travel-agent-cli 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +6 -6
- package/package.json +2 -2
- package/python/agents/__init__.py +19 -0
- package/python/agents/analysis_agent.py +234 -0
- package/python/agents/base.py +377 -0
- package/python/agents/collector_agent.py +304 -0
- package/python/agents/manager_agent.py +251 -0
- package/python/agents/planning_agent.py +161 -0
- package/python/agents/product_agent.py +672 -0
- package/python/agents/report_agent.py +172 -0
- package/python/analyzers/__init__.py +10 -0
- package/python/analyzers/hot_score.py +123 -0
- package/python/analyzers/ranker.py +225 -0
- package/python/analyzers/route_planner.py +86 -0
- package/python/cli/commands.py +254 -0
- package/python/collectors/__init__.py +14 -0
- package/python/collectors/ota/ctrip.py +120 -0
- package/python/collectors/ota/fliggy.py +152 -0
- package/python/collectors/weibo.py +235 -0
- package/python/collectors/wenlv.py +155 -0
- package/python/collectors/xiaohongshu.py +170 -0
- package/python/config/__init__.py +30 -0
- package/python/config/models.py +119 -0
- package/python/config/prompts.py +105 -0
- package/python/config/settings.py +172 -0
- package/python/export/__init__.py +6 -0
- package/python/export/report.py +192 -0
- package/python/main.py +632 -0
- package/python/pyproject.toml +51 -0
- package/python/scheduler/tasks.py +77 -0
- package/python/tools/fliggy_mcp.py +553 -0
- package/python/tools/flyai_tools.py +251 -0
- package/python/tools/mcp_tools.py +412 -0
- package/python/utils/__init__.py +9 -0
- package/python/utils/http.py +73 -0
- package/python/utils/storage.py +288 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""微博数据采集器
|
|
2
|
+
|
|
3
|
+
支持:
|
|
4
|
+
1. 微博开放平台 API
|
|
5
|
+
2. 网页爬取(备用方案)
|
|
6
|
+
"""
|
|
7
|
+
import asyncio
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
from config.models import SocialPost, SourceType
|
|
12
|
+
from config.settings import get_settings
|
|
13
|
+
from utils.storage import Storage
|
|
14
|
+
from utils.http import get_client
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class WeiboCollector:
|
|
18
|
+
"""微博采集器"""
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
self.storage = Storage()
|
|
22
|
+
self.settings = get_settings()
|
|
23
|
+
|
|
24
|
+
# 微博开放平台 API
|
|
25
|
+
self.api_base = "https://api.weibo.com"
|
|
26
|
+
self.access_token: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
def _get_auth_token(self) -> Optional[str]:
|
|
29
|
+
"""获取微博 API 访问令牌"""
|
|
30
|
+
if not self.settings.weibo_app_key or not self.settings.weibo_app_secret:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
# TODO: 实现 OAuth2.0 认证流程
|
|
34
|
+
# 参考:https://open.weibo.com/wiki/Oauth2
|
|
35
|
+
return self.access_token
|
|
36
|
+
|
|
37
|
+
async def search(
|
|
38
|
+
self, keyword: str = "旅行", count: int = 20, page: int = 1
|
|
39
|
+
) -> List[SocialPost]:
|
|
40
|
+
"""搜索微博内容
|
|
41
|
+
|
|
42
|
+
优先使用官方 API,失败则降级为网页爬取
|
|
43
|
+
"""
|
|
44
|
+
# 方案 1: 使用微博开放平台 API
|
|
45
|
+
token = self._get_auth_token()
|
|
46
|
+
if token:
|
|
47
|
+
try:
|
|
48
|
+
return await self._search_by_api(keyword, count, page)
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"微博 API 调用失败:{e},降级为网页爬取")
|
|
51
|
+
|
|
52
|
+
# 方案 2: 网页爬取
|
|
53
|
+
return await self._search_by_web(keyword, count, page)
|
|
54
|
+
|
|
55
|
+
async def _search_by_api(
|
|
56
|
+
self, keyword: str, count: int, page: int
|
|
57
|
+
) -> List[SocialPost]:
|
|
58
|
+
"""通过微博 API 搜索"""
|
|
59
|
+
# 参考:https://open.weibo.com/wiki/2/search/statuses
|
|
60
|
+
url = f"{self.api_base}/2/search/statuses.json"
|
|
61
|
+
params = {
|
|
62
|
+
"q": keyword,
|
|
63
|
+
"count": count,
|
|
64
|
+
"page": page,
|
|
65
|
+
"access_token": self.access_token,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
async with get_client() as client:
|
|
69
|
+
response = await client.get(url, params=params)
|
|
70
|
+
response.raise_for_status()
|
|
71
|
+
data = response.json()
|
|
72
|
+
|
|
73
|
+
posts = []
|
|
74
|
+
for status in data.get("statuses", []):
|
|
75
|
+
post = SocialPost(
|
|
76
|
+
id=str(status["id"]),
|
|
77
|
+
source=SourceType.WEIBO,
|
|
78
|
+
title=status.get("text", "")[:100],
|
|
79
|
+
content=status.get("text", ""),
|
|
80
|
+
author=status["user"]["screen_name"],
|
|
81
|
+
author_id=str(status["user"]["id"]),
|
|
82
|
+
likes=status.get("attitudes_count", 0),
|
|
83
|
+
comments=status.get("comments_count", 0),
|
|
84
|
+
shares=status.get("reposts_count", 0),
|
|
85
|
+
url=f"https://weibo.com/{status['id']}",
|
|
86
|
+
published_at=datetime.fromisoformat(
|
|
87
|
+
status["created_at"].replace("CST", "+08:00")
|
|
88
|
+
)
|
|
89
|
+
if "created_at" in status
|
|
90
|
+
else None,
|
|
91
|
+
)
|
|
92
|
+
posts.append(post)
|
|
93
|
+
await self.storage.save_post(post)
|
|
94
|
+
|
|
95
|
+
return posts
|
|
96
|
+
|
|
97
|
+
async def _search_by_web(
|
|
98
|
+
self, keyword: str, count: int, page: int
|
|
99
|
+
) -> List[SocialPost]:
|
|
100
|
+
"""通过网页爬取搜索"""
|
|
101
|
+
collected_posts = []
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
async with get_client() as client:
|
|
105
|
+
# 微博搜索结果页
|
|
106
|
+
url = "https://s.weibo.com/weibo"
|
|
107
|
+
params = {
|
|
108
|
+
"q": keyword,
|
|
109
|
+
"typeall": "sub",
|
|
110
|
+
"suball": "1",
|
|
111
|
+
"page": page,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
headers = {
|
|
115
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
|
116
|
+
"Referer": "https://weibo.com/",
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
response = await client.get(url, params=params, headers=headers)
|
|
120
|
+
|
|
121
|
+
if response.status_code == 200:
|
|
122
|
+
# 解析 HTML
|
|
123
|
+
from bs4 import BeautifulSoup
|
|
124
|
+
|
|
125
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
126
|
+
|
|
127
|
+
# 查找微博列表
|
|
128
|
+
for item in soup.select('.card-wrap'):
|
|
129
|
+
try:
|
|
130
|
+
content_elem = item.select_one('.content')
|
|
131
|
+
if not content_elem:
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# 提取信息
|
|
135
|
+
title = content_elem.get_text(strip=True)[:100]
|
|
136
|
+
content = content_elem.get_text(strip=True)
|
|
137
|
+
author_elem = item.select_one('.name')
|
|
138
|
+
author = author_elem.get_text(strip=True) if author_elem else "未知"
|
|
139
|
+
|
|
140
|
+
# 互动数据
|
|
141
|
+
likes_elem = item.select_one('.card-act li:last-child')
|
|
142
|
+
likes = 0
|
|
143
|
+
if likes_elem:
|
|
144
|
+
likes_text = likes_elem.get_text()
|
|
145
|
+
import re
|
|
146
|
+
match = re.search(r'\d+', likes_text)
|
|
147
|
+
if match:
|
|
148
|
+
likes = int(match.group())
|
|
149
|
+
|
|
150
|
+
# 生成 ID
|
|
151
|
+
post_id = f"weibo_{hash(title) % 1000000}"
|
|
152
|
+
|
|
153
|
+
post = SocialPost(
|
|
154
|
+
id=post_id,
|
|
155
|
+
source=SourceType.WEIBO,
|
|
156
|
+
title=title,
|
|
157
|
+
content=content,
|
|
158
|
+
author=author,
|
|
159
|
+
author_id=hash(author),
|
|
160
|
+
likes=likes,
|
|
161
|
+
url="https://s.weibo.com",
|
|
162
|
+
published_at=datetime.now(),
|
|
163
|
+
)
|
|
164
|
+
collected_posts.append(post)
|
|
165
|
+
await self.storage.save_post(post)
|
|
166
|
+
|
|
167
|
+
if len(collected_posts) >= count:
|
|
168
|
+
break
|
|
169
|
+
|
|
170
|
+
except Exception as e:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
print(f"微博爬取失败:{e}")
|
|
175
|
+
|
|
176
|
+
# 如果没有爬到数据,返回示例数据
|
|
177
|
+
if not collected_posts:
|
|
178
|
+
collected_posts = self._get_sample_posts(keyword)
|
|
179
|
+
|
|
180
|
+
return collected_posts
|
|
181
|
+
|
|
182
|
+
def _get_sample_posts(self, keyword: str) -> List[SocialPost]:
|
|
183
|
+
"""返回示例微博(用于测试)"""
|
|
184
|
+
samples = [
|
|
185
|
+
{
|
|
186
|
+
"id": "weibo_001",
|
|
187
|
+
"title": "春天去哪里旅行?这 5 个目的地强烈推荐!",
|
|
188
|
+
"content": "春暖花开,正是出游好时节!推荐 5 个宝藏目的地:1.云南罗平油菜花 2.西藏林芝桃花 3.江西婺源 4.江苏兴化 5.贵州平坝樱花...",
|
|
189
|
+
"author": "旅游博主小王",
|
|
190
|
+
"author_id": "wb_user_001",
|
|
191
|
+
"likes": 8923,
|
|
192
|
+
"comments": 445,
|
|
193
|
+
"shares": 1234,
|
|
194
|
+
"url": "https://weibo.com/001",
|
|
195
|
+
},
|
|
196
|
+
{
|
|
197
|
+
"id": "weibo_002",
|
|
198
|
+
"title": "特种兵式旅游到底值不值?",
|
|
199
|
+
"content": "24 小时打卡 8 个景点,日行 3 万步...这种旅游方式引发热议。我觉得...",
|
|
200
|
+
"author": "旅行观察家",
|
|
201
|
+
"author_id": "wb_user_002",
|
|
202
|
+
"likes": 15234,
|
|
203
|
+
"comments": 2345,
|
|
204
|
+
"shares": 890,
|
|
205
|
+
"url": "https://weibo.com/002",
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
"id": "weibo_003",
|
|
209
|
+
"title": "五一假期出行预测:这些城市最火",
|
|
210
|
+
"content": "根据各大平台数据,五一假期最热门目的地预测:北京、上海、成都、重庆、西安...",
|
|
211
|
+
"author": "旅行大数据",
|
|
212
|
+
"author_id": "wb_user_003",
|
|
213
|
+
"likes": 5678,
|
|
214
|
+
"comments": 234,
|
|
215
|
+
"shares": 567,
|
|
216
|
+
"url": "https://weibo.com/003",
|
|
217
|
+
},
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
return [
|
|
221
|
+
SocialPost(
|
|
222
|
+
id=s["id"],
|
|
223
|
+
source=SourceType.WEIBO,
|
|
224
|
+
title=s["title"],
|
|
225
|
+
content=s["content"],
|
|
226
|
+
author=s["author"],
|
|
227
|
+
author_id=s["author_id"],
|
|
228
|
+
likes=s["likes"],
|
|
229
|
+
comments=s["comments"],
|
|
230
|
+
shares=s["shares"],
|
|
231
|
+
url=s["url"],
|
|
232
|
+
published_at=datetime.now(),
|
|
233
|
+
)
|
|
234
|
+
for s in samples
|
|
235
|
+
]
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""文旅局信息采集器
|
|
2
|
+
|
|
3
|
+
爬取各地文旅局官网、公众号的推广信息
|
|
4
|
+
"""
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import List, Dict
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
|
|
9
|
+
from config.models import WenlvInfo, SourceType
|
|
10
|
+
from utils.storage import Storage
|
|
11
|
+
from utils.http import get_client
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# 主要文旅局官网列表
|
|
15
|
+
WENLV_WEBSITES = [
|
|
16
|
+
{"name": "文化和旅游部", "url": "https://www.mct.gov.cn", "region": "全国"},
|
|
17
|
+
{"name": "北京文旅局", "url": "https://wlj.beijing.gov.cn", "region": "北京"},
|
|
18
|
+
{"name": "上海文旅局", "url": "https://whlyj.sh.gov.cn", "region": "上海"},
|
|
19
|
+
{"name": "广东文旅厅", "url": "http://whlt.gd.gov.cn", "region": "广东"},
|
|
20
|
+
{"name": "云南文旅厅", "url": "http://whlt.yn.gov.cn", "region": "云南"},
|
|
21
|
+
{"name": "四川文旅厅", "url": "http://whlyt.sc.gov.cn", "region": "四川"},
|
|
22
|
+
{"name": "浙江文旅厅", "url": "http://ct.zj.gov.cn", "region": "浙江"},
|
|
23
|
+
{"name": "江苏文旅厅", "url": "http://wlt.jiangsu.gov.cn", "region": "江苏"},
|
|
24
|
+
{"name": "陕西文旅厅", "url": "http://whlyt.shaanxi.gov.cn", "region": "陕西"},
|
|
25
|
+
{"name": "海南旅文厅", "url": "http://wlt.hainan.gov.cn", "region": "海南"},
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class WenlvCollector:
|
|
30
|
+
"""文旅信息采集器"""
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
self.storage = Storage()
|
|
34
|
+
|
|
35
|
+
async def collect(self) -> List[WenlvInfo]:
|
|
36
|
+
"""采集各地文旅信息"""
|
|
37
|
+
# 初始化数据库
|
|
38
|
+
await self.storage.init_db()
|
|
39
|
+
|
|
40
|
+
all_info = []
|
|
41
|
+
|
|
42
|
+
for site in WENLV_WEBSITES:
|
|
43
|
+
try:
|
|
44
|
+
info_list = await self._collect_site(site)
|
|
45
|
+
all_info.extend(info_list)
|
|
46
|
+
print(f"已采集 {site['name']}: {len(info_list)} 条")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"采集 {site['name']} 失败:{e}")
|
|
49
|
+
|
|
50
|
+
# 如果没有采集到数据,返回示例数据
|
|
51
|
+
if not all_info:
|
|
52
|
+
all_info = self._get_sample_info()
|
|
53
|
+
|
|
54
|
+
return all_info
|
|
55
|
+
|
|
56
|
+
async def _collect_site(self, site: Dict) -> List[WenlvInfo]:
|
|
57
|
+
"""采集单个网站"""
|
|
58
|
+
collected = []
|
|
59
|
+
|
|
60
|
+
async with get_client() as client:
|
|
61
|
+
# 尝试访问官网
|
|
62
|
+
response = await client.get(site["url"])
|
|
63
|
+
|
|
64
|
+
if response.status_code == 200:
|
|
65
|
+
from bs4 import BeautifulSoup
|
|
66
|
+
|
|
67
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
68
|
+
|
|
69
|
+
# 查找新闻列表(不同网站结构不同,需要针对性适配)
|
|
70
|
+
# 这里是一个通用尝试
|
|
71
|
+
for link in soup.find_all("a", href=True)[:20]:
|
|
72
|
+
text = link.get_text(strip=True)
|
|
73
|
+
href = link["href"]
|
|
74
|
+
|
|
75
|
+
# 过滤有效链接
|
|
76
|
+
if len(text) < 10 or len(text) > 100:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# 判断信息类型
|
|
80
|
+
info_type = "其他"
|
|
81
|
+
if any(kw in text for kw in ["政策", "通知", "公告"]):
|
|
82
|
+
info_type = "政策"
|
|
83
|
+
elif any(kw in text for kw in ["活动", "节庆", "优惠"]):
|
|
84
|
+
info_type = "活动"
|
|
85
|
+
elif any(kw in text for kw in ["路线", "推荐", "攻略"]):
|
|
86
|
+
info_type = "推荐路线"
|
|
87
|
+
|
|
88
|
+
# 处理相对 URL
|
|
89
|
+
if href.startswith("/"):
|
|
90
|
+
base = site["url"].rstrip("/")
|
|
91
|
+
href = f"{base}{href}"
|
|
92
|
+
|
|
93
|
+
info = WenlvInfo(
|
|
94
|
+
id=f"wenlv_{hash(href) % 1000000}",
|
|
95
|
+
source=SourceType.WENLV,
|
|
96
|
+
title=text,
|
|
97
|
+
content=text,
|
|
98
|
+
region=site["region"],
|
|
99
|
+
url=href,
|
|
100
|
+
info_type=info_type,
|
|
101
|
+
published_at=datetime.now(),
|
|
102
|
+
)
|
|
103
|
+
collected.append(info)
|
|
104
|
+
await self.storage.save_wenlv(info)
|
|
105
|
+
|
|
106
|
+
return collected
|
|
107
|
+
|
|
108
|
+
def _get_sample_info(self) -> List[WenlvInfo]:
|
|
109
|
+
"""返回示例文旅信息"""
|
|
110
|
+
samples = [
|
|
111
|
+
{
|
|
112
|
+
"title": "文化和旅游部关于促进乡村旅游高质量发展的指导意见",
|
|
113
|
+
"region": "全国",
|
|
114
|
+
"info_type": "政策",
|
|
115
|
+
"url": "https://www.mct.gov.cn/example1",
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
"title": "云南推出 10 条精品旅游线路,五一假期出发",
|
|
119
|
+
"region": "云南",
|
|
120
|
+
"info_type": "推荐路线",
|
|
121
|
+
"url": "http://whlt.yn.gov.cn/example2",
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"title": "海南离岛免税购物额度提升至 10 万元",
|
|
125
|
+
"region": "海南",
|
|
126
|
+
"info_type": "政策",
|
|
127
|
+
"url": "http://wlt.hainan.gov.cn/example3",
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
"title": "四川启动文旅消费季,发放 5000 万消费券",
|
|
131
|
+
"region": "四川",
|
|
132
|
+
"info_type": "活动",
|
|
133
|
+
"url": "http://whlyt.sc.gov.cn/example4",
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
"title": "浙江推出'诗画江南'文旅品牌",
|
|
137
|
+
"region": "浙江",
|
|
138
|
+
"info_type": "推荐路线",
|
|
139
|
+
"url": "http://ct.zj.gov.cn/example5",
|
|
140
|
+
},
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
return [
|
|
144
|
+
WenlvInfo(
|
|
145
|
+
id=f"wenlv_{i}",
|
|
146
|
+
source=SourceType.WENLV,
|
|
147
|
+
title=s["title"],
|
|
148
|
+
content=s["title"], # 简化处理
|
|
149
|
+
region=s["region"],
|
|
150
|
+
url=s["url"],
|
|
151
|
+
info_type=s["info_type"],
|
|
152
|
+
published_at=datetime.now(),
|
|
153
|
+
)
|
|
154
|
+
for i, s in enumerate(samples)
|
|
155
|
+
]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""小红书数据采集器
|
|
2
|
+
|
|
3
|
+
参考:MediaCrawler, xhs_one_spider
|
|
4
|
+
注意:小红书有严格的反爬机制,需要处理签名和验证码
|
|
5
|
+
"""
|
|
6
|
+
import asyncio
|
|
7
|
+
import hashlib
|
|
8
|
+
import time
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
from config.models import SocialPost, SourceType
|
|
14
|
+
from utils.storage import Storage
|
|
15
|
+
from utils.http import get_client
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class XiaohongshuCollector:
|
|
19
|
+
"""小红书采集器"""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.storage = Storage()
|
|
23
|
+
self.base_url = "https://www.xiaohongshu.com"
|
|
24
|
+
self.api_base = "https://edith.xiaohongshu.com"
|
|
25
|
+
|
|
26
|
+
def _generate_xs_token(self) -> str:
|
|
27
|
+
"""生成 X-s 签名 token(简化版,实际需要根据最新算法更新)"""
|
|
28
|
+
timestamp = str(int(time.time() * 1000))
|
|
29
|
+
return hashlib.md5(timestamp.encode()).hexdigest()
|
|
30
|
+
|
|
31
|
+
async def search(self, keyword: str = "旅行", page: int = 1, page_size: int = 20) -> List[SocialPost]:
|
|
32
|
+
"""搜索小红书笔记
|
|
33
|
+
|
|
34
|
+
注意:由于小红书反爬严格,这里提供两种方案:
|
|
35
|
+
1. 使用官方开放平台 API(需要申请)
|
|
36
|
+
2. 模拟网页爬取(需要处理签名和验证码)
|
|
37
|
+
|
|
38
|
+
此处为简化示例,实际使用需要更新签名算法
|
|
39
|
+
"""
|
|
40
|
+
collected_posts = []
|
|
41
|
+
|
|
42
|
+
# 方案 1: 尝试使用开放平台 API
|
|
43
|
+
# 需要申请 https://open.xiaohongshu.com/
|
|
44
|
+
|
|
45
|
+
# 方案 2: 网页爬取(需要处理反爬)
|
|
46
|
+
# 这里提供一个框架,实际使用需要逆向分析最新接口
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
async with get_client() as client:
|
|
50
|
+
# 搜索页面
|
|
51
|
+
url = f"{self.base_url}/explore/search"
|
|
52
|
+
params = {
|
|
53
|
+
"source": "web_search",
|
|
54
|
+
"keyword": keyword,
|
|
55
|
+
"page": page,
|
|
56
|
+
"page_size": page_size,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
headers = {
|
|
60
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
|
61
|
+
"Referer": self.base_url,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
response = await client.get(url, params=params, headers=headers)
|
|
65
|
+
|
|
66
|
+
if response.status_code == 200:
|
|
67
|
+
# 解析搜索结果
|
|
68
|
+
# 注意:实际需要根据返回结构调整
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
except Exception as e:
|
|
72
|
+
print(f"小红书采集失败:{e}")
|
|
73
|
+
|
|
74
|
+
# 如果自动采集失败,可以使用数据导出工具手动获取
|
|
75
|
+
# 参考:xhs_one_spider, MediaCrawler 等项目
|
|
76
|
+
print(f"小红书搜索关键词:{keyword}")
|
|
77
|
+
print("提示:小红书有严格反爬,建议使用以下方案:")
|
|
78
|
+
print(" 1. 申请官方开放平台 API: https://open.xiaohongshu.com/")
|
|
79
|
+
print(" 2. 使用 MediaCrawler 等开源工具手动采集后导入")
|
|
80
|
+
|
|
81
|
+
# 返回示例数据用于测试
|
|
82
|
+
return self._get_sample_posts(keyword)
|
|
83
|
+
|
|
84
|
+
def _get_sample_posts(self, keyword: str) -> List[SocialPost]:
|
|
85
|
+
"""返回示例帖子(用于测试)"""
|
|
86
|
+
samples = [
|
|
87
|
+
{
|
|
88
|
+
"id": "xhs_001",
|
|
89
|
+
"title": "三亚旅行 | 5 天 4 晚超详细攻略,人均 3000+ 搞定",
|
|
90
|
+
"content": "三亚真的是国内海岛游的天花板!这次整理了 5 天 4 晚的详细攻略...",
|
|
91
|
+
"author": "旅行达人小 A",
|
|
92
|
+
"author_id": "user_001",
|
|
93
|
+
"likes": 15234,
|
|
94
|
+
"comments": 892,
|
|
95
|
+
"shares": 1523,
|
|
96
|
+
"tags": ["三亚", "旅行攻略", "海岛游", "海南"],
|
|
97
|
+
"url": "https://www.xiaohongshu.com/discovery/item/001",
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"id": "xhs_002",
|
|
101
|
+
"title": "云南大理 | 这才是正确的打开方式!",
|
|
102
|
+
"content": "大理真的太美了!洱海骑行、古城漫步、苍山徒步...",
|
|
103
|
+
"author": "背包客小李",
|
|
104
|
+
"author_id": "user_002",
|
|
105
|
+
"likes": 8921,
|
|
106
|
+
"comments": 445,
|
|
107
|
+
"shares": 678,
|
|
108
|
+
"tags": ["大理", "云南", "洱海", "古城"],
|
|
109
|
+
"url": "https://www.xiaohongshu.com/discovery/item/002",
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
"id": "xhs_003",
|
|
113
|
+
"title": "川西小环线 | 自驾 3 天看遍雪山草甸",
|
|
114
|
+
"content": "川西真的是自驾天堂!这条小环线可以看到雪山、草甸、藏寨...",
|
|
115
|
+
"author": "自驾达人老王",
|
|
116
|
+
"author_id": "user_003",
|
|
117
|
+
"likes": 12456,
|
|
118
|
+
"comments": 623,
|
|
119
|
+
"shares": 1890,
|
|
120
|
+
"tags": ["川西", "自驾", "雪山", "四姑娘山"],
|
|
121
|
+
"url": "https://www.xiaohongshu.com/discovery/item/003",
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"id": "xhs_004",
|
|
125
|
+
"title": "西安美食攻略 | 100 元吃遍回民街",
|
|
126
|
+
"content": "西安真的是美食天堂!肉夹馍、羊肉泡馍、凉皮...",
|
|
127
|
+
"author": "吃货小分队",
|
|
128
|
+
"author_id": "user_004",
|
|
129
|
+
"likes": 23456,
|
|
130
|
+
"comments": 1234,
|
|
131
|
+
"shares": 2345,
|
|
132
|
+
"tags": ["西安", "美食", "回民街", "陕西"],
|
|
133
|
+
"url": "https://www.xiaohongshu.com/discovery/item/004",
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
"id": "xhs_005",
|
|
137
|
+
"title": "桂林阳朔 | 山水画中的慢生活",
|
|
138
|
+
"content": "桂林山水甲天下,阳朔风光甲桂林!遇龙河漂流、十里画廊...",
|
|
139
|
+
"author": "慢旅行者",
|
|
140
|
+
"author_id": "user_005",
|
|
141
|
+
"likes": 7654,
|
|
142
|
+
"comments": 321,
|
|
143
|
+
"shares": 543,
|
|
144
|
+
"tags": ["桂林", "阳朔", "漓江", "广西"],
|
|
145
|
+
"url": "https://www.xiaohongshu.com/discovery/item/005",
|
|
146
|
+
},
|
|
147
|
+
]
|
|
148
|
+
|
|
149
|
+
return [
|
|
150
|
+
SocialPost(
|
|
151
|
+
id=s["id"],
|
|
152
|
+
source=SourceType.XIAOHONGSHU,
|
|
153
|
+
title=s["title"],
|
|
154
|
+
content=s["content"],
|
|
155
|
+
author=s["author"],
|
|
156
|
+
author_id=s["author_id"],
|
|
157
|
+
likes=s["likes"],
|
|
158
|
+
comments=s["comments"],
|
|
159
|
+
shares=s["shares"],
|
|
160
|
+
tags=s["tags"],
|
|
161
|
+
url=s["url"],
|
|
162
|
+
published_at=datetime.now(),
|
|
163
|
+
)
|
|
164
|
+
for s in samples
|
|
165
|
+
]
|
|
166
|
+
|
|
167
|
+
async def collect_detail(self, note_id: str) -> Optional[SocialPost]:
|
|
168
|
+
"""采集单篇笔记详情"""
|
|
169
|
+
# 实现类似逻辑
|
|
170
|
+
pass
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""配置模块"""
|
|
2
|
+
from config.settings import get_settings
|
|
3
|
+
from config.prompts import (
|
|
4
|
+
DESTINATION_ANALYSIS_PROMPT,
|
|
5
|
+
ROUTE_PLANNING_PROMPT,
|
|
6
|
+
REPORT_SUMMARY_PROMPT,
|
|
7
|
+
)
|
|
8
|
+
from config.models import (
|
|
9
|
+
SocialPost,
|
|
10
|
+
WenlvInfo,
|
|
11
|
+
FlightInfo,
|
|
12
|
+
HotelInfo,
|
|
13
|
+
DestinationRecommendation,
|
|
14
|
+
TravelRoute,
|
|
15
|
+
Report,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"get_settings",
|
|
20
|
+
"DESTINATION_ANALYSIS_PROMPT",
|
|
21
|
+
"ROUTE_PLANNING_PROMPT",
|
|
22
|
+
"REPORT_SUMMARY_PROMPT",
|
|
23
|
+
"SocialPost",
|
|
24
|
+
"WenlvInfo",
|
|
25
|
+
"FlightInfo",
|
|
26
|
+
"HotelInfo",
|
|
27
|
+
"DestinationRecommendation",
|
|
28
|
+
"TravelRoute",
|
|
29
|
+
"Report",
|
|
30
|
+
]
|