douyin-cli 5.3.260515__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backend/__init__.py +1 -0
- backend/cli.py +491 -0
- backend/constants.py +38 -0
- backend/lib/__init__.py +27 -0
- backend/lib/cookies.py +205 -0
- backend/lib/douyin/__init__.py +55 -0
- backend/lib/douyin/client.py +230 -0
- backend/lib/douyin/crawler.py +410 -0
- backend/lib/douyin/js/douyin.js +434 -0
- backend/lib/douyin/openapi.py +117 -0
- backend/lib/douyin/parser.py +299 -0
- backend/lib/douyin/request.py +294 -0
- backend/lib/douyin/target.py +182 -0
- backend/lib/douyin/types.py +186 -0
- backend/lib/download.py +40 -0
- backend/lib/exceptions.py +18 -0
- backend/settings.py +189 -0
- backend/utils/__init__.py +25 -0
- backend/utils/paths.py +39 -0
- backend/utils/text.py +231 -0
- douyin_cli-5.3.260515.dist-info/METADATA +90 -0
- douyin_cli-5.3.260515.dist-info/RECORD +24 -0
- douyin_cli-5.3.260515.dist-info/WHEEL +4 -0
- douyin_cli-5.3.260515.dist-info/entry_points.txt +3 -0
backend/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""DouyinCrawler backend package."""
|
backend/cli.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
1
|
+
"""Command line interface for DouyinCrawler."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import traceback
|
|
6
|
+
from collections.abc import Iterator
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from json import JSONDecodeError
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
import ujson as json
|
|
13
|
+
from loguru import logger
|
|
14
|
+
|
|
15
|
+
from backend.constants import APP_VERSION, SETTINGS_FILE
|
|
16
|
+
from backend.lib.cookies import CookieManager
|
|
17
|
+
from backend.lib.douyin import Douyin
|
|
18
|
+
from backend.lib.douyin.openapi import DouyinOpenAPIClient, DouyinOpenAPIError
|
|
19
|
+
from backend.lib.download import download
|
|
20
|
+
from backend.settings import settings
|
|
21
|
+
|
|
22
|
+
ACCOUNT_ONLY_TYPES = {"favorite", "collection", "following", "follower"}
|
|
23
|
+
NO_DOWNLOAD_TYPES = {"following", "follower"}
|
|
24
|
+
|
|
25
|
+
BANNER = rf"""
|
|
26
|
+
____ _ ____ _
|
|
27
|
+
| _ \ ___ _ _ _ _(_)_ __ / ___|_ __ __ ___ _| | ___ _ __
|
|
28
|
+
| | | |/ _ \| | | | | | | | '_ \ | | | '__/ _` \ \ /\ / / |/ _ \ '__|
|
|
29
|
+
| |_| | (_) | |_| | |_| | | | | | | |___| | | (_| |\ V V /| | __/ |
|
|
30
|
+
|____/ \___/ \__,_|\__, |_|_| |_| \____|_| \__,_| \_/\_/ |_|\___|_|
|
|
31
|
+
|___/
|
|
32
|
+
V{APP_VERSION}
|
|
33
|
+
Github: https://github.com/erma0/douyin
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class CrawlOptions:
|
|
39
|
+
"""Normalized options used by one or more crawl tasks."""
|
|
40
|
+
|
|
41
|
+
limit: int
|
|
42
|
+
no_download: bool
|
|
43
|
+
crawl_type: str
|
|
44
|
+
output_path: str
|
|
45
|
+
cookie: str
|
|
46
|
+
filters: dict[str, str]
|
|
47
|
+
download_title: bool
|
|
48
|
+
download_cover: bool
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@click.group(invoke_without_command=True)
|
|
52
|
+
@click.pass_context
|
|
53
|
+
@click.option(
|
|
54
|
+
"-u",
|
|
55
|
+
"--urls",
|
|
56
|
+
type=click.STRING,
|
|
57
|
+
multiple=True,
|
|
58
|
+
help="作品/账号/话题/音乐等类型的URL链接/ID或搜索关键词,也可输入文件路径(文件内一行一个),可多次输入。",
|
|
59
|
+
)
|
|
60
|
+
@click.option(
|
|
61
|
+
"-l",
|
|
62
|
+
"--limit",
|
|
63
|
+
type=click.INT,
|
|
64
|
+
default=0,
|
|
65
|
+
help="限制最大采集数量,默认不限制(0表示不限制)",
|
|
66
|
+
)
|
|
67
|
+
@click.option(
|
|
68
|
+
"--no-download",
|
|
69
|
+
is_flag=True,
|
|
70
|
+
help="不下载文件,仅采集数据",
|
|
71
|
+
)
|
|
72
|
+
@click.option(
|
|
73
|
+
"-t",
|
|
74
|
+
"--type",
|
|
75
|
+
"crawl_type",
|
|
76
|
+
type=click.Choice(
|
|
77
|
+
[
|
|
78
|
+
"post",
|
|
79
|
+
"favorite",
|
|
80
|
+
"music",
|
|
81
|
+
"hashtag",
|
|
82
|
+
"search",
|
|
83
|
+
"following",
|
|
84
|
+
"follower",
|
|
85
|
+
"collection",
|
|
86
|
+
"mix",
|
|
87
|
+
"aweme",
|
|
88
|
+
],
|
|
89
|
+
case_sensitive=False,
|
|
90
|
+
),
|
|
91
|
+
default="post",
|
|
92
|
+
help="采集类型,默认为post(主页作品)。支持:post/favorite/music/hashtag/search/following/follower/collection/mix/aweme",
|
|
93
|
+
)
|
|
94
|
+
@click.option(
|
|
95
|
+
"-p",
|
|
96
|
+
"--path",
|
|
97
|
+
"output_path",
|
|
98
|
+
type=click.STRING,
|
|
99
|
+
default="下载",
|
|
100
|
+
help="下载文件夹路径,默认为[下载]",
|
|
101
|
+
)
|
|
102
|
+
@click.option(
|
|
103
|
+
"-c",
|
|
104
|
+
"--cookie",
|
|
105
|
+
type=click.STRING,
|
|
106
|
+
help=f"已登录账号的cookie,可填写在 {SETTINGS_FILE} 中",
|
|
107
|
+
)
|
|
108
|
+
@click.option(
|
|
109
|
+
"--sort-type",
|
|
110
|
+
type=click.Choice(["0", "1", "2"], case_sensitive=False),
|
|
111
|
+
help="搜索排序(仅search类型):0=综合,1=最多点赞,2=最新",
|
|
112
|
+
)
|
|
113
|
+
@click.option(
|
|
114
|
+
"--publish-time",
|
|
115
|
+
type=click.Choice(["0", "1", "7", "180"], case_sensitive=False),
|
|
116
|
+
help="发布时间(仅search类型):0=不限,1=一天内,7=一周内,180=半年内",
|
|
117
|
+
)
|
|
118
|
+
@click.option(
|
|
119
|
+
"--filter-duration",
|
|
120
|
+
type=click.Choice(["", "0-1", "1-5", "5-10000"], case_sensitive=False),
|
|
121
|
+
help="视频时长(仅search类型):空=不限,0-1=1分钟以下,1-5=1-5分钟,5-10000=5分钟以上",
|
|
122
|
+
)
|
|
123
|
+
@click.option(
|
|
124
|
+
"--download-title",
|
|
125
|
+
is_flag=True,
|
|
126
|
+
help="下载标题文本文件",
|
|
127
|
+
)
|
|
128
|
+
@click.option(
|
|
129
|
+
"--download-cover",
|
|
130
|
+
is_flag=True,
|
|
131
|
+
help="下载封面图片",
|
|
132
|
+
)
|
|
133
|
+
def main(
|
|
134
|
+
ctx: click.Context,
|
|
135
|
+
urls: tuple[str, ...],
|
|
136
|
+
limit: int,
|
|
137
|
+
no_download: bool,
|
|
138
|
+
crawl_type: str,
|
|
139
|
+
output_path: str,
|
|
140
|
+
cookie: str | None,
|
|
141
|
+
sort_type: str | None,
|
|
142
|
+
publish_time: str | None,
|
|
143
|
+
filter_duration: str | None,
|
|
144
|
+
download_title: bool,
|
|
145
|
+
download_cover: bool,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""抖音数据采集命令行工具."""
|
|
148
|
+
if ctx.invoked_subcommand is not None:
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
click.echo(BANNER)
|
|
152
|
+
|
|
153
|
+
cookie_value = load_cookie(cookie)
|
|
154
|
+
if not cookie_value or not validate_cookie(cookie_value):
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
targets = resolve_targets(urls, crawl_type)
|
|
158
|
+
if targets is None:
|
|
159
|
+
return
|
|
160
|
+
|
|
161
|
+
options = CrawlOptions(
|
|
162
|
+
limit=limit,
|
|
163
|
+
no_download=no_download,
|
|
164
|
+
crawl_type=crawl_type,
|
|
165
|
+
output_path=output_path,
|
|
166
|
+
cookie=cookie_value,
|
|
167
|
+
filters=build_filters(sort_type, publish_time, filter_duration),
|
|
168
|
+
download_title=download_title,
|
|
169
|
+
download_cover=download_cover,
|
|
170
|
+
)
|
|
171
|
+
run_targets(targets, options)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@main.group()
|
|
175
|
+
def api() -> None:
|
|
176
|
+
"""调用抖音开放平台官方 OpenAPI."""
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@api.command("client-token")
|
|
180
|
+
@click.option("--client-key", envvar="DOUYIN_CLIENT_KEY", required=True)
|
|
181
|
+
@click.option("--client-secret", envvar="DOUYIN_CLIENT_SECRET", required=True)
|
|
182
|
+
def api_client_token(client_key: str, client_secret: str) -> None:
|
|
183
|
+
"""获取 client_token."""
|
|
184
|
+
with DouyinOpenAPIClient() as client:
|
|
185
|
+
echo_json(client.client_token(client_key, client_secret))
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
@api.command("access-token")
|
|
189
|
+
@click.option("--client-key", envvar="DOUYIN_CLIENT_KEY", required=True)
|
|
190
|
+
@click.option("--client-secret", envvar="DOUYIN_CLIENT_SECRET", required=True)
|
|
191
|
+
@click.option("--code", required=True, help="OAuth 授权码")
|
|
192
|
+
def api_access_token(client_key: str, client_secret: str, code: str) -> None:
|
|
193
|
+
"""用 OAuth code 换取 access_token."""
|
|
194
|
+
with DouyinOpenAPIClient() as client:
|
|
195
|
+
echo_json(client.access_token(client_key, client_secret, code))
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
@api.command("request")
|
|
199
|
+
@click.argument("method")
|
|
200
|
+
@click.argument("path")
|
|
201
|
+
@click.option("--token", envvar="DOUYIN_ACCESS_TOKEN", help="access_token/client_token")
|
|
202
|
+
@click.option("--param", "params", multiple=True, help="查询参数,格式 key=value")
|
|
203
|
+
@click.option("--json", "json_text", help="JSON 请求体")
|
|
204
|
+
@click.option("--form", "forms", multiple=True, help="表单参数,格式 key=value")
|
|
205
|
+
@click.option("--header", "headers", multiple=True, help="额外请求头,格式 key=value")
|
|
206
|
+
def api_request(
|
|
207
|
+
method: str,
|
|
208
|
+
path: str,
|
|
209
|
+
token: str | None,
|
|
210
|
+
params: tuple[str, ...],
|
|
211
|
+
json_text: str | None,
|
|
212
|
+
forms: tuple[str, ...],
|
|
213
|
+
headers: tuple[str, ...],
|
|
214
|
+
) -> None:
|
|
215
|
+
"""调用任意官方 OpenAPI 路径."""
|
|
216
|
+
try:
|
|
217
|
+
json_body = parse_json_body(json_text)
|
|
218
|
+
with DouyinOpenAPIClient() as client:
|
|
219
|
+
echo_json(
|
|
220
|
+
client.request(
|
|
221
|
+
method,
|
|
222
|
+
path,
|
|
223
|
+
token=token,
|
|
224
|
+
params=parse_key_values(params),
|
|
225
|
+
json_body=json_body,
|
|
226
|
+
form=parse_key_values(forms),
|
|
227
|
+
headers=parse_key_values(headers),
|
|
228
|
+
),
|
|
229
|
+
)
|
|
230
|
+
except (DouyinOpenAPIError, ValueError) as exc:
|
|
231
|
+
raise click.ClickException(str(exc)) from exc
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def build_filters(
|
|
235
|
+
sort_type: str | None,
|
|
236
|
+
publish_time: str | None,
|
|
237
|
+
filter_duration: str | None,
|
|
238
|
+
) -> dict[str, str]:
|
|
239
|
+
"""Build search filter arguments."""
|
|
240
|
+
filters = {}
|
|
241
|
+
if sort_type:
|
|
242
|
+
filters["sort_type"] = sort_type
|
|
243
|
+
if publish_time:
|
|
244
|
+
filters["publish_time"] = publish_time
|
|
245
|
+
if filter_duration is not None:
|
|
246
|
+
filters["filter_duration"] = filter_duration
|
|
247
|
+
return filters
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def echo_json(data: dict) -> None:
|
|
251
|
+
"""Print a JSON response."""
|
|
252
|
+
click.echo(json.dumps(data, ensure_ascii=False, indent=2))
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def parse_json_body(text: str | None) -> dict | list | None:
|
|
256
|
+
"""Parse a JSON body option."""
|
|
257
|
+
if text is None:
|
|
258
|
+
return None
|
|
259
|
+
try:
|
|
260
|
+
data = json.loads(text)
|
|
261
|
+
except JSONDecodeError as exc:
|
|
262
|
+
msg = f"--json 不是合法 JSON: {exc}"
|
|
263
|
+
raise ValueError(msg) from exc
|
|
264
|
+
if not isinstance(data, (dict, list)):
|
|
265
|
+
msg = "--json 必须是 JSON object 或 array"
|
|
266
|
+
raise ValueError(msg)
|
|
267
|
+
return data
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def parse_key_values(values: tuple[str, ...]) -> dict[str, str] | None:
|
|
271
|
+
"""Parse repeated key=value CLI options."""
|
|
272
|
+
if not values:
|
|
273
|
+
return None
|
|
274
|
+
parsed = {}
|
|
275
|
+
for value in values:
|
|
276
|
+
if "=" not in value:
|
|
277
|
+
msg = f"参数必须是 key=value 格式: {value}"
|
|
278
|
+
raise ValueError(msg)
|
|
279
|
+
key, item_value = value.split("=", 1)
|
|
280
|
+
if not key:
|
|
281
|
+
msg = f"参数 key 不能为空: {value}"
|
|
282
|
+
raise ValueError(msg)
|
|
283
|
+
parsed[key] = item_value
|
|
284
|
+
return parsed
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def load_cookie(cookie: str | None) -> str | None:
|
|
288
|
+
"""Load cookie from CLI, config, or interactive prompt."""
|
|
289
|
+
if cookie is not None:
|
|
290
|
+
logger.info("正在加载命令行指定的Cookie...")
|
|
291
|
+
cookie_value = cookie.strip()
|
|
292
|
+
if cookie_value:
|
|
293
|
+
return cookie_value
|
|
294
|
+
logger.error("无法加载指定的Cookie")
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
cookie_value = settings.get("cookie", "").strip()
|
|
298
|
+
if cookie_value:
|
|
299
|
+
logger.info("✓ 已从配置文件加载Cookie")
|
|
300
|
+
return cookie_value
|
|
301
|
+
|
|
302
|
+
return prompt_for_cookie()
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def prompt_for_cookie() -> str | None:
|
|
306
|
+
"""Prompt for cookie when no configured cookie exists."""
|
|
307
|
+
logger.warning("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
308
|
+
logger.warning("⚠ 未找到Cookie配置")
|
|
309
|
+
logger.info("配置方法:")
|
|
310
|
+
logger.info(f" 方法1:在 {SETTINGS_FILE} 中设置 cookie 字段")
|
|
311
|
+
logger.info(" 方法2:使用 -c 参数:douyin -c 'your_cookie'")
|
|
312
|
+
logger.warning("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
cookie_value = click.prompt(
|
|
316
|
+
"请粘贴Cookie字符串",
|
|
317
|
+
default="",
|
|
318
|
+
show_default=False,
|
|
319
|
+
)
|
|
320
|
+
except (click.Abort, EOFError):
|
|
321
|
+
logger.warning("\n用户取消输入")
|
|
322
|
+
return None
|
|
323
|
+
|
|
324
|
+
cookie_value = cookie_value.strip()
|
|
325
|
+
if not cookie_value:
|
|
326
|
+
logger.error("未输入Cookie,程序退出")
|
|
327
|
+
return None
|
|
328
|
+
logger.success("✓ Cookie已输入")
|
|
329
|
+
return cookie_value
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def validate_cookie(cookie: str) -> bool:
|
|
333
|
+
"""Validate cookie before crawling."""
|
|
334
|
+
if CookieManager.validate_cookie(cookie):
|
|
335
|
+
logger.success("✓ Cookie验证通过")
|
|
336
|
+
return True
|
|
337
|
+
|
|
338
|
+
logger.error("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
339
|
+
logger.error("✗ Cookie验证失败")
|
|
340
|
+
logger.info("可能原因:")
|
|
341
|
+
logger.info(" 1. Cookie已过期,请重新获取")
|
|
342
|
+
logger.info(" 2. Cookie格式不正确")
|
|
343
|
+
logger.info(" 3. 账号已退出登录")
|
|
344
|
+
logger.error("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
345
|
+
return False
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def resolve_targets(urls: tuple[str, ...], crawl_type: str) -> tuple[str, ...] | None:
|
|
349
|
+
"""Resolve CLI targets or prompt for one when needed."""
|
|
350
|
+
if urls:
|
|
351
|
+
return urls
|
|
352
|
+
|
|
353
|
+
if crawl_type in ACCOUNT_ONLY_TYPES:
|
|
354
|
+
logger.info(f"采集本账号的 {crawl_type} 数据")
|
|
355
|
+
return ("",)
|
|
356
|
+
|
|
357
|
+
url_input = click.prompt(
|
|
358
|
+
f"采集类型:{crawl_type},请输入目标关键词/URL链接/ID或文件路径",
|
|
359
|
+
default="",
|
|
360
|
+
show_default=False,
|
|
361
|
+
).strip()
|
|
362
|
+
if url_input:
|
|
363
|
+
return (url_input,)
|
|
364
|
+
|
|
365
|
+
logger.error("未输入目标,退出程序")
|
|
366
|
+
return None
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def run_targets(targets: tuple[str, ...], options: CrawlOptions) -> None:
|
|
370
|
+
"""Run all targets and print a summary."""
|
|
371
|
+
success_count = 0
|
|
372
|
+
fail_count = 0
|
|
373
|
+
|
|
374
|
+
for target in iter_targets(targets):
|
|
375
|
+
if run_task(target, options):
|
|
376
|
+
success_count += 1
|
|
377
|
+
else:
|
|
378
|
+
fail_count += 1
|
|
379
|
+
|
|
380
|
+
logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
381
|
+
logger.success(f"✓ 任务完成:成功 {success_count} 个,失败 {fail_count} 个")
|
|
382
|
+
logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def iter_targets(targets: tuple[str, ...]) -> Iterator[str | None]:
|
|
386
|
+
"""Yield individual targets, expanding files when an argument is a path."""
|
|
387
|
+
for raw_target in targets:
|
|
388
|
+
target = raw_target.strip()
|
|
389
|
+
if not target:
|
|
390
|
+
continue
|
|
391
|
+
|
|
392
|
+
path = Path(target)
|
|
393
|
+
if not path.exists():
|
|
394
|
+
yield target
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
logger.info(f"从文件读取目标:{target}")
|
|
398
|
+
lines = read_target_file(path)
|
|
399
|
+
if not lines:
|
|
400
|
+
logger.error(f"文件 [{target}] 中没有发现目标URL")
|
|
401
|
+
yield None
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
logger.info(f"文件中共有 {len(lines)} 个目标")
|
|
405
|
+
for index, line in enumerate(lines, 1):
|
|
406
|
+
logger.info(f"处理第 {index}/{len(lines)} 个目标")
|
|
407
|
+
yield line
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def read_target_file(path: Path) -> list[str]:
|
|
411
|
+
"""Read one target per non-empty line."""
|
|
412
|
+
try:
|
|
413
|
+
return [
|
|
414
|
+
line.strip()
|
|
415
|
+
for line in path.read_text(encoding="utf-8").splitlines()
|
|
416
|
+
if line.strip()
|
|
417
|
+
]
|
|
418
|
+
except OSError as exc:
|
|
419
|
+
logger.error(f"读取文件失败: {exc}")
|
|
420
|
+
return []
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def run_task(target: str | None, options: CrawlOptions) -> bool:
|
|
424
|
+
"""Run one crawl task."""
|
|
425
|
+
if target is None:
|
|
426
|
+
return False
|
|
427
|
+
|
|
428
|
+
try:
|
|
429
|
+
log_task_start(target, options)
|
|
430
|
+
douyin = create_crawler(target, options)
|
|
431
|
+
douyin.run()
|
|
432
|
+
maybe_download(douyin, options)
|
|
433
|
+
return True
|
|
434
|
+
except KeyboardInterrupt:
|
|
435
|
+
logger.warning("用户中断任务")
|
|
436
|
+
return False
|
|
437
|
+
except Exception as exc:
|
|
438
|
+
logger.error(f"任务执行失败: {exc}")
|
|
439
|
+
logger.debug(traceback.format_exc())
|
|
440
|
+
return False
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def log_task_start(target: str, options: CrawlOptions) -> None:
|
|
444
|
+
"""Log the task configuration."""
|
|
445
|
+
logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
446
|
+
logger.info("开始采集任务")
|
|
447
|
+
logger.info(f" 目标: {target or '本账号'}")
|
|
448
|
+
logger.info(f" 类型: {options.crawl_type}")
|
|
449
|
+
logger.info(f" 数量限制: {'不限' if options.limit == 0 else f'{options.limit}条'}")
|
|
450
|
+
if options.filters:
|
|
451
|
+
logger.info(f" 筛选条件: {options.filters}")
|
|
452
|
+
if options.download_title:
|
|
453
|
+
logger.info(" 下载标题: ✓ 是")
|
|
454
|
+
if options.download_cover:
|
|
455
|
+
logger.info(" 下载封面: ✓ 是")
|
|
456
|
+
logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def create_crawler(target: str, options: CrawlOptions) -> Douyin:
|
|
460
|
+
"""Create the crawler for one target."""
|
|
461
|
+
return Douyin(
|
|
462
|
+
target=target,
|
|
463
|
+
limit=options.limit,
|
|
464
|
+
type=options.crawl_type,
|
|
465
|
+
down_path=options.output_path,
|
|
466
|
+
cookie=options.cookie,
|
|
467
|
+
user_agent=settings.get("userAgent", ""),
|
|
468
|
+
filters=options.filters,
|
|
469
|
+
enable_download_title=options.download_title
|
|
470
|
+
or settings.get("enableDownloadTitle", False),
|
|
471
|
+
enable_download_cover=options.download_cover
|
|
472
|
+
or settings.get("enableDownloadCover", False),
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def maybe_download(douyin: Douyin, options: CrawlOptions) -> None:
|
|
477
|
+
"""Download crawled files when requested and supported."""
|
|
478
|
+
if options.no_download:
|
|
479
|
+
logger.info("已跳过下载(--no-download)")
|
|
480
|
+
return
|
|
481
|
+
|
|
482
|
+
if douyin.type in NO_DOWNLOAD_TYPES:
|
|
483
|
+
logger.info("此类型不需要下载文件")
|
|
484
|
+
return
|
|
485
|
+
|
|
486
|
+
logger.info("开始下载文件...")
|
|
487
|
+
download(douyin.down_path, douyin.aria2_conf)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
if __name__ == "__main__":
|
|
491
|
+
main()
|
backend/constants.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Application constants."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
# 兼容独立脚本运行和模块导入两种方式
|
|
8
|
+
try:
|
|
9
|
+
from .utils.paths import get_app_root
|
|
10
|
+
except ImportError:
|
|
11
|
+
from utils.paths import get_app_root
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from importlib.metadata import version as get_version
|
|
15
|
+
|
|
16
|
+
APP_VERSION = get_version("douyin-cli")
|
|
17
|
+
except Exception as e:
|
|
18
|
+
logger.debug(f"获取版本号失败: {e}")
|
|
19
|
+
APP_VERSION = "0.0.0"
|
|
20
|
+
|
|
21
|
+
# 项目根目录(应用目录)
|
|
22
|
+
PROJECT_ROOT = get_app_root()
|
|
23
|
+
# 完整路径
|
|
24
|
+
CONFIG_DIR = str(Path(PROJECT_ROOT) / "config")
|
|
25
|
+
DOWNLOAD_DIR = str(Path(PROJECT_ROOT) / "download")
|
|
26
|
+
SETTINGS_FILE = str(Path(CONFIG_DIR) / "settings.json")
|
|
27
|
+
|
|
28
|
+
# 默认设置(用于首次运行创建配置文件)
|
|
29
|
+
DEFAULT_SETTINGS = {
|
|
30
|
+
"cookie": "",
|
|
31
|
+
"userAgent": "", # 留空则使用内置默认值 (RequestHeaders.USER_AGENT)
|
|
32
|
+
"downloadPath": DOWNLOAD_DIR,
|
|
33
|
+
"enableIncrementalFetch": True,
|
|
34
|
+
"enableDownloadTitle": False,
|
|
35
|
+
"enableDownloadCover": False,
|
|
36
|
+
"filenameFields": ["id", "title"],
|
|
37
|
+
"filenameSeparator": "_",
|
|
38
|
+
}
|
backend/lib/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""业务逻辑模块
|
|
2
|
+
|
|
3
|
+
包含应用的核心业务逻辑和通用服务。
|
|
4
|
+
|
|
5
|
+
模块结构:
|
|
6
|
+
- douyin/: 抖音爬虫模块(核心业务)
|
|
7
|
+
- crawler.py: 爬虫主类,协调各模块完成数据采集
|
|
8
|
+
- client.py: API客户端,封装抖音API调用
|
|
9
|
+
- parser.py: 数据解析器,解析API返回的数据
|
|
10
|
+
- target.py: 目标处理器,识别和解析用户输入
|
|
11
|
+
- types.py: 类型定义和常量(作品类型、API端点、请求参数等)
|
|
12
|
+
- request.py: HTTP请求封装,处理签名和Cookie
|
|
13
|
+
- js/: JavaScript脚本(签名生成)
|
|
14
|
+
- cookies.py: Cookie管理和验证
|
|
15
|
+
- download.py: 调用系统 aria2c 下载文件
|
|
16
|
+
|
|
17
|
+
使用示例:
|
|
18
|
+
from backend.lib.douyin import Douyin
|
|
19
|
+
|
|
20
|
+
douyin = Douyin(target="https://www.douyin.com/user/xxx", limit=10)
|
|
21
|
+
douyin.run()
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# 向后兼容:保持原有的导入方式
|
|
25
|
+
from .douyin import Douyin
|
|
26
|
+
|
|
27
|
+
__all__ = ["Douyin"]
|