douyin-cli 5.3.260515__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
backend/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """DouyinCrawler backend package."""
backend/cli.py ADDED
@@ -0,0 +1,491 @@
1
+ """Command line interface for DouyinCrawler."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import traceback
6
+ from collections.abc import Iterator
7
+ from dataclasses import dataclass
8
+ from json import JSONDecodeError
9
+ from pathlib import Path
10
+
11
+ import click
12
+ import ujson as json
13
+ from loguru import logger
14
+
15
+ from backend.constants import APP_VERSION, SETTINGS_FILE
16
+ from backend.lib.cookies import CookieManager
17
+ from backend.lib.douyin import Douyin
18
+ from backend.lib.douyin.openapi import DouyinOpenAPIClient, DouyinOpenAPIError
19
+ from backend.lib.download import download
20
+ from backend.settings import settings
21
+
22
+ ACCOUNT_ONLY_TYPES = {"favorite", "collection", "following", "follower"}
23
+ NO_DOWNLOAD_TYPES = {"following", "follower"}
24
+
25
+ BANNER = rf"""
26
+ ____ _ ____ _
27
+ | _ \ ___ _ _ _ _(_)_ __ / ___|_ __ __ ___ _| | ___ _ __
28
+ | | | |/ _ \| | | | | | | | '_ \ | | | '__/ _` \ \ /\ / / |/ _ \ '__|
29
+ | |_| | (_) | |_| | |_| | | | | | | |___| | | (_| |\ V V /| | __/ |
30
+ |____/ \___/ \__,_|\__, |_|_| |_| \____|_| \__,_| \_/\_/ |_|\___|_|
31
+ |___/
32
+ V{APP_VERSION}
33
+ Github: https://github.com/erma0/douyin
34
+ """
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class CrawlOptions:
39
+ """Normalized options used by one or more crawl tasks."""
40
+
41
+ limit: int
42
+ no_download: bool
43
+ crawl_type: str
44
+ output_path: str
45
+ cookie: str
46
+ filters: dict[str, str]
47
+ download_title: bool
48
+ download_cover: bool
49
+
50
+
51
+ @click.group(invoke_without_command=True)
52
+ @click.pass_context
53
+ @click.option(
54
+ "-u",
55
+ "--urls",
56
+ type=click.STRING,
57
+ multiple=True,
58
+ help="作品/账号/话题/音乐等类型的URL链接/ID或搜索关键词,也可输入文件路径(文件内一行一个),可多次输入。",
59
+ )
60
+ @click.option(
61
+ "-l",
62
+ "--limit",
63
+ type=click.INT,
64
+ default=0,
65
+ help="限制最大采集数量,默认不限制(0表示不限制)",
66
+ )
67
+ @click.option(
68
+ "--no-download",
69
+ is_flag=True,
70
+ help="不下载文件,仅采集数据",
71
+ )
72
+ @click.option(
73
+ "-t",
74
+ "--type",
75
+ "crawl_type",
76
+ type=click.Choice(
77
+ [
78
+ "post",
79
+ "favorite",
80
+ "music",
81
+ "hashtag",
82
+ "search",
83
+ "following",
84
+ "follower",
85
+ "collection",
86
+ "mix",
87
+ "aweme",
88
+ ],
89
+ case_sensitive=False,
90
+ ),
91
+ default="post",
92
+ help="采集类型,默认为post(主页作品)。支持:post/favorite/music/hashtag/search/following/follower/collection/mix/aweme",
93
+ )
94
+ @click.option(
95
+ "-p",
96
+ "--path",
97
+ "output_path",
98
+ type=click.STRING,
99
+ default="下载",
100
+ help="下载文件夹路径,默认为[下载]",
101
+ )
102
+ @click.option(
103
+ "-c",
104
+ "--cookie",
105
+ type=click.STRING,
106
+ help=f"已登录账号的cookie,可填写在 {SETTINGS_FILE} 中",
107
+ )
108
+ @click.option(
109
+ "--sort-type",
110
+ type=click.Choice(["0", "1", "2"], case_sensitive=False),
111
+ help="搜索排序(仅search类型):0=综合,1=最多点赞,2=最新",
112
+ )
113
+ @click.option(
114
+ "--publish-time",
115
+ type=click.Choice(["0", "1", "7", "180"], case_sensitive=False),
116
+ help="发布时间(仅search类型):0=不限,1=一天内,7=一周内,180=半年内",
117
+ )
118
+ @click.option(
119
+ "--filter-duration",
120
+ type=click.Choice(["", "0-1", "1-5", "5-10000"], case_sensitive=False),
121
+ help="视频时长(仅search类型):空=不限,0-1=1分钟以下,1-5=1-5分钟,5-10000=5分钟以上",
122
+ )
123
+ @click.option(
124
+ "--download-title",
125
+ is_flag=True,
126
+ help="下载标题文本文件",
127
+ )
128
+ @click.option(
129
+ "--download-cover",
130
+ is_flag=True,
131
+ help="下载封面图片",
132
+ )
133
+ def main(
134
+ ctx: click.Context,
135
+ urls: tuple[str, ...],
136
+ limit: int,
137
+ no_download: bool,
138
+ crawl_type: str,
139
+ output_path: str,
140
+ cookie: str | None,
141
+ sort_type: str | None,
142
+ publish_time: str | None,
143
+ filter_duration: str | None,
144
+ download_title: bool,
145
+ download_cover: bool,
146
+ ) -> None:
147
+ """抖音数据采集命令行工具."""
148
+ if ctx.invoked_subcommand is not None:
149
+ return
150
+
151
+ click.echo(BANNER)
152
+
153
+ cookie_value = load_cookie(cookie)
154
+ if not cookie_value or not validate_cookie(cookie_value):
155
+ return
156
+
157
+ targets = resolve_targets(urls, crawl_type)
158
+ if targets is None:
159
+ return
160
+
161
+ options = CrawlOptions(
162
+ limit=limit,
163
+ no_download=no_download,
164
+ crawl_type=crawl_type,
165
+ output_path=output_path,
166
+ cookie=cookie_value,
167
+ filters=build_filters(sort_type, publish_time, filter_duration),
168
+ download_title=download_title,
169
+ download_cover=download_cover,
170
+ )
171
+ run_targets(targets, options)
172
+
173
+
174
+ @main.group()
175
+ def api() -> None:
176
+ """调用抖音开放平台官方 OpenAPI."""
177
+
178
+
179
+ @api.command("client-token")
180
+ @click.option("--client-key", envvar="DOUYIN_CLIENT_KEY", required=True)
181
+ @click.option("--client-secret", envvar="DOUYIN_CLIENT_SECRET", required=True)
182
+ def api_client_token(client_key: str, client_secret: str) -> None:
183
+ """获取 client_token."""
184
+ with DouyinOpenAPIClient() as client:
185
+ echo_json(client.client_token(client_key, client_secret))
186
+
187
+
188
+ @api.command("access-token")
189
+ @click.option("--client-key", envvar="DOUYIN_CLIENT_KEY", required=True)
190
+ @click.option("--client-secret", envvar="DOUYIN_CLIENT_SECRET", required=True)
191
+ @click.option("--code", required=True, help="OAuth 授权码")
192
+ def api_access_token(client_key: str, client_secret: str, code: str) -> None:
193
+ """用 OAuth code 换取 access_token."""
194
+ with DouyinOpenAPIClient() as client:
195
+ echo_json(client.access_token(client_key, client_secret, code))
196
+
197
+
198
+ @api.command("request")
199
+ @click.argument("method")
200
+ @click.argument("path")
201
+ @click.option("--token", envvar="DOUYIN_ACCESS_TOKEN", help="access_token/client_token")
202
+ @click.option("--param", "params", multiple=True, help="查询参数,格式 key=value")
203
+ @click.option("--json", "json_text", help="JSON 请求体")
204
+ @click.option("--form", "forms", multiple=True, help="表单参数,格式 key=value")
205
+ @click.option("--header", "headers", multiple=True, help="额外请求头,格式 key=value")
206
+ def api_request(
207
+ method: str,
208
+ path: str,
209
+ token: str | None,
210
+ params: tuple[str, ...],
211
+ json_text: str | None,
212
+ forms: tuple[str, ...],
213
+ headers: tuple[str, ...],
214
+ ) -> None:
215
+ """调用任意官方 OpenAPI 路径."""
216
+ try:
217
+ json_body = parse_json_body(json_text)
218
+ with DouyinOpenAPIClient() as client:
219
+ echo_json(
220
+ client.request(
221
+ method,
222
+ path,
223
+ token=token,
224
+ params=parse_key_values(params),
225
+ json_body=json_body,
226
+ form=parse_key_values(forms),
227
+ headers=parse_key_values(headers),
228
+ ),
229
+ )
230
+ except (DouyinOpenAPIError, ValueError) as exc:
231
+ raise click.ClickException(str(exc)) from exc
232
+
233
+
234
+ def build_filters(
235
+ sort_type: str | None,
236
+ publish_time: str | None,
237
+ filter_duration: str | None,
238
+ ) -> dict[str, str]:
239
+ """Build search filter arguments."""
240
+ filters = {}
241
+ if sort_type:
242
+ filters["sort_type"] = sort_type
243
+ if publish_time:
244
+ filters["publish_time"] = publish_time
245
+ if filter_duration is not None:
246
+ filters["filter_duration"] = filter_duration
247
+ return filters
248
+
249
+
250
+ def echo_json(data: dict) -> None:
251
+ """Print a JSON response."""
252
+ click.echo(json.dumps(data, ensure_ascii=False, indent=2))
253
+
254
+
255
+ def parse_json_body(text: str | None) -> dict | list | None:
256
+ """Parse a JSON body option."""
257
+ if text is None:
258
+ return None
259
+ try:
260
+ data = json.loads(text)
261
+ except JSONDecodeError as exc:
262
+ msg = f"--json 不是合法 JSON: {exc}"
263
+ raise ValueError(msg) from exc
264
+ if not isinstance(data, (dict, list)):
265
+ msg = "--json 必须是 JSON object 或 array"
266
+ raise ValueError(msg)
267
+ return data
268
+
269
+
270
+ def parse_key_values(values: tuple[str, ...]) -> dict[str, str] | None:
271
+ """Parse repeated key=value CLI options."""
272
+ if not values:
273
+ return None
274
+ parsed = {}
275
+ for value in values:
276
+ if "=" not in value:
277
+ msg = f"参数必须是 key=value 格式: {value}"
278
+ raise ValueError(msg)
279
+ key, item_value = value.split("=", 1)
280
+ if not key:
281
+ msg = f"参数 key 不能为空: {value}"
282
+ raise ValueError(msg)
283
+ parsed[key] = item_value
284
+ return parsed
285
+
286
+
287
+ def load_cookie(cookie: str | None) -> str | None:
288
+ """Load cookie from CLI, config, or interactive prompt."""
289
+ if cookie is not None:
290
+ logger.info("正在加载命令行指定的Cookie...")
291
+ cookie_value = cookie.strip()
292
+ if cookie_value:
293
+ return cookie_value
294
+ logger.error("无法加载指定的Cookie")
295
+ return None
296
+
297
+ cookie_value = settings.get("cookie", "").strip()
298
+ if cookie_value:
299
+ logger.info("✓ 已从配置文件加载Cookie")
300
+ return cookie_value
301
+
302
+ return prompt_for_cookie()
303
+
304
+
305
+ def prompt_for_cookie() -> str | None:
306
+ """Prompt for cookie when no configured cookie exists."""
307
+ logger.warning("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
308
+ logger.warning("⚠ 未找到Cookie配置")
309
+ logger.info("配置方法:")
310
+ logger.info(f" 方法1:在 {SETTINGS_FILE} 中设置 cookie 字段")
311
+ logger.info(" 方法2:使用 -c 参数:douyin -c 'your_cookie'")
312
+ logger.warning("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
313
+
314
+ try:
315
+ cookie_value = click.prompt(
316
+ "请粘贴Cookie字符串",
317
+ default="",
318
+ show_default=False,
319
+ )
320
+ except (click.Abort, EOFError):
321
+ logger.warning("\n用户取消输入")
322
+ return None
323
+
324
+ cookie_value = cookie_value.strip()
325
+ if not cookie_value:
326
+ logger.error("未输入Cookie,程序退出")
327
+ return None
328
+ logger.success("✓ Cookie已输入")
329
+ return cookie_value
330
+
331
+
332
+ def validate_cookie(cookie: str) -> bool:
333
+ """Validate cookie before crawling."""
334
+ if CookieManager.validate_cookie(cookie):
335
+ logger.success("✓ Cookie验证通过")
336
+ return True
337
+
338
+ logger.error("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
339
+ logger.error("✗ Cookie验证失败")
340
+ logger.info("可能原因:")
341
+ logger.info(" 1. Cookie已过期,请重新获取")
342
+ logger.info(" 2. Cookie格式不正确")
343
+ logger.info(" 3. 账号已退出登录")
344
+ logger.error("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
345
+ return False
346
+
347
+
348
+ def resolve_targets(urls: tuple[str, ...], crawl_type: str) -> tuple[str, ...] | None:
349
+ """Resolve CLI targets or prompt for one when needed."""
350
+ if urls:
351
+ return urls
352
+
353
+ if crawl_type in ACCOUNT_ONLY_TYPES:
354
+ logger.info(f"采集本账号的 {crawl_type} 数据")
355
+ return ("",)
356
+
357
+ url_input = click.prompt(
358
+ f"采集类型:{crawl_type},请输入目标关键词/URL链接/ID或文件路径",
359
+ default="",
360
+ show_default=False,
361
+ ).strip()
362
+ if url_input:
363
+ return (url_input,)
364
+
365
+ logger.error("未输入目标,退出程序")
366
+ return None
367
+
368
+
369
+ def run_targets(targets: tuple[str, ...], options: CrawlOptions) -> None:
370
+ """Run all targets and print a summary."""
371
+ success_count = 0
372
+ fail_count = 0
373
+
374
+ for target in iter_targets(targets):
375
+ if run_task(target, options):
376
+ success_count += 1
377
+ else:
378
+ fail_count += 1
379
+
380
+ logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
381
+ logger.success(f"✓ 任务完成:成功 {success_count} 个,失败 {fail_count} 个")
382
+ logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
383
+
384
+
385
+ def iter_targets(targets: tuple[str, ...]) -> Iterator[str | None]:
386
+ """Yield individual targets, expanding files when an argument is a path."""
387
+ for raw_target in targets:
388
+ target = raw_target.strip()
389
+ if not target:
390
+ continue
391
+
392
+ path = Path(target)
393
+ if not path.exists():
394
+ yield target
395
+ continue
396
+
397
+ logger.info(f"从文件读取目标:{target}")
398
+ lines = read_target_file(path)
399
+ if not lines:
400
+ logger.error(f"文件 [{target}] 中没有发现目标URL")
401
+ yield None
402
+ continue
403
+
404
+ logger.info(f"文件中共有 {len(lines)} 个目标")
405
+ for index, line in enumerate(lines, 1):
406
+ logger.info(f"处理第 {index}/{len(lines)} 个目标")
407
+ yield line
408
+
409
+
410
+ def read_target_file(path: Path) -> list[str]:
411
+ """Read one target per non-empty line."""
412
+ try:
413
+ return [
414
+ line.strip()
415
+ for line in path.read_text(encoding="utf-8").splitlines()
416
+ if line.strip()
417
+ ]
418
+ except OSError as exc:
419
+ logger.error(f"读取文件失败: {exc}")
420
+ return []
421
+
422
+
423
+ def run_task(target: str | None, options: CrawlOptions) -> bool:
424
+ """Run one crawl task."""
425
+ if target is None:
426
+ return False
427
+
428
+ try:
429
+ log_task_start(target, options)
430
+ douyin = create_crawler(target, options)
431
+ douyin.run()
432
+ maybe_download(douyin, options)
433
+ return True
434
+ except KeyboardInterrupt:
435
+ logger.warning("用户中断任务")
436
+ return False
437
+ except Exception as exc:
438
+ logger.error(f"任务执行失败: {exc}")
439
+ logger.debug(traceback.format_exc())
440
+ return False
441
+
442
+
443
+ def log_task_start(target: str, options: CrawlOptions) -> None:
444
+ """Log the task configuration."""
445
+ logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
446
+ logger.info("开始采集任务")
447
+ logger.info(f" 目标: {target or '本账号'}")
448
+ logger.info(f" 类型: {options.crawl_type}")
449
+ logger.info(f" 数量限制: {'不限' if options.limit == 0 else f'{options.limit}条'}")
450
+ if options.filters:
451
+ logger.info(f" 筛选条件: {options.filters}")
452
+ if options.download_title:
453
+ logger.info(" 下载标题: ✓ 是")
454
+ if options.download_cover:
455
+ logger.info(" 下载封面: ✓ 是")
456
+ logger.info("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
457
+
458
+
459
+ def create_crawler(target: str, options: CrawlOptions) -> Douyin:
460
+ """Create the crawler for one target."""
461
+ return Douyin(
462
+ target=target,
463
+ limit=options.limit,
464
+ type=options.crawl_type,
465
+ down_path=options.output_path,
466
+ cookie=options.cookie,
467
+ user_agent=settings.get("userAgent", ""),
468
+ filters=options.filters,
469
+ enable_download_title=options.download_title
470
+ or settings.get("enableDownloadTitle", False),
471
+ enable_download_cover=options.download_cover
472
+ or settings.get("enableDownloadCover", False),
473
+ )
474
+
475
+
476
+ def maybe_download(douyin: Douyin, options: CrawlOptions) -> None:
477
+ """Download crawled files when requested and supported."""
478
+ if options.no_download:
479
+ logger.info("已跳过下载(--no-download)")
480
+ return
481
+
482
+ if douyin.type in NO_DOWNLOAD_TYPES:
483
+ logger.info("此类型不需要下载文件")
484
+ return
485
+
486
+ logger.info("开始下载文件...")
487
+ download(douyin.down_path, douyin.aria2_conf)
488
+
489
+
490
+ if __name__ == "__main__":
491
+ main()
backend/constants.py ADDED
@@ -0,0 +1,38 @@
1
+ """Application constants."""
2
+
3
+ from pathlib import Path
4
+
5
+ from loguru import logger
6
+
7
+ # 兼容独立脚本运行和模块导入两种方式
8
+ try:
9
+ from .utils.paths import get_app_root
10
+ except ImportError:
11
+ from utils.paths import get_app_root
12
+
13
+ try:
14
+ from importlib.metadata import version as get_version
15
+
16
+ APP_VERSION = get_version("douyin-cli")
17
+ except Exception as e:
18
+ logger.debug(f"获取版本号失败: {e}")
19
+ APP_VERSION = "0.0.0"
20
+
21
+ # 项目根目录(应用目录)
22
+ PROJECT_ROOT = get_app_root()
23
+ # 完整路径
24
+ CONFIG_DIR = str(Path(PROJECT_ROOT) / "config")
25
+ DOWNLOAD_DIR = str(Path(PROJECT_ROOT) / "download")
26
+ SETTINGS_FILE = str(Path(CONFIG_DIR) / "settings.json")
27
+
28
+ # 默认设置(用于首次运行创建配置文件)
29
+ DEFAULT_SETTINGS = {
30
+ "cookie": "",
31
+ "userAgent": "", # 留空则使用内置默认值 (RequestHeaders.USER_AGENT)
32
+ "downloadPath": DOWNLOAD_DIR,
33
+ "enableIncrementalFetch": True,
34
+ "enableDownloadTitle": False,
35
+ "enableDownloadCover": False,
36
+ "filenameFields": ["id", "title"],
37
+ "filenameSeparator": "_",
38
+ }
@@ -0,0 +1,27 @@
1
+ """业务逻辑模块
2
+
3
+ 包含应用的核心业务逻辑和通用服务。
4
+
5
+ 模块结构:
6
+ - douyin/: 抖音爬虫模块(核心业务)
7
+ - crawler.py: 爬虫主类,协调各模块完成数据采集
8
+ - client.py: API客户端,封装抖音API调用
9
+ - parser.py: 数据解析器,解析API返回的数据
10
+ - target.py: 目标处理器,识别和解析用户输入
11
+ - types.py: 类型定义和常量(作品类型、API端点、请求参数等)
12
+ - request.py: HTTP请求封装,处理签名和Cookie
13
+ - js/: JavaScript脚本(签名生成)
14
+ - cookies.py: Cookie管理和验证
15
+ - download.py: 调用系统 aria2c 下载文件
16
+
17
+ 使用示例:
18
+ from backend.lib.douyin import Douyin
19
+
20
+ douyin = Douyin(target="https://www.douyin.com/user/xxx", limit=10)
21
+ douyin.run()
22
+ """
23
+
24
+ # 向后兼容:保持原有的导入方式
25
+ from .douyin import Douyin
26
+
27
+ __all__ = ["Douyin"]