@tikomni/skills 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/scripts/pipelines/douyin_metadata.py +151 -0
- package/skills/social-media-crawl/scripts/pipelines/home_asr.py +40 -37
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +5 -11
- package/skills/social-media-crawl/scripts/pipelines/input_contracts.py +318 -0
- package/skills/social-media-crawl/scripts/pipelines/media_url_rules.py +86 -0
- package/skills/social-media-crawl/scripts/pipelines/platform_adapters.py +77 -30
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +84 -6
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +79 -73
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +84 -6
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +86 -60
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +5 -3
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared media URL classification helpers."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from typing import Iterable, List
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _is_http_url(url: str) -> bool:
|
|
10
|
+
lower = (url or "").lower()
|
|
11
|
+
return lower.startswith("http://") or lower.startswith("https://")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def is_probable_image_url(url: str) -> bool:
|
|
15
|
+
lower = (url or "").lower()
|
|
16
|
+
if not _is_http_url(lower):
|
|
17
|
+
return False
|
|
18
|
+
image_tokens = [
|
|
19
|
+
".jpg",
|
|
20
|
+
".jpeg",
|
|
21
|
+
".png",
|
|
22
|
+
".webp",
|
|
23
|
+
".gif",
|
|
24
|
+
"imageview2",
|
|
25
|
+
"imagemogr2",
|
|
26
|
+
"redimage",
|
|
27
|
+
"frame/",
|
|
28
|
+
"sns-img",
|
|
29
|
+
"sns-webpic",
|
|
30
|
+
"notes_pre_post",
|
|
31
|
+
"/image/",
|
|
32
|
+
"/img/",
|
|
33
|
+
]
|
|
34
|
+
return any(token in lower for token in image_tokens)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def is_probable_audio_url(url: str) -> bool:
|
|
38
|
+
lower = (url or "").lower()
|
|
39
|
+
if not _is_http_url(lower):
|
|
40
|
+
return False
|
|
41
|
+
audio_tokens = [
|
|
42
|
+
".m4a",
|
|
43
|
+
".mp3",
|
|
44
|
+
".aac",
|
|
45
|
+
".wav",
|
|
46
|
+
".flac",
|
|
47
|
+
".ogg",
|
|
48
|
+
"/audio/",
|
|
49
|
+
"sns-audio",
|
|
50
|
+
"redaudio",
|
|
51
|
+
]
|
|
52
|
+
return any(token in lower for token in audio_tokens)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def is_probable_video_url(url: str) -> bool:
|
|
56
|
+
lower = (url or "").lower()
|
|
57
|
+
if not _is_http_url(lower):
|
|
58
|
+
return False
|
|
59
|
+
if is_probable_image_url(lower) or is_probable_audio_url(lower):
|
|
60
|
+
return False
|
|
61
|
+
video_tokens = [
|
|
62
|
+
".mp4",
|
|
63
|
+
".m3u8",
|
|
64
|
+
".mov",
|
|
65
|
+
".flv",
|
|
66
|
+
"/video/",
|
|
67
|
+
"sns-video",
|
|
68
|
+
"redvideo",
|
|
69
|
+
"play",
|
|
70
|
+
"stream",
|
|
71
|
+
"master",
|
|
72
|
+
"vod",
|
|
73
|
+
]
|
|
74
|
+
return any(token in lower for token in video_tokens)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def filter_video_urls(urls: Iterable[str]) -> List[str]:
|
|
78
|
+
unique: List[str] = []
|
|
79
|
+
seen = set()
|
|
80
|
+
for raw in urls:
|
|
81
|
+
url = str(raw or "").strip()
|
|
82
|
+
if not url or url in seen or not is_probable_video_url(url):
|
|
83
|
+
continue
|
|
84
|
+
unique.append(url)
|
|
85
|
+
seen.add(url)
|
|
86
|
+
return unique
|
|
@@ -13,6 +13,14 @@ from scripts.pipelines.schema import (
|
|
|
13
13
|
validate_work_item,
|
|
14
14
|
validate_works_collection,
|
|
15
15
|
)
|
|
16
|
+
from scripts.pipelines.douyin_metadata import (
|
|
17
|
+
extract_douyin_author,
|
|
18
|
+
extract_douyin_caption,
|
|
19
|
+
extract_douyin_metrics,
|
|
20
|
+
extract_douyin_title,
|
|
21
|
+
normalize_douyin_author_handle,
|
|
22
|
+
)
|
|
23
|
+
from scripts.pipelines.media_url_rules import is_probable_video_url as is_shared_probable_video_url
|
|
16
24
|
from scripts.core.tikomni_common import deep_find_all, deep_find_first
|
|
17
25
|
from scripts.pipelines.select_low_quality_video_url import select_low_quality_video_url
|
|
18
26
|
|
|
@@ -117,12 +125,7 @@ def _normalize_douyin_tags(value: Any) -> List[str]:
|
|
|
117
125
|
|
|
118
126
|
|
|
119
127
|
def _is_probable_video_url(url: str) -> bool:
|
|
120
|
-
|
|
121
|
-
if not (lower.startswith("http://") or lower.startswith("https://")):
|
|
122
|
-
return False
|
|
123
|
-
if any(token in lower for token in [".jpg", ".jpeg", ".png", ".webp", "image", "img"]):
|
|
124
|
-
return False
|
|
125
|
-
return any(token in lower for token in [".mp4", ".m3u8", ".m4a", "video", "stream", "play"])
|
|
128
|
+
return is_shared_probable_video_url(url)
|
|
126
129
|
|
|
127
130
|
|
|
128
131
|
def _extract_douyin_video_down_url(item: Dict[str, Any]) -> str:
|
|
@@ -348,14 +351,21 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
348
351
|
|
|
349
352
|
internal_author_id = _t(_first(profile_data, ["sec_user_id", "sec_uid"], raw.get("resolved_author_id")))
|
|
350
353
|
stable_author_id = _t(_first(profile_data, ["uid", "user_id", "id"]))
|
|
351
|
-
author_handle =
|
|
354
|
+
author_handle = normalize_douyin_author_handle(
|
|
355
|
+
_first(profile_data, ["unique_id"]),
|
|
356
|
+
_first(profile_data, ["short_id"]),
|
|
357
|
+
_first(profile_data, ["douyin_id"]),
|
|
358
|
+
_first(profile_data, ["display_id"]),
|
|
359
|
+
_first(profile_data, ["nickname", "name"]),
|
|
360
|
+
)
|
|
361
|
+
nickname = _t(_first(profile_data, ["nickname", "name"]))
|
|
352
362
|
|
|
353
363
|
author_id = internal_author_id or stable_author_id
|
|
354
364
|
profile = build_author_profile(
|
|
355
365
|
platform="douyin",
|
|
356
366
|
platform_author_id=author_id,
|
|
357
367
|
author_handle=author_handle,
|
|
358
|
-
nickname=
|
|
368
|
+
nickname=nickname,
|
|
359
369
|
ip_location=_t(_first(profile_data, ["ip_location", "ip_label", "ipLocation"])),
|
|
360
370
|
fans_count=_i(_first(profile_data, ["follower_count", "fans_count", "mplatform_followers_count"])),
|
|
361
371
|
liked_count=_i(_first(profile_data, ["total_favorited", "liked_count", "favoriting_count"])),
|
|
@@ -383,23 +393,27 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
383
393
|
if not isinstance(item, dict):
|
|
384
394
|
continue
|
|
385
395
|
aweme_id = _t(_first(item, ["aweme_id", "item_id", "id"]))
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
"comment": _i(_first(item, ["comment_count"], 0)),
|
|
389
|
-
"collect": _i(_first(item, ["collect_count"], 0)),
|
|
390
|
-
"share": _i(_first(item, ["share_count"], 0)),
|
|
391
|
-
"play": _optional_i(_first(item, ["play_count", "view_count"], None)),
|
|
392
|
-
}
|
|
396
|
+
author_info = extract_douyin_author(item)
|
|
397
|
+
metrics = extract_douyin_metrics(item)
|
|
393
398
|
video_down_url = _extract_douyin_video_down_url(item)
|
|
394
399
|
tags = _normalize_douyin_tags(_first(item, ["hashtags", "tags", "text_extra"], []))
|
|
400
|
+
work_author_handle = normalize_douyin_author_handle(
|
|
401
|
+
author_info.get("author_handle"),
|
|
402
|
+
author_handle,
|
|
403
|
+
nickname,
|
|
404
|
+
)
|
|
405
|
+
work_platform_author_id = _t(author_info.get("platform_author_id") or author_id)
|
|
406
|
+
work_author_platform_id = _t(author_info.get("author_platform_id") or stable_author_id or author_id)
|
|
407
|
+
work_nickname = _t(author_info.get("nickname") or nickname)
|
|
408
|
+
work_signature = _t(author_info.get("signature") or profile.get("signature"))
|
|
395
409
|
work = build_work_item(
|
|
396
410
|
platform="douyin",
|
|
397
411
|
platform_work_id=aweme_id,
|
|
398
|
-
platform_author_id=
|
|
399
|
-
author_handle=
|
|
400
|
-
author_platform_id=
|
|
401
|
-
title=
|
|
402
|
-
caption_raw=
|
|
412
|
+
platform_author_id=work_platform_author_id,
|
|
413
|
+
author_handle=work_author_handle,
|
|
414
|
+
author_platform_id=work_author_platform_id,
|
|
415
|
+
title=extract_douyin_title(item),
|
|
416
|
+
caption_raw=extract_douyin_caption(item),
|
|
403
417
|
subtitle_raw="",
|
|
404
418
|
subtitle_source="missing",
|
|
405
419
|
publish_time=_t(_first(item, ["create_time", "publish_time"])),
|
|
@@ -407,7 +421,12 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
407
421
|
content_type="video",
|
|
408
422
|
duration_ms=_i(_first(item, ["duration_ms", "duration"], 0)),
|
|
409
423
|
tags=tags,
|
|
410
|
-
metrics=
|
|
424
|
+
metrics={
|
|
425
|
+
"digg_count": int(metrics.get("digg_count") or 0),
|
|
426
|
+
"comment_count": int(metrics.get("comment_count") or 0),
|
|
427
|
+
"collect_count": int(metrics.get("collect_count") or 0),
|
|
428
|
+
"share_count": int(metrics.get("share_count") or 0),
|
|
429
|
+
},
|
|
411
430
|
cover_image=(
|
|
412
431
|
_extract_first_url(_first(item, ["cover_url"], ""))
|
|
413
432
|
or _extract_first_url(_first(item, ["cover"], ""))
|
|
@@ -420,18 +439,31 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
420
439
|
asr_error_reason="",
|
|
421
440
|
asr_source="fallback_none",
|
|
422
441
|
platform_native_refs={
|
|
423
|
-
"douyin_sec_uid": internal_author_id,
|
|
424
|
-
"douyin_aweme_author_id": stable_author_id or author_id,
|
|
442
|
+
"douyin_sec_uid": _t(author_info.get("douyin_sec_uid") or internal_author_id),
|
|
443
|
+
"douyin_aweme_author_id": _t(author_info.get("douyin_aweme_author_id") or stable_author_id or author_id),
|
|
444
|
+
"douyin_unique_id": _t(author_info.get("unique_id")),
|
|
425
445
|
},
|
|
426
446
|
raw_ref={"aweme_id": aweme_id, "raw_item": item},
|
|
427
447
|
)
|
|
428
448
|
work.update(
|
|
429
449
|
{
|
|
430
|
-
"
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
450
|
+
"author": {
|
|
451
|
+
"author_handle": work_author_handle,
|
|
452
|
+
"platform_author_id": work_platform_author_id,
|
|
453
|
+
"author_platform_id": work_author_platform_id,
|
|
454
|
+
"douyin_sec_uid": _t(author_info.get("douyin_sec_uid") or internal_author_id),
|
|
455
|
+
"douyin_aweme_author_id": _t(author_info.get("douyin_aweme_author_id") or stable_author_id or author_id),
|
|
456
|
+
"unique_id": _t(author_info.get("unique_id")),
|
|
457
|
+
"nickname": work_nickname,
|
|
458
|
+
"signature": work_signature,
|
|
459
|
+
},
|
|
460
|
+
"nickname": work_nickname,
|
|
461
|
+
"signature": work_signature,
|
|
462
|
+
"digg_count": int(metrics.get("digg_count") or 0),
|
|
463
|
+
"comment_count": int(metrics.get("comment_count") or 0),
|
|
464
|
+
"collect_count": int(metrics.get("collect_count") or 0),
|
|
465
|
+
"share_count": int(metrics.get("share_count") or 0),
|
|
466
|
+
"play_count": metrics.get("play_count"),
|
|
435
467
|
}
|
|
436
468
|
)
|
|
437
469
|
|
|
@@ -448,16 +480,18 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
448
480
|
|
|
449
481
|
author_id = _t(_first(profile_data, ["user_id", "userid", "id"], raw.get("resolved_author_id")))
|
|
450
482
|
author_handle = _t(_first(profile_data, ["red_id", "redid", "display_id", "username"]))
|
|
483
|
+
nickname = _t(_first(profile_data, ["nickname", "name"]))
|
|
484
|
+
signature = _t(_first(profile_data, ["desc", "signature", "bio", "introduction"]))
|
|
451
485
|
profile = build_author_profile(
|
|
452
486
|
platform="xiaohongshu",
|
|
453
487
|
platform_author_id=author_id,
|
|
454
488
|
author_handle=author_handle,
|
|
455
|
-
nickname=
|
|
489
|
+
nickname=nickname,
|
|
456
490
|
ip_location=_t(_first(profile_data, ["ip_location", "ip_location_desc", "ipLocation"])),
|
|
457
491
|
fans_count=_i(_first(profile_data, ["fans", "fans_count", "follower_count", "followers"])),
|
|
458
492
|
liked_count=_i(_first(profile_data, ["liked_count", "likes", "total_liked", "like_count"])),
|
|
459
493
|
collected_count=_i(_first(profile_data, ["collected_count", "collect_count", "total_collected", "favorite_count"])),
|
|
460
|
-
signature=
|
|
494
|
+
signature=signature,
|
|
461
495
|
avatar_url=_extract_xhs_avatar_url(profile_data),
|
|
462
496
|
works_count=_i(_first(profile_data, ["notes", "note_count", "works_count", "post_count"])),
|
|
463
497
|
verified=bool(_first(profile_data, ["official_verified", "verified"], False)),
|
|
@@ -480,6 +514,8 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
480
514
|
"share": _i(_first(item, ["share_count"], 0)),
|
|
481
515
|
"play": _optional_i(_first(item, ["view_count", "play_count"], None)),
|
|
482
516
|
}
|
|
517
|
+
if (metrics["play"] or 0) <= 0 and max(metrics["like"], metrics["comment"], metrics["collect"], metrics["share"]) > 0:
|
|
518
|
+
metrics["play"] = None
|
|
483
519
|
subtitle_inline = _extract_xhs_subtitle_inline(item)
|
|
484
520
|
subtitle_urls = _extract_xhs_subtitle_urls(item)
|
|
485
521
|
video_down_url = _extract_xhs_video_down_url(item)
|
|
@@ -489,6 +525,8 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
489
525
|
cover_image = _extract_xhs_cover_image(item)
|
|
490
526
|
source_url = _extract_xhs_source_url(item, note_id)
|
|
491
527
|
share_url = _extract_xhs_share_url(item, note_id)
|
|
528
|
+
work_nickname = nickname
|
|
529
|
+
work_signature = signature
|
|
492
530
|
|
|
493
531
|
work = build_work_item(
|
|
494
532
|
platform="xiaohongshu",
|
|
@@ -523,6 +561,15 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
523
561
|
)
|
|
524
562
|
work.update(
|
|
525
563
|
{
|
|
564
|
+
"author": {
|
|
565
|
+
"author_handle": author_handle,
|
|
566
|
+
"platform_author_id": author_id,
|
|
567
|
+
"author_platform_id": author_id,
|
|
568
|
+
"nickname": work_nickname,
|
|
569
|
+
"signature": work_signature,
|
|
570
|
+
},
|
|
571
|
+
"nickname": work_nickname,
|
|
572
|
+
"signature": work_signature,
|
|
526
573
|
"digg_count": metrics["like"],
|
|
527
574
|
"comment_count": metrics["comment"],
|
|
528
575
|
"collect_count": metrics["collect"],
|
|
@@ -27,10 +27,15 @@ from scripts.core.config_loader import config_get, load_tikomni_config, resolve_
|
|
|
27
27
|
from scripts.core.progress_report import build_progress_reporter
|
|
28
28
|
from scripts.core.storage_router import resolve_author_directory_name
|
|
29
29
|
from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
|
|
30
|
+
from scripts.pipelines.input_contracts import normalize_douyin_creator_input
|
|
31
|
+
from scripts.pipelines.schema import build_author_profile
|
|
30
32
|
from scripts.pipelines.douyin_creator_home_helpers import collect_and_adapt
|
|
31
33
|
from scripts.pipelines.home_asr import enrich_author_home_asr
|
|
32
34
|
from scripts.writers.write_work_fact_card import build_work_fact_card, persist_output_envelope, write_work_fact_card
|
|
33
35
|
|
|
36
|
+
DEFAULT_MAX_ITEMS = 200
|
|
37
|
+
MAX_ITEMS_HARD_LIMIT = 200
|
|
38
|
+
|
|
34
39
|
|
|
35
40
|
def _write_collection_artifacts(
|
|
36
41
|
*,
|
|
@@ -81,11 +86,12 @@ def run_douyin_creator_home(
|
|
|
81
86
|
*,
|
|
82
87
|
input_value: str,
|
|
83
88
|
config: Dict[str, Any],
|
|
84
|
-
runtime: Dict[str, Any],
|
|
89
|
+
runtime: Dict[str, Any] | None,
|
|
85
90
|
max_items: int,
|
|
86
91
|
write_card: bool,
|
|
87
92
|
persist_output: bool,
|
|
88
93
|
) -> Dict[str, Any]:
|
|
94
|
+
bounded_max_items = max(1, min(int(max_items), MAX_ITEMS_HARD_LIMIT))
|
|
89
95
|
progress = build_progress_reporter(
|
|
90
96
|
workflow="social-media-crawl",
|
|
91
97
|
platform="douyin",
|
|
@@ -94,15 +100,69 @@ def run_douyin_creator_home(
|
|
|
94
100
|
scope="workflow",
|
|
95
101
|
)
|
|
96
102
|
progress.started(stage="author_home.workflow", message="douyin author_home workflow started")
|
|
103
|
+
preflight = normalize_douyin_creator_input(input_value)
|
|
104
|
+
normalized_input_value = str(preflight.get("input_value") or "")
|
|
105
|
+
if preflight.get("error_reason"):
|
|
106
|
+
request_id = ensure_request_id(None, fallback_seed=input_value)
|
|
107
|
+
empty_profile = build_author_profile(platform="douyin", request_id=request_id)
|
|
108
|
+
extract_trace = [
|
|
109
|
+
{
|
|
110
|
+
"step": "input.preflight",
|
|
111
|
+
"ok": False,
|
|
112
|
+
"input_kind": "creator_url_or_sec_uid",
|
|
113
|
+
"normalized_input_value": normalized_input_value or None,
|
|
114
|
+
"error_reason": preflight.get("error_reason"),
|
|
115
|
+
"missing_fields": list(preflight.get("missing_fields") or []),
|
|
116
|
+
}
|
|
117
|
+
]
|
|
118
|
+
envelope = {
|
|
119
|
+
"object_type": "creator",
|
|
120
|
+
"platform": "douyin",
|
|
121
|
+
"input": input_value,
|
|
122
|
+
"normalized": {
|
|
123
|
+
"creator_profile": {**empty_profile, "request_id": request_id, "extract_trace": extract_trace},
|
|
124
|
+
"work_collection": {
|
|
125
|
+
"platform": "douyin",
|
|
126
|
+
"platform_author_id": "",
|
|
127
|
+
"count": 0,
|
|
128
|
+
"items": [],
|
|
129
|
+
"request_id": request_id,
|
|
130
|
+
"extract_trace": extract_trace,
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
"completeness": evaluate_collection(empty_profile, []),
|
|
134
|
+
"missing_fields": normalize_missing_fields(preflight.get("missing_fields")),
|
|
135
|
+
"error_reason": str(preflight.get("error_reason") or "invalid_creator_input"),
|
|
136
|
+
"extract_trace": extract_trace,
|
|
137
|
+
"request_id": request_id,
|
|
138
|
+
"card_write": {
|
|
139
|
+
"enabled": bool(write_card),
|
|
140
|
+
"ok": False,
|
|
141
|
+
"count": 0,
|
|
142
|
+
"results": [],
|
|
143
|
+
"reason": "skipped_invalid_input",
|
|
144
|
+
},
|
|
145
|
+
"collection_artifacts": {},
|
|
146
|
+
"output_persist": {"enabled": False, "skipped": True, "reason": "invalid_input"},
|
|
147
|
+
}
|
|
148
|
+
progress.done(
|
|
149
|
+
stage="author_home.workflow",
|
|
150
|
+
message="douyin author_home workflow finished",
|
|
151
|
+
data={"request_id": request_id, "works_count": 0, "error_reason": envelope["error_reason"]},
|
|
152
|
+
)
|
|
153
|
+
return envelope
|
|
154
|
+
|
|
155
|
+
if runtime is None:
|
|
156
|
+
raise ValueError("runtime_required_for_valid_input")
|
|
97
157
|
|
|
98
158
|
raw, profile, works, missing = collect_and_adapt(
|
|
99
|
-
input_value=input_value,
|
|
159
|
+
input_value=normalized_input_value or input_value,
|
|
100
160
|
base_url=runtime["base_url"],
|
|
101
161
|
token=runtime["token"],
|
|
102
162
|
timeout_ms=runtime["timeout_ms"],
|
|
103
163
|
page_size=20,
|
|
104
164
|
pages_max=50,
|
|
105
|
-
max_items=
|
|
165
|
+
max_items=bounded_max_items,
|
|
106
166
|
progress=progress.child(scope="author_home.collect"),
|
|
107
167
|
)
|
|
108
168
|
|
|
@@ -138,7 +198,7 @@ def run_douyin_creator_home(
|
|
|
138
198
|
|
|
139
199
|
request_id = ensure_request_id(
|
|
140
200
|
raw.get("request_id") or profile.get("request_id"),
|
|
141
|
-
fallback_seed=input_value,
|
|
201
|
+
fallback_seed=normalized_input_value or input_value,
|
|
142
202
|
)
|
|
143
203
|
extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
|
|
144
204
|
|
|
@@ -206,7 +266,12 @@ def main() -> None:
|
|
|
206
266
|
parser.add_argument("--allow-process-env", action="store_true", help="Allow process env overrides")
|
|
207
267
|
parser.add_argument("--base-url", default=None, help="Override Tikomni base URL")
|
|
208
268
|
parser.add_argument("--timeout-ms", type=int, default=None, help="Override timeout in ms")
|
|
209
|
-
parser.add_argument(
|
|
269
|
+
parser.add_argument(
|
|
270
|
+
"--max-items",
|
|
271
|
+
type=int,
|
|
272
|
+
default=DEFAULT_MAX_ITEMS,
|
|
273
|
+
help=f"Max works to collect from homepage (default full crawl, capped at {MAX_ITEMS_HARD_LIMIT})",
|
|
274
|
+
)
|
|
210
275
|
parser.set_defaults(write_card=True, persist_output=True)
|
|
211
276
|
parser.add_argument("--write-card", dest="write_card", action="store_true", help="Write work fact cards")
|
|
212
277
|
parser.add_argument("--no-write-card", dest="write_card", action="store_false", help="Skip card writing")
|
|
@@ -215,6 +280,19 @@ def main() -> None:
|
|
|
215
280
|
args = parser.parse_args()
|
|
216
281
|
|
|
217
282
|
config, _ = load_tikomni_config(args.config, env_file=args.env_file, allow_process_env=args.allow_process_env)
|
|
283
|
+
preflight = normalize_douyin_creator_input(args.input)
|
|
284
|
+
if preflight.get("error_reason"):
|
|
285
|
+
write_json_stdout(
|
|
286
|
+
run_douyin_creator_home(
|
|
287
|
+
input_value=args.input,
|
|
288
|
+
config=config,
|
|
289
|
+
runtime=None,
|
|
290
|
+
max_items=int(args.max_items),
|
|
291
|
+
write_card=bool(args.write_card),
|
|
292
|
+
persist_output=bool(args.persist_output),
|
|
293
|
+
)
|
|
294
|
+
)
|
|
295
|
+
return
|
|
218
296
|
runtime = resolve_runtime(
|
|
219
297
|
env_file=args.env_file,
|
|
220
298
|
api_key_env=str(config_get(config, "runtime.auth_env_key", "TIKOMNI_API_KEY")),
|
|
@@ -224,7 +302,7 @@ def main() -> None:
|
|
|
224
302
|
)
|
|
225
303
|
write_json_stdout(
|
|
226
304
|
run_douyin_creator_home(
|
|
227
|
-
input_value=args.input,
|
|
305
|
+
input_value=str(preflight.get("input_value") or args.input),
|
|
228
306
|
config=config,
|
|
229
307
|
runtime=runtime,
|
|
230
308
|
max_items=int(args.max_items),
|
|
@@ -31,6 +31,13 @@ from scripts.core.config_loader import config_get, load_tikomni_config
|
|
|
31
31
|
from scripts.core.extract_pipeline import resolve_trace_error_context
|
|
32
32
|
from scripts.core.progress_report import ProgressReporter, build_progress_reporter
|
|
33
33
|
from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
|
|
34
|
+
from scripts.pipelines.douyin_metadata import (
|
|
35
|
+
extract_douyin_author as extract_shared_douyin_author,
|
|
36
|
+
extract_douyin_caption as extract_shared_douyin_caption,
|
|
37
|
+
extract_douyin_metrics as extract_shared_douyin_metrics,
|
|
38
|
+
extract_douyin_title as extract_shared_douyin_title,
|
|
39
|
+
)
|
|
40
|
+
from scripts.pipelines.input_contracts import normalize_douyin_work_input
|
|
34
41
|
from scripts.core.asr_pipeline import derive_asr_clean_text, run_u2_asr_with_timeout_retry
|
|
35
42
|
from scripts.pipelines.select_low_quality_video_url import select_low_quality_video_url
|
|
36
43
|
from scripts.core.tikomni_common import (
|
|
@@ -156,14 +163,8 @@ def _normalize_input(
|
|
|
156
163
|
input_value: Optional[str],
|
|
157
164
|
share_url: Optional[str],
|
|
158
165
|
) -> Dict[str, Optional[str]]:
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
if input_value and not normalized_share:
|
|
162
|
-
candidate = input_value.strip()
|
|
163
|
-
if candidate.startswith("http://") or candidate.startswith("https://"):
|
|
164
|
-
normalized_share = candidate
|
|
165
|
-
|
|
166
|
-
return {"share_url": normalized_share}
|
|
166
|
+
normalized = normalize_douyin_work_input(input_value, share_url)
|
|
167
|
+
return {"share_url": normalize_text(normalized.get("share_url")) or None}
|
|
167
168
|
|
|
168
169
|
|
|
169
170
|
def _extract_aweme_detail(payload: Any) -> Optional[Dict[str, Any]]:
|
|
@@ -238,76 +239,19 @@ def _normalize_duration_ms(item: Dict[str, Any]) -> Optional[int]:
|
|
|
238
239
|
|
|
239
240
|
|
|
240
241
|
def _pick_title(item: Dict[str, Any]) -> str:
|
|
241
|
-
|
|
242
|
-
value = item.get(key)
|
|
243
|
-
text = normalize_text(value)
|
|
244
|
-
if text:
|
|
245
|
-
return text
|
|
246
|
-
return ""
|
|
242
|
+
return extract_shared_douyin_title(item)
|
|
247
243
|
|
|
248
244
|
|
|
249
245
|
def _pick_desc(item: Dict[str, Any]) -> str:
|
|
250
|
-
|
|
251
|
-
value = item.get(key)
|
|
252
|
-
text = normalize_text(value)
|
|
253
|
-
if text:
|
|
254
|
-
return text
|
|
255
|
-
return ""
|
|
246
|
+
return extract_shared_douyin_caption(item)
|
|
256
247
|
|
|
257
248
|
|
|
258
249
|
def _extract_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
|
|
259
|
-
|
|
260
|
-
if not isinstance(author, dict):
|
|
261
|
-
author = {}
|
|
262
|
-
|
|
263
|
-
author_platform_id = normalize_text(author.get("uid")) or normalize_text(author.get("id")) or normalize_text(item.get("author_user_id"))
|
|
264
|
-
author_handle = normalize_text(author.get("short_id")) or normalize_text(author.get("nickname"))
|
|
265
|
-
douyin_sec_uid = normalize_text(author.get("sec_uid"))
|
|
266
|
-
douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
|
|
267
|
-
|
|
268
|
-
return {
|
|
269
|
-
"author_handle": author_handle or None,
|
|
270
|
-
"platform_author_id": author_platform_id or None,
|
|
271
|
-
"author_platform_id": author_platform_id or None,
|
|
272
|
-
"douyin_sec_uid": douyin_sec_uid or None,
|
|
273
|
-
"douyin_aweme_author_id": douyin_aweme_author_id or None,
|
|
274
|
-
"nickname": normalize_text(author.get("nickname")) or None,
|
|
275
|
-
"signature": normalize_text(author.get("signature")) or None,
|
|
276
|
-
}
|
|
250
|
+
return extract_shared_douyin_author(item)
|
|
277
251
|
|
|
278
252
|
|
|
279
253
|
def _extract_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
|
|
280
|
-
|
|
281
|
-
if not isinstance(statistics, dict):
|
|
282
|
-
statistics = {}
|
|
283
|
-
|
|
284
|
-
def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
|
|
285
|
-
for key in keys:
|
|
286
|
-
value = _safe_int(statistics.get(key))
|
|
287
|
-
if value is not None:
|
|
288
|
-
return value
|
|
289
|
-
value = _safe_int(item.get(key))
|
|
290
|
-
if value is not None:
|
|
291
|
-
return value
|
|
292
|
-
return default
|
|
293
|
-
|
|
294
|
-
metrics = {
|
|
295
|
-
"digg_count": metric("digg_count"),
|
|
296
|
-
"comment_count": metric("comment_count"),
|
|
297
|
-
"collect_count": metric("collect_count"),
|
|
298
|
-
"share_count": metric("share_count", "forward_count"),
|
|
299
|
-
"play_count": metric("play_count", default=None),
|
|
300
|
-
}
|
|
301
|
-
play_count = metrics.get("play_count")
|
|
302
|
-
engagement_floor = max(
|
|
303
|
-
int(metrics.get("digg_count") or 0),
|
|
304
|
-
int(metrics.get("comment_count") or 0),
|
|
305
|
-
int(metrics.get("collect_count") or 0),
|
|
306
|
-
int(metrics.get("share_count") or 0),
|
|
307
|
-
)
|
|
308
|
-
if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
|
|
309
|
-
metrics["play_count"] = None
|
|
310
|
-
return metrics
|
|
254
|
+
return extract_shared_douyin_metrics(item)
|
|
311
255
|
|
|
312
256
|
|
|
313
257
|
def _extract_platform_work_id(item: Dict[str, Any]) -> Optional[str]:
|
|
@@ -633,6 +577,7 @@ def _build_result(
|
|
|
633
577
|
cover_image: Optional[str] = None,
|
|
634
578
|
asr_source: str = "fallback_none",
|
|
635
579
|
timings: Optional[Dict[str, int]] = None,
|
|
580
|
+
missing_fields: Optional[List[Dict[str, str]]] = None,
|
|
636
581
|
) -> Dict[str, Any]:
|
|
637
582
|
summary_block = summarize_content(raw_content, source="douyin:single-video-low-quality")
|
|
638
583
|
insights = list(summary_block.get("insights", []))
|
|
@@ -698,13 +643,13 @@ def _build_result(
|
|
|
698
643
|
"insights": insights,
|
|
699
644
|
"confidence": confidence,
|
|
700
645
|
"error_reason": error_reason,
|
|
701
|
-
"missing_fields": _build_missing_fields(
|
|
646
|
+
"missing_fields": list(missing_fields or _build_missing_fields(
|
|
702
647
|
title=title,
|
|
703
648
|
desc=desc,
|
|
704
649
|
platform_work_id=platform_work_id,
|
|
705
650
|
video_down_url=video_down_url,
|
|
706
651
|
author=author,
|
|
707
|
-
),
|
|
652
|
+
)),
|
|
708
653
|
"extract_trace": extract_trace,
|
|
709
654
|
"fallback_trace": fallback_trace,
|
|
710
655
|
"request_id": request_id,
|
|
@@ -741,7 +686,8 @@ def run_douyin_single_video(
|
|
|
741
686
|
workflow_started_at = time.perf_counter()
|
|
742
687
|
timings = _empty_timings()
|
|
743
688
|
parse_started_at = time.perf_counter()
|
|
744
|
-
|
|
689
|
+
preflight = normalize_douyin_work_input(input_value, share_url)
|
|
690
|
+
source_input = {"share_url": normalize_text(preflight.get("share_url")) or None}
|
|
745
691
|
timings["url_parse_ms"] = _elapsed_ms(parse_started_at)
|
|
746
692
|
if progress is not None:
|
|
747
693
|
progress.started(
|
|
@@ -749,6 +695,66 @@ def run_douyin_single_video(
|
|
|
749
695
|
message="douyin single_video workflow started",
|
|
750
696
|
data={"analysis_mode": analysis_mode, "write_card": bool(write_card), "persist_output": bool(persist_output)},
|
|
751
697
|
)
|
|
698
|
+
preflight_trace = [
|
|
699
|
+
{
|
|
700
|
+
"step": "input.preflight",
|
|
701
|
+
"ok": preflight.get("error_reason") is None,
|
|
702
|
+
"input_kind": "share_url",
|
|
703
|
+
"normalized_share_url": source_input.get("share_url"),
|
|
704
|
+
"error_reason": preflight.get("error_reason"),
|
|
705
|
+
"missing_fields": list(preflight.get("missing_fields") or []),
|
|
706
|
+
}
|
|
707
|
+
]
|
|
708
|
+
if preflight.get("error_reason"):
|
|
709
|
+
result = _build_result(
|
|
710
|
+
source_input=source_input,
|
|
711
|
+
platform_work_id=None,
|
|
712
|
+
title="",
|
|
713
|
+
desc="",
|
|
714
|
+
duration_ms=None,
|
|
715
|
+
video_down_url=None,
|
|
716
|
+
author={"author_handle": None, "author_platform_id": None, "douyin_sec_uid": None, "douyin_aweme_author_id": None, "nickname": None, "signature": None},
|
|
717
|
+
metrics=_empty_metrics(),
|
|
718
|
+
tags=[],
|
|
719
|
+
is_video=False,
|
|
720
|
+
video_type_reason="invalid_input",
|
|
721
|
+
raw_content="",
|
|
722
|
+
confidence="low",
|
|
723
|
+
error_reason=str(preflight.get("error_reason") or "invalid_share_url"),
|
|
724
|
+
extract_trace=preflight_trace,
|
|
725
|
+
fallback_trace=[],
|
|
726
|
+
request_id=None,
|
|
727
|
+
u2_task_id=None,
|
|
728
|
+
u2_task_status="UNKNOWN",
|
|
729
|
+
u2_gate_reason="invalid_input",
|
|
730
|
+
analysis_mode=analysis_mode,
|
|
731
|
+
timings=timings,
|
|
732
|
+
missing_fields=list(preflight.get("missing_fields") or []),
|
|
733
|
+
)
|
|
734
|
+
if write_card:
|
|
735
|
+
card_started_at = time.perf_counter()
|
|
736
|
+
result["card_write"] = write_work_fact_card(
|
|
737
|
+
payload=result,
|
|
738
|
+
platform="douyin",
|
|
739
|
+
card_type=card_type,
|
|
740
|
+
card_root=card_root,
|
|
741
|
+
content_kind=content_kind,
|
|
742
|
+
storage_config=storage_config,
|
|
743
|
+
analysis_mode=analysis_mode,
|
|
744
|
+
progress=progress.child(scope="card_write") if progress is not None else None,
|
|
745
|
+
)
|
|
746
|
+
timings["card_write_ms"] = _elapsed_ms(card_started_at)
|
|
747
|
+
timings["llm_analysis_ms"] = _safe_int((result.get("card_write") or {}).get("llm_analysis_ms"))
|
|
748
|
+
timings["total_ms"] = _elapsed_ms(workflow_started_at)
|
|
749
|
+
result["timings"] = dict(timings)
|
|
750
|
+
_update_pipeline_status(result)
|
|
751
|
+
return _finalize_result(
|
|
752
|
+
result=result,
|
|
753
|
+
source_input=source_input,
|
|
754
|
+
platform_work_id=None,
|
|
755
|
+
storage_config=storage_config,
|
|
756
|
+
persist_output=persist_output,
|
|
757
|
+
)
|
|
752
758
|
if not source_input.get("share_url"):
|
|
753
759
|
result = _build_result(
|
|
754
760
|
source_input=source_input,
|
|
@@ -765,7 +771,7 @@ def run_douyin_single_video(
|
|
|
765
771
|
raw_content="",
|
|
766
772
|
confidence="low",
|
|
767
773
|
error_reason="missing_share_url",
|
|
768
|
-
extract_trace=
|
|
774
|
+
extract_trace=preflight_trace,
|
|
769
775
|
fallback_trace=[],
|
|
770
776
|
request_id=None,
|
|
771
777
|
u2_task_id=None,
|