@tikomni/skills 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ #!/usr/bin/env python3
2
+ """Shared media URL classification helpers."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from typing import Iterable, List
7
+
8
+
9
+ def _is_http_url(url: str) -> bool:
10
+ lower = (url or "").lower()
11
+ return lower.startswith("http://") or lower.startswith("https://")
12
+
13
+
14
+ def is_probable_image_url(url: str) -> bool:
15
+ lower = (url or "").lower()
16
+ if not _is_http_url(lower):
17
+ return False
18
+ image_tokens = [
19
+ ".jpg",
20
+ ".jpeg",
21
+ ".png",
22
+ ".webp",
23
+ ".gif",
24
+ "imageview2",
25
+ "imagemogr2",
26
+ "redimage",
27
+ "frame/",
28
+ "sns-img",
29
+ "sns-webpic",
30
+ "notes_pre_post",
31
+ "/image/",
32
+ "/img/",
33
+ ]
34
+ return any(token in lower for token in image_tokens)
35
+
36
+
37
+ def is_probable_audio_url(url: str) -> bool:
38
+ lower = (url or "").lower()
39
+ if not _is_http_url(lower):
40
+ return False
41
+ audio_tokens = [
42
+ ".m4a",
43
+ ".mp3",
44
+ ".aac",
45
+ ".wav",
46
+ ".flac",
47
+ ".ogg",
48
+ "/audio/",
49
+ "sns-audio",
50
+ "redaudio",
51
+ ]
52
+ return any(token in lower for token in audio_tokens)
53
+
54
+
55
+ def is_probable_video_url(url: str) -> bool:
56
+ lower = (url or "").lower()
57
+ if not _is_http_url(lower):
58
+ return False
59
+ if is_probable_image_url(lower) or is_probable_audio_url(lower):
60
+ return False
61
+ video_tokens = [
62
+ ".mp4",
63
+ ".m3u8",
64
+ ".mov",
65
+ ".flv",
66
+ "/video/",
67
+ "sns-video",
68
+ "redvideo",
69
+ "play",
70
+ "stream",
71
+ "master",
72
+ "vod",
73
+ ]
74
+ return any(token in lower for token in video_tokens)
75
+
76
+
77
+ def filter_video_urls(urls: Iterable[str]) -> List[str]:
78
+ unique: List[str] = []
79
+ seen = set()
80
+ for raw in urls:
81
+ url = str(raw or "").strip()
82
+ if not url or url in seen or not is_probable_video_url(url):
83
+ continue
84
+ unique.append(url)
85
+ seen.add(url)
86
+ return unique
@@ -13,6 +13,14 @@ from scripts.pipelines.schema import (
13
13
  validate_work_item,
14
14
  validate_works_collection,
15
15
  )
16
+ from scripts.pipelines.douyin_metadata import (
17
+ extract_douyin_author,
18
+ extract_douyin_caption,
19
+ extract_douyin_metrics,
20
+ extract_douyin_title,
21
+ normalize_douyin_author_handle,
22
+ )
23
+ from scripts.pipelines.media_url_rules import is_probable_video_url as is_shared_probable_video_url
16
24
  from scripts.core.tikomni_common import deep_find_all, deep_find_first
17
25
  from scripts.pipelines.select_low_quality_video_url import select_low_quality_video_url
18
26
 
@@ -117,12 +125,7 @@ def _normalize_douyin_tags(value: Any) -> List[str]:
117
125
 
118
126
 
119
127
  def _is_probable_video_url(url: str) -> bool:
120
- lower = (url or "").lower()
121
- if not (lower.startswith("http://") or lower.startswith("https://")):
122
- return False
123
- if any(token in lower for token in [".jpg", ".jpeg", ".png", ".webp", "image", "img"]):
124
- return False
125
- return any(token in lower for token in [".mp4", ".m3u8", ".m4a", "video", "stream", "play"])
128
+ return is_shared_probable_video_url(url)
126
129
 
127
130
 
128
131
  def _extract_douyin_video_down_url(item: Dict[str, Any]) -> str:
@@ -348,14 +351,21 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
348
351
 
349
352
  internal_author_id = _t(_first(profile_data, ["sec_user_id", "sec_uid"], raw.get("resolved_author_id")))
350
353
  stable_author_id = _t(_first(profile_data, ["uid", "user_id", "id"]))
351
- author_handle = _t(_first(profile_data, ["short_id", "unique_id", "douyin_id", "display_id"]))
354
+ author_handle = normalize_douyin_author_handle(
355
+ _first(profile_data, ["unique_id"]),
356
+ _first(profile_data, ["short_id"]),
357
+ _first(profile_data, ["douyin_id"]),
358
+ _first(profile_data, ["display_id"]),
359
+ _first(profile_data, ["nickname", "name"]),
360
+ )
361
+ nickname = _t(_first(profile_data, ["nickname", "name"]))
352
362
 
353
363
  author_id = internal_author_id or stable_author_id
354
364
  profile = build_author_profile(
355
365
  platform="douyin",
356
366
  platform_author_id=author_id,
357
367
  author_handle=author_handle,
358
- nickname=_t(_first(profile_data, ["nickname", "name"])),
368
+ nickname=nickname,
359
369
  ip_location=_t(_first(profile_data, ["ip_location", "ip_label", "ipLocation"])),
360
370
  fans_count=_i(_first(profile_data, ["follower_count", "fans_count", "mplatform_followers_count"])),
361
371
  liked_count=_i(_first(profile_data, ["total_favorited", "liked_count", "favoriting_count"])),
@@ -383,23 +393,27 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
383
393
  if not isinstance(item, dict):
384
394
  continue
385
395
  aweme_id = _t(_first(item, ["aweme_id", "item_id", "id"]))
386
- metrics = {
387
- "like": _i(_first(item, ["digg_count", "like_count"], 0)),
388
- "comment": _i(_first(item, ["comment_count"], 0)),
389
- "collect": _i(_first(item, ["collect_count"], 0)),
390
- "share": _i(_first(item, ["share_count"], 0)),
391
- "play": _optional_i(_first(item, ["play_count", "view_count"], None)),
392
- }
396
+ author_info = extract_douyin_author(item)
397
+ metrics = extract_douyin_metrics(item)
393
398
  video_down_url = _extract_douyin_video_down_url(item)
394
399
  tags = _normalize_douyin_tags(_first(item, ["hashtags", "tags", "text_extra"], []))
400
+ work_author_handle = normalize_douyin_author_handle(
401
+ author_info.get("author_handle"),
402
+ author_handle,
403
+ nickname,
404
+ )
405
+ work_platform_author_id = _t(author_info.get("platform_author_id") or author_id)
406
+ work_author_platform_id = _t(author_info.get("author_platform_id") or stable_author_id or author_id)
407
+ work_nickname = _t(author_info.get("nickname") or nickname)
408
+ work_signature = _t(author_info.get("signature") or profile.get("signature"))
395
409
  work = build_work_item(
396
410
  platform="douyin",
397
411
  platform_work_id=aweme_id,
398
- platform_author_id=author_id,
399
- author_handle=author_handle,
400
- author_platform_id=stable_author_id or author_id,
401
- title=_t(_first(item, ["title"])),
402
- caption_raw=_t(_first(item, ["desc"])),
412
+ platform_author_id=work_platform_author_id,
413
+ author_handle=work_author_handle,
414
+ author_platform_id=work_author_platform_id,
415
+ title=extract_douyin_title(item),
416
+ caption_raw=extract_douyin_caption(item),
403
417
  subtitle_raw="",
404
418
  subtitle_source="missing",
405
419
  publish_time=_t(_first(item, ["create_time", "publish_time"])),
@@ -407,7 +421,12 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
407
421
  content_type="video",
408
422
  duration_ms=_i(_first(item, ["duration_ms", "duration"], 0)),
409
423
  tags=tags,
410
- metrics=metrics,
424
+ metrics={
425
+ "digg_count": int(metrics.get("digg_count") or 0),
426
+ "comment_count": int(metrics.get("comment_count") or 0),
427
+ "collect_count": int(metrics.get("collect_count") or 0),
428
+ "share_count": int(metrics.get("share_count") or 0),
429
+ },
411
430
  cover_image=(
412
431
  _extract_first_url(_first(item, ["cover_url"], ""))
413
432
  or _extract_first_url(_first(item, ["cover"], ""))
@@ -420,18 +439,31 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
420
439
  asr_error_reason="",
421
440
  asr_source="fallback_none",
422
441
  platform_native_refs={
423
- "douyin_sec_uid": internal_author_id,
424
- "douyin_aweme_author_id": stable_author_id or author_id,
442
+ "douyin_sec_uid": _t(author_info.get("douyin_sec_uid") or internal_author_id),
443
+ "douyin_aweme_author_id": _t(author_info.get("douyin_aweme_author_id") or stable_author_id or author_id),
444
+ "douyin_unique_id": _t(author_info.get("unique_id")),
425
445
  },
426
446
  raw_ref={"aweme_id": aweme_id, "raw_item": item},
427
447
  )
428
448
  work.update(
429
449
  {
430
- "digg_count": metrics["like"],
431
- "comment_count": metrics["comment"],
432
- "collect_count": metrics["collect"],
433
- "share_count": metrics["share"],
434
- "play_count": metrics["play"],
450
+ "author": {
451
+ "author_handle": work_author_handle,
452
+ "platform_author_id": work_platform_author_id,
453
+ "author_platform_id": work_author_platform_id,
454
+ "douyin_sec_uid": _t(author_info.get("douyin_sec_uid") or internal_author_id),
455
+ "douyin_aweme_author_id": _t(author_info.get("douyin_aweme_author_id") or stable_author_id or author_id),
456
+ "unique_id": _t(author_info.get("unique_id")),
457
+ "nickname": work_nickname,
458
+ "signature": work_signature,
459
+ },
460
+ "nickname": work_nickname,
461
+ "signature": work_signature,
462
+ "digg_count": int(metrics.get("digg_count") or 0),
463
+ "comment_count": int(metrics.get("comment_count") or 0),
464
+ "collect_count": int(metrics.get("collect_count") or 0),
465
+ "share_count": int(metrics.get("share_count") or 0),
466
+ "play_count": metrics.get("play_count"),
435
467
  }
436
468
  )
437
469
 
@@ -448,16 +480,18 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
448
480
 
449
481
  author_id = _t(_first(profile_data, ["user_id", "userid", "id"], raw.get("resolved_author_id")))
450
482
  author_handle = _t(_first(profile_data, ["red_id", "redid", "display_id", "username"]))
483
+ nickname = _t(_first(profile_data, ["nickname", "name"]))
484
+ signature = _t(_first(profile_data, ["desc", "signature", "bio", "introduction"]))
451
485
  profile = build_author_profile(
452
486
  platform="xiaohongshu",
453
487
  platform_author_id=author_id,
454
488
  author_handle=author_handle,
455
- nickname=_t(_first(profile_data, ["nickname", "name"])),
489
+ nickname=nickname,
456
490
  ip_location=_t(_first(profile_data, ["ip_location", "ip_location_desc", "ipLocation"])),
457
491
  fans_count=_i(_first(profile_data, ["fans", "fans_count", "follower_count", "followers"])),
458
492
  liked_count=_i(_first(profile_data, ["liked_count", "likes", "total_liked", "like_count"])),
459
493
  collected_count=_i(_first(profile_data, ["collected_count", "collect_count", "total_collected", "favorite_count"])),
460
- signature=_t(_first(profile_data, ["desc", "signature", "bio", "introduction"])),
494
+ signature=signature,
461
495
  avatar_url=_extract_xhs_avatar_url(profile_data),
462
496
  works_count=_i(_first(profile_data, ["notes", "note_count", "works_count", "post_count"])),
463
497
  verified=bool(_first(profile_data, ["official_verified", "verified"], False)),
@@ -480,6 +514,8 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
480
514
  "share": _i(_first(item, ["share_count"], 0)),
481
515
  "play": _optional_i(_first(item, ["view_count", "play_count"], None)),
482
516
  }
517
+ if (metrics["play"] or 0) <= 0 and max(metrics["like"], metrics["comment"], metrics["collect"], metrics["share"]) > 0:
518
+ metrics["play"] = None
483
519
  subtitle_inline = _extract_xhs_subtitle_inline(item)
484
520
  subtitle_urls = _extract_xhs_subtitle_urls(item)
485
521
  video_down_url = _extract_xhs_video_down_url(item)
@@ -489,6 +525,8 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
489
525
  cover_image = _extract_xhs_cover_image(item)
490
526
  source_url = _extract_xhs_source_url(item, note_id)
491
527
  share_url = _extract_xhs_share_url(item, note_id)
528
+ work_nickname = nickname
529
+ work_signature = signature
492
530
 
493
531
  work = build_work_item(
494
532
  platform="xiaohongshu",
@@ -523,6 +561,15 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
523
561
  )
524
562
  work.update(
525
563
  {
564
+ "author": {
565
+ "author_handle": author_handle,
566
+ "platform_author_id": author_id,
567
+ "author_platform_id": author_id,
568
+ "nickname": work_nickname,
569
+ "signature": work_signature,
570
+ },
571
+ "nickname": work_nickname,
572
+ "signature": work_signature,
526
573
  "digg_count": metrics["like"],
527
574
  "comment_count": metrics["comment"],
528
575
  "collect_count": metrics["collect"],
@@ -27,10 +27,15 @@ from scripts.core.config_loader import config_get, load_tikomni_config, resolve_
27
27
  from scripts.core.progress_report import build_progress_reporter
28
28
  from scripts.core.storage_router import resolve_author_directory_name
29
29
  from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
30
+ from scripts.pipelines.input_contracts import normalize_douyin_creator_input
31
+ from scripts.pipelines.schema import build_author_profile
30
32
  from scripts.pipelines.douyin_creator_home_helpers import collect_and_adapt
31
33
  from scripts.pipelines.home_asr import enrich_author_home_asr
32
34
  from scripts.writers.write_work_fact_card import build_work_fact_card, persist_output_envelope, write_work_fact_card
33
35
 
36
+ DEFAULT_MAX_ITEMS = 200
37
+ MAX_ITEMS_HARD_LIMIT = 200
38
+
34
39
 
35
40
  def _write_collection_artifacts(
36
41
  *,
@@ -81,11 +86,12 @@ def run_douyin_creator_home(
81
86
  *,
82
87
  input_value: str,
83
88
  config: Dict[str, Any],
84
- runtime: Dict[str, Any],
89
+ runtime: Dict[str, Any] | None,
85
90
  max_items: int,
86
91
  write_card: bool,
87
92
  persist_output: bool,
88
93
  ) -> Dict[str, Any]:
94
+ bounded_max_items = max(1, min(int(max_items), MAX_ITEMS_HARD_LIMIT))
89
95
  progress = build_progress_reporter(
90
96
  workflow="social-media-crawl",
91
97
  platform="douyin",
@@ -94,15 +100,69 @@ def run_douyin_creator_home(
94
100
  scope="workflow",
95
101
  )
96
102
  progress.started(stage="author_home.workflow", message="douyin author_home workflow started")
103
+ preflight = normalize_douyin_creator_input(input_value)
104
+ normalized_input_value = str(preflight.get("input_value") or "")
105
+ if preflight.get("error_reason"):
106
+ request_id = ensure_request_id(None, fallback_seed=input_value)
107
+ empty_profile = build_author_profile(platform="douyin", request_id=request_id)
108
+ extract_trace = [
109
+ {
110
+ "step": "input.preflight",
111
+ "ok": False,
112
+ "input_kind": "creator_url_or_sec_uid",
113
+ "normalized_input_value": normalized_input_value or None,
114
+ "error_reason": preflight.get("error_reason"),
115
+ "missing_fields": list(preflight.get("missing_fields") or []),
116
+ }
117
+ ]
118
+ envelope = {
119
+ "object_type": "creator",
120
+ "platform": "douyin",
121
+ "input": input_value,
122
+ "normalized": {
123
+ "creator_profile": {**empty_profile, "request_id": request_id, "extract_trace": extract_trace},
124
+ "work_collection": {
125
+ "platform": "douyin",
126
+ "platform_author_id": "",
127
+ "count": 0,
128
+ "items": [],
129
+ "request_id": request_id,
130
+ "extract_trace": extract_trace,
131
+ },
132
+ },
133
+ "completeness": evaluate_collection(empty_profile, []),
134
+ "missing_fields": normalize_missing_fields(preflight.get("missing_fields")),
135
+ "error_reason": str(preflight.get("error_reason") or "invalid_creator_input"),
136
+ "extract_trace": extract_trace,
137
+ "request_id": request_id,
138
+ "card_write": {
139
+ "enabled": bool(write_card),
140
+ "ok": False,
141
+ "count": 0,
142
+ "results": [],
143
+ "reason": "skipped_invalid_input",
144
+ },
145
+ "collection_artifacts": {},
146
+ "output_persist": {"enabled": False, "skipped": True, "reason": "invalid_input"},
147
+ }
148
+ progress.done(
149
+ stage="author_home.workflow",
150
+ message="douyin author_home workflow finished",
151
+ data={"request_id": request_id, "works_count": 0, "error_reason": envelope["error_reason"]},
152
+ )
153
+ return envelope
154
+
155
+ if runtime is None:
156
+ raise ValueError("runtime_required_for_valid_input")
97
157
 
98
158
  raw, profile, works, missing = collect_and_adapt(
99
- input_value=input_value,
159
+ input_value=normalized_input_value or input_value,
100
160
  base_url=runtime["base_url"],
101
161
  token=runtime["token"],
102
162
  timeout_ms=runtime["timeout_ms"],
103
163
  page_size=20,
104
164
  pages_max=50,
105
- max_items=max(1, int(max_items)),
165
+ max_items=bounded_max_items,
106
166
  progress=progress.child(scope="author_home.collect"),
107
167
  )
108
168
 
@@ -138,7 +198,7 @@ def run_douyin_creator_home(
138
198
 
139
199
  request_id = ensure_request_id(
140
200
  raw.get("request_id") or profile.get("request_id"),
141
- fallback_seed=input_value,
201
+ fallback_seed=normalized_input_value or input_value,
142
202
  )
143
203
  extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
144
204
 
@@ -206,7 +266,12 @@ def main() -> None:
206
266
  parser.add_argument("--allow-process-env", action="store_true", help="Allow process env overrides")
207
267
  parser.add_argument("--base-url", default=None, help="Override Tikomni base URL")
208
268
  parser.add_argument("--timeout-ms", type=int, default=None, help="Override timeout in ms")
209
- parser.add_argument("--max-items", type=int, default=5, help="Max works to collect from homepage")
269
+ parser.add_argument(
270
+ "--max-items",
271
+ type=int,
272
+ default=DEFAULT_MAX_ITEMS,
273
+ help=f"Max works to collect from homepage (default full crawl, capped at {MAX_ITEMS_HARD_LIMIT})",
274
+ )
210
275
  parser.set_defaults(write_card=True, persist_output=True)
211
276
  parser.add_argument("--write-card", dest="write_card", action="store_true", help="Write work fact cards")
212
277
  parser.add_argument("--no-write-card", dest="write_card", action="store_false", help="Skip card writing")
@@ -215,6 +280,19 @@ def main() -> None:
215
280
  args = parser.parse_args()
216
281
 
217
282
  config, _ = load_tikomni_config(args.config, env_file=args.env_file, allow_process_env=args.allow_process_env)
283
+ preflight = normalize_douyin_creator_input(args.input)
284
+ if preflight.get("error_reason"):
285
+ write_json_stdout(
286
+ run_douyin_creator_home(
287
+ input_value=args.input,
288
+ config=config,
289
+ runtime=None,
290
+ max_items=int(args.max_items),
291
+ write_card=bool(args.write_card),
292
+ persist_output=bool(args.persist_output),
293
+ )
294
+ )
295
+ return
218
296
  runtime = resolve_runtime(
219
297
  env_file=args.env_file,
220
298
  api_key_env=str(config_get(config, "runtime.auth_env_key", "TIKOMNI_API_KEY")),
@@ -224,7 +302,7 @@ def main() -> None:
224
302
  )
225
303
  write_json_stdout(
226
304
  run_douyin_creator_home(
227
- input_value=args.input,
305
+ input_value=str(preflight.get("input_value") or args.input),
228
306
  config=config,
229
307
  runtime=runtime,
230
308
  max_items=int(args.max_items),
@@ -31,6 +31,13 @@ from scripts.core.config_loader import config_get, load_tikomni_config
31
31
  from scripts.core.extract_pipeline import resolve_trace_error_context
32
32
  from scripts.core.progress_report import ProgressReporter, build_progress_reporter
33
33
  from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
34
+ from scripts.pipelines.douyin_metadata import (
35
+ extract_douyin_author as extract_shared_douyin_author,
36
+ extract_douyin_caption as extract_shared_douyin_caption,
37
+ extract_douyin_metrics as extract_shared_douyin_metrics,
38
+ extract_douyin_title as extract_shared_douyin_title,
39
+ )
40
+ from scripts.pipelines.input_contracts import normalize_douyin_work_input
34
41
  from scripts.core.asr_pipeline import derive_asr_clean_text, run_u2_asr_with_timeout_retry
35
42
  from scripts.pipelines.select_low_quality_video_url import select_low_quality_video_url
36
43
  from scripts.core.tikomni_common import (
@@ -156,14 +163,8 @@ def _normalize_input(
156
163
  input_value: Optional[str],
157
164
  share_url: Optional[str],
158
165
  ) -> Dict[str, Optional[str]]:
159
- normalized_share = (share_url or "").strip() or None
160
-
161
- if input_value and not normalized_share:
162
- candidate = input_value.strip()
163
- if candidate.startswith("http://") or candidate.startswith("https://"):
164
- normalized_share = candidate
165
-
166
- return {"share_url": normalized_share}
166
+ normalized = normalize_douyin_work_input(input_value, share_url)
167
+ return {"share_url": normalize_text(normalized.get("share_url")) or None}
167
168
 
168
169
 
169
170
  def _extract_aweme_detail(payload: Any) -> Optional[Dict[str, Any]]:
@@ -238,76 +239,19 @@ def _normalize_duration_ms(item: Dict[str, Any]) -> Optional[int]:
238
239
 
239
240
 
240
241
  def _pick_title(item: Dict[str, Any]) -> str:
241
- for key in ("item_title", "title", "desc", "preview_title"):
242
- value = item.get(key)
243
- text = normalize_text(value)
244
- if text:
245
- return text
246
- return ""
242
+ return extract_shared_douyin_title(item)
247
243
 
248
244
 
249
245
  def _pick_desc(item: Dict[str, Any]) -> str:
250
- for key in ("desc", "item_title", "title", "preview_title"):
251
- value = item.get(key)
252
- text = normalize_text(value)
253
- if text:
254
- return text
255
- return ""
246
+ return extract_shared_douyin_caption(item)
256
247
 
257
248
 
258
249
  def _extract_author(item: Dict[str, Any]) -> Dict[str, Optional[str]]:
259
- author = item.get("author")
260
- if not isinstance(author, dict):
261
- author = {}
262
-
263
- author_platform_id = normalize_text(author.get("uid")) or normalize_text(author.get("id")) or normalize_text(item.get("author_user_id"))
264
- author_handle = normalize_text(author.get("short_id")) or normalize_text(author.get("nickname"))
265
- douyin_sec_uid = normalize_text(author.get("sec_uid"))
266
- douyin_aweme_author_id = normalize_text(item.get("author_user_id")) or author_platform_id
267
-
268
- return {
269
- "author_handle": author_handle or None,
270
- "platform_author_id": author_platform_id or None,
271
- "author_platform_id": author_platform_id or None,
272
- "douyin_sec_uid": douyin_sec_uid or None,
273
- "douyin_aweme_author_id": douyin_aweme_author_id or None,
274
- "nickname": normalize_text(author.get("nickname")) or None,
275
- "signature": normalize_text(author.get("signature")) or None,
276
- }
250
+ return extract_shared_douyin_author(item)
277
251
 
278
252
 
279
253
  def _extract_metrics(item: Dict[str, Any]) -> Dict[str, Optional[int]]:
280
- statistics = item.get("statistics")
281
- if not isinstance(statistics, dict):
282
- statistics = {}
283
-
284
- def metric(*keys: str, default: Optional[int] = 0) -> Optional[int]:
285
- for key in keys:
286
- value = _safe_int(statistics.get(key))
287
- if value is not None:
288
- return value
289
- value = _safe_int(item.get(key))
290
- if value is not None:
291
- return value
292
- return default
293
-
294
- metrics = {
295
- "digg_count": metric("digg_count"),
296
- "comment_count": metric("comment_count"),
297
- "collect_count": metric("collect_count"),
298
- "share_count": metric("share_count", "forward_count"),
299
- "play_count": metric("play_count", default=None),
300
- }
301
- play_count = metrics.get("play_count")
302
- engagement_floor = max(
303
- int(metrics.get("digg_count") or 0),
304
- int(metrics.get("comment_count") or 0),
305
- int(metrics.get("collect_count") or 0),
306
- int(metrics.get("share_count") or 0),
307
- )
308
- if play_count is not None and int(play_count) <= 0 and engagement_floor > 0:
309
- metrics["play_count"] = None
310
- return metrics
254
+ return extract_shared_douyin_metrics(item)
311
255
 
312
256
 
313
257
  def _extract_platform_work_id(item: Dict[str, Any]) -> Optional[str]:
@@ -633,6 +577,7 @@ def _build_result(
633
577
  cover_image: Optional[str] = None,
634
578
  asr_source: str = "fallback_none",
635
579
  timings: Optional[Dict[str, int]] = None,
580
+ missing_fields: Optional[List[Dict[str, str]]] = None,
636
581
  ) -> Dict[str, Any]:
637
582
  summary_block = summarize_content(raw_content, source="douyin:single-video-low-quality")
638
583
  insights = list(summary_block.get("insights", []))
@@ -698,13 +643,13 @@ def _build_result(
698
643
  "insights": insights,
699
644
  "confidence": confidence,
700
645
  "error_reason": error_reason,
701
- "missing_fields": _build_missing_fields(
646
+ "missing_fields": list(missing_fields or _build_missing_fields(
702
647
  title=title,
703
648
  desc=desc,
704
649
  platform_work_id=platform_work_id,
705
650
  video_down_url=video_down_url,
706
651
  author=author,
707
- ),
652
+ )),
708
653
  "extract_trace": extract_trace,
709
654
  "fallback_trace": fallback_trace,
710
655
  "request_id": request_id,
@@ -741,7 +686,8 @@ def run_douyin_single_video(
741
686
  workflow_started_at = time.perf_counter()
742
687
  timings = _empty_timings()
743
688
  parse_started_at = time.perf_counter()
744
- source_input = _normalize_input(input_value, share_url)
689
+ preflight = normalize_douyin_work_input(input_value, share_url)
690
+ source_input = {"share_url": normalize_text(preflight.get("share_url")) or None}
745
691
  timings["url_parse_ms"] = _elapsed_ms(parse_started_at)
746
692
  if progress is not None:
747
693
  progress.started(
@@ -749,6 +695,66 @@ def run_douyin_single_video(
749
695
  message="douyin single_video workflow started",
750
696
  data={"analysis_mode": analysis_mode, "write_card": bool(write_card), "persist_output": bool(persist_output)},
751
697
  )
698
+ preflight_trace = [
699
+ {
700
+ "step": "input.preflight",
701
+ "ok": preflight.get("error_reason") is None,
702
+ "input_kind": "share_url",
703
+ "normalized_share_url": source_input.get("share_url"),
704
+ "error_reason": preflight.get("error_reason"),
705
+ "missing_fields": list(preflight.get("missing_fields") or []),
706
+ }
707
+ ]
708
+ if preflight.get("error_reason"):
709
+ result = _build_result(
710
+ source_input=source_input,
711
+ platform_work_id=None,
712
+ title="",
713
+ desc="",
714
+ duration_ms=None,
715
+ video_down_url=None,
716
+ author={"author_handle": None, "author_platform_id": None, "douyin_sec_uid": None, "douyin_aweme_author_id": None, "nickname": None, "signature": None},
717
+ metrics=_empty_metrics(),
718
+ tags=[],
719
+ is_video=False,
720
+ video_type_reason="invalid_input",
721
+ raw_content="",
722
+ confidence="low",
723
+ error_reason=str(preflight.get("error_reason") or "invalid_share_url"),
724
+ extract_trace=preflight_trace,
725
+ fallback_trace=[],
726
+ request_id=None,
727
+ u2_task_id=None,
728
+ u2_task_status="UNKNOWN",
729
+ u2_gate_reason="invalid_input",
730
+ analysis_mode=analysis_mode,
731
+ timings=timings,
732
+ missing_fields=list(preflight.get("missing_fields") or []),
733
+ )
734
+ if write_card:
735
+ card_started_at = time.perf_counter()
736
+ result["card_write"] = write_work_fact_card(
737
+ payload=result,
738
+ platform="douyin",
739
+ card_type=card_type,
740
+ card_root=card_root,
741
+ content_kind=content_kind,
742
+ storage_config=storage_config,
743
+ analysis_mode=analysis_mode,
744
+ progress=progress.child(scope="card_write") if progress is not None else None,
745
+ )
746
+ timings["card_write_ms"] = _elapsed_ms(card_started_at)
747
+ timings["llm_analysis_ms"] = _safe_int((result.get("card_write") or {}).get("llm_analysis_ms"))
748
+ timings["total_ms"] = _elapsed_ms(workflow_started_at)
749
+ result["timings"] = dict(timings)
750
+ _update_pipeline_status(result)
751
+ return _finalize_result(
752
+ result=result,
753
+ source_input=source_input,
754
+ platform_work_id=None,
755
+ storage_config=storage_config,
756
+ persist_output=persist_output,
757
+ )
752
758
  if not source_input.get("share_url"):
753
759
  result = _build_result(
754
760
  source_input=source_input,
@@ -765,7 +771,7 @@ def run_douyin_single_video(
765
771
  raw_content="",
766
772
  confidence="low",
767
773
  error_reason="missing_share_url",
768
- extract_trace=[],
774
+ extract_trace=preflight_trace,
769
775
  fallback_trace=[],
770
776
  request_id=None,
771
777
  u2_task_id=None,