@tikomni/skills 0.1.11 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tikomni/skills",
3
- "version": "0.1.11",
3
+ "version": "1.0.0",
4
4
  "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
@@ -24,6 +24,7 @@
24
24
 
25
25
  ## Optional Fields
26
26
 
27
+ - `duration_ms`
27
28
  - `digg_count`
28
29
  - `comment_count`
29
30
  - `collect_count`
@@ -37,6 +38,7 @@
37
38
  ## Field Rules
38
39
 
39
40
  - `author` is the display name, not an object.
41
+ - `duration_ms` uses milliseconds. Write `null` when the duration is unavailable or not applicable.
40
42
  - Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
41
43
  - The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
42
44
  - `primary_text` is the text that is best suited for reading and indexing in the current task.
@@ -32,6 +32,7 @@
32
32
  "subtitle_raw": { "type": "string" },
33
33
  "work_modality": { "type": "string" },
34
34
  "published_date": { "type": "string" },
35
+ "duration_ms": { "type": ["integer", "null"] },
35
36
  "digg_count": { "type": "integer" },
36
37
  "comment_count": { "type": "integer" },
37
38
  "collect_count": { "type": "integer" },
@@ -18,6 +18,7 @@ from scripts.core.asr_pipeline import (
18
18
  run_u2_asr_candidates_with_timeout_retry,
19
19
  )
20
20
  from scripts.core.u3_fallback import run_u3_public_url_fallback
21
+ from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
21
22
 
22
23
  DEFAULT_BATCH_SUBMIT_SIZE = 50
23
24
  MAX_BATCH_SUBMIT_SIZE = 100
@@ -76,13 +77,17 @@ def _resolve_is_video(work: Dict[str, Any], *, platform: str) -> bool:
76
77
  return False
77
78
 
78
79
  if platform == "douyin":
79
- return True
80
+ raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
81
+ raw_item = raw_ref.get("raw_item") if isinstance(raw_ref.get("raw_item"), dict) else {}
82
+ if raw_item:
83
+ return bool(normalize_douyin_video_type(raw_item).get("is_video"))
84
+ return False
80
85
 
81
86
  raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
82
87
  xhs_type_hint = normalize_text(raw_ref.get("type") or raw_ref.get("note_type")).lower()
83
- if xhs_type_hint in {"video", "0", "normal", "mixed", "mix"}:
88
+ if xhs_type_hint in {"video", "0", "mixed", "mix", "video_note", "note_video"}:
84
89
  return True
85
- if xhs_type_hint in {"image", "1", "note", "photo"}:
90
+ if xhs_type_hint in {"normal", "image", "1", "note", "photo", "text", "album"}:
86
91
  return False
87
92
 
88
93
  return False
@@ -275,7 +280,8 @@ def _run_u2_for_work(
275
280
  gate = _evaluate_u2_gate(work, platform=platform)
276
281
  if not gate.get("can_u2"):
277
282
  gate_reason = normalize_text(gate.get("gate_reason")) or "skip:unknown"
278
- return _fallback_none_result(gate_reason), {
283
+ fallback_result = _video_caption_fallback_result(work, gate_reason) if gate.get("is_video") else _fallback_none_result(gate_reason)
284
+ return fallback_result, {
279
285
  "step": "author_home.asr.u2_gate",
280
286
  "platform_work_id": work.get("platform_work_id"),
281
287
  "ok": False,
@@ -325,19 +331,7 @@ def _run_u2_for_work(
325
331
  asr_source="external_asr",
326
332
  ), trace
327
333
 
328
- return {
329
- "subtitle_raw": "",
330
- "subtitle_source": "missing",
331
- "asr_raw": "",
332
- "asr_clean": "",
333
- "primary_text": "",
334
- "primary_text_source": "asr_clean",
335
- "analysis_eligibility": "incomplete",
336
- "analysis_exclusion_reason": "video_asr_unavailable",
337
- "asr_status": "failed",
338
- "asr_error_reason": normalize_text(poll_result.get("error_reason")) or "u2_failed",
339
- "asr_source": "fallback_none",
340
- }, trace
334
+ return _video_caption_fallback_result(work, normalize_text(poll_result.get("error_reason")) or "u2_failed"), trace
341
335
 
342
336
 
343
337
  def _iter_xhs_interface_text_candidates(work: Dict[str, Any]) -> List[Tuple[str, str]]:
@@ -489,6 +483,23 @@ def _fallback_none_result(reason: str) -> Dict[str, Any]:
489
483
  }
490
484
 
491
485
 
486
+ def _video_caption_fallback_result(work: Dict[str, Any], reason: str) -> Dict[str, Any]:
487
+ caption_raw = normalize_text(work.get("caption_raw"))
488
+ return {
489
+ "subtitle_raw": "",
490
+ "subtitle_source": "missing",
491
+ "asr_raw": "",
492
+ "asr_clean": "",
493
+ "primary_text": caption_raw,
494
+ "primary_text_source": "caption_raw" if caption_raw else "missing",
495
+ "analysis_eligibility": "eligible" if caption_raw else "incomplete",
496
+ "analysis_exclusion_reason": "" if caption_raw else (normalize_text(reason) or "video_asr_unavailable"),
497
+ "asr_status": "failed",
498
+ "asr_error_reason": normalize_text(reason) or "asr_failed",
499
+ "asr_source": "fallback_none",
500
+ }
501
+
502
+
492
503
  def _run_xhs_u3_then_u2_batch_for_entries(
493
504
  *,
494
505
  batch_id: str,
@@ -517,7 +528,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
517
528
  subtitle_invalid = normalize_text(entry.get("subtitle_invalid")) or "subtitle_missing"
518
529
 
519
530
  if not source_url:
520
- work.update(_fallback_none_result("skip:video_download_url_missing"))
531
+ work.update(_video_caption_fallback_result(work, "skip:video_download_url_missing"))
521
532
  trace.append(
522
533
  {
523
534
  "step": "author_home.asr.xhs_u3",
@@ -554,7 +565,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
554
565
  )
555
566
 
556
567
  if not u3_result.get("ok") or not public_url:
557
- work.update(_fallback_none_result(normalize_text(u3_result.get("error_reason")) or "u3_bridge_failed"))
568
+ work.update(_video_caption_fallback_result(work, normalize_text(u3_result.get("error_reason")) or "u3_bridge_failed"))
558
569
  u3_failed_count += 1
559
570
  continue
560
571
 
@@ -597,7 +608,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
597
608
  work = entry.get("work")
598
609
  if not isinstance(work, dict):
599
610
  continue
600
- work.update(_fallback_none_result(normalize_text(entry.get("fallback_reason")) or "xhs_u3_then_u2_failed"))
611
+ work.update(_video_caption_fallback_result(work, normalize_text(entry.get("fallback_reason")) or "xhs_u3_then_u2_failed"))
601
612
 
602
613
  return {
603
614
  "trace": trace,
@@ -905,7 +916,10 @@ def enrich_author_home_asr(
905
916
  )
906
917
 
907
918
  if not gate.get("can_u2"):
908
- work.update(_fallback_none_result(str(gate.get("gate_reason") or "skip:unknown")))
919
+ if gate.get("is_video"):
920
+ work.update(_video_caption_fallback_result(work, str(gate.get("gate_reason") or "skip:unknown")))
921
+ else:
922
+ work.update(_mark_text_work_ready(work))
909
923
  else:
910
924
  batch_u2_entries.append(
911
925
  {
@@ -987,7 +1001,7 @@ def enrich_author_home_asr(
987
1001
  }
988
1002
  )
989
1003
  if not gate.get("can_u2"):
990
- work.update(_fallback_none_result(str(gate.get("gate_reason") or "skip:unknown")))
1004
+ work.update(_video_caption_fallback_result(work, str(gate.get("gate_reason") or "skip:unknown")))
991
1005
  else:
992
1006
  batch_xhs_u3_entries.append(
993
1007
  {
@@ -72,6 +72,177 @@ def _pick_list(payload: Any, keys: List[str]) -> List[Any]:
72
72
  return hit if isinstance(hit, list) else []
73
73
 
74
74
 
75
+ def _pick_raw(payload: Any, keys: List[str]) -> Any:
76
+ ordered_keys = [str(key) for key in keys if str(key).strip()]
77
+
78
+ def _walk(node: Any) -> Any:
79
+ if isinstance(node, dict):
80
+ for key in ordered_keys:
81
+ if key in node and node.get(key) is not None:
82
+ return node.get(key)
83
+ for value in node.values():
84
+ hit = _walk(value)
85
+ if hit is not None:
86
+ return hit
87
+ elif isinstance(node, list):
88
+ for item in node:
89
+ hit = _walk(item)
90
+ if hit is not None:
91
+ return hit
92
+ return None
93
+
94
+ return _walk(payload)
95
+
96
+
97
+ def _payload_candidates(payload: Any) -> List[Any]:
98
+ candidates: List[Any] = []
99
+ if payload is not None:
100
+ candidates.append(payload)
101
+ if isinstance(payload, dict):
102
+ nested = payload.get("data")
103
+ if nested is not None:
104
+ candidates.append(nested)
105
+ return candidates
106
+
107
+
108
+ def _pick_raw_from_candidates(payload: Any, keys: List[str]) -> Any:
109
+ for candidate in _payload_candidates(payload):
110
+ hit = _pick_raw(candidate, keys)
111
+ if hit is not None:
112
+ return hit
113
+ return None
114
+
115
+
116
+ def _pick_list_from_candidates(payload: Any, keys: List[str]) -> List[Any]:
117
+ for candidate in _payload_candidates(payload):
118
+ hit = _pick_list(candidate, keys)
119
+ if hit:
120
+ return hit
121
+ return []
122
+
123
+
124
+ def _unwrap_data_layers(payload: Any, *, max_depth: int = 3) -> Any:
125
+ node = payload
126
+ depth = 0
127
+ while depth < max_depth and isinstance(node, dict) and isinstance(node.get("data"), dict):
128
+ node = node.get("data")
129
+ depth += 1
130
+ return node
131
+
132
+
133
+ def _extract_douyin_posts_page(payload: Any) -> Dict[str, Any]:
134
+ node = _unwrap_data_layers(payload)
135
+ if not isinstance(node, dict):
136
+ return {}
137
+ return node
138
+
139
+
140
+ def _extract_douyin_posts_items(payload: Any) -> List[Any]:
141
+ node = _extract_douyin_posts_page(payload)
142
+ items = node.get("aweme_list")
143
+ return items if isinstance(items, list) else []
144
+
145
+
146
+ def _extract_douyin_posts_next_cursor(payload: Any) -> Any:
147
+ node = _extract_douyin_posts_page(payload)
148
+ if not isinstance(node, dict):
149
+ return None
150
+ for key in ("max_cursor", "cursor", "next_cursor"):
151
+ if key in node and node.get(key) is not None:
152
+ return node.get(key)
153
+ return None
154
+
155
+
156
+ def _extract_douyin_posts_has_more(payload: Any) -> Any:
157
+ node = _extract_douyin_posts_page(payload)
158
+ if not isinstance(node, dict):
159
+ return None
160
+ for key in ("has_more", "hasMore"):
161
+ if key in node and node.get(key) is not None:
162
+ return node.get(key)
163
+ return None
164
+
165
+
166
+ def _extract_xhs_posts_page(payload: Any) -> Dict[str, Any]:
167
+ node = _unwrap_data_layers(payload)
168
+ if not isinstance(node, dict):
169
+ return {}
170
+ return node
171
+
172
+
173
+ def _extract_xhs_posts_items(payload: Any) -> List[Any]:
174
+ node = _extract_xhs_posts_page(payload)
175
+ for key in ("notes", "note_list", "noteList", "items", "list"):
176
+ value = node.get(key)
177
+ if isinstance(value, list):
178
+ return value
179
+ return []
180
+
181
+
182
+ def _extract_xhs_response_cursor(payload: Any) -> str:
183
+ node = _extract_xhs_posts_page(payload)
184
+ if not isinstance(node, dict):
185
+ return ""
186
+ for key in ("cursor", "next_cursor", "last_cursor", "last_note_id"):
187
+ value = _to_text(node.get(key))
188
+ if value:
189
+ return value
190
+ return ""
191
+
192
+
193
+ def _extract_xhs_posts_has_more(payload: Any) -> Any:
194
+ node = _extract_xhs_posts_page(payload)
195
+ if not isinstance(node, dict):
196
+ return None
197
+ for key in ("has_more", "hasMore"):
198
+ if key in node and node.get(key) is not None:
199
+ return node.get(key)
200
+ return None
201
+
202
+
203
+ def _normalize_has_more(value: Any) -> Optional[bool]:
204
+ if value is None:
205
+ return None
206
+ if isinstance(value, bool):
207
+ return value
208
+ if isinstance(value, (int, float)):
209
+ return bool(int(value))
210
+ text = _to_text(value).lower()
211
+ if not text:
212
+ return None
213
+ if text in {"1", "true", "yes", "y"}:
214
+ return True
215
+ if text in {"0", "false", "no", "n"}:
216
+ return False
217
+ return None
218
+
219
+
220
+ def _normalize_int_like(value: Any) -> Optional[int]:
221
+ if value is None:
222
+ return None
223
+ try:
224
+ if isinstance(value, bool):
225
+ return int(value)
226
+ if isinstance(value, (int, float)):
227
+ return int(value)
228
+ text = _to_text(value)
229
+ if not text:
230
+ return None
231
+ return int(float(text.replace(",", "")))
232
+ except Exception:
233
+ return None
234
+
235
+
236
+ def _last_xhs_note_id(items: List[Any]) -> str:
237
+ for item in reversed(items):
238
+ if not isinstance(item, dict):
239
+ continue
240
+ note_id = _pick_text(item, ["note_id", "id", "item_id"])
241
+ if note_id:
242
+ return note_id
243
+ return ""
244
+
245
+
75
246
  def _looks_like_douyin_sec_user_id(value: str) -> bool:
76
247
  return value.startswith("MS4wLjA")
77
248
 
@@ -206,10 +377,10 @@ def _pick_first_mapping(items: List[Any]) -> Dict[str, Any]:
206
377
 
207
378
 
208
379
  def _xhs_posts_field_completeness(payload: Any) -> Dict[str, Any]:
209
- page_items = _pick_list(payload, ["notes", "note_list", "noteList", "items", "list"])
380
+ page_items = _extract_xhs_posts_items(payload)
210
381
  first_item = _pick_first_mapping(page_items)
211
- has_more_flag = _pick_int(payload, ["has_more", "hasMore"], default=-1) >= 0
212
- cursor_hit = bool(_pick_text(payload, ["cursor", "next_cursor", "last_cursor", "last_note_id"]))
382
+ has_more_flag = _extract_xhs_posts_has_more(payload) is not None
383
+ cursor_hit = bool(_extract_xhs_response_cursor(payload))
213
384
  cover_hit = bool(_extract_first_url(_first_url_candidate(first_item, ["cover", "cover_url", "cover_image", "image", "image_url"])))
214
385
  share_or_source = bool(_pick_text(first_item, ["share_url", "share_link", "url", "note_url"])) or bool(_pick_text(first_item, ["note_id", "id", "item_id"]))
215
386
  interaction_values = [
@@ -466,7 +637,10 @@ def collect_douyin_author_home_raw(
466
637
  page = 0
467
638
  pagination_trace: List[Dict[str, Any]] = []
468
639
 
469
- while has_more and page < max(pages_max, 1) and len(works) < max_items:
640
+ max_pages = max(pages_max, 1)
641
+ page_limit = min(max(page_size, 1), 20)
642
+
643
+ while has_more and page < max_pages and len(works) < max_items:
470
644
  page += 1
471
645
  posts_resp = call_json_api(
472
646
  base_url=base_url,
@@ -476,7 +650,7 @@ def collect_douyin_author_home_raw(
476
650
  timeout_ms=timeout_ms,
477
651
  params={
478
652
  "sec_user_id": sec_user_id,
479
- "count": min(max(page_size, 1), 20),
653
+ "count": page_limit,
480
654
  "max_cursor": cursor,
481
655
  "sort_type": 0,
482
656
  },
@@ -497,22 +671,12 @@ def collect_douyin_author_home_raw(
497
671
  )
498
672
  request_id_candidates.append(posts_resp)
499
673
  response_payload = posts_resp.get("data")
500
- page_items = _pick_list(response_payload, ["aweme_list", "items", "list"])
501
- if not page_items and isinstance(response_payload, dict):
502
- page_items = _pick_list(response_payload.get("data"), ["aweme_list", "items", "list"])
674
+ page_items = _extract_douyin_posts_items(response_payload)
503
675
 
504
- data = response_payload
505
- next_cursor = _pick_int(data, ["max_cursor", "cursor", "next_cursor"], default=0)
506
- has_more_flag = _pick_int(data, ["has_more", "hasMore"], default=0)
507
- pagination_trace.append(
508
- {
509
- "page": page,
510
- "cursor_in": cursor,
511
- "cursor_out": next_cursor,
512
- "has_more": has_more_flag,
513
- "items": len(page_items),
514
- }
515
- )
676
+ next_cursor_raw = _extract_douyin_posts_next_cursor(response_payload)
677
+ has_more_raw = _extract_douyin_posts_has_more(response_payload)
678
+ next_cursor = _normalize_int_like(next_cursor_raw)
679
+ has_more_normalized = _normalize_has_more(has_more_raw)
516
680
 
517
681
  for item in page_items:
518
682
  if not isinstance(item, dict):
@@ -526,8 +690,44 @@ def collect_douyin_author_home_raw(
526
690
  if len(works) >= max_items:
527
691
  break
528
692
 
529
- has_more = bool(has_more_flag == 1 and next_cursor != cursor)
530
- cursor = next_cursor
693
+ next_cursor_changed = next_cursor is not None and next_cursor != cursor
694
+ stop_reason = ""
695
+ should_continue = False
696
+
697
+ if len(works) >= max_items:
698
+ stop_reason = "max_items_reached"
699
+ elif not page_items:
700
+ stop_reason = "page_empty"
701
+ elif has_more_normalized is False:
702
+ stop_reason = "has_more_false"
703
+ elif next_cursor is not None and next_cursor == cursor:
704
+ stop_reason = "cursor_not_advanced"
705
+ elif has_more_normalized is True and next_cursor is None:
706
+ stop_reason = "pagination_contract_incomplete"
707
+ elif has_more_normalized is True or next_cursor_changed:
708
+ should_continue = True
709
+ else:
710
+ stop_reason = "pagination_contract_incomplete"
711
+
712
+ if should_continue and page >= max_pages:
713
+ should_continue = False
714
+ stop_reason = "pages_max_reached"
715
+
716
+ pagination_trace.append(
717
+ {
718
+ "page": page,
719
+ "cursor_in": cursor,
720
+ "cursor_out": next_cursor,
721
+ "has_more_raw": has_more_raw,
722
+ "has_more_normalized": has_more_normalized,
723
+ "items": len(page_items),
724
+ "stop_reason": stop_reason,
725
+ }
726
+ )
727
+
728
+ has_more = should_continue
729
+ if should_continue and next_cursor is not None:
730
+ cursor = next_cursor
531
731
 
532
732
  request_id = _pick_request_id(request_id_candidates, trace)
533
733
  if progress is not None:
@@ -661,7 +861,10 @@ def collect_xhs_author_home_raw(
661
861
  page = 0
662
862
  pagination_trace: List[Dict[str, Any]] = []
663
863
 
664
- while has_more and page < max(pages_max, 1) and len(works) < max_items:
864
+ max_pages = max(pages_max, 1)
865
+ page_limit = min(max(page_size, 1), 20)
866
+
867
+ while has_more and page < max_pages and len(works) < max_items:
665
868
  page += 1
666
869
  if progress is not None:
667
870
  progress.progress(
@@ -684,7 +887,7 @@ def collect_xhs_author_home_raw(
684
887
  "user_id": user_id,
685
888
  "share_text": input_value,
686
889
  "cursor": cursor or None,
687
- "num": min(max(page_size, 1), 20),
890
+ "num": page_limit,
688
891
  "xsec_token": xsec_token or None,
689
892
  },
690
893
  fallback_reason=posts_reason,
@@ -739,20 +942,14 @@ def collect_xhs_author_home_raw(
739
942
  )
740
943
 
741
944
  data = posts_resp.get("data")
742
- page_items = _pick_list(data, ["notes", "note_list", "noteList", "items", "list"])
743
- next_cursor = _pick_text(data, ["cursor", "next_cursor", "last_cursor", "last_note_id"])
744
- has_more_flag = _pick_int(data, ["has_more", "hasMore"], default=0)
745
- pagination_trace.append(
746
- {
747
- "page": page,
748
- "cursor_in": cursor,
749
- "cursor_out": next_cursor,
750
- "has_more": has_more_flag,
751
- "items": len(page_items),
752
- "route_label": posts_resp.get("_route_label"),
753
- "request_id": posts_resp.get("request_id"),
754
- }
755
- )
945
+ page_items = _extract_xhs_posts_items(data)
946
+ next_cursor_raw = _extract_xhs_response_cursor(data)
947
+ has_more_raw = _extract_xhs_posts_has_more(data)
948
+ has_more_normalized = _normalize_has_more(has_more_raw)
949
+ fallback_cursor = _last_xhs_note_id(page_items)
950
+ explicit_cursor = _to_text(next_cursor_raw)
951
+ cursor_source = "response_cursor" if explicit_cursor else ("last_note_id_fallback" if fallback_cursor else "missing")
952
+ next_cursor = explicit_cursor or fallback_cursor
756
953
 
757
954
  for item in page_items:
758
955
  if not isinstance(item, dict):
@@ -766,8 +963,47 @@ def collect_xhs_author_home_raw(
766
963
  if len(works) >= max_items:
767
964
  break
768
965
 
769
- has_more = bool(has_more_flag == 1 and next_cursor and str(next_cursor) != str(cursor))
770
- cursor = next_cursor
966
+ next_cursor_changed = bool(next_cursor and str(next_cursor) != str(cursor))
967
+ stop_reason = ""
968
+ should_continue = False
969
+
970
+ if len(works) >= max_items:
971
+ stop_reason = "max_items_reached"
972
+ elif not page_items:
973
+ stop_reason = "page_empty"
974
+ elif has_more_normalized is False:
975
+ stop_reason = "has_more_false"
976
+ elif next_cursor and str(next_cursor) == str(cursor):
977
+ stop_reason = "cursor_not_advanced"
978
+ elif has_more_normalized is True and not next_cursor:
979
+ stop_reason = "pagination_contract_incomplete"
980
+ elif has_more_normalized is True or next_cursor_changed:
981
+ should_continue = True
982
+ else:
983
+ stop_reason = "pagination_contract_incomplete"
984
+
985
+ if should_continue and page >= max_pages:
986
+ should_continue = False
987
+ stop_reason = "pages_max_reached"
988
+
989
+ pagination_trace.append(
990
+ {
991
+ "page": page,
992
+ "cursor_in": cursor,
993
+ "cursor_out": next_cursor,
994
+ "cursor_source": cursor_source,
995
+ "has_more_raw": has_more_raw,
996
+ "has_more_normalized": has_more_normalized,
997
+ "items": len(page_items),
998
+ "route_label": posts_resp.get("_route_label"),
999
+ "request_id": posts_resp.get("request_id"),
1000
+ "stop_reason": stop_reason,
1001
+ }
1002
+ )
1003
+
1004
+ has_more = should_continue
1005
+ if should_continue and next_cursor:
1006
+ cursor = next_cursor
771
1007
 
772
1008
  request_id = _pick_request_id(request_id_candidates, trace)
773
1009
  if progress is not None:
@@ -20,6 +20,7 @@ from scripts.pipelines.douyin_metadata import (
20
20
  extract_douyin_title,
21
21
  normalize_douyin_author_handle,
22
22
  )
23
+ from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
23
24
  from scripts.pipelines.media_url_rules import is_probable_video_url as is_shared_probable_video_url
24
25
  from scripts.core.tikomni_common import deep_find_all, deep_find_first
25
26
  from scripts.pipelines.select_low_quality_video_url import select_low_quality_video_url
@@ -54,6 +55,15 @@ def _optional_i(value: Any) -> int | None:
54
55
  return None
55
56
 
56
57
 
58
+ def _normalize_duration_ms(value: Any) -> int:
59
+ parsed = _optional_i(value)
60
+ if parsed is None or parsed <= 0:
61
+ return 0
62
+ if 0 < parsed < 10000:
63
+ return parsed * 1000
64
+ return parsed
65
+
66
+
57
67
  def _first(payload: Any, keys: List[str], default: Any = "") -> Any:
58
68
  hit = deep_find_first(payload, keys)
59
69
  return default if hit is None else hit
@@ -134,6 +144,16 @@ def _extract_douyin_video_down_url(item: Dict[str, Any]) -> str:
134
144
  return _t(selected.get("video_down_url"))
135
145
 
136
146
 
147
+ def _extract_douyin_duration_ms(item: Dict[str, Any]) -> int:
148
+ raw = item.get("duration_ms")
149
+ if raw is None:
150
+ raw = item.get("duration")
151
+ video = item.get("video")
152
+ if raw is None and isinstance(video, dict):
153
+ raw = video.get("duration")
154
+ return _normalize_duration_ms(raw)
155
+
156
+
137
157
  def _extract_xhs_video_down_url(item: Dict[str, Any]) -> str:
138
158
  urls = _pick_http_urls(
139
159
  item,
@@ -154,6 +174,33 @@ def _extract_xhs_video_down_url(item: Dict[str, Any]) -> str:
154
174
  return ""
155
175
 
156
176
 
177
+ def _extract_xhs_duration_ms(item: Dict[str, Any]) -> int:
178
+ video_info_v2 = item.get("video_info_v2") if isinstance(item.get("video_info_v2"), dict) else {}
179
+ media = video_info_v2.get("media") if isinstance(video_info_v2.get("media"), dict) else {}
180
+ media_video = media.get("video") if isinstance(media.get("video"), dict) else {}
181
+ capa = video_info_v2.get("capa") if isinstance(video_info_v2.get("capa"), dict) else {}
182
+ raw_candidates = [
183
+ item.get("duration_ms"),
184
+ item.get("duration"),
185
+ item.get("video_duration"),
186
+ item.get("duration_sec"),
187
+ media_video.get("duration"),
188
+ capa.get("duration"),
189
+ ]
190
+ video = item.get("video")
191
+ if isinstance(video, dict):
192
+ raw_candidates.append(video.get("duration"))
193
+ note = item.get("note")
194
+ if isinstance(note, dict):
195
+ raw_candidates.append(note.get("duration"))
196
+
197
+ for candidate in raw_candidates:
198
+ normalized = _normalize_duration_ms(candidate)
199
+ if normalized > 0:
200
+ return normalized
201
+ return 0
202
+
203
+
157
204
  def _normalize_text_list(value: Any) -> List[str]:
158
205
  values: List[str] = []
159
206
  if isinstance(value, list):
@@ -276,9 +323,9 @@ def _extract_xhs_subtitle_urls(item: Dict[str, Any]) -> List[str]:
276
323
 
277
324
  def _extract_xhs_work_modality(item: Dict[str, Any], *, video_download_url: str, subtitle_inline: str) -> str:
278
325
  content_type_raw = _t(_first(item, ["type", "note_type", "model_type"])).lower()
279
- if content_type_raw in {"video", "0", "normal", "mixed", "mix", "video_note", "note_video"}:
326
+ if content_type_raw in {"video", "0", "mixed", "mix", "video_note", "note_video"}:
280
327
  return "video"
281
- if content_type_raw in {"image", "1", "photo", "album", "note", "text"}:
328
+ if content_type_raw in {"normal", "image", "1", "photo", "album", "note", "text"}:
282
329
  return "text"
283
330
  if video_download_url or subtitle_inline:
284
331
  return "video"
@@ -393,6 +440,9 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
393
440
  if not isinstance(item, dict):
394
441
  continue
395
442
  aweme_id = _t(_first(item, ["aweme_id", "item_id", "id"]))
443
+ video_type_info = normalize_douyin_video_type(item)
444
+ is_video = bool(video_type_info.get("is_video"))
445
+ duration_ms = _extract_douyin_duration_ms(item)
396
446
  author_info = extract_douyin_author(item)
397
447
  metrics = extract_douyin_metrics(item)
398
448
  video_down_url = _extract_douyin_video_down_url(item)
@@ -417,9 +467,9 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
417
467
  subtitle_raw="",
418
468
  subtitle_source="missing",
419
469
  publish_time=_t(_first(item, ["create_time", "publish_time"])),
420
- work_modality="video",
421
- content_type="video",
422
- duration_ms=_i(_first(item, ["duration_ms", "duration"], 0)),
470
+ work_modality="video" if is_video else "text",
471
+ content_type="video" if is_video else "text",
472
+ duration_ms=duration_ms,
423
473
  tags=tags,
424
474
  metrics={
425
475
  "digg_count": int(metrics.get("digg_count") or 0),
@@ -438,10 +488,13 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
438
488
  asr_status="pending",
439
489
  asr_error_reason="",
440
490
  asr_source="fallback_none",
491
+ is_video=is_video,
441
492
  platform_native_refs={
442
493
  "douyin_sec_uid": _t(author_info.get("douyin_sec_uid") or internal_author_id),
443
494
  "douyin_aweme_author_id": _t(author_info.get("douyin_aweme_author_id") or stable_author_id or author_id),
444
495
  "douyin_unique_id": _t(author_info.get("unique_id")),
496
+ "douyin_video_type_reason": _t(video_type_info.get("video_type_reason")),
497
+ "douyin_video_type_field": _t(video_type_info.get("matched_field")),
445
498
  },
446
499
  raw_ref={"aweme_id": aweme_id, "raw_item": item},
447
500
  )
@@ -519,6 +572,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
519
572
  subtitle_inline = _extract_xhs_subtitle_inline(item)
520
573
  subtitle_urls = _extract_xhs_subtitle_urls(item)
521
574
  video_down_url = _extract_xhs_video_down_url(item)
575
+ duration_ms = _extract_xhs_duration_ms(item)
522
576
  content_type_raw = _t(_first(item, ["type", "note_type", "model_type"]))
523
577
  work_modality = _extract_xhs_work_modality(item, video_download_url=video_down_url, subtitle_inline=subtitle_inline)
524
578
  content_type = "video" if work_modality == "video" else (content_type_raw or "text")
@@ -541,7 +595,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
541
595
  publish_time=_t(_first(item, ["publish_time", "time", "create_time", "publishTime", "created_at"])),
542
596
  work_modality=work_modality,
543
597
  content_type=content_type,
544
- duration_ms=_i(_first(item, ["duration_ms", "duration", "video_duration"], 0)),
598
+ duration_ms=duration_ms,
545
599
  tags=_extract_xhs_tags(item),
546
600
  metrics=metrics,
547
601
  cover_image=cover_image,
@@ -551,6 +605,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
551
605
  asr_status="subtitle_ready" if subtitle_inline else "pending",
552
606
  asr_error_reason="",
553
607
  asr_source="native_subtitle" if subtitle_inline else "fallback_none",
608
+ is_video=work_modality == "video",
554
609
  platform_native_refs={"xhs_user_id": author_id, "xhs_red_id": author_handle},
555
610
  raw_ref={
556
611
  "note_id": note_id,
@@ -108,9 +108,9 @@ def _infer_work_modality(*, work_modality: str, is_video: Any, content_type: str
108
108
  return "video"
109
109
 
110
110
  content_type_value = _to_text(content_type).lower()
111
- if content_type_value in {"video", "mixed", "mix", "video_note", "note_video", "normal", "0"}:
111
+ if content_type_value in {"video", "mixed", "mix", "video_note", "note_video", "0"}:
112
112
  return "video"
113
- if content_type_value in {"text", "note", "image", "photo", "album", "1"}:
113
+ if content_type_value in {"normal", "text", "note", "image", "photo", "album", "1"}:
114
114
  return "text"
115
115
 
116
116
  if subtitle_raw or video_download_url:
@@ -62,6 +62,13 @@ def _safe_optional_int(value: Any) -> Optional[int]:
62
62
  return None
63
63
 
64
64
 
65
+ def _safe_optional_positive_int(value: Any) -> Optional[int]:
66
+ parsed = _safe_optional_int(value)
67
+ if parsed is None or parsed <= 0:
68
+ return None
69
+ return parsed
70
+
71
+
65
72
  def _source_dict(payload: Dict[str, Any]) -> Dict[str, Any]:
66
73
  source = payload.get("source")
67
74
  return source if isinstance(source, dict) else {}
@@ -250,6 +257,7 @@ def build_work_fact_card(payload: Dict[str, Any], platform: Optional[str] = None
250
257
  "create_time_sec": payload.get("create_time_sec"),
251
258
  "publish_time_source": _safe_text(payload.get("publish_time_source")),
252
259
  "published_date": _resolve_published_date(payload),
260
+ "duration_ms": _safe_optional_positive_int(payload.get("duration_ms")),
253
261
  "digg_count": _safe_int(payload.get("digg_count")),
254
262
  "comment_count": _safe_int(payload.get("comment_count")),
255
263
  "collect_count": _safe_int(payload.get("collect_count")),
@@ -324,6 +332,7 @@ def _frontmatter_lines(card: Dict[str, Any]) -> List[str]:
324
332
  ("title", card.get("title")),
325
333
  ("published_date", card.get("published_date")),
326
334
  ("work_modality", card.get("work_modality")),
335
+ ("duration_ms", card.get("duration_ms")),
327
336
  ("digg_count", card.get("digg_count")),
328
337
  ("comment_count", card.get("comment_count")),
329
338
  ("collect_count", card.get("collect_count")),