@tikomni/skills 0.1.11 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/references/contracts/work-fact-card-fields.md +2 -0
- package/skills/social-media-crawl/references/schemas/work-fact-card.schema.json +1 -0
- package/skills/social-media-crawl/scripts/pipelines/home_asr.py +36 -22
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +276 -40
- package/skills/social-media-crawl/scripts/pipelines/platform_adapters.py +61 -6
- package/skills/social-media-crawl/scripts/pipelines/schema.py +2 -2
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +9 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tikomni/skills",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "1.0.0",
|
|
4
4
|
"description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
|
|
25
25
|
## Optional Fields
|
|
26
26
|
|
|
27
|
+
- `duration_ms`
|
|
27
28
|
- `digg_count`
|
|
28
29
|
- `comment_count`
|
|
29
30
|
- `collect_count`
|
|
@@ -37,6 +38,7 @@
|
|
|
37
38
|
## Field Rules
|
|
38
39
|
|
|
39
40
|
- `author` is the display name, not an object.
|
|
41
|
+
- `duration_ms` uses milliseconds. Write `null` when the duration is unavailable or not applicable.
|
|
40
42
|
- Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
|
|
41
43
|
- The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
|
|
42
44
|
- `primary_text` is the text that is best suited for reading and indexing in the current task.
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
"subtitle_raw": { "type": "string" },
|
|
33
33
|
"work_modality": { "type": "string" },
|
|
34
34
|
"published_date": { "type": "string" },
|
|
35
|
+
"duration_ms": { "type": ["integer", "null"] },
|
|
35
36
|
"digg_count": { "type": "integer" },
|
|
36
37
|
"comment_count": { "type": "integer" },
|
|
37
38
|
"collect_count": { "type": "integer" },
|
|
@@ -18,6 +18,7 @@ from scripts.core.asr_pipeline import (
|
|
|
18
18
|
run_u2_asr_candidates_with_timeout_retry,
|
|
19
19
|
)
|
|
20
20
|
from scripts.core.u3_fallback import run_u3_public_url_fallback
|
|
21
|
+
from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
|
|
21
22
|
|
|
22
23
|
DEFAULT_BATCH_SUBMIT_SIZE = 50
|
|
23
24
|
MAX_BATCH_SUBMIT_SIZE = 100
|
|
@@ -76,13 +77,17 @@ def _resolve_is_video(work: Dict[str, Any], *, platform: str) -> bool:
|
|
|
76
77
|
return False
|
|
77
78
|
|
|
78
79
|
if platform == "douyin":
|
|
79
|
-
|
|
80
|
+
raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
|
|
81
|
+
raw_item = raw_ref.get("raw_item") if isinstance(raw_ref.get("raw_item"), dict) else {}
|
|
82
|
+
if raw_item:
|
|
83
|
+
return bool(normalize_douyin_video_type(raw_item).get("is_video"))
|
|
84
|
+
return False
|
|
80
85
|
|
|
81
86
|
raw_ref = work.get("raw_ref") if isinstance(work.get("raw_ref"), dict) else {}
|
|
82
87
|
xhs_type_hint = normalize_text(raw_ref.get("type") or raw_ref.get("note_type")).lower()
|
|
83
|
-
if xhs_type_hint in {"video", "0", "
|
|
88
|
+
if xhs_type_hint in {"video", "0", "mixed", "mix", "video_note", "note_video"}:
|
|
84
89
|
return True
|
|
85
|
-
if xhs_type_hint in {"image", "1", "note", "photo"}:
|
|
90
|
+
if xhs_type_hint in {"normal", "image", "1", "note", "photo", "text", "album"}:
|
|
86
91
|
return False
|
|
87
92
|
|
|
88
93
|
return False
|
|
@@ -275,7 +280,8 @@ def _run_u2_for_work(
|
|
|
275
280
|
gate = _evaluate_u2_gate(work, platform=platform)
|
|
276
281
|
if not gate.get("can_u2"):
|
|
277
282
|
gate_reason = normalize_text(gate.get("gate_reason")) or "skip:unknown"
|
|
278
|
-
|
|
283
|
+
fallback_result = _video_caption_fallback_result(work, gate_reason) if gate.get("is_video") else _fallback_none_result(gate_reason)
|
|
284
|
+
return fallback_result, {
|
|
279
285
|
"step": "author_home.asr.u2_gate",
|
|
280
286
|
"platform_work_id": work.get("platform_work_id"),
|
|
281
287
|
"ok": False,
|
|
@@ -325,19 +331,7 @@ def _run_u2_for_work(
|
|
|
325
331
|
asr_source="external_asr",
|
|
326
332
|
), trace
|
|
327
333
|
|
|
328
|
-
return
|
|
329
|
-
"subtitle_raw": "",
|
|
330
|
-
"subtitle_source": "missing",
|
|
331
|
-
"asr_raw": "",
|
|
332
|
-
"asr_clean": "",
|
|
333
|
-
"primary_text": "",
|
|
334
|
-
"primary_text_source": "asr_clean",
|
|
335
|
-
"analysis_eligibility": "incomplete",
|
|
336
|
-
"analysis_exclusion_reason": "video_asr_unavailable",
|
|
337
|
-
"asr_status": "failed",
|
|
338
|
-
"asr_error_reason": normalize_text(poll_result.get("error_reason")) or "u2_failed",
|
|
339
|
-
"asr_source": "fallback_none",
|
|
340
|
-
}, trace
|
|
334
|
+
return _video_caption_fallback_result(work, normalize_text(poll_result.get("error_reason")) or "u2_failed"), trace
|
|
341
335
|
|
|
342
336
|
|
|
343
337
|
def _iter_xhs_interface_text_candidates(work: Dict[str, Any]) -> List[Tuple[str, str]]:
|
|
@@ -489,6 +483,23 @@ def _fallback_none_result(reason: str) -> Dict[str, Any]:
|
|
|
489
483
|
}
|
|
490
484
|
|
|
491
485
|
|
|
486
|
+
def _video_caption_fallback_result(work: Dict[str, Any], reason: str) -> Dict[str, Any]:
|
|
487
|
+
caption_raw = normalize_text(work.get("caption_raw"))
|
|
488
|
+
return {
|
|
489
|
+
"subtitle_raw": "",
|
|
490
|
+
"subtitle_source": "missing",
|
|
491
|
+
"asr_raw": "",
|
|
492
|
+
"asr_clean": "",
|
|
493
|
+
"primary_text": caption_raw,
|
|
494
|
+
"primary_text_source": "caption_raw" if caption_raw else "missing",
|
|
495
|
+
"analysis_eligibility": "eligible" if caption_raw else "incomplete",
|
|
496
|
+
"analysis_exclusion_reason": "" if caption_raw else (normalize_text(reason) or "video_asr_unavailable"),
|
|
497
|
+
"asr_status": "failed",
|
|
498
|
+
"asr_error_reason": normalize_text(reason) or "asr_failed",
|
|
499
|
+
"asr_source": "fallback_none",
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
|
|
492
503
|
def _run_xhs_u3_then_u2_batch_for_entries(
|
|
493
504
|
*,
|
|
494
505
|
batch_id: str,
|
|
@@ -517,7 +528,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
|
|
|
517
528
|
subtitle_invalid = normalize_text(entry.get("subtitle_invalid")) or "subtitle_missing"
|
|
518
529
|
|
|
519
530
|
if not source_url:
|
|
520
|
-
work.update(
|
|
531
|
+
work.update(_video_caption_fallback_result(work, "skip:video_download_url_missing"))
|
|
521
532
|
trace.append(
|
|
522
533
|
{
|
|
523
534
|
"step": "author_home.asr.xhs_u3",
|
|
@@ -554,7 +565,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
|
|
|
554
565
|
)
|
|
555
566
|
|
|
556
567
|
if not u3_result.get("ok") or not public_url:
|
|
557
|
-
work.update(
|
|
568
|
+
work.update(_video_caption_fallback_result(work, normalize_text(u3_result.get("error_reason")) or "u3_bridge_failed"))
|
|
558
569
|
u3_failed_count += 1
|
|
559
570
|
continue
|
|
560
571
|
|
|
@@ -597,7 +608,7 @@ def _run_xhs_u3_then_u2_batch_for_entries(
|
|
|
597
608
|
work = entry.get("work")
|
|
598
609
|
if not isinstance(work, dict):
|
|
599
610
|
continue
|
|
600
|
-
work.update(
|
|
611
|
+
work.update(_video_caption_fallback_result(work, normalize_text(entry.get("fallback_reason")) or "xhs_u3_then_u2_failed"))
|
|
601
612
|
|
|
602
613
|
return {
|
|
603
614
|
"trace": trace,
|
|
@@ -905,7 +916,10 @@ def enrich_author_home_asr(
|
|
|
905
916
|
)
|
|
906
917
|
|
|
907
918
|
if not gate.get("can_u2"):
|
|
908
|
-
|
|
919
|
+
if gate.get("is_video"):
|
|
920
|
+
work.update(_video_caption_fallback_result(work, str(gate.get("gate_reason") or "skip:unknown")))
|
|
921
|
+
else:
|
|
922
|
+
work.update(_mark_text_work_ready(work))
|
|
909
923
|
else:
|
|
910
924
|
batch_u2_entries.append(
|
|
911
925
|
{
|
|
@@ -987,7 +1001,7 @@ def enrich_author_home_asr(
|
|
|
987
1001
|
}
|
|
988
1002
|
)
|
|
989
1003
|
if not gate.get("can_u2"):
|
|
990
|
-
work.update(
|
|
1004
|
+
work.update(_video_caption_fallback_result(work, str(gate.get("gate_reason") or "skip:unknown")))
|
|
991
1005
|
else:
|
|
992
1006
|
batch_xhs_u3_entries.append(
|
|
993
1007
|
{
|
|
@@ -72,6 +72,177 @@ def _pick_list(payload: Any, keys: List[str]) -> List[Any]:
|
|
|
72
72
|
return hit if isinstance(hit, list) else []
|
|
73
73
|
|
|
74
74
|
|
|
75
|
+
def _pick_raw(payload: Any, keys: List[str]) -> Any:
|
|
76
|
+
ordered_keys = [str(key) for key in keys if str(key).strip()]
|
|
77
|
+
|
|
78
|
+
def _walk(node: Any) -> Any:
|
|
79
|
+
if isinstance(node, dict):
|
|
80
|
+
for key in ordered_keys:
|
|
81
|
+
if key in node and node.get(key) is not None:
|
|
82
|
+
return node.get(key)
|
|
83
|
+
for value in node.values():
|
|
84
|
+
hit = _walk(value)
|
|
85
|
+
if hit is not None:
|
|
86
|
+
return hit
|
|
87
|
+
elif isinstance(node, list):
|
|
88
|
+
for item in node:
|
|
89
|
+
hit = _walk(item)
|
|
90
|
+
if hit is not None:
|
|
91
|
+
return hit
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
return _walk(payload)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _payload_candidates(payload: Any) -> List[Any]:
|
|
98
|
+
candidates: List[Any] = []
|
|
99
|
+
if payload is not None:
|
|
100
|
+
candidates.append(payload)
|
|
101
|
+
if isinstance(payload, dict):
|
|
102
|
+
nested = payload.get("data")
|
|
103
|
+
if nested is not None:
|
|
104
|
+
candidates.append(nested)
|
|
105
|
+
return candidates
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _pick_raw_from_candidates(payload: Any, keys: List[str]) -> Any:
|
|
109
|
+
for candidate in _payload_candidates(payload):
|
|
110
|
+
hit = _pick_raw(candidate, keys)
|
|
111
|
+
if hit is not None:
|
|
112
|
+
return hit
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _pick_list_from_candidates(payload: Any, keys: List[str]) -> List[Any]:
|
|
117
|
+
for candidate in _payload_candidates(payload):
|
|
118
|
+
hit = _pick_list(candidate, keys)
|
|
119
|
+
if hit:
|
|
120
|
+
return hit
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _unwrap_data_layers(payload: Any, *, max_depth: int = 3) -> Any:
|
|
125
|
+
node = payload
|
|
126
|
+
depth = 0
|
|
127
|
+
while depth < max_depth and isinstance(node, dict) and isinstance(node.get("data"), dict):
|
|
128
|
+
node = node.get("data")
|
|
129
|
+
depth += 1
|
|
130
|
+
return node
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _extract_douyin_posts_page(payload: Any) -> Dict[str, Any]:
|
|
134
|
+
node = _unwrap_data_layers(payload)
|
|
135
|
+
if not isinstance(node, dict):
|
|
136
|
+
return {}
|
|
137
|
+
return node
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _extract_douyin_posts_items(payload: Any) -> List[Any]:
|
|
141
|
+
node = _extract_douyin_posts_page(payload)
|
|
142
|
+
items = node.get("aweme_list")
|
|
143
|
+
return items if isinstance(items, list) else []
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _extract_douyin_posts_next_cursor(payload: Any) -> Any:
|
|
147
|
+
node = _extract_douyin_posts_page(payload)
|
|
148
|
+
if not isinstance(node, dict):
|
|
149
|
+
return None
|
|
150
|
+
for key in ("max_cursor", "cursor", "next_cursor"):
|
|
151
|
+
if key in node and node.get(key) is not None:
|
|
152
|
+
return node.get(key)
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _extract_douyin_posts_has_more(payload: Any) -> Any:
|
|
157
|
+
node = _extract_douyin_posts_page(payload)
|
|
158
|
+
if not isinstance(node, dict):
|
|
159
|
+
return None
|
|
160
|
+
for key in ("has_more", "hasMore"):
|
|
161
|
+
if key in node and node.get(key) is not None:
|
|
162
|
+
return node.get(key)
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _extract_xhs_posts_page(payload: Any) -> Dict[str, Any]:
|
|
167
|
+
node = _unwrap_data_layers(payload)
|
|
168
|
+
if not isinstance(node, dict):
|
|
169
|
+
return {}
|
|
170
|
+
return node
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _extract_xhs_posts_items(payload: Any) -> List[Any]:
|
|
174
|
+
node = _extract_xhs_posts_page(payload)
|
|
175
|
+
for key in ("notes", "note_list", "noteList", "items", "list"):
|
|
176
|
+
value = node.get(key)
|
|
177
|
+
if isinstance(value, list):
|
|
178
|
+
return value
|
|
179
|
+
return []
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _extract_xhs_response_cursor(payload: Any) -> str:
|
|
183
|
+
node = _extract_xhs_posts_page(payload)
|
|
184
|
+
if not isinstance(node, dict):
|
|
185
|
+
return ""
|
|
186
|
+
for key in ("cursor", "next_cursor", "last_cursor", "last_note_id"):
|
|
187
|
+
value = _to_text(node.get(key))
|
|
188
|
+
if value:
|
|
189
|
+
return value
|
|
190
|
+
return ""
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _extract_xhs_posts_has_more(payload: Any) -> Any:
|
|
194
|
+
node = _extract_xhs_posts_page(payload)
|
|
195
|
+
if not isinstance(node, dict):
|
|
196
|
+
return None
|
|
197
|
+
for key in ("has_more", "hasMore"):
|
|
198
|
+
if key in node and node.get(key) is not None:
|
|
199
|
+
return node.get(key)
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _normalize_has_more(value: Any) -> Optional[bool]:
|
|
204
|
+
if value is None:
|
|
205
|
+
return None
|
|
206
|
+
if isinstance(value, bool):
|
|
207
|
+
return value
|
|
208
|
+
if isinstance(value, (int, float)):
|
|
209
|
+
return bool(int(value))
|
|
210
|
+
text = _to_text(value).lower()
|
|
211
|
+
if not text:
|
|
212
|
+
return None
|
|
213
|
+
if text in {"1", "true", "yes", "y"}:
|
|
214
|
+
return True
|
|
215
|
+
if text in {"0", "false", "no", "n"}:
|
|
216
|
+
return False
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _normalize_int_like(value: Any) -> Optional[int]:
|
|
221
|
+
if value is None:
|
|
222
|
+
return None
|
|
223
|
+
try:
|
|
224
|
+
if isinstance(value, bool):
|
|
225
|
+
return int(value)
|
|
226
|
+
if isinstance(value, (int, float)):
|
|
227
|
+
return int(value)
|
|
228
|
+
text = _to_text(value)
|
|
229
|
+
if not text:
|
|
230
|
+
return None
|
|
231
|
+
return int(float(text.replace(",", "")))
|
|
232
|
+
except Exception:
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _last_xhs_note_id(items: List[Any]) -> str:
|
|
237
|
+
for item in reversed(items):
|
|
238
|
+
if not isinstance(item, dict):
|
|
239
|
+
continue
|
|
240
|
+
note_id = _pick_text(item, ["note_id", "id", "item_id"])
|
|
241
|
+
if note_id:
|
|
242
|
+
return note_id
|
|
243
|
+
return ""
|
|
244
|
+
|
|
245
|
+
|
|
75
246
|
def _looks_like_douyin_sec_user_id(value: str) -> bool:
|
|
76
247
|
return value.startswith("MS4wLjA")
|
|
77
248
|
|
|
@@ -206,10 +377,10 @@ def _pick_first_mapping(items: List[Any]) -> Dict[str, Any]:
|
|
|
206
377
|
|
|
207
378
|
|
|
208
379
|
def _xhs_posts_field_completeness(payload: Any) -> Dict[str, Any]:
|
|
209
|
-
page_items =
|
|
380
|
+
page_items = _extract_xhs_posts_items(payload)
|
|
210
381
|
first_item = _pick_first_mapping(page_items)
|
|
211
|
-
has_more_flag =
|
|
212
|
-
cursor_hit = bool(
|
|
382
|
+
has_more_flag = _extract_xhs_posts_has_more(payload) is not None
|
|
383
|
+
cursor_hit = bool(_extract_xhs_response_cursor(payload))
|
|
213
384
|
cover_hit = bool(_extract_first_url(_first_url_candidate(first_item, ["cover", "cover_url", "cover_image", "image", "image_url"])))
|
|
214
385
|
share_or_source = bool(_pick_text(first_item, ["share_url", "share_link", "url", "note_url"])) or bool(_pick_text(first_item, ["note_id", "id", "item_id"]))
|
|
215
386
|
interaction_values = [
|
|
@@ -466,7 +637,10 @@ def collect_douyin_author_home_raw(
|
|
|
466
637
|
page = 0
|
|
467
638
|
pagination_trace: List[Dict[str, Any]] = []
|
|
468
639
|
|
|
469
|
-
|
|
640
|
+
max_pages = max(pages_max, 1)
|
|
641
|
+
page_limit = min(max(page_size, 1), 20)
|
|
642
|
+
|
|
643
|
+
while has_more and page < max_pages and len(works) < max_items:
|
|
470
644
|
page += 1
|
|
471
645
|
posts_resp = call_json_api(
|
|
472
646
|
base_url=base_url,
|
|
@@ -476,7 +650,7 @@ def collect_douyin_author_home_raw(
|
|
|
476
650
|
timeout_ms=timeout_ms,
|
|
477
651
|
params={
|
|
478
652
|
"sec_user_id": sec_user_id,
|
|
479
|
-
"count":
|
|
653
|
+
"count": page_limit,
|
|
480
654
|
"max_cursor": cursor,
|
|
481
655
|
"sort_type": 0,
|
|
482
656
|
},
|
|
@@ -497,22 +671,12 @@ def collect_douyin_author_home_raw(
|
|
|
497
671
|
)
|
|
498
672
|
request_id_candidates.append(posts_resp)
|
|
499
673
|
response_payload = posts_resp.get("data")
|
|
500
|
-
page_items =
|
|
501
|
-
if not page_items and isinstance(response_payload, dict):
|
|
502
|
-
page_items = _pick_list(response_payload.get("data"), ["aweme_list", "items", "list"])
|
|
674
|
+
page_items = _extract_douyin_posts_items(response_payload)
|
|
503
675
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
{
|
|
509
|
-
"page": page,
|
|
510
|
-
"cursor_in": cursor,
|
|
511
|
-
"cursor_out": next_cursor,
|
|
512
|
-
"has_more": has_more_flag,
|
|
513
|
-
"items": len(page_items),
|
|
514
|
-
}
|
|
515
|
-
)
|
|
676
|
+
next_cursor_raw = _extract_douyin_posts_next_cursor(response_payload)
|
|
677
|
+
has_more_raw = _extract_douyin_posts_has_more(response_payload)
|
|
678
|
+
next_cursor = _normalize_int_like(next_cursor_raw)
|
|
679
|
+
has_more_normalized = _normalize_has_more(has_more_raw)
|
|
516
680
|
|
|
517
681
|
for item in page_items:
|
|
518
682
|
if not isinstance(item, dict):
|
|
@@ -526,8 +690,44 @@ def collect_douyin_author_home_raw(
|
|
|
526
690
|
if len(works) >= max_items:
|
|
527
691
|
break
|
|
528
692
|
|
|
529
|
-
|
|
530
|
-
|
|
693
|
+
next_cursor_changed = next_cursor is not None and next_cursor != cursor
|
|
694
|
+
stop_reason = ""
|
|
695
|
+
should_continue = False
|
|
696
|
+
|
|
697
|
+
if len(works) >= max_items:
|
|
698
|
+
stop_reason = "max_items_reached"
|
|
699
|
+
elif not page_items:
|
|
700
|
+
stop_reason = "page_empty"
|
|
701
|
+
elif has_more_normalized is False:
|
|
702
|
+
stop_reason = "has_more_false"
|
|
703
|
+
elif next_cursor is not None and next_cursor == cursor:
|
|
704
|
+
stop_reason = "cursor_not_advanced"
|
|
705
|
+
elif has_more_normalized is True and next_cursor is None:
|
|
706
|
+
stop_reason = "pagination_contract_incomplete"
|
|
707
|
+
elif has_more_normalized is True or next_cursor_changed:
|
|
708
|
+
should_continue = True
|
|
709
|
+
else:
|
|
710
|
+
stop_reason = "pagination_contract_incomplete"
|
|
711
|
+
|
|
712
|
+
if should_continue and page >= max_pages:
|
|
713
|
+
should_continue = False
|
|
714
|
+
stop_reason = "pages_max_reached"
|
|
715
|
+
|
|
716
|
+
pagination_trace.append(
|
|
717
|
+
{
|
|
718
|
+
"page": page,
|
|
719
|
+
"cursor_in": cursor,
|
|
720
|
+
"cursor_out": next_cursor,
|
|
721
|
+
"has_more_raw": has_more_raw,
|
|
722
|
+
"has_more_normalized": has_more_normalized,
|
|
723
|
+
"items": len(page_items),
|
|
724
|
+
"stop_reason": stop_reason,
|
|
725
|
+
}
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
has_more = should_continue
|
|
729
|
+
if should_continue and next_cursor is not None:
|
|
730
|
+
cursor = next_cursor
|
|
531
731
|
|
|
532
732
|
request_id = _pick_request_id(request_id_candidates, trace)
|
|
533
733
|
if progress is not None:
|
|
@@ -661,7 +861,10 @@ def collect_xhs_author_home_raw(
|
|
|
661
861
|
page = 0
|
|
662
862
|
pagination_trace: List[Dict[str, Any]] = []
|
|
663
863
|
|
|
664
|
-
|
|
864
|
+
max_pages = max(pages_max, 1)
|
|
865
|
+
page_limit = min(max(page_size, 1), 20)
|
|
866
|
+
|
|
867
|
+
while has_more and page < max_pages and len(works) < max_items:
|
|
665
868
|
page += 1
|
|
666
869
|
if progress is not None:
|
|
667
870
|
progress.progress(
|
|
@@ -684,7 +887,7 @@ def collect_xhs_author_home_raw(
|
|
|
684
887
|
"user_id": user_id,
|
|
685
888
|
"share_text": input_value,
|
|
686
889
|
"cursor": cursor or None,
|
|
687
|
-
"num":
|
|
890
|
+
"num": page_limit,
|
|
688
891
|
"xsec_token": xsec_token or None,
|
|
689
892
|
},
|
|
690
893
|
fallback_reason=posts_reason,
|
|
@@ -739,20 +942,14 @@ def collect_xhs_author_home_raw(
|
|
|
739
942
|
)
|
|
740
943
|
|
|
741
944
|
data = posts_resp.get("data")
|
|
742
|
-
page_items =
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
"has_more": has_more_flag,
|
|
751
|
-
"items": len(page_items),
|
|
752
|
-
"route_label": posts_resp.get("_route_label"),
|
|
753
|
-
"request_id": posts_resp.get("request_id"),
|
|
754
|
-
}
|
|
755
|
-
)
|
|
945
|
+
page_items = _extract_xhs_posts_items(data)
|
|
946
|
+
next_cursor_raw = _extract_xhs_response_cursor(data)
|
|
947
|
+
has_more_raw = _extract_xhs_posts_has_more(data)
|
|
948
|
+
has_more_normalized = _normalize_has_more(has_more_raw)
|
|
949
|
+
fallback_cursor = _last_xhs_note_id(page_items)
|
|
950
|
+
explicit_cursor = _to_text(next_cursor_raw)
|
|
951
|
+
cursor_source = "response_cursor" if explicit_cursor else ("last_note_id_fallback" if fallback_cursor else "missing")
|
|
952
|
+
next_cursor = explicit_cursor or fallback_cursor
|
|
756
953
|
|
|
757
954
|
for item in page_items:
|
|
758
955
|
if not isinstance(item, dict):
|
|
@@ -766,8 +963,47 @@ def collect_xhs_author_home_raw(
|
|
|
766
963
|
if len(works) >= max_items:
|
|
767
964
|
break
|
|
768
965
|
|
|
769
|
-
|
|
770
|
-
|
|
966
|
+
next_cursor_changed = bool(next_cursor and str(next_cursor) != str(cursor))
|
|
967
|
+
stop_reason = ""
|
|
968
|
+
should_continue = False
|
|
969
|
+
|
|
970
|
+
if len(works) >= max_items:
|
|
971
|
+
stop_reason = "max_items_reached"
|
|
972
|
+
elif not page_items:
|
|
973
|
+
stop_reason = "page_empty"
|
|
974
|
+
elif has_more_normalized is False:
|
|
975
|
+
stop_reason = "has_more_false"
|
|
976
|
+
elif next_cursor and str(next_cursor) == str(cursor):
|
|
977
|
+
stop_reason = "cursor_not_advanced"
|
|
978
|
+
elif has_more_normalized is True and not next_cursor:
|
|
979
|
+
stop_reason = "pagination_contract_incomplete"
|
|
980
|
+
elif has_more_normalized is True or next_cursor_changed:
|
|
981
|
+
should_continue = True
|
|
982
|
+
else:
|
|
983
|
+
stop_reason = "pagination_contract_incomplete"
|
|
984
|
+
|
|
985
|
+
if should_continue and page >= max_pages:
|
|
986
|
+
should_continue = False
|
|
987
|
+
stop_reason = "pages_max_reached"
|
|
988
|
+
|
|
989
|
+
pagination_trace.append(
|
|
990
|
+
{
|
|
991
|
+
"page": page,
|
|
992
|
+
"cursor_in": cursor,
|
|
993
|
+
"cursor_out": next_cursor,
|
|
994
|
+
"cursor_source": cursor_source,
|
|
995
|
+
"has_more_raw": has_more_raw,
|
|
996
|
+
"has_more_normalized": has_more_normalized,
|
|
997
|
+
"items": len(page_items),
|
|
998
|
+
"route_label": posts_resp.get("_route_label"),
|
|
999
|
+
"request_id": posts_resp.get("request_id"),
|
|
1000
|
+
"stop_reason": stop_reason,
|
|
1001
|
+
}
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
has_more = should_continue
|
|
1005
|
+
if should_continue and next_cursor:
|
|
1006
|
+
cursor = next_cursor
|
|
771
1007
|
|
|
772
1008
|
request_id = _pick_request_id(request_id_candidates, trace)
|
|
773
1009
|
if progress is not None:
|
|
@@ -20,6 +20,7 @@ from scripts.pipelines.douyin_metadata import (
|
|
|
20
20
|
extract_douyin_title,
|
|
21
21
|
normalize_douyin_author_handle,
|
|
22
22
|
)
|
|
23
|
+
from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
|
|
23
24
|
from scripts.pipelines.media_url_rules import is_probable_video_url as is_shared_probable_video_url
|
|
24
25
|
from scripts.core.tikomni_common import deep_find_all, deep_find_first
|
|
25
26
|
from scripts.pipelines.select_low_quality_video_url import select_low_quality_video_url
|
|
@@ -54,6 +55,15 @@ def _optional_i(value: Any) -> int | None:
|
|
|
54
55
|
return None
|
|
55
56
|
|
|
56
57
|
|
|
58
|
+
def _normalize_duration_ms(value: Any) -> int:
|
|
59
|
+
parsed = _optional_i(value)
|
|
60
|
+
if parsed is None or parsed <= 0:
|
|
61
|
+
return 0
|
|
62
|
+
if 0 < parsed < 10000:
|
|
63
|
+
return parsed * 1000
|
|
64
|
+
return parsed
|
|
65
|
+
|
|
66
|
+
|
|
57
67
|
def _first(payload: Any, keys: List[str], default: Any = "") -> Any:
|
|
58
68
|
hit = deep_find_first(payload, keys)
|
|
59
69
|
return default if hit is None else hit
|
|
@@ -134,6 +144,16 @@ def _extract_douyin_video_down_url(item: Dict[str, Any]) -> str:
|
|
|
134
144
|
return _t(selected.get("video_down_url"))
|
|
135
145
|
|
|
136
146
|
|
|
147
|
+
def _extract_douyin_duration_ms(item: Dict[str, Any]) -> int:
|
|
148
|
+
raw = item.get("duration_ms")
|
|
149
|
+
if raw is None:
|
|
150
|
+
raw = item.get("duration")
|
|
151
|
+
video = item.get("video")
|
|
152
|
+
if raw is None and isinstance(video, dict):
|
|
153
|
+
raw = video.get("duration")
|
|
154
|
+
return _normalize_duration_ms(raw)
|
|
155
|
+
|
|
156
|
+
|
|
137
157
|
def _extract_xhs_video_down_url(item: Dict[str, Any]) -> str:
|
|
138
158
|
urls = _pick_http_urls(
|
|
139
159
|
item,
|
|
@@ -154,6 +174,33 @@ def _extract_xhs_video_down_url(item: Dict[str, Any]) -> str:
|
|
|
154
174
|
return ""
|
|
155
175
|
|
|
156
176
|
|
|
177
|
+
def _extract_xhs_duration_ms(item: Dict[str, Any]) -> int:
|
|
178
|
+
video_info_v2 = item.get("video_info_v2") if isinstance(item.get("video_info_v2"), dict) else {}
|
|
179
|
+
media = video_info_v2.get("media") if isinstance(video_info_v2.get("media"), dict) else {}
|
|
180
|
+
media_video = media.get("video") if isinstance(media.get("video"), dict) else {}
|
|
181
|
+
capa = video_info_v2.get("capa") if isinstance(video_info_v2.get("capa"), dict) else {}
|
|
182
|
+
raw_candidates = [
|
|
183
|
+
item.get("duration_ms"),
|
|
184
|
+
item.get("duration"),
|
|
185
|
+
item.get("video_duration"),
|
|
186
|
+
item.get("duration_sec"),
|
|
187
|
+
media_video.get("duration"),
|
|
188
|
+
capa.get("duration"),
|
|
189
|
+
]
|
|
190
|
+
video = item.get("video")
|
|
191
|
+
if isinstance(video, dict):
|
|
192
|
+
raw_candidates.append(video.get("duration"))
|
|
193
|
+
note = item.get("note")
|
|
194
|
+
if isinstance(note, dict):
|
|
195
|
+
raw_candidates.append(note.get("duration"))
|
|
196
|
+
|
|
197
|
+
for candidate in raw_candidates:
|
|
198
|
+
normalized = _normalize_duration_ms(candidate)
|
|
199
|
+
if normalized > 0:
|
|
200
|
+
return normalized
|
|
201
|
+
return 0
|
|
202
|
+
|
|
203
|
+
|
|
157
204
|
def _normalize_text_list(value: Any) -> List[str]:
|
|
158
205
|
values: List[str] = []
|
|
159
206
|
if isinstance(value, list):
|
|
@@ -276,9 +323,9 @@ def _extract_xhs_subtitle_urls(item: Dict[str, Any]) -> List[str]:
|
|
|
276
323
|
|
|
277
324
|
def _extract_xhs_work_modality(item: Dict[str, Any], *, video_download_url: str, subtitle_inline: str) -> str:
|
|
278
325
|
content_type_raw = _t(_first(item, ["type", "note_type", "model_type"])).lower()
|
|
279
|
-
if content_type_raw in {"video", "0", "
|
|
326
|
+
if content_type_raw in {"video", "0", "mixed", "mix", "video_note", "note_video"}:
|
|
280
327
|
return "video"
|
|
281
|
-
if content_type_raw in {"image", "1", "photo", "album", "note", "text"}:
|
|
328
|
+
if content_type_raw in {"normal", "image", "1", "photo", "album", "note", "text"}:
|
|
282
329
|
return "text"
|
|
283
330
|
if video_download_url or subtitle_inline:
|
|
284
331
|
return "video"
|
|
@@ -393,6 +440,9 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
393
440
|
if not isinstance(item, dict):
|
|
394
441
|
continue
|
|
395
442
|
aweme_id = _t(_first(item, ["aweme_id", "item_id", "id"]))
|
|
443
|
+
video_type_info = normalize_douyin_video_type(item)
|
|
444
|
+
is_video = bool(video_type_info.get("is_video"))
|
|
445
|
+
duration_ms = _extract_douyin_duration_ms(item)
|
|
396
446
|
author_info = extract_douyin_author(item)
|
|
397
447
|
metrics = extract_douyin_metrics(item)
|
|
398
448
|
video_down_url = _extract_douyin_video_down_url(item)
|
|
@@ -417,9 +467,9 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
417
467
|
subtitle_raw="",
|
|
418
468
|
subtitle_source="missing",
|
|
419
469
|
publish_time=_t(_first(item, ["create_time", "publish_time"])),
|
|
420
|
-
work_modality="video",
|
|
421
|
-
content_type="video",
|
|
422
|
-
duration_ms=
|
|
470
|
+
work_modality="video" if is_video else "text",
|
|
471
|
+
content_type="video" if is_video else "text",
|
|
472
|
+
duration_ms=duration_ms,
|
|
423
473
|
tags=tags,
|
|
424
474
|
metrics={
|
|
425
475
|
"digg_count": int(metrics.get("digg_count") or 0),
|
|
@@ -438,10 +488,13 @@ def adapt_douyin_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[
|
|
|
438
488
|
asr_status="pending",
|
|
439
489
|
asr_error_reason="",
|
|
440
490
|
asr_source="fallback_none",
|
|
491
|
+
is_video=is_video,
|
|
441
492
|
platform_native_refs={
|
|
442
493
|
"douyin_sec_uid": _t(author_info.get("douyin_sec_uid") or internal_author_id),
|
|
443
494
|
"douyin_aweme_author_id": _t(author_info.get("douyin_aweme_author_id") or stable_author_id or author_id),
|
|
444
495
|
"douyin_unique_id": _t(author_info.get("unique_id")),
|
|
496
|
+
"douyin_video_type_reason": _t(video_type_info.get("video_type_reason")),
|
|
497
|
+
"douyin_video_type_field": _t(video_type_info.get("matched_field")),
|
|
445
498
|
},
|
|
446
499
|
raw_ref={"aweme_id": aweme_id, "raw_item": item},
|
|
447
500
|
)
|
|
@@ -519,6 +572,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
519
572
|
subtitle_inline = _extract_xhs_subtitle_inline(item)
|
|
520
573
|
subtitle_urls = _extract_xhs_subtitle_urls(item)
|
|
521
574
|
video_down_url = _extract_xhs_video_down_url(item)
|
|
575
|
+
duration_ms = _extract_xhs_duration_ms(item)
|
|
522
576
|
content_type_raw = _t(_first(item, ["type", "note_type", "model_type"]))
|
|
523
577
|
work_modality = _extract_xhs_work_modality(item, video_download_url=video_down_url, subtitle_inline=subtitle_inline)
|
|
524
578
|
content_type = "video" if work_modality == "video" else (content_type_raw or "text")
|
|
@@ -541,7 +595,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
541
595
|
publish_time=_t(_first(item, ["publish_time", "time", "create_time", "publishTime", "created_at"])),
|
|
542
596
|
work_modality=work_modality,
|
|
543
597
|
content_type=content_type,
|
|
544
|
-
duration_ms=
|
|
598
|
+
duration_ms=duration_ms,
|
|
545
599
|
tags=_extract_xhs_tags(item),
|
|
546
600
|
metrics=metrics,
|
|
547
601
|
cover_image=cover_image,
|
|
@@ -551,6 +605,7 @@ def adapt_xhs_author_home(raw: Dict[str, Any]) -> Tuple[Dict[str, Any], List[Dic
|
|
|
551
605
|
asr_status="subtitle_ready" if subtitle_inline else "pending",
|
|
552
606
|
asr_error_reason="",
|
|
553
607
|
asr_source="native_subtitle" if subtitle_inline else "fallback_none",
|
|
608
|
+
is_video=work_modality == "video",
|
|
554
609
|
platform_native_refs={"xhs_user_id": author_id, "xhs_red_id": author_handle},
|
|
555
610
|
raw_ref={
|
|
556
611
|
"note_id": note_id,
|
|
@@ -108,9 +108,9 @@ def _infer_work_modality(*, work_modality: str, is_video: Any, content_type: str
|
|
|
108
108
|
return "video"
|
|
109
109
|
|
|
110
110
|
content_type_value = _to_text(content_type).lower()
|
|
111
|
-
if content_type_value in {"video", "mixed", "mix", "video_note", "note_video", "
|
|
111
|
+
if content_type_value in {"video", "mixed", "mix", "video_note", "note_video", "0"}:
|
|
112
112
|
return "video"
|
|
113
|
-
if content_type_value in {"text", "note", "image", "photo", "album", "1"}:
|
|
113
|
+
if content_type_value in {"normal", "text", "note", "image", "photo", "album", "1"}:
|
|
114
114
|
return "text"
|
|
115
115
|
|
|
116
116
|
if subtitle_raw or video_download_url:
|
|
@@ -62,6 +62,13 @@ def _safe_optional_int(value: Any) -> Optional[int]:
|
|
|
62
62
|
return None
|
|
63
63
|
|
|
64
64
|
|
|
65
|
+
def _safe_optional_positive_int(value: Any) -> Optional[int]:
|
|
66
|
+
parsed = _safe_optional_int(value)
|
|
67
|
+
if parsed is None or parsed <= 0:
|
|
68
|
+
return None
|
|
69
|
+
return parsed
|
|
70
|
+
|
|
71
|
+
|
|
65
72
|
def _source_dict(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
66
73
|
source = payload.get("source")
|
|
67
74
|
return source if isinstance(source, dict) else {}
|
|
@@ -250,6 +257,7 @@ def build_work_fact_card(payload: Dict[str, Any], platform: Optional[str] = None
|
|
|
250
257
|
"create_time_sec": payload.get("create_time_sec"),
|
|
251
258
|
"publish_time_source": _safe_text(payload.get("publish_time_source")),
|
|
252
259
|
"published_date": _resolve_published_date(payload),
|
|
260
|
+
"duration_ms": _safe_optional_positive_int(payload.get("duration_ms")),
|
|
253
261
|
"digg_count": _safe_int(payload.get("digg_count")),
|
|
254
262
|
"comment_count": _safe_int(payload.get("comment_count")),
|
|
255
263
|
"collect_count": _safe_int(payload.get("collect_count")),
|
|
@@ -324,6 +332,7 @@ def _frontmatter_lines(card: Dict[str, Any]) -> List[str]:
|
|
|
324
332
|
("title", card.get("title")),
|
|
325
333
|
("published_date", card.get("published_date")),
|
|
326
334
|
("work_modality", card.get("work_modality")),
|
|
335
|
+
("duration_ms", card.get("duration_ms")),
|
|
327
336
|
("digg_count", card.get("digg_count")),
|
|
328
337
|
("comment_count", card.get("comment_count")),
|
|
329
338
|
("collect_count", card.get("collect_count")),
|