@tikomni/skills 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/SKILL.md +2 -1
- package/skills/social-media-crawl/references/contracts/work-fact-card-fields.md +1 -0
- package/skills/social-media-crawl/scripts/core/asr_pipeline.py +153 -0
- package/skills/social-media-crawl/scripts/pipelines/home_asr.py +167 -14
- package/skills/social-media-crawl/scripts/pipelines/platform_adapters.py +38 -1
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +49 -11
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +4 -2
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tikomni/skills",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.7",
|
|
4
4
|
"description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
|
|
@@ -32,7 +32,8 @@ description: Use this skill when the user asks about social media links, posts,
|
|
|
32
32
|
- 先看用户给的对象是什么,再选固定 pipeline 或 MCP 工具。
|
|
33
33
|
- 优先返回事实字段,不补充主观分析。
|
|
34
34
|
- 视频文本优先使用平台原生字幕;拿不到再走 ASR。
|
|
35
|
-
- 默认输出以结构化 JSON
|
|
35
|
+
- 默认输出以结构化 JSON 和Markdown并主动落库。
|
|
36
|
+
- 当固定pipeline时,默认执行抓取并落库(强制)。
|
|
36
37
|
|
|
37
38
|
## Workflow
|
|
38
39
|
|
|
@@ -1014,6 +1014,159 @@ def run_u2_asr_candidates_with_timeout_retry(
|
|
|
1014
1014
|
return final_bundle
|
|
1015
1015
|
|
|
1016
1016
|
|
|
1017
|
+
def run_u3_then_u2_asr_candidates_with_timeout_retry(
|
|
1018
|
+
*,
|
|
1019
|
+
base_url: str,
|
|
1020
|
+
token: str,
|
|
1021
|
+
timeout_ms: int,
|
|
1022
|
+
candidates: List[str],
|
|
1023
|
+
submit_max_retries: int,
|
|
1024
|
+
submit_backoff_ms: int,
|
|
1025
|
+
poll_interval_sec: float,
|
|
1026
|
+
max_polls: int,
|
|
1027
|
+
timeout_retry_enabled: bool = True,
|
|
1028
|
+
timeout_retry_max_retries: int = 3,
|
|
1029
|
+
pending_timeout_sec: int = DEFAULT_U2_PENDING_TIMEOUT_SEC,
|
|
1030
|
+
progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
1031
|
+
) -> Dict[str, Any]:
|
|
1032
|
+
normalized_candidates = normalize_media_candidates(candidates)
|
|
1033
|
+
attempts: List[Dict[str, Any]] = []
|
|
1034
|
+
|
|
1035
|
+
final_bundle: Dict[str, Any] = {
|
|
1036
|
+
"submit_bundle": {},
|
|
1037
|
+
"poll_result": {"ok": False, "task_status": "UNKNOWN", "error_reason": "no_candidates"},
|
|
1038
|
+
"rounds": [],
|
|
1039
|
+
"timeout_retry": {
|
|
1040
|
+
"enabled": bool(timeout_retry_enabled),
|
|
1041
|
+
"configured_max_retries": max(0, min(3, int(timeout_retry_max_retries))),
|
|
1042
|
+
"triggered": False,
|
|
1043
|
+
"result": "not_triggered",
|
|
1044
|
+
},
|
|
1045
|
+
"u3_fallback": {
|
|
1046
|
+
"enabled": False,
|
|
1047
|
+
"triggered": False,
|
|
1048
|
+
"ok": False,
|
|
1049
|
+
"result": "not_triggered",
|
|
1050
|
+
"public_url": "",
|
|
1051
|
+
"trace": [],
|
|
1052
|
+
},
|
|
1053
|
+
}
|
|
1054
|
+
chosen_url: Optional[str] = None
|
|
1055
|
+
chosen_public_url: Optional[str] = None
|
|
1056
|
+
|
|
1057
|
+
for index, candidate in enumerate(normalized_candidates, start=1):
|
|
1058
|
+
valid = is_valid_u2_media_candidate(candidate)
|
|
1059
|
+
if not valid:
|
|
1060
|
+
attempts.append(
|
|
1061
|
+
{
|
|
1062
|
+
"index": index,
|
|
1063
|
+
"candidate": candidate,
|
|
1064
|
+
"valid": False,
|
|
1065
|
+
"result": "skipped_non_media_candidate",
|
|
1066
|
+
}
|
|
1067
|
+
)
|
|
1068
|
+
continue
|
|
1069
|
+
|
|
1070
|
+
u3_result = run_u3_public_url_fallback(
|
|
1071
|
+
base_url=base_url,
|
|
1072
|
+
token=token,
|
|
1073
|
+
timeout_ms=timeout_ms,
|
|
1074
|
+
source_url=candidate,
|
|
1075
|
+
)
|
|
1076
|
+
u3_bundle = {
|
|
1077
|
+
"enabled": True,
|
|
1078
|
+
"triggered": True,
|
|
1079
|
+
"ok": bool(u3_result.get("ok")),
|
|
1080
|
+
"result": "u3_completed" if u3_result.get("ok") else "u3_failed",
|
|
1081
|
+
"public_url": normalize_media_url(u3_result.get("public_url")),
|
|
1082
|
+
"request_id": u3_result.get("request_id"),
|
|
1083
|
+
"error_reason": u3_result.get("error_reason"),
|
|
1084
|
+
"trace": u3_result.get("trace", []),
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
attempts.append(
|
|
1088
|
+
{
|
|
1089
|
+
"index": index,
|
|
1090
|
+
"candidate": candidate,
|
|
1091
|
+
"valid": True,
|
|
1092
|
+
"u3_bridge": u3_bundle,
|
|
1093
|
+
}
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
if not u3_bundle.get("ok") or not u3_bundle.get("public_url"):
|
|
1097
|
+
final_bundle = {
|
|
1098
|
+
"submit_bundle": {},
|
|
1099
|
+
"poll_result": {
|
|
1100
|
+
"ok": False,
|
|
1101
|
+
"task_status": "UNKNOWN",
|
|
1102
|
+
"error_reason": u3_bundle.get("error_reason") or "u3_bridge_failed",
|
|
1103
|
+
"request_id": u3_bundle.get("request_id"),
|
|
1104
|
+
"trace": list(u3_bundle.get("trace", [])),
|
|
1105
|
+
},
|
|
1106
|
+
"rounds": [],
|
|
1107
|
+
"timeout_retry": {
|
|
1108
|
+
"enabled": bool(timeout_retry_enabled),
|
|
1109
|
+
"configured_max_retries": max(0, min(3, int(timeout_retry_max_retries))),
|
|
1110
|
+
"triggered": False,
|
|
1111
|
+
"result": "not_triggered",
|
|
1112
|
+
},
|
|
1113
|
+
"u3_fallback": {
|
|
1114
|
+
"enabled": False,
|
|
1115
|
+
"triggered": False,
|
|
1116
|
+
"ok": False,
|
|
1117
|
+
"result": "not_triggered",
|
|
1118
|
+
"public_url": "",
|
|
1119
|
+
"trace": [],
|
|
1120
|
+
},
|
|
1121
|
+
"u3_bridge": u3_bundle,
|
|
1122
|
+
}
|
|
1123
|
+
continue
|
|
1124
|
+
|
|
1125
|
+
bundle = run_u2_asr_with_timeout_retry(
|
|
1126
|
+
base_url=base_url,
|
|
1127
|
+
token=token,
|
|
1128
|
+
timeout_ms=timeout_ms,
|
|
1129
|
+
video_url=str(u3_bundle.get("public_url")),
|
|
1130
|
+
submit_max_retries=submit_max_retries,
|
|
1131
|
+
submit_backoff_ms=submit_backoff_ms,
|
|
1132
|
+
poll_interval_sec=poll_interval_sec,
|
|
1133
|
+
max_polls=max_polls,
|
|
1134
|
+
timeout_retry_enabled=timeout_retry_enabled,
|
|
1135
|
+
timeout_retry_max_retries=timeout_retry_max_retries,
|
|
1136
|
+
pending_timeout_sec=pending_timeout_sec,
|
|
1137
|
+
u3_fallback_enabled=False,
|
|
1138
|
+
progress_callback=progress_callback,
|
|
1139
|
+
)
|
|
1140
|
+
poll_result = bundle.get("poll_result", {})
|
|
1141
|
+
error_reason = str(poll_result.get("error_reason") or "")
|
|
1142
|
+
ok = bool(poll_result.get("ok"))
|
|
1143
|
+
|
|
1144
|
+
attempts[-1].update(
|
|
1145
|
+
{
|
|
1146
|
+
"ok": ok,
|
|
1147
|
+
"error_reason": error_reason,
|
|
1148
|
+
"task_status": poll_result.get("task_status"),
|
|
1149
|
+
"u2_public_url": u3_bundle.get("public_url"),
|
|
1150
|
+
}
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
final_bundle = dict(bundle)
|
|
1154
|
+
final_bundle["u3_bridge"] = u3_bundle
|
|
1155
|
+
chosen_url = candidate
|
|
1156
|
+
chosen_public_url = str(u3_bundle.get("public_url") or "")
|
|
1157
|
+
if ok:
|
|
1158
|
+
break
|
|
1159
|
+
if error_reason == "INVALID_SOURCE_URL":
|
|
1160
|
+
continue
|
|
1161
|
+
break
|
|
1162
|
+
|
|
1163
|
+
final_bundle["candidate_attempts"] = attempts
|
|
1164
|
+
final_bundle["chosen_candidate"] = chosen_url
|
|
1165
|
+
final_bundle["chosen_public_url"] = chosen_public_url
|
|
1166
|
+
final_bundle["normalized_candidates"] = normalized_candidates
|
|
1167
|
+
return final_bundle
|
|
1168
|
+
|
|
1169
|
+
|
|
1017
1170
|
def run_u2_asr_batch_with_timeout_retry(
|
|
1018
1171
|
*,
|
|
1019
1172
|
base_url: str,
|
|
@@ -16,9 +16,11 @@ from scripts.core.asr_pipeline import (
|
|
|
16
16
|
run_u2_asr_batch_with_timeout_retry,
|
|
17
17
|
run_u2_asr_candidates_with_timeout_retry,
|
|
18
18
|
)
|
|
19
|
+
from scripts.core.u3_fallback import run_u3_public_url_fallback
|
|
19
20
|
|
|
20
21
|
DEFAULT_BATCH_SUBMIT_SIZE = 50
|
|
21
22
|
MAX_BATCH_SUBMIT_SIZE = 100
|
|
23
|
+
XHS_U3_U2_BATCH_SIZE = 20
|
|
22
24
|
U2_GATE_MIN_DURATION_MS = 13000
|
|
23
25
|
U2_GATE_MAX_DURATION_MS = 1800000
|
|
24
26
|
U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_download_url_present"
|
|
@@ -406,23 +408,31 @@ def _resolve_xhs_subtitle(work: Dict[str, Any], timeout_ms: int) -> Dict[str, An
|
|
|
406
408
|
}
|
|
407
409
|
invalid_reasons.append({"field": source, "reason": invalid_reason})
|
|
408
410
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
411
|
+
subtitle_text = _fetch_subtitle_text(subtitle_urls, timeout_ms)
|
|
412
|
+
if subtitle_text:
|
|
413
|
+
invalid_reason = _invalid_subtitle_reason(subtitle_text)
|
|
414
|
+
if invalid_reason is None:
|
|
415
|
+
return {
|
|
416
|
+
"text": subtitle_text,
|
|
417
|
+
"subtitle_source": "subtitle_url",
|
|
418
|
+
"subtitle_field": "raw_ref.subtitle_urls",
|
|
419
|
+
"subtitle_urls": subtitle_urls,
|
|
420
|
+
"invalid_reasons": invalid_reasons,
|
|
421
|
+
"failure_category": "",
|
|
422
|
+
}
|
|
423
|
+
invalid_reasons.append({"field": "raw_ref.subtitle_urls", "reason": invalid_reason})
|
|
414
424
|
|
|
415
425
|
return {
|
|
416
|
-
"text":
|
|
417
|
-
"subtitle_source": "
|
|
418
|
-
"subtitle_field": "
|
|
426
|
+
"text": "",
|
|
427
|
+
"subtitle_source": "missing",
|
|
428
|
+
"subtitle_field": "",
|
|
419
429
|
"subtitle_urls": subtitle_urls,
|
|
420
430
|
"invalid_reasons": invalid_reasons,
|
|
421
431
|
"failure_category": _classify_xhs_subtitle_failure(
|
|
422
432
|
work=work,
|
|
423
433
|
interface_candidates=interface_candidates,
|
|
424
434
|
subtitle_urls=subtitle_urls,
|
|
425
|
-
invalid_reason=
|
|
435
|
+
invalid_reason="subtitle_empty",
|
|
426
436
|
),
|
|
427
437
|
}
|
|
428
438
|
|
|
@@ -462,6 +472,127 @@ def _fallback_none_result(reason: str) -> Dict[str, Any]:
|
|
|
462
472
|
}
|
|
463
473
|
|
|
464
474
|
|
|
475
|
+
def _run_xhs_u3_then_u2_batch_for_entries(
|
|
476
|
+
*,
|
|
477
|
+
batch_id: str,
|
|
478
|
+
entries: List[Dict[str, Any]],
|
|
479
|
+
base_url: str,
|
|
480
|
+
token: str,
|
|
481
|
+
timeout_ms: int,
|
|
482
|
+
poll_interval_sec: float,
|
|
483
|
+
max_polls: int,
|
|
484
|
+
submit_max_retries: int,
|
|
485
|
+
submit_backoff_ms: int,
|
|
486
|
+
timeout_retry_enabled: bool,
|
|
487
|
+
timeout_retry_max_retries: int,
|
|
488
|
+
) -> Dict[str, Any]:
|
|
489
|
+
trace: List[Dict[str, Any]] = []
|
|
490
|
+
u2_entries: List[Dict[str, Any]] = []
|
|
491
|
+
u3_failed_count = 0
|
|
492
|
+
|
|
493
|
+
for entry in entries:
|
|
494
|
+
work = entry.get("work")
|
|
495
|
+
if not isinstance(work, dict):
|
|
496
|
+
continue
|
|
497
|
+
|
|
498
|
+
source_url = normalize_media_url(entry.get("video_download_url") or work.get("video_download_url") or work.get("video_down_url"))
|
|
499
|
+
work_id = normalize_text(entry.get("work_id") or work.get("platform_work_id"))
|
|
500
|
+
subtitle_invalid = normalize_text(entry.get("subtitle_invalid")) or "subtitle_missing"
|
|
501
|
+
|
|
502
|
+
if not source_url:
|
|
503
|
+
work.update(_fallback_none_result("skip:video_download_url_missing"))
|
|
504
|
+
trace.append(
|
|
505
|
+
{
|
|
506
|
+
"step": "author_home.asr.xhs_u3",
|
|
507
|
+
"batch_id": batch_id,
|
|
508
|
+
"platform_work_id": work_id,
|
|
509
|
+
"ok": False,
|
|
510
|
+
"error_reason": "skip:video_download_url_missing",
|
|
511
|
+
"subtitle_invalid": subtitle_invalid,
|
|
512
|
+
"public_url_present": False,
|
|
513
|
+
}
|
|
514
|
+
)
|
|
515
|
+
u3_failed_count += 1
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
u3_result = run_u3_public_url_fallback(
|
|
519
|
+
base_url=base_url,
|
|
520
|
+
token=token,
|
|
521
|
+
timeout_ms=timeout_ms,
|
|
522
|
+
source_url=source_url,
|
|
523
|
+
)
|
|
524
|
+
public_url = normalize_media_url(u3_result.get("public_url"))
|
|
525
|
+
trace.append(
|
|
526
|
+
{
|
|
527
|
+
"step": "author_home.asr.xhs_u3",
|
|
528
|
+
"batch_id": batch_id,
|
|
529
|
+
"platform_work_id": work_id,
|
|
530
|
+
"ok": bool(u3_result.get("ok") and public_url),
|
|
531
|
+
"error_reason": u3_result.get("error_reason"),
|
|
532
|
+
"subtitle_invalid": subtitle_invalid,
|
|
533
|
+
"source_url": source_url,
|
|
534
|
+
"public_url_present": bool(public_url),
|
|
535
|
+
"u3_trace": u3_result.get("trace", []),
|
|
536
|
+
}
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
if not u3_result.get("ok") or not public_url:
|
|
540
|
+
work.update(_fallback_none_result(normalize_text(u3_result.get("error_reason")) or "u3_bridge_failed"))
|
|
541
|
+
u3_failed_count += 1
|
|
542
|
+
continue
|
|
543
|
+
|
|
544
|
+
u2_entries.append(
|
|
545
|
+
{
|
|
546
|
+
"work": work,
|
|
547
|
+
"work_id": work_id,
|
|
548
|
+
"video_download_url": public_url,
|
|
549
|
+
"fallback_reason": f"xhs_u3_then_u2_failed:{subtitle_invalid}",
|
|
550
|
+
"u3_public_url": public_url,
|
|
551
|
+
}
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
batch_bundle = {
|
|
555
|
+
"trace": [],
|
|
556
|
+
"submitted": False,
|
|
557
|
+
"completed": False,
|
|
558
|
+
"mapped_count": 0,
|
|
559
|
+
"unmapped_entries": [],
|
|
560
|
+
"batch_progress": {},
|
|
561
|
+
}
|
|
562
|
+
if u2_entries:
|
|
563
|
+
batch_bundle = _run_u2_batch_for_entries(
|
|
564
|
+
batch_id=batch_id,
|
|
565
|
+
entries=u2_entries,
|
|
566
|
+
base_url=base_url,
|
|
567
|
+
token=token,
|
|
568
|
+
timeout_ms=timeout_ms,
|
|
569
|
+
poll_interval_sec=poll_interval_sec,
|
|
570
|
+
max_polls=max_polls,
|
|
571
|
+
submit_max_retries=submit_max_retries,
|
|
572
|
+
submit_backoff_ms=submit_backoff_ms,
|
|
573
|
+
timeout_retry_enabled=timeout_retry_enabled,
|
|
574
|
+
timeout_retry_max_retries=timeout_retry_max_retries,
|
|
575
|
+
)
|
|
576
|
+
trace.extend(batch_bundle.get("trace") if isinstance(batch_bundle.get("trace"), list) else [])
|
|
577
|
+
|
|
578
|
+
unmapped_entries = list(batch_bundle.get("unmapped_entries") or [])
|
|
579
|
+
for entry in unmapped_entries:
|
|
580
|
+
work = entry.get("work")
|
|
581
|
+
if not isinstance(work, dict):
|
|
582
|
+
continue
|
|
583
|
+
work.update(_fallback_none_result(normalize_text(entry.get("fallback_reason")) or "xhs_u3_then_u2_failed"))
|
|
584
|
+
|
|
585
|
+
return {
|
|
586
|
+
"trace": trace,
|
|
587
|
+
"submitted": bool(batch_bundle.get("submitted")),
|
|
588
|
+
"completed": bool(batch_bundle.get("completed")),
|
|
589
|
+
"mapped_count": int(batch_bundle.get("mapped_count") or 0),
|
|
590
|
+
"unmapped_count": len(unmapped_entries),
|
|
591
|
+
"u3_ready_count": len(u2_entries),
|
|
592
|
+
"u3_failed_count": u3_failed_count,
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
|
|
465
596
|
def _mark_text_work_ready(work: Dict[str, Any]) -> Dict[str, Any]:
|
|
466
597
|
caption_raw = normalize_text(work.get("caption_raw"))
|
|
467
598
|
return {
|
|
@@ -669,6 +800,8 @@ def enrich_author_home_asr(
|
|
|
669
800
|
default=DEFAULT_BATCH_SUBMIT_SIZE,
|
|
670
801
|
hard_limit=MAX_BATCH_SUBMIT_SIZE,
|
|
671
802
|
)
|
|
803
|
+
if platform == "xiaohongshu":
|
|
804
|
+
effective_batch = min(effective_batch, XHS_U3_U2_BATCH_SIZE)
|
|
672
805
|
|
|
673
806
|
trace.append(
|
|
674
807
|
{
|
|
@@ -738,6 +871,7 @@ def enrich_author_home_asr(
|
|
|
738
871
|
)
|
|
739
872
|
|
|
740
873
|
batch_u2_entries: List[Dict[str, Any]] = []
|
|
874
|
+
batch_xhs_u3_entries: List[Dict[str, Any]] = []
|
|
741
875
|
|
|
742
876
|
for work in batch:
|
|
743
877
|
work_id = normalize_text(work.get("platform_work_id"))
|
|
@@ -834,7 +968,6 @@ def enrich_author_home_asr(
|
|
|
834
968
|
"invalid_reasons": subtitle_probe.get("invalid_reasons"),
|
|
835
969
|
}
|
|
836
970
|
)
|
|
837
|
-
|
|
838
971
|
gate = _evaluate_u2_gate(work, platform=platform)
|
|
839
972
|
trace.append(
|
|
840
973
|
{
|
|
@@ -848,19 +981,17 @@ def enrich_author_home_asr(
|
|
|
848
981
|
"is_video": gate.get("is_video"),
|
|
849
982
|
"duration_ms": gate.get("duration_ms"),
|
|
850
983
|
"video_download_url_present": gate.get("video_download_url_present"),
|
|
851
|
-
"subtitle_invalid": subtitle_invalid,
|
|
852
984
|
}
|
|
853
985
|
)
|
|
854
|
-
|
|
855
986
|
if not gate.get("can_u2"):
|
|
856
987
|
work.update(_fallback_none_result(str(gate.get("gate_reason") or "skip:unknown")))
|
|
857
988
|
else:
|
|
858
|
-
|
|
989
|
+
batch_xhs_u3_entries.append(
|
|
859
990
|
{
|
|
860
991
|
"work": work,
|
|
861
992
|
"work_id": work_id,
|
|
862
993
|
"video_download_url": gate.get("video_download_url"),
|
|
863
|
-
"
|
|
994
|
+
"subtitle_invalid": subtitle_invalid,
|
|
864
995
|
}
|
|
865
996
|
)
|
|
866
997
|
|
|
@@ -890,6 +1021,28 @@ def enrich_author_home_asr(
|
|
|
890
1021
|
fallback_entries = list(batch_bundle.get("unmapped_entries") or [])
|
|
891
1022
|
batch_unmapped_count += len(fallback_entries)
|
|
892
1023
|
|
|
1024
|
+
if batch_xhs_u3_entries:
|
|
1025
|
+
xhs_batch_bundle = _run_xhs_u3_then_u2_batch_for_entries(
|
|
1026
|
+
batch_id=batch_id,
|
|
1027
|
+
entries=batch_xhs_u3_entries,
|
|
1028
|
+
base_url=base_url,
|
|
1029
|
+
token=token,
|
|
1030
|
+
timeout_ms=timeout_ms,
|
|
1031
|
+
poll_interval_sec=poll_interval_sec,
|
|
1032
|
+
max_polls=max_polls,
|
|
1033
|
+
submit_max_retries=max(0, int(xhs_submit_max_retries)),
|
|
1034
|
+
submit_backoff_ms=max(0, int(xhs_submit_backoff_ms)),
|
|
1035
|
+
timeout_retry_enabled=timeout_retry_enabled,
|
|
1036
|
+
timeout_retry_max_retries=max(0, int(timeout_retry_max_retries)),
|
|
1037
|
+
)
|
|
1038
|
+
trace.extend(xhs_batch_bundle.get("trace") if isinstance(xhs_batch_bundle.get("trace"), list) else [])
|
|
1039
|
+
if xhs_batch_bundle.get("submitted"):
|
|
1040
|
+
submitted_batches += 1
|
|
1041
|
+
if xhs_batch_bundle.get("completed"):
|
|
1042
|
+
completed_batches += 1
|
|
1043
|
+
batch_mapped_count += int(xhs_batch_bundle.get("mapped_count") or 0)
|
|
1044
|
+
batch_unmapped_count += int(xhs_batch_bundle.get("unmapped_count") or 0)
|
|
1045
|
+
|
|
893
1046
|
for fallback_entry in fallback_entries:
|
|
894
1047
|
fallback_work = fallback_entry.get("work")
|
|
895
1048
|
if not isinstance(fallback_work, dict):
|
|
@@ -217,7 +217,8 @@ def _extract_xhs_subtitle_inline(item: Dict[str, Any]) -> str:
|
|
|
217
217
|
|
|
218
218
|
|
|
219
219
|
def _extract_xhs_subtitle_urls(item: Dict[str, Any]) -> List[str]:
|
|
220
|
-
|
|
220
|
+
preferred_language_keys = ("source", "zh-CN", "zh_CN", "zh-Hans", "zh", "zh-Hant", "zh-TW", "zh-HK")
|
|
221
|
+
urls = _pick_http_urls(
|
|
221
222
|
item,
|
|
222
223
|
[
|
|
223
224
|
"subtitle_url",
|
|
@@ -233,6 +234,42 @@ def _extract_xhs_subtitle_urls(item: Dict[str, Any]) -> List[str]:
|
|
|
233
234
|
],
|
|
234
235
|
)
|
|
235
236
|
|
|
237
|
+
def _append(value: Any) -> None:
|
|
238
|
+
text = _t(value)
|
|
239
|
+
if text.startswith("http://") or text.startswith("https://"):
|
|
240
|
+
urls.append(text)
|
|
241
|
+
|
|
242
|
+
def _walk(node: Any) -> None:
|
|
243
|
+
if isinstance(node, dict):
|
|
244
|
+
for key in preferred_language_keys:
|
|
245
|
+
if key in node and isinstance(node.get(key), (dict, list)):
|
|
246
|
+
_walk(node.get(key))
|
|
247
|
+
_append(node.get("url"))
|
|
248
|
+
_append(node.get("src"))
|
|
249
|
+
for key, value in node.items():
|
|
250
|
+
if key in preferred_language_keys:
|
|
251
|
+
continue
|
|
252
|
+
if isinstance(value, (dict, list)):
|
|
253
|
+
_walk(value)
|
|
254
|
+
elif isinstance(node, list):
|
|
255
|
+
for item in node:
|
|
256
|
+
if isinstance(item, (dict, list)):
|
|
257
|
+
_walk(item)
|
|
258
|
+
else:
|
|
259
|
+
_append(item)
|
|
260
|
+
|
|
261
|
+
for container in deep_find_all(item, ["subtitles", "subtitle_list", "subtitleList"]):
|
|
262
|
+
_walk(container)
|
|
263
|
+
|
|
264
|
+
deduped: List[str] = []
|
|
265
|
+
seen = set()
|
|
266
|
+
for url in urls:
|
|
267
|
+
if url in seen:
|
|
268
|
+
continue
|
|
269
|
+
seen.add(url)
|
|
270
|
+
deduped.append(url)
|
|
271
|
+
return deduped
|
|
272
|
+
|
|
236
273
|
|
|
237
274
|
def _extract_xhs_work_modality(item: Dict[str, Any], *, video_download_url: str, subtitle_inline: str) -> str:
|
|
238
275
|
content_type_raw = _t(_first(item, ["type", "note_type", "model_type"])).lower()
|
|
@@ -27,7 +27,7 @@ from datetime import datetime
|
|
|
27
27
|
from pathlib import Path
|
|
28
28
|
from typing import Any, Dict, List, Optional, Tuple
|
|
29
29
|
|
|
30
|
-
from scripts.core.asr_pipeline import derive_asr_clean_text,
|
|
30
|
+
from scripts.core.asr_pipeline import derive_asr_clean_text, run_u3_then_u2_asr_candidates_with_timeout_retry
|
|
31
31
|
from scripts.core.config_loader import config_get, load_tikomni_config
|
|
32
32
|
from scripts.core.progress_report import ProgressReporter, build_progress_reporter
|
|
33
33
|
from scripts.core.extract_pipeline import build_api_trace, resolve_trace_error_context
|
|
@@ -576,6 +576,14 @@ def _extract_xhs_metadata(
|
|
|
576
576
|
["noteList", "publishTime"],
|
|
577
577
|
["noteList", "time"],
|
|
578
578
|
["noteList", "timestamp"],
|
|
579
|
+
["data", "data", "create_time_sec"],
|
|
580
|
+
["data", "data", "create_time"],
|
|
581
|
+
["data", "data", "createTime"],
|
|
582
|
+
["data", "data", "publish_time_sec"],
|
|
583
|
+
["data", "data", "publish_time"],
|
|
584
|
+
["data", "data", "publishTime"],
|
|
585
|
+
["data", "data", "time"],
|
|
586
|
+
["data", "data", "timestamp"],
|
|
579
587
|
]
|
|
580
588
|
create_time_sec, create_time_source = _pick_int_with_source_from_paths(
|
|
581
589
|
payload,
|
|
@@ -930,12 +938,39 @@ def _fetch_note_info(
|
|
|
930
938
|
|
|
931
939
|
def _extract_subtitle_urls(payload: Any) -> List[str]:
|
|
932
940
|
urls: List[str] = []
|
|
941
|
+
preferred_language_keys = ("source", "zh-CN", "zh_CN", "zh-Hans", "zh", "zh-Hant", "zh-TW", "zh-HK")
|
|
942
|
+
|
|
943
|
+
def _append_url(value: Any) -> None:
|
|
944
|
+
if isinstance(value, str):
|
|
945
|
+
text = value.strip()
|
|
946
|
+
if text.startswith("http://") or text.startswith("https://"):
|
|
947
|
+
urls.append(text)
|
|
948
|
+
|
|
949
|
+
def _walk_subtitle_container(node: Any) -> None:
|
|
950
|
+
if isinstance(node, dict):
|
|
951
|
+
for key in preferred_language_keys:
|
|
952
|
+
if key in node and isinstance(node.get(key), (dict, list)):
|
|
953
|
+
_walk_subtitle_container(node.get(key))
|
|
954
|
+
_append_url(node.get("url"))
|
|
955
|
+
_append_url(node.get("src"))
|
|
956
|
+
for key, value in node.items():
|
|
957
|
+
if key in preferred_language_keys:
|
|
958
|
+
continue
|
|
959
|
+
if isinstance(value, (dict, list)):
|
|
960
|
+
_walk_subtitle_container(value)
|
|
961
|
+
elif isinstance(node, list):
|
|
962
|
+
for item in node:
|
|
963
|
+
if isinstance(item, (dict, list)):
|
|
964
|
+
_walk_subtitle_container(item)
|
|
965
|
+
else:
|
|
966
|
+
_append_url(item)
|
|
967
|
+
|
|
933
968
|
for key in ["subtitle_url", "subtitleUrl", "srt_url", "srtUrl", "vtt_url", "vttUrl"]:
|
|
934
969
|
for value in deep_find_all(payload, [key]):
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
970
|
+
_append_url(value)
|
|
971
|
+
|
|
972
|
+
for container in deep_find_all(payload, ["subtitles", "subtitle_list", "subtitleList"]):
|
|
973
|
+
_walk_subtitle_container(container)
|
|
939
974
|
|
|
940
975
|
unique: List[str] = []
|
|
941
976
|
seen = set()
|
|
@@ -1672,8 +1707,9 @@ def run_xiaohongshu_extract(
|
|
|
1672
1707
|
|
|
1673
1708
|
subtitle_inline_text = "" if force_u2_fallback else _extract_subtitle_inline_text(effective_payload)
|
|
1674
1709
|
subtitle_urls = [] if force_u2_fallback else _extract_subtitle_urls(effective_payload)
|
|
1675
|
-
|
|
1676
|
-
|
|
1710
|
+
subtitle_text = subtitle_inline_text
|
|
1711
|
+
if not subtitle_text and subtitle_urls:
|
|
1712
|
+
subtitle_text = _fetch_subtitle_text(subtitle_urls, runtime["timeout_ms"])
|
|
1677
1713
|
|
|
1678
1714
|
app_video_candidates = _extract_video_candidates(note_response.get("data"))
|
|
1679
1715
|
app_image_candidates, image_quality_strategy = _extract_image_candidates_with_strategy(note_response.get("data"))
|
|
@@ -1869,11 +1905,11 @@ def run_xiaohongshu_extract(
|
|
|
1869
1905
|
if progress is not None:
|
|
1870
1906
|
progress.progress(
|
|
1871
1907
|
stage="note.u2",
|
|
1872
|
-
message="starting xiaohongshu u2 flow",
|
|
1908
|
+
message="starting xiaohongshu u3->u2 flow",
|
|
1873
1909
|
data={"candidate_count": len(u2_candidates), "timeout_ms": u2_timeout_ms},
|
|
1874
1910
|
)
|
|
1875
1911
|
u2_started_at = time.perf_counter()
|
|
1876
|
-
u2_bundle =
|
|
1912
|
+
u2_bundle = run_u3_then_u2_asr_candidates_with_timeout_retry(
|
|
1877
1913
|
base_url=runtime["base_url"],
|
|
1878
1914
|
token=runtime["token"],
|
|
1879
1915
|
timeout_ms=u2_timeout_ms,
|
|
@@ -1914,16 +1950,18 @@ def run_xiaohongshu_extract(
|
|
|
1914
1950
|
|
|
1915
1951
|
trace.append(
|
|
1916
1952
|
{
|
|
1917
|
-
"step": "
|
|
1953
|
+
"step": "u3_then_u2_asr",
|
|
1918
1954
|
"endpoint": "/api/u2/v1/services/audio/asr/transcription + /api/u2/v1/tasks/{task_id}",
|
|
1919
1955
|
"selected_video_url": selected_video_url,
|
|
1920
1956
|
"selected_video_candidates": u2_candidates,
|
|
1957
|
+
"chosen_public_url": u2_bundle.get("chosen_public_url"),
|
|
1921
1958
|
"candidate_attempts": u2_bundle.get("candidate_attempts", []),
|
|
1922
1959
|
"submit_retries_config": {
|
|
1923
1960
|
"u2_submit_max_retries": max(0, int(u2_submit_max_retries)),
|
|
1924
1961
|
"u2_submit_backoff_ms": max(0, int(u2_submit_backoff_ms)),
|
|
1925
1962
|
},
|
|
1926
1963
|
"timeout_retry": u2_bundle.get("timeout_retry", {}),
|
|
1964
|
+
"u3_bridge": u2_bundle.get("u3_bridge", {}),
|
|
1927
1965
|
"u3_fallback": u2_bundle.get("u3_fallback", {}),
|
|
1928
1966
|
"rounds": u2_bundle.get("rounds", []),
|
|
1929
1967
|
"final_task_id": poll_result.get("task_id") or task_id,
|
|
@@ -1934,7 +1972,7 @@ def run_xiaohongshu_extract(
|
|
|
1934
1972
|
if progress is not None:
|
|
1935
1973
|
(progress.done if poll_result.get("ok") else progress.failed)(
|
|
1936
1974
|
stage="note.u2",
|
|
1937
|
-
message="xiaohongshu u2 flow finished" if poll_result.get("ok") else "xiaohongshu u2 flow failed",
|
|
1975
|
+
message="xiaohongshu u3->u2 flow finished" if poll_result.get("ok") else "xiaohongshu u3->u2 flow failed",
|
|
1938
1976
|
data={
|
|
1939
1977
|
"task_id": poll_result.get("task_id") or task_id,
|
|
1940
1978
|
"task_status": poll_result.get("task_status"),
|
|
@@ -246,6 +246,9 @@ def build_work_fact_card(payload: Dict[str, Any], platform: Optional[str] = None
|
|
|
246
246
|
"caption_raw": caption_raw,
|
|
247
247
|
"subtitle_raw": _safe_text(payload.get("subtitle_raw")),
|
|
248
248
|
"work_modality": _resolve_work_modality(payload),
|
|
249
|
+
"publish_time": payload.get("publish_time"),
|
|
250
|
+
"create_time_sec": payload.get("create_time_sec"),
|
|
251
|
+
"publish_time_source": _safe_text(payload.get("publish_time_source")),
|
|
249
252
|
"published_date": _resolve_published_date(payload),
|
|
250
253
|
"digg_count": _safe_int(payload.get("digg_count")),
|
|
251
254
|
"comment_count": _safe_int(payload.get("comment_count")),
|
|
@@ -389,7 +392,6 @@ def write_work_fact_card(
|
|
|
389
392
|
fallback_identifier=fallback_identifier,
|
|
390
393
|
)
|
|
391
394
|
|
|
392
|
-
Path(paths["json_path"]).write_text(json.dumps(card, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
393
395
|
Path(paths["markdown_path"]).write_text(
|
|
394
396
|
"\n".join(_markdown_lines(card)).strip() + "\n",
|
|
395
397
|
encoding="utf-8",
|
|
@@ -400,7 +402,7 @@ def write_work_fact_card(
|
|
|
400
402
|
"ok": True,
|
|
401
403
|
"count": 1,
|
|
402
404
|
"path": paths["markdown_path"],
|
|
403
|
-
"json_path":
|
|
405
|
+
"json_path": None,
|
|
404
406
|
"markdown_path": paths["markdown_path"],
|
|
405
407
|
"route": paths["route"],
|
|
406
408
|
"identifier": paths["identifier"],
|