@tikomni/skills 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/skills/single-work-analysis/env.example +3 -3
- package/skills/single-work-analysis/references/config-templates/defaults.yaml +8 -19
- package/skills/single-work-analysis/references/prompt-contracts/{insight.md → analysis-bundle.md} +43 -8
- package/skills/single-work-analysis/scripts/core/analysis_adapter.py +384 -0
- package/skills/single-work-analysis/scripts/core/analysis_pipeline.py +399 -76
- package/skills/single-work-analysis/scripts/core/config_loader.py +18 -42
- package/skills/single-work-analysis/scripts/core/progress_report.py +163 -16
- package/skills/single-work-analysis/scripts/core/storage_router.py +24 -57
- package/skills/single-work-analysis/scripts/core/tikomni_common.py +13 -3
- package/skills/single-work-analysis/scripts/pipeline/asr/asr_pipeline.py +154 -7
- package/skills/single-work-analysis/scripts/pipeline/asr/poll_u2_task.py +3 -1
- package/skills/single-work-analysis/scripts/platform/douyin/run_douyin_single_video.py +243 -44
- package/skills/single-work-analysis/scripts/platform/xiaohongshu/run_xiaohongshu_extract.py +263 -25
- package/skills/single-work-analysis/scripts/writers/write_benchmark_card.py +244 -894
- package/skills/single-work-analysis/references/prompt-contracts/asr-clean.md +0 -28
- package/skills/single-work-analysis/references/prompt-contracts/cta.md +0 -24
- package/skills/single-work-analysis/references/prompt-contracts/hook.md +0 -25
- package/skills/single-work-analysis/references/prompt-contracts/structure.md +0 -25
- package/skills/single-work-analysis/references/prompt-contracts/style.md +0 -27
- package/skills/single-work-analysis/references/prompt-contracts/summary.md +0 -29
- package/skills/single-work-analysis/references/prompt-contracts/topic.md +0 -29
|
@@ -20,15 +20,16 @@ import argparse
|
|
|
20
20
|
import hashlib
|
|
21
21
|
import json
|
|
22
22
|
import re
|
|
23
|
+
import time
|
|
23
24
|
import urllib.parse
|
|
24
25
|
import urllib.request
|
|
25
26
|
from datetime import datetime
|
|
26
27
|
from pathlib import Path
|
|
27
28
|
from typing import Any, Dict, List, Optional, Tuple
|
|
28
29
|
|
|
29
|
-
from scripts.pipeline.asr.asr_pipeline import run_u2_asr_candidates_with_timeout_retry
|
|
30
|
+
from scripts.pipeline.asr.asr_pipeline import derive_asr_clean_text, run_u2_asr_candidates_with_timeout_retry
|
|
30
31
|
from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
|
|
31
|
-
from scripts.core.progress_report import ProgressReporter
|
|
32
|
+
from scripts.core.progress_report import ProgressReporter, build_progress_reporter
|
|
32
33
|
from scripts.core.storage_router import render_output_filename, resolve_json_filename_pattern
|
|
33
34
|
from scripts.core.extract_pipeline import build_api_trace, resolve_trace_error_context
|
|
34
35
|
from scripts.core.tikomni_common import (
|
|
@@ -49,6 +50,7 @@ APP_V1_ENDPOINT = "/api/u1/v1/xiaohongshu/app/get_note_info"
|
|
|
49
50
|
WEB_V2_V2_ENDPOINT = "/api/u1/v1/xiaohongshu/web_v2/fetch_feed_notes_v2"
|
|
50
51
|
WEB_V2_V3_ENDPOINT = "/api/u1/v1/xiaohongshu/web_v2/fetch_feed_notes_v3"
|
|
51
52
|
WEB_ENDPOINT = "/api/u1/v1/xiaohongshu/web/get_note_info_v7"
|
|
53
|
+
U2_REQUEST_TIMEOUT_CAP_MS = 15000
|
|
52
54
|
U2_GATE_MIN_DURATION_MS = 13000
|
|
53
55
|
U2_GATE_MAX_DURATION_MS = 1800000
|
|
54
56
|
U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_download_url_present"
|
|
@@ -80,6 +82,43 @@ def _to_int_or_none(value: Any) -> Optional[int]:
|
|
|
80
82
|
return None
|
|
81
83
|
|
|
82
84
|
|
|
85
|
+
def _resolve_u2_timeout_ms(timeout_ms: Any) -> int:
|
|
86
|
+
parsed = _to_int_or_none(timeout_ms)
|
|
87
|
+
if parsed is None or parsed <= 0:
|
|
88
|
+
return U2_REQUEST_TIMEOUT_CAP_MS
|
|
89
|
+
return max(5000, min(parsed, U2_REQUEST_TIMEOUT_CAP_MS))
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _report_u2_progress(progress: Optional[ProgressReporter], *, stage: str, event: Dict[str, Any], label: str) -> None:
|
|
93
|
+
if progress is None:
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
phase = normalize_text(event.get("phase")).lower()
|
|
97
|
+
state = normalize_text(event.get("state")).lower()
|
|
98
|
+
payload = {
|
|
99
|
+
"phase": phase or "poll",
|
|
100
|
+
"state": state or "",
|
|
101
|
+
"task_id": event.get("task_id"),
|
|
102
|
+
"attempt": event.get("attempt"),
|
|
103
|
+
"task_status": event.get("task_status"),
|
|
104
|
+
"platform_task_status": event.get("platform_task_status"),
|
|
105
|
+
"pending_count": event.get("pending_count"),
|
|
106
|
+
"status_code": event.get("status_code"),
|
|
107
|
+
"batch_progress": event.get("batch_progress"),
|
|
108
|
+
"wait_ms": event.get("wait_ms"),
|
|
109
|
+
"candidate_count": event.get("candidate_count"),
|
|
110
|
+
"ok": event.get("ok"),
|
|
111
|
+
"error_reason": event.get("error_reason"),
|
|
112
|
+
"retriable": event.get("retriable"),
|
|
113
|
+
"request_id": event.get("request_id"),
|
|
114
|
+
}
|
|
115
|
+
message = f"{label} u2 {phase or 'poll'} {state or 'progress'}"
|
|
116
|
+
if phase == "submit" and state == "heartbeat":
|
|
117
|
+
progress.heartbeat(stage=stage, message=message, data=payload)
|
|
118
|
+
return
|
|
119
|
+
progress.progress(stage=stage, message=message, data=payload)
|
|
120
|
+
|
|
121
|
+
|
|
83
122
|
def _evaluate_u2_gate_for_xhs(*, note_content_type: str, duration_ms: Any, video_down_url: Optional[str]) -> Dict[str, Any]:
|
|
84
123
|
content_type = normalize_text(note_content_type).lower()
|
|
85
124
|
is_video = content_type in {"video", "mixed"}
|
|
@@ -760,6 +799,70 @@ def _append_missing_metadata_fields(missing_fields: List[Dict[str, str]], metada
|
|
|
760
799
|
_append(key)
|
|
761
800
|
|
|
762
801
|
|
|
802
|
+
def _empty_timings() -> Dict[str, int]:
|
|
803
|
+
return {
|
|
804
|
+
"url_parse_ms": 0,
|
|
805
|
+
"u1_total_ms": 0,
|
|
806
|
+
"u2_submit_ms": 0,
|
|
807
|
+
"u2_poll_ms": 0,
|
|
808
|
+
"card_write_ms": 0,
|
|
809
|
+
"llm_analysis_ms": 0,
|
|
810
|
+
"total_ms": 0,
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def _elapsed_ms(started_at: float) -> int:
|
|
815
|
+
return int((time.perf_counter() - started_at) * 1000)
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def _http_summary_for_note(response: Dict[str, Any], source_input: Dict[str, Optional[str]]) -> Dict[str, Any]:
|
|
819
|
+
completeness = response.get("_field_completeness") if isinstance(response.get("_field_completeness"), dict) else {}
|
|
820
|
+
payload = response.get("data")
|
|
821
|
+
metadata = _extract_xhs_metadata(
|
|
822
|
+
payload=payload,
|
|
823
|
+
source_input=source_input,
|
|
824
|
+
selected_video_url=None,
|
|
825
|
+
selected_image_urls=[],
|
|
826
|
+
) if response.get("ok") else {}
|
|
827
|
+
return {
|
|
828
|
+
"note_id": normalize_text(metadata.get("note_id")) or normalize_text(source_input.get("note_id")),
|
|
829
|
+
"title_hit": bool(normalize_text(metadata.get("title"))),
|
|
830
|
+
"author_hit": bool(normalize_text(metadata.get("author"))),
|
|
831
|
+
"media_present": bool(normalize_text(metadata.get("video_down_url")) or metadata.get("cover_image")),
|
|
832
|
+
"filled_count": completeness.get("filled_count"),
|
|
833
|
+
"ratio": completeness.get("ratio"),
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
def _emit_http_progress(
|
|
838
|
+
progress: Optional[ProgressReporter],
|
|
839
|
+
*,
|
|
840
|
+
stage: str,
|
|
841
|
+
response: Dict[str, Any],
|
|
842
|
+
route_label: str,
|
|
843
|
+
source_input: Dict[str, Optional[str]],
|
|
844
|
+
) -> None:
|
|
845
|
+
if progress is None:
|
|
846
|
+
return
|
|
847
|
+
progress.http_event(
|
|
848
|
+
stage=stage,
|
|
849
|
+
endpoint=str(response.get("_endpoint") or route_label),
|
|
850
|
+
response=response,
|
|
851
|
+
route_label=route_label,
|
|
852
|
+
summary=_http_summary_for_note(response, source_input),
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
def _update_pipeline_status(result: Dict[str, Any]) -> None:
|
|
857
|
+
card_write = result.get("card_write") if isinstance(result.get("card_write"), dict) else {}
|
|
858
|
+
deep_analysis = result.get("deep_analysis") if isinstance(result.get("deep_analysis"), dict) else {}
|
|
859
|
+
result["pipeline_status"] = {
|
|
860
|
+
"facts_ready": True,
|
|
861
|
+
"card_ready": bool(card_write.get("ok")),
|
|
862
|
+
"deep_analysis": deep_analysis.get("status") or "skipped",
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
|
|
763
866
|
def _fetch_sparse_metadata_enrich(
|
|
764
867
|
*,
|
|
765
868
|
base_url: str,
|
|
@@ -767,6 +870,7 @@ def _fetch_sparse_metadata_enrich(
|
|
|
767
870
|
timeout_ms: int,
|
|
768
871
|
source_input: Dict[str, Optional[str]],
|
|
769
872
|
note_id: Optional[str],
|
|
873
|
+
progress: Optional[ProgressReporter] = None,
|
|
770
874
|
) -> Dict[str, Any]:
|
|
771
875
|
share_text = source_input.get("share_text")
|
|
772
876
|
resolved_note_id = note_id or source_input.get("note_id") or _extract_note_id_from_share(share_text)
|
|
@@ -782,6 +886,7 @@ def _fetch_sparse_metadata_enrich(
|
|
|
782
886
|
)
|
|
783
887
|
response["_endpoint"] = WEB_V2_V3_ENDPOINT
|
|
784
888
|
response["_route_label"] = "web_v2_v3_sparse_enrich"
|
|
889
|
+
_emit_http_progress(progress, stage="note.fetch", response=response, route_label="web_v2_v3_sparse_enrich", source_input=source_input)
|
|
785
890
|
return response
|
|
786
891
|
|
|
787
892
|
if resolved_note_id:
|
|
@@ -795,6 +900,7 @@ def _fetch_sparse_metadata_enrich(
|
|
|
795
900
|
)
|
|
796
901
|
response["_endpoint"] = WEB_V2_V2_ENDPOINT
|
|
797
902
|
response["_route_label"] = "web_v2_v2_sparse_enrich"
|
|
903
|
+
_emit_http_progress(progress, stage="note.fetch", response=response, route_label="web_v2_v2_sparse_enrich", source_input=source_input)
|
|
798
904
|
return response
|
|
799
905
|
|
|
800
906
|
return {
|
|
@@ -805,7 +911,14 @@ def _fetch_sparse_metadata_enrich(
|
|
|
805
911
|
}
|
|
806
912
|
|
|
807
913
|
|
|
808
|
-
def _fetch_note_info(
|
|
914
|
+
def _fetch_note_info(
|
|
915
|
+
*,
|
|
916
|
+
base_url: str,
|
|
917
|
+
token: str,
|
|
918
|
+
timeout_ms: int,
|
|
919
|
+
source_input: Dict[str, Optional[str]],
|
|
920
|
+
progress: Optional[ProgressReporter] = None,
|
|
921
|
+
) -> Dict[str, Any]:
|
|
809
922
|
attempts: List[Dict[str, Any]] = []
|
|
810
923
|
|
|
811
924
|
share_text = source_input.get("share_text")
|
|
@@ -832,6 +945,7 @@ def _fetch_note_info(*, base_url: str, token: str, timeout_ms: int, source_input
|
|
|
832
945
|
"missing_core": ["note_id", "title_or_desc", "media"],
|
|
833
946
|
"core_ready": False,
|
|
834
947
|
}
|
|
948
|
+
_emit_http_progress(progress, stage="note.fetch", response=response, route_label=label, source_input=source_input)
|
|
835
949
|
attempts.append({"label": label, "endpoint": path, "response": response})
|
|
836
950
|
return response
|
|
837
951
|
|
|
@@ -1323,6 +1437,7 @@ def _build_result(
|
|
|
1323
1437
|
missing_fields: Optional[List[Dict[str, str]]] = None,
|
|
1324
1438
|
metadata_fields: Optional[Dict[str, Any]] = None,
|
|
1325
1439
|
asr_source: Optional[str] = None,
|
|
1440
|
+
timings: Optional[Dict[str, int]] = None,
|
|
1326
1441
|
) -> Dict[str, Any]:
|
|
1327
1442
|
metadata = metadata_fields or {}
|
|
1328
1443
|
summary_block = summarize_content(raw_content, source=f"xiaohongshu:{text_source}")
|
|
@@ -1344,7 +1459,8 @@ def _build_result(
|
|
|
1344
1459
|
|
|
1345
1460
|
work_modality = "video" if normalize_text(note_content_type).lower() in {"video", "mixed"} else "text"
|
|
1346
1461
|
caption_raw = normalize_text(metadata.get("caption_raw"))
|
|
1347
|
-
|
|
1462
|
+
asr_clean = derive_asr_clean_text(raw_content)
|
|
1463
|
+
primary_text = asr_clean if work_modality == "video" else (caption_raw or raw_content)
|
|
1348
1464
|
primary_text_source = "asr_clean" if work_modality == "video" else "caption_raw"
|
|
1349
1465
|
analysis_eligibility = "eligible" if primary_text else "incomplete"
|
|
1350
1466
|
analysis_exclusion_reason = "" if analysis_eligibility == "eligible" else ("video_asr_unavailable" if work_modality == "video" else "caption_raw_missing")
|
|
@@ -1389,6 +1505,8 @@ def _build_result(
|
|
|
1389
1505
|
"xhs_sec_token": metadata.get("xhs_sec_token"),
|
|
1390
1506
|
"downloaded_assets": downloaded_assets,
|
|
1391
1507
|
"raw_content": raw_content,
|
|
1508
|
+
"asr_raw": raw_content,
|
|
1509
|
+
"asr_clean": asr_clean,
|
|
1392
1510
|
"primary_text": primary_text,
|
|
1393
1511
|
"primary_text_source": primary_text_source,
|
|
1394
1512
|
"analysis_eligibility": analysis_eligibility,
|
|
@@ -1401,6 +1519,7 @@ def _build_result(
|
|
|
1401
1519
|
"extract_trace": extract_trace,
|
|
1402
1520
|
"fallback_trace": fallback_trace,
|
|
1403
1521
|
"request_id": request_id,
|
|
1522
|
+
"timings": dict(timings or {}),
|
|
1404
1523
|
}
|
|
1405
1524
|
|
|
1406
1525
|
|
|
@@ -1421,6 +1540,7 @@ def run_xiaohongshu_extract(
|
|
|
1421
1540
|
u2_timeout_retry_max_retries: int,
|
|
1422
1541
|
force_u2_fallback: bool,
|
|
1423
1542
|
write_card: bool,
|
|
1543
|
+
analysis_mode: str,
|
|
1424
1544
|
card_type: str,
|
|
1425
1545
|
card_root: Optional[str],
|
|
1426
1546
|
storage_config: Optional[Dict[str, Any]] = None,
|
|
@@ -1428,14 +1548,17 @@ def run_xiaohongshu_extract(
|
|
|
1428
1548
|
persist_output: bool = True,
|
|
1429
1549
|
progress: Optional[ProgressReporter] = None,
|
|
1430
1550
|
) -> Dict[str, Any]:
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
)
|
|
1435
|
-
|
|
1551
|
+
workflow_started_at = time.perf_counter()
|
|
1552
|
+
timings = _empty_timings()
|
|
1553
|
+
parse_started_at = time.perf_counter()
|
|
1436
1554
|
source_input = _normalize_input(input_value, share_text, note_id)
|
|
1555
|
+
timings["url_parse_ms"] = _elapsed_ms(parse_started_at)
|
|
1437
1556
|
if progress is not None:
|
|
1438
|
-
progress.started(
|
|
1557
|
+
progress.started(
|
|
1558
|
+
stage="note.workflow",
|
|
1559
|
+
message="xiaohongshu note workflow started",
|
|
1560
|
+
data={"analysis_mode": analysis_mode, "write_card": bool(write_card), "persist_output": bool(persist_output)},
|
|
1561
|
+
)
|
|
1439
1562
|
metadata_fields: Dict[str, Any] = {}
|
|
1440
1563
|
if not source_input["share_text"] and not source_input["note_id"]:
|
|
1441
1564
|
result = _build_result(
|
|
@@ -1452,15 +1575,17 @@ def run_xiaohongshu_extract(
|
|
|
1452
1575
|
u2_task_id=None,
|
|
1453
1576
|
u2_task_status="UNKNOWN",
|
|
1454
1577
|
note_content_type="unknown",
|
|
1455
|
-
analysis_mode=
|
|
1578
|
+
analysis_mode=analysis_mode,
|
|
1456
1579
|
selected_video_url=None,
|
|
1457
1580
|
selected_video_candidates=[],
|
|
1458
1581
|
selected_image_urls=[],
|
|
1459
1582
|
downloaded_assets=[],
|
|
1460
1583
|
missing_fields=[{"field": "share_text_or_note_id", "reason": "missing_input"}],
|
|
1461
1584
|
metadata_fields=metadata_fields,
|
|
1585
|
+
timings=timings,
|
|
1462
1586
|
)
|
|
1463
1587
|
if write_card:
|
|
1588
|
+
card_started_at = time.perf_counter()
|
|
1464
1589
|
result["card_write"] = write_benchmark_card(
|
|
1465
1590
|
payload=result,
|
|
1466
1591
|
platform="xiaohongshu",
|
|
@@ -1468,7 +1593,14 @@ def run_xiaohongshu_extract(
|
|
|
1468
1593
|
card_root=card_root,
|
|
1469
1594
|
content_kind="note",
|
|
1470
1595
|
storage_config=storage_config,
|
|
1596
|
+
analysis_mode=analysis_mode,
|
|
1597
|
+
progress=progress.child(scope="card_write") if progress is not None else None,
|
|
1471
1598
|
)
|
|
1599
|
+
timings["card_write_ms"] = _elapsed_ms(card_started_at)
|
|
1600
|
+
timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
|
|
1601
|
+
timings["total_ms"] = _elapsed_ms(workflow_started_at)
|
|
1602
|
+
result["timings"] = dict(timings)
|
|
1603
|
+
_update_pipeline_status(result)
|
|
1472
1604
|
return _finalize_result(
|
|
1473
1605
|
result=result,
|
|
1474
1606
|
source_input=source_input,
|
|
@@ -1487,6 +1619,7 @@ def run_xiaohongshu_extract(
|
|
|
1487
1619
|
|
|
1488
1620
|
trace: List[Dict[str, Any]] = []
|
|
1489
1621
|
|
|
1622
|
+
u1_started_at = time.perf_counter()
|
|
1490
1623
|
if progress is not None:
|
|
1491
1624
|
progress.progress(stage="note.fetch", message="fetching xiaohongshu note payload")
|
|
1492
1625
|
note_response = _fetch_note_info(
|
|
@@ -1494,7 +1627,9 @@ def run_xiaohongshu_extract(
|
|
|
1494
1627
|
token=runtime["token"],
|
|
1495
1628
|
timeout_ms=runtime["timeout_ms"],
|
|
1496
1629
|
source_input=source_input,
|
|
1630
|
+
progress=progress,
|
|
1497
1631
|
)
|
|
1632
|
+
timings["u1_total_ms"] = _elapsed_ms(u1_started_at)
|
|
1498
1633
|
|
|
1499
1634
|
attempts = note_response.get("_attempts") or []
|
|
1500
1635
|
for index, attempt in enumerate(attempts, start=1):
|
|
@@ -1548,15 +1683,17 @@ def run_xiaohongshu_extract(
|
|
|
1548
1683
|
u2_task_id=None,
|
|
1549
1684
|
u2_task_status="UNKNOWN",
|
|
1550
1685
|
note_content_type="unknown",
|
|
1551
|
-
analysis_mode=
|
|
1686
|
+
analysis_mode=analysis_mode,
|
|
1552
1687
|
selected_video_url=None,
|
|
1553
1688
|
selected_video_candidates=[],
|
|
1554
1689
|
selected_image_urls=[],
|
|
1555
1690
|
downloaded_assets=[],
|
|
1556
1691
|
missing_fields=[{"field": "u1_note_info", "reason": "all_routes_failed"}],
|
|
1557
1692
|
metadata_fields=metadata_fields,
|
|
1693
|
+
timings=timings,
|
|
1558
1694
|
)
|
|
1559
1695
|
if write_card:
|
|
1696
|
+
card_started_at = time.perf_counter()
|
|
1560
1697
|
result["card_write"] = write_benchmark_card(
|
|
1561
1698
|
payload=result,
|
|
1562
1699
|
platform="xiaohongshu",
|
|
@@ -1564,7 +1701,14 @@ def run_xiaohongshu_extract(
|
|
|
1564
1701
|
card_root=card_root,
|
|
1565
1702
|
content_kind="note",
|
|
1566
1703
|
storage_config=storage_config,
|
|
1704
|
+
analysis_mode=analysis_mode,
|
|
1705
|
+
progress=progress.child(scope="card_write") if progress is not None else None,
|
|
1567
1706
|
)
|
|
1707
|
+
timings["card_write_ms"] = _elapsed_ms(card_started_at)
|
|
1708
|
+
timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
|
|
1709
|
+
timings["total_ms"] = _elapsed_ms(workflow_started_at)
|
|
1710
|
+
result["timings"] = dict(timings)
|
|
1711
|
+
_update_pipeline_status(result)
|
|
1568
1712
|
return _finalize_result(
|
|
1569
1713
|
result=result,
|
|
1570
1714
|
source_input=source_input,
|
|
@@ -1589,13 +1733,16 @@ def run_xiaohongshu_extract(
|
|
|
1589
1733
|
enrich_payload: Any = None
|
|
1590
1734
|
|
|
1591
1735
|
if sparse_metadata_detected:
|
|
1736
|
+
enrich_started_at = time.perf_counter()
|
|
1592
1737
|
enrich_response = _fetch_sparse_metadata_enrich(
|
|
1593
1738
|
base_url=runtime["base_url"],
|
|
1594
1739
|
token=runtime["token"],
|
|
1595
1740
|
timeout_ms=runtime["timeout_ms"],
|
|
1596
1741
|
source_input=source_input,
|
|
1597
1742
|
note_id=source_input.get("note_id"),
|
|
1743
|
+
progress=progress,
|
|
1598
1744
|
)
|
|
1745
|
+
timings["u1_total_ms"] += _elapsed_ms(enrich_started_at)
|
|
1599
1746
|
trace.append(
|
|
1600
1747
|
build_api_trace(
|
|
1601
1748
|
step="u1_sparse_metadata_enrich",
|
|
@@ -1710,15 +1857,17 @@ def run_xiaohongshu_extract(
|
|
|
1710
1857
|
u2_task_id=None,
|
|
1711
1858
|
u2_task_status="SKIPPED",
|
|
1712
1859
|
note_content_type=note_content_type,
|
|
1713
|
-
analysis_mode=
|
|
1860
|
+
analysis_mode=analysis_mode,
|
|
1714
1861
|
selected_video_url=selected_video_url,
|
|
1715
1862
|
selected_video_candidates=video_candidates,
|
|
1716
1863
|
selected_image_urls=image_candidates,
|
|
1717
1864
|
downloaded_assets=[],
|
|
1718
1865
|
missing_fields=missing_fields,
|
|
1719
1866
|
metadata_fields=metadata_fields,
|
|
1867
|
+
timings=timings,
|
|
1720
1868
|
)
|
|
1721
1869
|
if write_card:
|
|
1870
|
+
card_started_at = time.perf_counter()
|
|
1722
1871
|
result["card_write"] = write_benchmark_card(
|
|
1723
1872
|
payload=result,
|
|
1724
1873
|
platform="xiaohongshu",
|
|
@@ -1726,7 +1875,14 @@ def run_xiaohongshu_extract(
|
|
|
1726
1875
|
card_root=card_root,
|
|
1727
1876
|
content_kind="single_video",
|
|
1728
1877
|
storage_config=storage_config,
|
|
1878
|
+
analysis_mode=analysis_mode,
|
|
1879
|
+
progress=progress.child(scope="card_write") if progress is not None else None,
|
|
1729
1880
|
)
|
|
1881
|
+
timings["card_write_ms"] = _elapsed_ms(card_started_at)
|
|
1882
|
+
timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
|
|
1883
|
+
timings["total_ms"] = _elapsed_ms(workflow_started_at)
|
|
1884
|
+
result["timings"] = dict(timings)
|
|
1885
|
+
_update_pipeline_status(result)
|
|
1730
1886
|
return _finalize_result(
|
|
1731
1887
|
result=result,
|
|
1732
1888
|
source_input=source_input,
|
|
@@ -1766,15 +1922,17 @@ def run_xiaohongshu_extract(
|
|
|
1766
1922
|
u2_task_id=None,
|
|
1767
1923
|
u2_task_status="SKIPPED",
|
|
1768
1924
|
note_content_type=note_content_type,
|
|
1769
|
-
analysis_mode=
|
|
1925
|
+
analysis_mode=analysis_mode,
|
|
1770
1926
|
selected_video_url=u2_gate.get("video_down_url") or selected_video_url,
|
|
1771
1927
|
selected_video_candidates=video_candidates,
|
|
1772
1928
|
selected_image_urls=image_candidates,
|
|
1773
1929
|
downloaded_assets=[],
|
|
1774
1930
|
missing_fields=missing_fields,
|
|
1775
1931
|
metadata_fields=metadata_fields,
|
|
1932
|
+
timings=timings,
|
|
1776
1933
|
)
|
|
1777
1934
|
if write_card:
|
|
1935
|
+
card_started_at = time.perf_counter()
|
|
1778
1936
|
result["card_write"] = write_benchmark_card(
|
|
1779
1937
|
payload=result,
|
|
1780
1938
|
platform="xiaohongshu",
|
|
@@ -1782,7 +1940,14 @@ def run_xiaohongshu_extract(
|
|
|
1782
1940
|
card_root=card_root,
|
|
1783
1941
|
content_kind="single_video",
|
|
1784
1942
|
storage_config=storage_config,
|
|
1943
|
+
analysis_mode=analysis_mode,
|
|
1944
|
+
progress=progress.child(scope="card_write") if progress is not None else None,
|
|
1785
1945
|
)
|
|
1946
|
+
timings["card_write_ms"] = _elapsed_ms(card_started_at)
|
|
1947
|
+
timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
|
|
1948
|
+
timings["total_ms"] = _elapsed_ms(workflow_started_at)
|
|
1949
|
+
result["timings"] = dict(timings)
|
|
1950
|
+
_update_pipeline_status(result)
|
|
1786
1951
|
return _finalize_result(
|
|
1787
1952
|
result=result,
|
|
1788
1953
|
source_input=source_input,
|
|
@@ -1792,16 +1957,18 @@ def run_xiaohongshu_extract(
|
|
|
1792
1957
|
)
|
|
1793
1958
|
|
|
1794
1959
|
u2_candidates = _dedupe_keep_order([u2_gate.get("video_down_url")] + list(video_candidates))
|
|
1960
|
+
u2_timeout_ms = _resolve_u2_timeout_ms(runtime["timeout_ms"])
|
|
1795
1961
|
if progress is not None:
|
|
1796
1962
|
progress.progress(
|
|
1797
1963
|
stage="note.u2",
|
|
1798
1964
|
message="starting xiaohongshu u2 flow",
|
|
1799
|
-
data={"candidate_count": len(u2_candidates)},
|
|
1965
|
+
data={"candidate_count": len(u2_candidates), "timeout_ms": u2_timeout_ms},
|
|
1800
1966
|
)
|
|
1967
|
+
u2_started_at = time.perf_counter()
|
|
1801
1968
|
u2_bundle = run_u2_asr_candidates_with_timeout_retry(
|
|
1802
1969
|
base_url=runtime["base_url"],
|
|
1803
1970
|
token=runtime["token"],
|
|
1804
|
-
timeout_ms=
|
|
1971
|
+
timeout_ms=u2_timeout_ms,
|
|
1805
1972
|
candidates=u2_candidates,
|
|
1806
1973
|
submit_max_retries=u2_submit_max_retries,
|
|
1807
1974
|
submit_backoff_ms=u2_submit_backoff_ms,
|
|
@@ -1809,7 +1976,12 @@ def run_xiaohongshu_extract(
|
|
|
1809
1976
|
max_polls=max_polls,
|
|
1810
1977
|
timeout_retry_enabled=u2_timeout_retry_enabled,
|
|
1811
1978
|
timeout_retry_max_retries=u2_timeout_retry_max_retries,
|
|
1979
|
+
progress_callback=(
|
|
1980
|
+
lambda event: _report_u2_progress(progress, stage="note.u2", event=event, label="xiaohongshu")
|
|
1981
|
+
) if progress is not None else None,
|
|
1812
1982
|
)
|
|
1983
|
+
timings["u2_submit_ms"] = _to_int_or_none(u2_bundle.get("submit_duration_ms")) or 0
|
|
1984
|
+
timings["u2_poll_ms"] = _to_int_or_none(u2_bundle.get("poll_duration_ms")) or _elapsed_ms(u2_started_at)
|
|
1813
1985
|
submit_bundle = u2_bundle.get("submit_bundle", {})
|
|
1814
1986
|
submit_response = submit_bundle.get("submit_response", {})
|
|
1815
1987
|
task_id = submit_bundle.get("task_id")
|
|
@@ -1818,6 +1990,19 @@ def run_xiaohongshu_extract(
|
|
|
1818
1990
|
if selected_video_url and not normalize_text(metadata_fields.get("video_down_url")):
|
|
1819
1991
|
metadata_fields["video_down_url"] = selected_video_url
|
|
1820
1992
|
|
|
1993
|
+
if progress is not None:
|
|
1994
|
+
progress.http_event(
|
|
1995
|
+
stage="note.u2",
|
|
1996
|
+
endpoint="/api/u2/v1/services/audio/asr/transcription",
|
|
1997
|
+
response=submit_response,
|
|
1998
|
+
route_label="u2_submit",
|
|
1999
|
+
summary={
|
|
2000
|
+
"task_id": task_id,
|
|
2001
|
+
"retry_count": len(submit_bundle.get("retry_chain", [])),
|
|
2002
|
+
"candidate_count": len(u2_candidates),
|
|
2003
|
+
},
|
|
2004
|
+
)
|
|
2005
|
+
|
|
1821
2006
|
trace.append(
|
|
1822
2007
|
{
|
|
1823
2008
|
"step": "u2_asr_timeout_retry",
|
|
@@ -1879,15 +2064,17 @@ def run_xiaohongshu_extract(
|
|
|
1879
2064
|
u2_task_id=poll_result.get("task_id") or task_id,
|
|
1880
2065
|
u2_task_status=poll_result.get("task_status") or "UNKNOWN",
|
|
1881
2066
|
note_content_type=note_content_type,
|
|
1882
|
-
analysis_mode=
|
|
2067
|
+
analysis_mode=analysis_mode,
|
|
1883
2068
|
selected_video_url=selected_video_url,
|
|
1884
2069
|
selected_video_candidates=u2_candidates,
|
|
1885
2070
|
selected_image_urls=image_candidates,
|
|
1886
2071
|
downloaded_assets=[],
|
|
1887
2072
|
missing_fields=missing_fields,
|
|
1888
2073
|
metadata_fields=metadata_fields,
|
|
2074
|
+
timings=timings,
|
|
1889
2075
|
)
|
|
1890
2076
|
if write_card:
|
|
2077
|
+
card_started_at = time.perf_counter()
|
|
1891
2078
|
result["card_write"] = write_benchmark_card(
|
|
1892
2079
|
payload=result,
|
|
1893
2080
|
platform="xiaohongshu",
|
|
@@ -1895,7 +2082,14 @@ def run_xiaohongshu_extract(
|
|
|
1895
2082
|
card_root=card_root,
|
|
1896
2083
|
content_kind="single_video",
|
|
1897
2084
|
storage_config=storage_config,
|
|
2085
|
+
analysis_mode=analysis_mode,
|
|
2086
|
+
progress=progress.child(scope="card_write") if progress is not None else None,
|
|
1898
2087
|
)
|
|
2088
|
+
timings["card_write_ms"] = _elapsed_ms(card_started_at)
|
|
2089
|
+
timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
|
|
2090
|
+
timings["total_ms"] = _elapsed_ms(workflow_started_at)
|
|
2091
|
+
result["timings"] = dict(timings)
|
|
2092
|
+
_update_pipeline_status(result)
|
|
1899
2093
|
return _finalize_result(
|
|
1900
2094
|
result=result,
|
|
1901
2095
|
source_input=source_input,
|
|
@@ -1911,30 +2105,41 @@ def run_xiaohongshu_extract(
|
|
|
1911
2105
|
explicit_error_reason=poll_result.get("error_reason"),
|
|
1912
2106
|
explicit_request_id=poll_result.get("request_id") or submit_response.get("request_id") or note_response.get("request_id"),
|
|
1913
2107
|
)
|
|
2108
|
+
text_source = "u2"
|
|
2109
|
+
confidence = "high" if poll_result.get("ok") and raw_content else "low"
|
|
2110
|
+
error_reason = final_ctx.get("error_reason")
|
|
2111
|
+
if not raw_content and caption_text:
|
|
2112
|
+
missing_fields.append({"field": "asr_transcript", "reason": f"u2_failed:{error_reason or 'u2_poll_timeout'}"})
|
|
2113
|
+
raw_content = caption_text
|
|
2114
|
+
text_source = "caption_fallback"
|
|
2115
|
+
confidence = "medium"
|
|
2116
|
+
error_reason = None
|
|
1914
2117
|
result = _build_result(
|
|
1915
2118
|
source_input=source_input,
|
|
1916
2119
|
raw_content=raw_content,
|
|
1917
|
-
confidence=
|
|
1918
|
-
error_reason=
|
|
2120
|
+
confidence=confidence,
|
|
2121
|
+
error_reason=error_reason,
|
|
1919
2122
|
extract_trace=trace,
|
|
1920
2123
|
fallback_trace=final_ctx.get("fallback_trace", []),
|
|
1921
2124
|
request_id=final_ctx.get("request_id"),
|
|
1922
|
-
text_source=
|
|
2125
|
+
text_source=text_source,
|
|
1923
2126
|
note_id=str(resolved_note_id) if resolved_note_id else source_input.get("note_id"),
|
|
1924
2127
|
subtitle_hit=False,
|
|
1925
2128
|
u2_task_id=poll_result.get("task_id") or task_id,
|
|
1926
2129
|
u2_task_status=poll_result.get("task_status"),
|
|
1927
2130
|
note_content_type=note_content_type,
|
|
1928
|
-
analysis_mode=
|
|
2131
|
+
analysis_mode=analysis_mode,
|
|
1929
2132
|
selected_video_url=selected_video_url,
|
|
1930
2133
|
selected_video_candidates=u2_candidates,
|
|
1931
2134
|
selected_image_urls=image_candidates,
|
|
1932
2135
|
downloaded_assets=[],
|
|
1933
2136
|
missing_fields=missing_fields,
|
|
1934
2137
|
metadata_fields=metadata_fields,
|
|
2138
|
+
timings=timings,
|
|
1935
2139
|
)
|
|
1936
2140
|
|
|
1937
2141
|
if write_card:
|
|
2142
|
+
card_started_at = time.perf_counter()
|
|
1938
2143
|
result["card_write"] = write_benchmark_card(
|
|
1939
2144
|
payload=result,
|
|
1940
2145
|
platform="xiaohongshu",
|
|
@@ -1942,7 +2147,15 @@ def run_xiaohongshu_extract(
|
|
|
1942
2147
|
card_root=card_root,
|
|
1943
2148
|
content_kind="single_video",
|
|
1944
2149
|
storage_config=storage_config,
|
|
2150
|
+
analysis_mode=analysis_mode,
|
|
2151
|
+
progress=progress.child(scope="card_write") if progress is not None else None,
|
|
1945
2152
|
)
|
|
2153
|
+
timings["card_write_ms"] = _elapsed_ms(card_started_at)
|
|
2154
|
+
timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
|
|
2155
|
+
|
|
2156
|
+
timings["total_ms"] = _elapsed_ms(workflow_started_at)
|
|
2157
|
+
result["timings"] = dict(timings)
|
|
2158
|
+
_update_pipeline_status(result)
|
|
1946
2159
|
|
|
1947
2160
|
return _finalize_result(
|
|
1948
2161
|
result=result,
|
|
@@ -1989,16 +2202,18 @@ def run_xiaohongshu_extract(
|
|
|
1989
2202
|
u2_task_id=None,
|
|
1990
2203
|
u2_task_status="SKIPPED",
|
|
1991
2204
|
note_content_type="image" if note_content_type == "unknown" else note_content_type,
|
|
1992
|
-
analysis_mode=
|
|
2205
|
+
analysis_mode=analysis_mode,
|
|
1993
2206
|
selected_video_url=None,
|
|
1994
2207
|
selected_video_candidates=video_candidates,
|
|
1995
2208
|
selected_image_urls=image_candidates,
|
|
1996
2209
|
downloaded_assets=downloaded_assets,
|
|
1997
2210
|
missing_fields=missing_fields,
|
|
1998
2211
|
metadata_fields=metadata_fields,
|
|
2212
|
+
timings=timings,
|
|
1999
2213
|
)
|
|
2000
2214
|
|
|
2001
2215
|
if write_card:
|
|
2216
|
+
card_started_at = time.perf_counter()
|
|
2002
2217
|
result["card_write"] = write_benchmark_card(
|
|
2003
2218
|
payload=result,
|
|
2004
2219
|
platform="xiaohongshu",
|
|
@@ -2006,7 +2221,15 @@ def run_xiaohongshu_extract(
|
|
|
2006
2221
|
card_root=card_root,
|
|
2007
2222
|
content_kind="note",
|
|
2008
2223
|
storage_config=storage_config,
|
|
2224
|
+
analysis_mode=analysis_mode,
|
|
2225
|
+
progress=progress.child(scope="card_write") if progress is not None else None,
|
|
2009
2226
|
)
|
|
2227
|
+
timings["card_write_ms"] = _elapsed_ms(card_started_at)
|
|
2228
|
+
timings["llm_analysis_ms"] = _to_int_or_none((result.get("card_write") or {}).get("llm_analysis_ms")) or 0
|
|
2229
|
+
|
|
2230
|
+
timings["total_ms"] = _elapsed_ms(workflow_started_at)
|
|
2231
|
+
result["timings"] = dict(timings)
|
|
2232
|
+
_update_pipeline_status(result)
|
|
2010
2233
|
|
|
2011
2234
|
finalized = _finalize_result(
|
|
2012
2235
|
result=result,
|
|
@@ -2025,6 +2248,7 @@ def run_xiaohongshu_extract(
|
|
|
2025
2248
|
"card_write_ok": bool((finalized.get("card_write") or {}).get("ok")),
|
|
2026
2249
|
"output_persist_ok": bool((finalized.get("output_persist") or {}).get("ok")),
|
|
2027
2250
|
"text_source": finalized.get("text_source"),
|
|
2251
|
+
"deep_analysis_status": ((finalized.get("deep_analysis") or {}).get("status")),
|
|
2028
2252
|
},
|
|
2029
2253
|
)
|
|
2030
2254
|
return finalized
|
|
@@ -2069,7 +2293,13 @@ def main() -> None:
|
|
|
2069
2293
|
help="Conservative max retries for U2 timeout-only retry (0~3)",
|
|
2070
2294
|
)
|
|
2071
2295
|
parser.add_argument("--force-u2-fallback", action="store_true", help="Skip subtitle usage and force U2 fallback (test)")
|
|
2072
|
-
parser.add_argument("--card-type", choices=["work"
|
|
2296
|
+
parser.add_argument("--card-type", choices=["work"], default="work", help="Primary card type")
|
|
2297
|
+
parser.add_argument("--analysis-mode", choices=["auto", "local"], default="auto", help="Card analysis mode")
|
|
2298
|
+
parser.set_defaults(write_card=True, persist_output=True)
|
|
2299
|
+
parser.add_argument("--write-card", dest="write_card", action="store_true", help="Write final work card")
|
|
2300
|
+
parser.add_argument("--no-write-card", dest="write_card", action="store_false", help="Skip card writing")
|
|
2301
|
+
parser.add_argument("--persist-output", dest="persist_output", action="store_true", help="Persist result JSON")
|
|
2302
|
+
parser.add_argument("--no-persist-output", dest="persist_output", action="store_false", help="Skip result JSON persist")
|
|
2073
2303
|
parser.add_argument("--card-root", default=None, help="Card root (absolute); falls back to TIKOMNI_CARD_ROOT when writing cards")
|
|
2074
2304
|
args = parser.parse_args()
|
|
2075
2305
|
|
|
@@ -2109,6 +2339,12 @@ def main() -> None:
|
|
|
2109
2339
|
if args.u2_timeout_retry_max_retries is not None
|
|
2110
2340
|
else config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 3)
|
|
2111
2341
|
)
|
|
2342
|
+
progress = build_progress_reporter(
|
|
2343
|
+
workflow="single-work-analysis",
|
|
2344
|
+
platform="xiaohongshu",
|
|
2345
|
+
content_kind="note",
|
|
2346
|
+
input_value=args.share_text or args.note_id or args.input,
|
|
2347
|
+
)
|
|
2112
2348
|
|
|
2113
2349
|
try:
|
|
2114
2350
|
result = run_xiaohongshu_extract(
|
|
@@ -2126,12 +2362,14 @@ def main() -> None:
|
|
|
2126
2362
|
u2_timeout_retry_enabled=bool(u2_timeout_retry_enabled),
|
|
2127
2363
|
u2_timeout_retry_max_retries=int(u2_timeout_retry_max_retries),
|
|
2128
2364
|
force_u2_fallback=args.force_u2_fallback,
|
|
2129
|
-
write_card=
|
|
2365
|
+
write_card=bool(args.write_card),
|
|
2366
|
+
analysis_mode=args.analysis_mode,
|
|
2130
2367
|
card_type=args.card_type,
|
|
2131
2368
|
card_root=args.card_root,
|
|
2132
2369
|
storage_config=config,
|
|
2133
2370
|
allow_process_env=args.allow_process_env,
|
|
2134
|
-
persist_output=
|
|
2371
|
+
persist_output=bool(args.persist_output),
|
|
2372
|
+
progress=progress,
|
|
2135
2373
|
)
|
|
2136
2374
|
except ValueError as error:
|
|
2137
2375
|
result = {
|