@tikomni/skills 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/scripts/core/extract_pipeline.py +93 -1
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +1066 -102
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +102 -25
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +335 -78
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +8 -1
- package/skills/social-media-crawl/tests/test_fixed_pipeline_fallback.py +169 -0
|
@@ -250,6 +250,8 @@ def run_douyin_creator_home(
|
|
|
250
250
|
normalized_profile["extract_trace"] = extract_trace
|
|
251
251
|
|
|
252
252
|
normalized_works = [build_work_fact_card(work, platform="douyin") for work in works]
|
|
253
|
+
stage_status = raw.get("stage_status") if isinstance(raw.get("stage_status"), dict) else {}
|
|
254
|
+
error_reason = raw.get("error_reason")
|
|
253
255
|
envelope = {
|
|
254
256
|
"object_type": "creator",
|
|
255
257
|
"platform": "douyin",
|
|
@@ -267,7 +269,7 @@ def run_douyin_creator_home(
|
|
|
267
269
|
},
|
|
268
270
|
"completeness": evaluate_collection(profile, normalized_works),
|
|
269
271
|
"missing_fields": normalize_missing_fields(missing),
|
|
270
|
-
"error_reason":
|
|
272
|
+
"error_reason": error_reason,
|
|
271
273
|
"extract_trace": extract_trace,
|
|
272
274
|
"request_id": request_id,
|
|
273
275
|
"card_write": {
|
|
@@ -278,6 +280,8 @@ def run_douyin_creator_home(
|
|
|
278
280
|
},
|
|
279
281
|
"collection_artifacts": collection_artifacts,
|
|
280
282
|
}
|
|
283
|
+
if stage_status:
|
|
284
|
+
envelope["stage_status"] = stage_status
|
|
281
285
|
envelope["output_persist"] = persist_output_envelope(
|
|
282
286
|
envelope=envelope,
|
|
283
287
|
storage_config=config,
|
|
@@ -285,10 +289,11 @@ def run_douyin_creator_home(
|
|
|
285
289
|
fallback_identifier=str(profile.get("platform_author_id") or "author-home"),
|
|
286
290
|
) if persist_output else {"enabled": False, "skipped": True, "reason": "disabled_by_flag"}
|
|
287
291
|
|
|
288
|
-
progress.done
|
|
292
|
+
final_event = progress.failed if envelope.get("error_reason") else progress.done
|
|
293
|
+
final_event(
|
|
289
294
|
stage="author_home.workflow",
|
|
290
|
-
message="douyin author_home workflow finished",
|
|
291
|
-
data={"request_id": request_id, "works_count": len(normalized_works)},
|
|
295
|
+
message="douyin author_home workflow failed" if envelope.get("error_reason") else "douyin author_home workflow finished",
|
|
296
|
+
data={"request_id": request_id, "works_count": len(normalized_works), "error_reason": envelope.get("error_reason")},
|
|
292
297
|
)
|
|
293
298
|
return envelope
|
|
294
299
|
|
|
@@ -28,7 +28,12 @@ from pathlib import Path
|
|
|
28
28
|
from typing import Any, Dict, List, Optional
|
|
29
29
|
|
|
30
30
|
from scripts.core.config_loader import config_get, load_tikomni_config
|
|
31
|
-
from scripts.core.extract_pipeline import
|
|
31
|
+
from scripts.core.extract_pipeline import (
|
|
32
|
+
build_attempted_route,
|
|
33
|
+
build_route_plan_entry,
|
|
34
|
+
build_stage_status,
|
|
35
|
+
resolve_trace_error_context,
|
|
36
|
+
)
|
|
32
37
|
from scripts.core.progress_report import ProgressReporter, build_progress_reporter
|
|
33
38
|
from scripts.pipelines.douyin_video_type_matrix import normalize_douyin_video_type
|
|
34
39
|
from scripts.pipelines.douyin_metadata import (
|
|
@@ -394,6 +399,11 @@ def _u1_fetch_one_video(
|
|
|
394
399
|
app_timeout_ms: int,
|
|
395
400
|
web_timeout_ms: int,
|
|
396
401
|
) -> Dict[str, Any]:
|
|
402
|
+
route_plan = [
|
|
403
|
+
build_route_plan_entry(route_label="app_v3", endpoint=APP_ENDPOINT, method="GET"),
|
|
404
|
+
build_route_plan_entry(route_label="web", endpoint=WEB_ENDPOINT, method="GET"),
|
|
405
|
+
]
|
|
406
|
+
attempts: List[Dict[str, Any]] = []
|
|
397
407
|
app_response = call_json_api(
|
|
398
408
|
base_url=base_url,
|
|
399
409
|
path=APP_ENDPOINT,
|
|
@@ -403,7 +413,24 @@ def _u1_fetch_one_video(
|
|
|
403
413
|
params={"share_url": share_url},
|
|
404
414
|
)
|
|
405
415
|
app_response["_endpoint"] = APP_ENDPOINT
|
|
416
|
+
app_response["_route_label"] = "app_v3"
|
|
417
|
+
attempts.append(
|
|
418
|
+
build_attempted_route(
|
|
419
|
+
route_label="app_v3",
|
|
420
|
+
endpoint=APP_ENDPOINT,
|
|
421
|
+
response=app_response,
|
|
422
|
+
accepted=bool(app_response.get("ok")),
|
|
423
|
+
accept_reason="fetch_response_ok" if app_response.get("ok") else "response_not_ok",
|
|
424
|
+
fallback_reason="" if app_response.get("ok") else (
|
|
425
|
+
"primary_timeout_retry_exhausted" if app_response.get("timeout_retry_exhausted") else "primary_non_timeout_failure"
|
|
426
|
+
),
|
|
427
|
+
extra={"response": app_response},
|
|
428
|
+
)
|
|
429
|
+
)
|
|
406
430
|
if app_response.get("ok"):
|
|
431
|
+
app_response["_attempts"] = attempts
|
|
432
|
+
app_response["_route_plan"] = route_plan
|
|
433
|
+
app_response["_accept_reason"] = "fetch_response_ok"
|
|
407
434
|
return app_response
|
|
408
435
|
|
|
409
436
|
app_response["fallback_trigger_reason"] = (
|
|
@@ -418,8 +445,26 @@ def _u1_fetch_one_video(
|
|
|
418
445
|
params={"share_url": share_url},
|
|
419
446
|
)
|
|
420
447
|
web_response["_endpoint"] = WEB_ENDPOINT
|
|
448
|
+
web_response["_route_label"] = "web"
|
|
421
449
|
web_response["_app_failed"] = app_response
|
|
422
450
|
web_response["fallback_trigger_reason"] = app_response.get("fallback_trigger_reason")
|
|
451
|
+
attempts.append(
|
|
452
|
+
build_attempted_route(
|
|
453
|
+
route_label="web",
|
|
454
|
+
endpoint=WEB_ENDPOINT,
|
|
455
|
+
response=web_response,
|
|
456
|
+
accepted=bool(web_response.get("ok")),
|
|
457
|
+
accept_reason="fetch_response_ok" if web_response.get("ok") else "response_not_ok",
|
|
458
|
+
fallback_reason="" if web_response.get("ok") else (
|
|
459
|
+
"fallback_timeout_retry_exhausted" if web_response.get("timeout_retry_exhausted") else "fallback_non_timeout_failure"
|
|
460
|
+
),
|
|
461
|
+
extra={"response": web_response},
|
|
462
|
+
)
|
|
463
|
+
)
|
|
464
|
+
web_response["_attempts"] = attempts
|
|
465
|
+
web_response["_route_plan"] = route_plan
|
|
466
|
+
if web_response.get("ok"):
|
|
467
|
+
web_response["_accept_reason"] = "fetch_response_ok"
|
|
423
468
|
return web_response
|
|
424
469
|
|
|
425
470
|
|
|
@@ -578,6 +623,7 @@ def _build_result(
|
|
|
578
623
|
asr_source: str = "fallback_none",
|
|
579
624
|
timings: Optional[Dict[str, int]] = None,
|
|
580
625
|
missing_fields: Optional[List[Dict[str, str]]] = None,
|
|
626
|
+
stage_status: Optional[Dict[str, Any]] = None,
|
|
581
627
|
) -> Dict[str, Any]:
|
|
582
628
|
summary_block = summarize_content(raw_content, source="douyin:single-video-low-quality")
|
|
583
629
|
insights = list(summary_block.get("insights", []))
|
|
@@ -656,6 +702,8 @@ def _build_result(
|
|
|
656
702
|
"endpoint_list": endpoint_list,
|
|
657
703
|
"timings": dict(timings or {}),
|
|
658
704
|
}
|
|
705
|
+
if isinstance(stage_status, dict):
|
|
706
|
+
payload["stage_status"] = dict(stage_status)
|
|
659
707
|
return payload
|
|
660
708
|
|
|
661
709
|
|
|
@@ -830,41 +878,67 @@ def run_douyin_single_video(
|
|
|
830
878
|
)
|
|
831
879
|
timings["u1_total_ms"] = _elapsed_ms(u1_started_at)
|
|
832
880
|
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
881
|
+
attempts = one_video_response.get("_attempts") or []
|
|
882
|
+
stage_status = build_stage_status(
|
|
883
|
+
stage="fetch",
|
|
884
|
+
status="succeeded" if one_video_response.get("ok") else "failed",
|
|
885
|
+
route_plan=list(one_video_response.get("_route_plan") or []),
|
|
886
|
+
attempted_routes=list(attempts),
|
|
887
|
+
chosen_route=str(one_video_response.get("_route_label") or ""),
|
|
888
|
+
accept_reason=str(one_video_response.get("_accept_reason") or ""),
|
|
889
|
+
fallback_reason=str(one_video_response.get("fallback_trigger_reason") or ""),
|
|
890
|
+
error_reason=None if one_video_response.get("ok") else "single_fetch_all_routes_failed",
|
|
891
|
+
all_routes_failed=not bool(one_video_response.get("ok")),
|
|
892
|
+
)
|
|
893
|
+
for index, attempt in enumerate(attempts, start=1):
|
|
894
|
+
response = attempt.get("response") if isinstance(attempt, dict) else None
|
|
895
|
+
endpoint = attempt.get("endpoint") if isinstance(attempt, dict) else None
|
|
896
|
+
label = attempt.get("route_label") if isinstance(attempt, dict) else None
|
|
897
|
+
if not isinstance(response, dict):
|
|
898
|
+
if attempt.get("skipped"):
|
|
899
|
+
trace.append(
|
|
900
|
+
{
|
|
901
|
+
"step": f"u1_fetch_one_video_attempt_{index}",
|
|
902
|
+
"route_label": label,
|
|
903
|
+
"endpoint": endpoint,
|
|
904
|
+
"accept_reason": attempt.get("accept_reason"),
|
|
905
|
+
"fallback_reason": attempt.get("fallback_reason"),
|
|
906
|
+
"param_readiness": attempt.get("param_readiness"),
|
|
907
|
+
"param_reason": attempt.get("param_reason"),
|
|
908
|
+
"skipped": True,
|
|
909
|
+
}
|
|
910
|
+
)
|
|
911
|
+
continue
|
|
912
|
+
_emit_http_progress(progress, stage="single_video.fetch", response=response, route_label=str(label or "route"))
|
|
913
|
+
step = "u1_fetch_one_video_effective" if index == len(attempts) else f"u1_fetch_one_video_attempt_{index}"
|
|
836
914
|
trace.append(
|
|
837
915
|
_trace_step(
|
|
838
|
-
step=
|
|
839
|
-
endpoint=
|
|
840
|
-
response=
|
|
841
|
-
extra={
|
|
916
|
+
step=step,
|
|
917
|
+
endpoint=endpoint,
|
|
918
|
+
response=response,
|
|
919
|
+
extra={
|
|
920
|
+
"route_label": label,
|
|
921
|
+
"attempt": index,
|
|
922
|
+
"chosen_route": one_video_response.get("_route_label"),
|
|
923
|
+
"accept_reason": attempt.get("accept_reason"),
|
|
924
|
+
"fallback_reason": attempt.get("fallback_reason"),
|
|
925
|
+
"app_timeout_ms": app_timeout,
|
|
926
|
+
"web_timeout_ms": web_timeout,
|
|
927
|
+
},
|
|
842
928
|
)
|
|
843
929
|
)
|
|
844
|
-
|
|
845
|
-
_emit_http_progress(
|
|
846
|
-
progress,
|
|
847
|
-
stage="single_video.fetch",
|
|
848
|
-
response=one_video_response,
|
|
849
|
-
route_label="effective_route",
|
|
850
|
-
)
|
|
851
930
|
trace.append(
|
|
852
|
-
|
|
853
|
-
step
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
extra={
|
|
857
|
-
"app_timeout_ms": app_timeout,
|
|
858
|
-
"web_timeout_ms": web_timeout,
|
|
859
|
-
},
|
|
860
|
-
)
|
|
931
|
+
{
|
|
932
|
+
"step": "u1_fetch_one_video_route_decision",
|
|
933
|
+
**stage_status,
|
|
934
|
+
}
|
|
861
935
|
)
|
|
862
936
|
|
|
863
937
|
if not one_video_response.get("ok"):
|
|
864
938
|
error_ctx = resolve_trace_error_context(
|
|
865
939
|
responses=[one_video_response],
|
|
866
940
|
extract_trace=trace,
|
|
867
|
-
default_error_reason="
|
|
941
|
+
default_error_reason="single_fetch_all_routes_failed",
|
|
868
942
|
)
|
|
869
943
|
result = _build_result(
|
|
870
944
|
source_input=source_input,
|
|
@@ -889,6 +963,7 @@ def run_douyin_single_video(
|
|
|
889
963
|
u2_gate_reason="u1_failed",
|
|
890
964
|
analysis_mode=analysis_mode,
|
|
891
965
|
timings=timings,
|
|
966
|
+
stage_status={"fetch": stage_status},
|
|
892
967
|
)
|
|
893
968
|
if write_card:
|
|
894
969
|
card_started_at = time.perf_counter()
|
|
@@ -945,6 +1020,7 @@ def run_douyin_single_video(
|
|
|
945
1020
|
u2_gate_reason="aweme_detail_missing",
|
|
946
1021
|
analysis_mode=analysis_mode,
|
|
947
1022
|
timings=timings,
|
|
1023
|
+
stage_status={"fetch": stage_status},
|
|
948
1024
|
)
|
|
949
1025
|
if write_card:
|
|
950
1026
|
card_started_at = time.perf_counter()
|
|
@@ -1200,6 +1276,7 @@ def run_douyin_single_video(
|
|
|
1200
1276
|
analysis_mode=analysis_mode,
|
|
1201
1277
|
asr_source="u2" if raw_content else "fallback_none",
|
|
1202
1278
|
timings=timings,
|
|
1279
|
+
stage_status={"fetch": stage_status},
|
|
1203
1280
|
)
|
|
1204
1281
|
|
|
1205
1282
|
if write_card:
|
|
@@ -249,6 +249,8 @@ def run_xiaohongshu_creator_home(
|
|
|
249
249
|
normalized_profile["request_id"] = request_id
|
|
250
250
|
normalized_profile["extract_trace"] = extract_trace
|
|
251
251
|
normalized_works = [build_work_fact_card(work, platform="xiaohongshu") for work in works]
|
|
252
|
+
stage_status = raw.get("stage_status") if isinstance(raw.get("stage_status"), dict) else {}
|
|
253
|
+
error_reason = raw.get("error_reason")
|
|
252
254
|
|
|
253
255
|
envelope = {
|
|
254
256
|
"object_type": "creator",
|
|
@@ -267,7 +269,7 @@ def run_xiaohongshu_creator_home(
|
|
|
267
269
|
},
|
|
268
270
|
"completeness": evaluate_collection(profile, normalized_works),
|
|
269
271
|
"missing_fields": normalize_missing_fields(missing),
|
|
270
|
-
"error_reason":
|
|
272
|
+
"error_reason": error_reason,
|
|
271
273
|
"extract_trace": extract_trace,
|
|
272
274
|
"request_id": request_id,
|
|
273
275
|
"card_write": {
|
|
@@ -278,6 +280,8 @@ def run_xiaohongshu_creator_home(
|
|
|
278
280
|
},
|
|
279
281
|
"collection_artifacts": collection_artifacts,
|
|
280
282
|
}
|
|
283
|
+
if stage_status:
|
|
284
|
+
envelope["stage_status"] = stage_status
|
|
281
285
|
envelope["output_persist"] = persist_output_envelope(
|
|
282
286
|
envelope=envelope,
|
|
283
287
|
storage_config=config,
|
|
@@ -285,10 +289,11 @@ def run_xiaohongshu_creator_home(
|
|
|
285
289
|
fallback_identifier=str(profile.get("platform_author_id") or "author-home"),
|
|
286
290
|
) if persist_output else {"enabled": False, "skipped": True, "reason": "disabled_by_flag"}
|
|
287
291
|
|
|
288
|
-
progress.done
|
|
292
|
+
final_event = progress.failed if envelope.get("error_reason") else progress.done
|
|
293
|
+
final_event(
|
|
289
294
|
stage="author_home.workflow",
|
|
290
|
-
message="xiaohongshu author_home workflow finished",
|
|
291
|
-
data={"request_id": request_id, "works_count": len(normalized_works)},
|
|
295
|
+
message="xiaohongshu author_home workflow failed" if envelope.get("error_reason") else "xiaohongshu author_home workflow finished",
|
|
296
|
+
data={"request_id": request_id, "works_count": len(normalized_works), "error_reason": envelope.get("error_reason")},
|
|
292
297
|
)
|
|
293
298
|
return envelope
|
|
294
299
|
|