@tikomni/skills 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/scripts/core/extract_pipeline.py +93 -1
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +1066 -102
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_single_work.py +102 -25
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +9 -4
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_single_work.py +335 -78
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +8 -1
- package/skills/social-media-crawl/tests/test_fixed_pipeline_fallback.py +169 -0
|
@@ -19,6 +19,7 @@ bootstrap_for_direct_run(__file__, __package__)
|
|
|
19
19
|
import argparse
|
|
20
20
|
import hashlib
|
|
21
21
|
import json
|
|
22
|
+
import os
|
|
22
23
|
import re
|
|
23
24
|
import time
|
|
24
25
|
import urllib.parse
|
|
@@ -30,7 +31,13 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
30
31
|
from scripts.core.asr_pipeline import derive_asr_clean_text, run_u3_then_u2_asr_candidates_with_timeout_retry
|
|
31
32
|
from scripts.core.config_loader import config_get, load_tikomni_config
|
|
32
33
|
from scripts.core.progress_report import ProgressReporter, build_progress_reporter
|
|
33
|
-
from scripts.core.extract_pipeline import
|
|
34
|
+
from scripts.core.extract_pipeline import (
|
|
35
|
+
build_api_trace,
|
|
36
|
+
build_attempted_route,
|
|
37
|
+
build_route_plan_entry,
|
|
38
|
+
build_stage_status,
|
|
39
|
+
resolve_trace_error_context,
|
|
40
|
+
)
|
|
34
41
|
from scripts.core.tikomni_common import (
|
|
35
42
|
call_json_api,
|
|
36
43
|
deep_find_all,
|
|
@@ -55,10 +62,14 @@ from scripts.writers.write_work_fact_card import (
|
|
|
55
62
|
APP_V2_VIDEO_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_video_note_detail"
|
|
56
63
|
APP_V2_IMAGE_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_image_note_detail"
|
|
57
64
|
APP_V2_MIXED_ENDPOINT = "/api/u1/v1/xiaohongshu/app_v2/get_mixed_note_detail"
|
|
65
|
+
APP_V1_V2_ENDPOINT = "/api/u1/v1/xiaohongshu/app/get_note_info_v2"
|
|
58
66
|
APP_V1_ENDPOINT = "/api/u1/v1/xiaohongshu/app/get_note_info"
|
|
59
67
|
WEB_V2_V2_ENDPOINT = "/api/u1/v1/xiaohongshu/web_v2/fetch_feed_notes_v2"
|
|
60
68
|
WEB_V2_V3_ENDPOINT = "/api/u1/v1/xiaohongshu/web_v2/fetch_feed_notes_v3"
|
|
61
|
-
|
|
69
|
+
WEB_V1_V7_ENDPOINT = "/api/u1/v1/xiaohongshu/web/get_note_info_v7"
|
|
70
|
+
WEB_V1_V5_ENDPOINT = "/api/u1/v1/xiaohongshu/web/get_note_info_v5"
|
|
71
|
+
WEB_V1_V4_ENDPOINT = "/api/u1/v1/xiaohongshu/web/get_note_info_v4"
|
|
72
|
+
WEB_V1_V2_ENDPOINT = "/api/u1/v1/xiaohongshu/web/get_note_info_v2"
|
|
62
73
|
U2_REQUEST_TIMEOUT_CAP_MS = 15000
|
|
63
74
|
U2_GATE_MIN_DURATION_MS = 13000
|
|
64
75
|
U2_GATE_MAX_DURATION_MS = 1800000
|
|
@@ -320,6 +331,218 @@ def _route_success_for_note(response: Dict[str, Any], source_input: Dict[str, Op
|
|
|
320
331
|
return bool(completeness.get("core_ready"))
|
|
321
332
|
|
|
322
333
|
|
|
334
|
+
def _response_failure_reason(response: Dict[str, Any]) -> str:
|
|
335
|
+
if response.get("timeout_retry_exhausted"):
|
|
336
|
+
return "primary_timeout_retry_exhausted"
|
|
337
|
+
if response.get("error_reason"):
|
|
338
|
+
return "primary_non_timeout_failure"
|
|
339
|
+
return "primary_unknown_failure"
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _route_accept_decision(response: Dict[str, Any], source_input: Dict[str, Optional[str]]) -> Dict[str, Any]:
|
|
343
|
+
if not response.get("ok"):
|
|
344
|
+
return {
|
|
345
|
+
"accepted": False,
|
|
346
|
+
"accept_reason": "response_not_ok",
|
|
347
|
+
"fallback_reason": _response_failure_reason(response),
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
completeness = response.get("_field_completeness")
|
|
351
|
+
if not isinstance(completeness, dict):
|
|
352
|
+
completeness = _route_field_completeness(response.get("data"), source_input)
|
|
353
|
+
response["_field_completeness"] = completeness
|
|
354
|
+
|
|
355
|
+
missing_core = list(completeness.get("missing_core") or [])
|
|
356
|
+
if missing_core:
|
|
357
|
+
return {
|
|
358
|
+
"accepted": False,
|
|
359
|
+
"accept_reason": "note_missing_core_fields",
|
|
360
|
+
"fallback_reason": f"note_missing_core:{','.join(missing_core)}",
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
fields = completeness.get("fields") if isinstance(completeness.get("fields"), dict) else {}
|
|
364
|
+
optional_missing = [field_name for field_name in ("author", "subtitle", "metrics") if not fields.get(field_name)]
|
|
365
|
+
accept_reason = "note_core_fields_ready"
|
|
366
|
+
if optional_missing:
|
|
367
|
+
accept_reason = f"note_core_fields_ready_optional_missing:{','.join(optional_missing)}"
|
|
368
|
+
return {
|
|
369
|
+
"accepted": True,
|
|
370
|
+
"accept_reason": accept_reason,
|
|
371
|
+
"fallback_reason": "",
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _extract_xsec_token_from_input(share_text: Optional[str]) -> str:
|
|
376
|
+
text = normalize_text(share_text)
|
|
377
|
+
if not text:
|
|
378
|
+
return ""
|
|
379
|
+
|
|
380
|
+
candidates = [text]
|
|
381
|
+
candidates.extend(re.findall(r"https?://\\S+", text))
|
|
382
|
+
for candidate in candidates:
|
|
383
|
+
try:
|
|
384
|
+
query = urllib.parse.parse_qs(urllib.parse.urlparse(candidate).query)
|
|
385
|
+
except Exception:
|
|
386
|
+
continue
|
|
387
|
+
token = normalize_text((query.get("xsec_token") or [""])[0])
|
|
388
|
+
if token:
|
|
389
|
+
return urllib.parse.unquote(token)
|
|
390
|
+
return ""
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _build_unavailable_attempt(
|
|
394
|
+
*,
|
|
395
|
+
route_label: str,
|
|
396
|
+
endpoint: str,
|
|
397
|
+
method: str,
|
|
398
|
+
reason: str,
|
|
399
|
+
) -> Dict[str, Any]:
|
|
400
|
+
return build_attempted_route(
|
|
401
|
+
route_label=route_label,
|
|
402
|
+
endpoint=endpoint,
|
|
403
|
+
accepted=False,
|
|
404
|
+
accept_reason="skipped_param_unavailable",
|
|
405
|
+
fallback_reason=reason,
|
|
406
|
+
param_readiness="unavailable",
|
|
407
|
+
param_reason=reason,
|
|
408
|
+
skipped=True,
|
|
409
|
+
extra={"method": method.upper()},
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def _build_note_fetch_routes(source_input: Dict[str, Optional[str]]) -> List[Dict[str, Any]]:
|
|
414
|
+
share_text = source_input.get("share_text")
|
|
415
|
+
note_id = source_input.get("note_id") or _extract_note_id_from_share(share_text)
|
|
416
|
+
app_params: Dict[str, Any] = {}
|
|
417
|
+
web_params: Dict[str, Any] = {}
|
|
418
|
+
|
|
419
|
+
if share_text:
|
|
420
|
+
app_params["share_text"] = share_text
|
|
421
|
+
web_params["share_text"] = share_text
|
|
422
|
+
if note_id:
|
|
423
|
+
app_params["note_id"] = note_id
|
|
424
|
+
web_params["note_id"] = note_id
|
|
425
|
+
|
|
426
|
+
short_url_ready = bool(_is_short_share_url(share_text) and share_text)
|
|
427
|
+
xsec_token = _extract_xsec_token_from_input(share_text)
|
|
428
|
+
web_cookie = os.getenv("TIKOMNI_XHS_WEB_COOKIE", "").strip()
|
|
429
|
+
web_v5_ready = bool(note_id and xsec_token and web_cookie)
|
|
430
|
+
|
|
431
|
+
return [
|
|
432
|
+
{
|
|
433
|
+
"route_label": "app_v2_video",
|
|
434
|
+
"endpoint": APP_V2_VIDEO_ENDPOINT,
|
|
435
|
+
"method": "GET",
|
|
436
|
+
"params": dict(app_params),
|
|
437
|
+
"body": None,
|
|
438
|
+
"param_readiness": "ready" if app_params else "unavailable",
|
|
439
|
+
"param_reason": "" if app_params else "missing_note_id_or_share_text",
|
|
440
|
+
},
|
|
441
|
+
{
|
|
442
|
+
"route_label": "app_v2_image",
|
|
443
|
+
"endpoint": APP_V2_IMAGE_ENDPOINT,
|
|
444
|
+
"method": "GET",
|
|
445
|
+
"params": dict(app_params),
|
|
446
|
+
"body": None,
|
|
447
|
+
"param_readiness": "ready" if app_params else "unavailable",
|
|
448
|
+
"param_reason": "" if app_params else "missing_note_id_or_share_text",
|
|
449
|
+
},
|
|
450
|
+
{
|
|
451
|
+
"route_label": "app_v2_mixed",
|
|
452
|
+
"endpoint": APP_V2_MIXED_ENDPOINT,
|
|
453
|
+
"method": "GET",
|
|
454
|
+
"params": dict(app_params),
|
|
455
|
+
"body": None,
|
|
456
|
+
"param_readiness": "ready" if app_params else "unavailable",
|
|
457
|
+
"param_reason": "" if app_params else "missing_note_id_or_share_text",
|
|
458
|
+
},
|
|
459
|
+
{
|
|
460
|
+
"route_label": "app_v1_v2",
|
|
461
|
+
"endpoint": APP_V1_V2_ENDPOINT,
|
|
462
|
+
"method": "GET",
|
|
463
|
+
"params": dict(app_params),
|
|
464
|
+
"body": None,
|
|
465
|
+
"param_readiness": "ready" if app_params else "unavailable",
|
|
466
|
+
"param_reason": "" if app_params else "missing_note_id_or_share_text",
|
|
467
|
+
},
|
|
468
|
+
{
|
|
469
|
+
"route_label": "app_v1",
|
|
470
|
+
"endpoint": APP_V1_ENDPOINT,
|
|
471
|
+
"method": "GET",
|
|
472
|
+
"params": dict(app_params),
|
|
473
|
+
"body": None,
|
|
474
|
+
"param_readiness": "ready" if app_params else "unavailable",
|
|
475
|
+
"param_reason": "" if app_params else "missing_note_id_or_share_text",
|
|
476
|
+
},
|
|
477
|
+
{
|
|
478
|
+
"route_label": "web_v2_v3",
|
|
479
|
+
"endpoint": WEB_V2_V3_ENDPOINT,
|
|
480
|
+
"method": "GET",
|
|
481
|
+
"params": {"short_url": share_text} if short_url_ready else {},
|
|
482
|
+
"body": None,
|
|
483
|
+
"param_readiness": "ready" if short_url_ready else "unavailable",
|
|
484
|
+
"param_reason": "" if short_url_ready else "missing_short_share_url",
|
|
485
|
+
},
|
|
486
|
+
{
|
|
487
|
+
"route_label": "web_v2_v2",
|
|
488
|
+
"endpoint": WEB_V2_V2_ENDPOINT,
|
|
489
|
+
"method": "GET",
|
|
490
|
+
"params": {"note_id": note_id} if note_id else {},
|
|
491
|
+
"body": None,
|
|
492
|
+
"param_readiness": "ready" if note_id else "unavailable",
|
|
493
|
+
"param_reason": "" if note_id else "missing_note_id",
|
|
494
|
+
},
|
|
495
|
+
{
|
|
496
|
+
"route_label": "web_v1_v7",
|
|
497
|
+
"endpoint": WEB_V1_V7_ENDPOINT,
|
|
498
|
+
"method": "GET",
|
|
499
|
+
"params": dict(web_params),
|
|
500
|
+
"body": None,
|
|
501
|
+
"param_readiness": "ready" if web_params else "unavailable",
|
|
502
|
+
"param_reason": "" if web_params else "missing_note_id_or_share_text",
|
|
503
|
+
},
|
|
504
|
+
{
|
|
505
|
+
"route_label": "web_v1_v5",
|
|
506
|
+
"endpoint": WEB_V1_V5_ENDPOINT,
|
|
507
|
+
"method": "POST",
|
|
508
|
+
"params": {},
|
|
509
|
+
"body": {
|
|
510
|
+
"note_id": note_id,
|
|
511
|
+
"xsec_token": xsec_token,
|
|
512
|
+
"cookie": web_cookie,
|
|
513
|
+
} if web_v5_ready else None,
|
|
514
|
+
"param_readiness": "ready" if web_v5_ready else "unavailable",
|
|
515
|
+
"param_reason": (
|
|
516
|
+
""
|
|
517
|
+
if web_v5_ready
|
|
518
|
+
else "missing_note_id"
|
|
519
|
+
if not note_id
|
|
520
|
+
else "missing_xsec_token"
|
|
521
|
+
if not xsec_token
|
|
522
|
+
else "fallback_requires_cookie"
|
|
523
|
+
),
|
|
524
|
+
},
|
|
525
|
+
{
|
|
526
|
+
"route_label": "web_v1_v4",
|
|
527
|
+
"endpoint": WEB_V1_V4_ENDPOINT,
|
|
528
|
+
"method": "GET",
|
|
529
|
+
"params": dict(web_params),
|
|
530
|
+
"body": None,
|
|
531
|
+
"param_readiness": "ready" if web_params else "unavailable",
|
|
532
|
+
"param_reason": "" if web_params else "missing_note_id_or_share_text",
|
|
533
|
+
},
|
|
534
|
+
{
|
|
535
|
+
"route_label": "web_v1_v2",
|
|
536
|
+
"endpoint": WEB_V1_V2_ENDPOINT,
|
|
537
|
+
"method": "GET",
|
|
538
|
+
"params": dict(web_params),
|
|
539
|
+
"body": None,
|
|
540
|
+
"param_readiness": "ready" if web_params else "unavailable",
|
|
541
|
+
"param_reason": "" if web_params else "missing_note_id_or_share_text",
|
|
542
|
+
},
|
|
543
|
+
]
|
|
544
|
+
|
|
545
|
+
|
|
323
546
|
def _pick_text_from_paths(payload: Any, paths: List[List[str]]) -> str:
|
|
324
547
|
for path in paths:
|
|
325
548
|
raw = deep_find_first(payload, path)
|
|
@@ -817,18 +1040,35 @@ def _fetch_note_info(
|
|
|
817
1040
|
progress: Optional[ProgressReporter] = None,
|
|
818
1041
|
) -> Dict[str, Any]:
|
|
819
1042
|
attempts: List[Dict[str, Any]] = []
|
|
1043
|
+
routes = _build_note_fetch_routes(source_input)
|
|
1044
|
+
route_plan = [
|
|
1045
|
+
build_route_plan_entry(
|
|
1046
|
+
route_label=str(route["route_label"]),
|
|
1047
|
+
endpoint=str(route["endpoint"]),
|
|
1048
|
+
method=str(route["method"]),
|
|
1049
|
+
param_readiness=str(route.get("param_readiness") or "ready"),
|
|
1050
|
+
param_reason=str(route.get("param_reason") or ""),
|
|
1051
|
+
)
|
|
1052
|
+
for route in routes
|
|
1053
|
+
]
|
|
820
1054
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
1055
|
+
def _call(
|
|
1056
|
+
*,
|
|
1057
|
+
path: str,
|
|
1058
|
+
method: str,
|
|
1059
|
+
params: Optional[Dict[str, Any]],
|
|
1060
|
+
body: Optional[Dict[str, Any]],
|
|
1061
|
+
label: str,
|
|
1062
|
+
fallback_reason: Optional[str] = None,
|
|
1063
|
+
) -> Dict[str, Any]:
|
|
825
1064
|
response = call_json_api(
|
|
826
1065
|
base_url=base_url,
|
|
827
1066
|
path=path,
|
|
828
1067
|
token=token,
|
|
829
|
-
method=
|
|
1068
|
+
method=method,
|
|
830
1069
|
timeout_ms=timeout_ms,
|
|
831
1070
|
params=params,
|
|
1071
|
+
body=body,
|
|
832
1072
|
)
|
|
833
1073
|
response["_endpoint"] = path
|
|
834
1074
|
response["_route_label"] = label
|
|
@@ -843,78 +1083,64 @@ def _fetch_note_info(
|
|
|
843
1083
|
"core_ready": False,
|
|
844
1084
|
}
|
|
845
1085
|
_emit_http_progress(progress, stage="note.fetch", response=response, route_label=label, source_input=source_input)
|
|
846
|
-
attempts.append({"label": label, "endpoint": path, "response": response})
|
|
847
1086
|
return response
|
|
848
1087
|
|
|
849
|
-
app_params: Dict[str, Any] = {}
|
|
850
|
-
if share_text:
|
|
851
|
-
app_params["share_text"] = share_text
|
|
852
|
-
if note_id:
|
|
853
|
-
app_params["note_id"] = note_id
|
|
854
|
-
|
|
855
|
-
app_v2_attempts = [
|
|
856
|
-
(APP_V2_VIDEO_ENDPOINT, "app_v2_video"),
|
|
857
|
-
(APP_V2_IMAGE_ENDPOINT, "app_v2_image"),
|
|
858
|
-
(APP_V2_MIXED_ENDPOINT, "app_v2_mixed"),
|
|
859
|
-
]
|
|
860
1088
|
next_reason: Optional[str] = None
|
|
1089
|
+
final_response: Dict[str, Any] = {
|
|
1090
|
+
"ok": False,
|
|
1091
|
+
"error_reason": "single_fetch_all_routes_failed",
|
|
1092
|
+
"_endpoint": None,
|
|
1093
|
+
"_route_label": "",
|
|
1094
|
+
}
|
|
861
1095
|
|
|
862
|
-
for
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
1096
|
+
for route in routes:
|
|
1097
|
+
if route.get("param_readiness") != "ready":
|
|
1098
|
+
attempts.append(
|
|
1099
|
+
_build_unavailable_attempt(
|
|
1100
|
+
route_label=str(route["route_label"]),
|
|
1101
|
+
endpoint=str(route["endpoint"]),
|
|
1102
|
+
method=str(route["method"]),
|
|
1103
|
+
reason=str(route.get("param_reason") or "fallback_param_unavailable"),
|
|
1104
|
+
)
|
|
1105
|
+
)
|
|
1106
|
+
continue
|
|
872
1107
|
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
app_fallback_reason = (
|
|
881
|
-
"field_completeness_below_threshold"
|
|
882
|
-
if app_response.get("ok")
|
|
883
|
-
else ("primary_timeout_retry_exhausted" if app_response.get("timeout_retry_exhausted") else "primary_non_timeout_failure")
|
|
884
|
-
)
|
|
885
|
-
is_short = _is_short_share_url(share_text)
|
|
886
|
-
|
|
887
|
-
if is_short and share_text:
|
|
888
|
-
v3_response = _call(
|
|
889
|
-
WEB_V2_V3_ENDPOINT,
|
|
890
|
-
{"short_url": share_text},
|
|
891
|
-
"web_v2_v3_short",
|
|
892
|
-
fallback_reason=app_fallback_reason,
|
|
1108
|
+
response = _call(
|
|
1109
|
+
path=str(route["endpoint"]),
|
|
1110
|
+
method=str(route["method"]),
|
|
1111
|
+
params=dict(route.get("params") or {}),
|
|
1112
|
+
body=dict(route.get("body") or {}) if isinstance(route.get("body"), dict) else None,
|
|
1113
|
+
label=str(route["route_label"]),
|
|
1114
|
+
fallback_reason=next_reason,
|
|
893
1115
|
)
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
1116
|
+
decision = _route_accept_decision(response, source_input)
|
|
1117
|
+
attempts.append(
|
|
1118
|
+
build_attempted_route(
|
|
1119
|
+
route_label=str(route["route_label"]),
|
|
1120
|
+
endpoint=str(route["endpoint"]),
|
|
1121
|
+
response=response,
|
|
1122
|
+
accepted=bool(decision.get("accepted")),
|
|
1123
|
+
accept_reason=str(decision.get("accept_reason") or ""),
|
|
1124
|
+
fallback_reason=str(decision.get("fallback_reason") or ""),
|
|
1125
|
+
extra={
|
|
1126
|
+
"method": str(route["method"]).upper(),
|
|
1127
|
+
"field_completeness": response.get("_field_completeness"),
|
|
1128
|
+
"response": response,
|
|
1129
|
+
},
|
|
1130
|
+
)
|
|
904
1131
|
)
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
web_params["note_id"] = note_id
|
|
1132
|
+
final_response = response
|
|
1133
|
+
if decision.get("accepted"):
|
|
1134
|
+
response["_attempts"] = attempts
|
|
1135
|
+
response["_route_plan"] = route_plan
|
|
1136
|
+
response["_accept_reason"] = decision.get("accept_reason")
|
|
1137
|
+
return response
|
|
1138
|
+
next_reason = str(decision.get("fallback_reason") or "field_completeness_below_threshold")
|
|
1139
|
+
response["fallback_trigger_reason"] = next_reason
|
|
914
1140
|
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
return
|
|
1141
|
+
final_response["_attempts"] = attempts
|
|
1142
|
+
final_response["_route_plan"] = route_plan
|
|
1143
|
+
return final_response
|
|
918
1144
|
|
|
919
1145
|
|
|
920
1146
|
def _extract_subtitle_urls(payload: Any) -> List[str]:
|
|
@@ -1344,6 +1570,7 @@ def _build_result(
|
|
|
1344
1570
|
metadata_fields: Optional[Dict[str, Any]] = None,
|
|
1345
1571
|
asr_source: Optional[str] = None,
|
|
1346
1572
|
timings: Optional[Dict[str, int]] = None,
|
|
1573
|
+
stage_status: Optional[Dict[str, Any]] = None,
|
|
1347
1574
|
) -> Dict[str, Any]:
|
|
1348
1575
|
metadata = metadata_fields or {}
|
|
1349
1576
|
summary_block = summarize_content(raw_content, source=f"xiaohongshu:{text_source}")
|
|
@@ -1371,7 +1598,7 @@ def _build_result(
|
|
|
1371
1598
|
analysis_eligibility = "eligible" if primary_text else "incomplete"
|
|
1372
1599
|
analysis_exclusion_reason = "" if analysis_eligibility == "eligible" else ("video_asr_unavailable" if work_modality == "video" else "caption_raw_missing")
|
|
1373
1600
|
|
|
1374
|
-
|
|
1601
|
+
payload = {
|
|
1375
1602
|
"platform": "xiaohongshu",
|
|
1376
1603
|
"content_kind": "note",
|
|
1377
1604
|
"source": source_input,
|
|
@@ -1427,6 +1654,9 @@ def _build_result(
|
|
|
1427
1654
|
"request_id": request_id,
|
|
1428
1655
|
"timings": dict(timings or {}),
|
|
1429
1656
|
}
|
|
1657
|
+
if isinstance(stage_status, dict):
|
|
1658
|
+
payload["stage_status"] = dict(stage_status)
|
|
1659
|
+
return payload
|
|
1430
1660
|
|
|
1431
1661
|
|
|
1432
1662
|
def run_xiaohongshu_extract(
|
|
@@ -1601,11 +1831,35 @@ def run_xiaohongshu_extract(
|
|
|
1601
1831
|
timings["u1_total_ms"] = _elapsed_ms(u1_started_at)
|
|
1602
1832
|
|
|
1603
1833
|
attempts = note_response.get("_attempts") or []
|
|
1834
|
+
stage_status = build_stage_status(
|
|
1835
|
+
stage="fetch",
|
|
1836
|
+
status="succeeded" if note_response.get("ok") else "failed",
|
|
1837
|
+
route_plan=list(note_response.get("_route_plan") or []),
|
|
1838
|
+
attempted_routes=list(attempts),
|
|
1839
|
+
chosen_route=str(note_response.get("_route_label") or ""),
|
|
1840
|
+
accept_reason=str(note_response.get("_accept_reason") or ""),
|
|
1841
|
+
fallback_reason=str(note_response.get("fallback_trigger_reason") or ""),
|
|
1842
|
+
error_reason=None if note_response.get("ok") else "single_fetch_all_routes_failed",
|
|
1843
|
+
all_routes_failed=not bool(note_response.get("ok")),
|
|
1844
|
+
)
|
|
1604
1845
|
for index, attempt in enumerate(attempts, start=1):
|
|
1605
1846
|
response = attempt.get("response") if isinstance(attempt, dict) else None
|
|
1606
1847
|
endpoint = attempt.get("endpoint") if isinstance(attempt, dict) else None
|
|
1607
|
-
label = attempt.get("
|
|
1848
|
+
label = attempt.get("route_label") if isinstance(attempt, dict) else None
|
|
1608
1849
|
if not isinstance(response, dict):
|
|
1850
|
+
if attempt.get("skipped"):
|
|
1851
|
+
trace.append(
|
|
1852
|
+
{
|
|
1853
|
+
"step": f"u1_get_note_info_attempt_{index}",
|
|
1854
|
+
"route_label": label,
|
|
1855
|
+
"endpoint": endpoint,
|
|
1856
|
+
"accept_reason": attempt.get("accept_reason"),
|
|
1857
|
+
"fallback_reason": attempt.get("fallback_reason"),
|
|
1858
|
+
"param_readiness": attempt.get("param_readiness"),
|
|
1859
|
+
"param_reason": attempt.get("param_reason"),
|
|
1860
|
+
"skipped": True,
|
|
1861
|
+
}
|
|
1862
|
+
)
|
|
1609
1863
|
continue
|
|
1610
1864
|
step = "u1_get_note_info_effective" if index == len(attempts) else f"u1_get_note_info_attempt_{index}"
|
|
1611
1865
|
trace.append(
|
|
@@ -1625,10 +1879,7 @@ def run_xiaohongshu_extract(
|
|
|
1625
1879
|
trace.append(
|
|
1626
1880
|
{
|
|
1627
1881
|
"step": "u1_get_note_info_route_decision",
|
|
1628
|
-
|
|
1629
|
-
"request_id": note_response.get("request_id"),
|
|
1630
|
-
"field_completeness": note_response.get("_field_completeness"),
|
|
1631
|
-
"attempt_count": len(attempts),
|
|
1882
|
+
**stage_status,
|
|
1632
1883
|
}
|
|
1633
1884
|
)
|
|
1634
1885
|
|
|
@@ -1636,7 +1887,7 @@ def run_xiaohongshu_extract(
|
|
|
1636
1887
|
error_ctx = resolve_trace_error_context(
|
|
1637
1888
|
responses=[note_response],
|
|
1638
1889
|
extract_trace=trace,
|
|
1639
|
-
default_error_reason="
|
|
1890
|
+
default_error_reason="single_fetch_all_routes_failed",
|
|
1640
1891
|
)
|
|
1641
1892
|
result = _build_result(
|
|
1642
1893
|
source_input=source_input,
|
|
@@ -1660,6 +1911,7 @@ def run_xiaohongshu_extract(
|
|
|
1660
1911
|
missing_fields=[{"field": "u1_note_info", "reason": "all_routes_failed"}],
|
|
1661
1912
|
metadata_fields=metadata_fields,
|
|
1662
1913
|
timings=timings,
|
|
1914
|
+
stage_status={"fetch": stage_status},
|
|
1663
1915
|
)
|
|
1664
1916
|
if write_card:
|
|
1665
1917
|
card_started_at = time.perf_counter()
|
|
@@ -1835,6 +2087,7 @@ def run_xiaohongshu_extract(
|
|
|
1835
2087
|
missing_fields=missing_fields,
|
|
1836
2088
|
metadata_fields=metadata_fields,
|
|
1837
2089
|
timings=timings,
|
|
2090
|
+
stage_status={"fetch": stage_status},
|
|
1838
2091
|
)
|
|
1839
2092
|
if write_card:
|
|
1840
2093
|
card_started_at = time.perf_counter()
|
|
@@ -1900,6 +2153,7 @@ def run_xiaohongshu_extract(
|
|
|
1900
2153
|
missing_fields=missing_fields,
|
|
1901
2154
|
metadata_fields=metadata_fields,
|
|
1902
2155
|
timings=timings,
|
|
2156
|
+
stage_status={"fetch": stage_status},
|
|
1903
2157
|
)
|
|
1904
2158
|
if write_card:
|
|
1905
2159
|
card_started_at = time.perf_counter()
|
|
@@ -2046,6 +2300,7 @@ def run_xiaohongshu_extract(
|
|
|
2046
2300
|
missing_fields=missing_fields,
|
|
2047
2301
|
metadata_fields=metadata_fields,
|
|
2048
2302
|
timings=timings,
|
|
2303
|
+
stage_status={"fetch": stage_status},
|
|
2049
2304
|
)
|
|
2050
2305
|
if write_card:
|
|
2051
2306
|
card_started_at = time.perf_counter()
|
|
@@ -2110,6 +2365,7 @@ def run_xiaohongshu_extract(
|
|
|
2110
2365
|
missing_fields=missing_fields,
|
|
2111
2366
|
metadata_fields=metadata_fields,
|
|
2112
2367
|
timings=timings,
|
|
2368
|
+
stage_status={"fetch": stage_status},
|
|
2113
2369
|
)
|
|
2114
2370
|
|
|
2115
2371
|
if write_card:
|
|
@@ -2184,6 +2440,7 @@ def run_xiaohongshu_extract(
|
|
|
2184
2440
|
missing_fields=missing_fields,
|
|
2185
2441
|
metadata_fields=metadata_fields,
|
|
2186
2442
|
timings=timings,
|
|
2443
|
+
stage_status={"fetch": stage_status},
|
|
2187
2444
|
)
|
|
2188
2445
|
|
|
2189
2446
|
if write_card:
|
|
@@ -295,7 +295,7 @@ def build_work_output_envelope(payload: Dict[str, Any], platform: Optional[str]
|
|
|
295
295
|
card = build_work_fact_card(payload, platform=platform)
|
|
296
296
|
source = _source_dict(payload)
|
|
297
297
|
input_value = source.get("share_url") or source.get("share_text") or source.get("source_url") or source
|
|
298
|
-
|
|
298
|
+
envelope = {
|
|
299
299
|
"object_type": "work",
|
|
300
300
|
"platform": card["platform"],
|
|
301
301
|
"input": input_value,
|
|
@@ -306,6 +306,13 @@ def build_work_output_envelope(payload: Dict[str, Any], platform: Optional[str]
|
|
|
306
306
|
"extract_trace": card.get("extract_trace", []),
|
|
307
307
|
"request_id": card["request_id"],
|
|
308
308
|
}
|
|
309
|
+
stage_status = payload.get("stage_status")
|
|
310
|
+
if isinstance(stage_status, dict):
|
|
311
|
+
envelope["stage_status"] = stage_status
|
|
312
|
+
pipeline_status = payload.get("pipeline_status")
|
|
313
|
+
if isinstance(pipeline_status, dict):
|
|
314
|
+
envelope["pipeline_status"] = pipeline_status
|
|
315
|
+
return envelope
|
|
309
316
|
|
|
310
317
|
|
|
311
318
|
def _yaml_scalar(value: Any) -> str:
|