@tikomni/skills 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/references/contracts/work-fact-card-fields.md +1 -0
- package/skills/social-media-crawl/scripts/core/asr_pipeline.py +273 -39
- package/skills/social-media-crawl/scripts/pipelines/home_asr.py +184 -28
- package/skills/social-media-crawl/scripts/pipelines/homepage_runtime_state.py +173 -0
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +93 -58
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +93 -58
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +0 -8
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tikomni/skills",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
|
|
@@ -42,6 +42,7 @@
|
|
|
42
42
|
- Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
|
|
43
43
|
- The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
|
|
44
44
|
- `primary_text` is the text that is best suited for reading and indexing in the current task.
|
|
45
|
+
- `asr_raw` and `subtitle_raw` are internal preserved text fields. Keep them in the normalized card data, but do not render them as standalone sections in the Markdown body.
|
|
45
46
|
- `play_count` may be `null`. Leave it empty when missing, and keep `0` only when the platform explicitly returns `0`.
|
|
46
47
|
- Preferred order for video works:
|
|
47
48
|
- `subtitle_raw`
|
|
@@ -8,7 +8,7 @@ import time
|
|
|
8
8
|
import urllib.error
|
|
9
9
|
import urllib.request
|
|
10
10
|
from urllib.parse import urlparse, urlunparse
|
|
11
|
-
from typing import Any, Callable, Dict, List, Optional
|
|
11
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
12
12
|
|
|
13
13
|
from scripts.core.tikomni_common import (
|
|
14
14
|
call_json_api,
|
|
@@ -23,6 +23,21 @@ from scripts.core.u3_fallback import run_u3_public_url_fallback
|
|
|
23
23
|
|
|
24
24
|
U2_BATCH_SUBMIT_HARD_LIMIT = 100
|
|
25
25
|
DEFAULT_U2_PENDING_TIMEOUT_SEC = 60
|
|
26
|
+
SUMMARY_TEXT_FIELDS = (
|
|
27
|
+
"full_text",
|
|
28
|
+
"transcript_text",
|
|
29
|
+
"transcription_text",
|
|
30
|
+
"result_text",
|
|
31
|
+
"summary_text",
|
|
32
|
+
"transcript",
|
|
33
|
+
"transcription",
|
|
34
|
+
"result",
|
|
35
|
+
"content",
|
|
36
|
+
"text",
|
|
37
|
+
)
|
|
38
|
+
SEGMENT_CONTAINER_FIELDS = ("sentences", "segments", "paragraphs")
|
|
39
|
+
SEGMENT_TEXT_FIELDS = ("text", "sentence", "content", "paragraph", "transcript_text")
|
|
40
|
+
CHAR_SPACED_RUN_RE = re.compile(r"(?:[A-Za-z0-9\u4e00-\u9fff]{1,4}\s+){5,}[A-Za-z0-9\u4e00-\u9fff]{1,4}")
|
|
26
41
|
|
|
27
42
|
|
|
28
43
|
def clamp_u2_batch_submit_size(size: int, *, default: int = 50, hard_limit: int = U2_BATCH_SUBMIT_HARD_LIMIT) -> int:
|
|
@@ -251,6 +266,33 @@ def clean_transcript_text(raw_text: Any) -> str:
|
|
|
251
266
|
return normalize_text(raw_text)
|
|
252
267
|
|
|
253
268
|
|
|
269
|
+
def _text_signature(text: str) -> str:
|
|
270
|
+
return re.sub(r"[\W_]+", "", clean_transcript_text(text)).lower()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _is_char_spaced_noise_sequence(text: str) -> bool:
|
|
274
|
+
tokens = [token for token in clean_transcript_text(text).split(" ") if token]
|
|
275
|
+
if len(tokens) < 6:
|
|
276
|
+
return False
|
|
277
|
+
single_char_tokens = sum(1 for token in tokens if len(token) == 1)
|
|
278
|
+
short_tokens = sum(1 for token in tokens if len(token) <= 2)
|
|
279
|
+
cjk_tokens = sum(1 for token in tokens if any("\u4e00" <= char <= "\u9fff" for char in token))
|
|
280
|
+
return (
|
|
281
|
+
single_char_tokens >= 4
|
|
282
|
+
and short_tokens / max(len(tokens), 1) >= 0.75
|
|
283
|
+
and cjk_tokens / max(len(tokens), 1) >= 0.5
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _strip_char_spaced_noise_runs(text: str) -> str:
|
|
288
|
+
def _replace(match: re.Match[str]) -> str:
|
|
289
|
+
chunk = match.group(0)
|
|
290
|
+
return " " if _is_char_spaced_noise_sequence(chunk) else chunk
|
|
291
|
+
|
|
292
|
+
cleaned = CHAR_SPACED_RUN_RE.sub(_replace, text)
|
|
293
|
+
return re.sub(r"\s+", " ", cleaned).strip()
|
|
294
|
+
|
|
295
|
+
|
|
254
296
|
def _ensure_sentence_end(text: str) -> str:
|
|
255
297
|
if not text:
|
|
256
298
|
return text
|
|
@@ -264,13 +306,36 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
|
|
|
264
306
|
if not base:
|
|
265
307
|
return ""
|
|
266
308
|
|
|
267
|
-
denoised =
|
|
309
|
+
denoised = _strip_char_spaced_noise_runs(base)
|
|
310
|
+
denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", denoised)
|
|
268
311
|
denoised = re.sub(r"(嗯+|啊+|呃+)", " ", denoised)
|
|
269
312
|
denoised = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", denoised)
|
|
270
313
|
denoised = re.sub(r"\s+", " ", denoised).strip()
|
|
271
314
|
|
|
272
315
|
units = [clean_transcript_text(part) for part in re.split(r"[。!?!?;;\n]+", denoised)]
|
|
273
|
-
sentences = [
|
|
316
|
+
sentences: List[str] = []
|
|
317
|
+
signatures: List[str] = []
|
|
318
|
+
for unit in units:
|
|
319
|
+
if not unit or _is_char_spaced_noise_sequence(unit):
|
|
320
|
+
continue
|
|
321
|
+
sentence = _ensure_sentence_end(unit)
|
|
322
|
+
signature = _text_signature(sentence)
|
|
323
|
+
if not signature:
|
|
324
|
+
continue
|
|
325
|
+
duplicate = False
|
|
326
|
+
for existing in signatures:
|
|
327
|
+
if signature == existing:
|
|
328
|
+
duplicate = True
|
|
329
|
+
break
|
|
330
|
+
smaller = signature if len(signature) <= len(existing) else existing
|
|
331
|
+
larger = existing if len(signature) <= len(existing) else signature
|
|
332
|
+
if len(smaller) >= 12 and smaller in larger:
|
|
333
|
+
duplicate = True
|
|
334
|
+
break
|
|
335
|
+
if duplicate:
|
|
336
|
+
continue
|
|
337
|
+
signatures.append(signature)
|
|
338
|
+
sentences.append(sentence)
|
|
274
339
|
if not sentences:
|
|
275
340
|
fallback = _ensure_sentence_end(denoised)
|
|
276
341
|
return fallback if fallback else ""
|
|
@@ -292,6 +357,94 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
|
|
|
292
357
|
return "\n\n".join(paragraphs)
|
|
293
358
|
|
|
294
359
|
|
|
360
|
+
def _extract_summary_text_from_node(node: Dict[str, Any]) -> Tuple[str, str]:
|
|
361
|
+
for key in SUMMARY_TEXT_FIELDS:
|
|
362
|
+
value = node.get(key)
|
|
363
|
+
if isinstance(value, str):
|
|
364
|
+
cleaned = clean_transcript_text(value)
|
|
365
|
+
if cleaned:
|
|
366
|
+
return cleaned, key
|
|
367
|
+
return "", ""
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _append_segment_lines(node: Any, lines: List[str]) -> None:
|
|
371
|
+
if isinstance(node, str):
|
|
372
|
+
cleaned = clean_transcript_text(node)
|
|
373
|
+
if cleaned:
|
|
374
|
+
lines.append(cleaned)
|
|
375
|
+
return
|
|
376
|
+
if isinstance(node, dict):
|
|
377
|
+
for key in SEGMENT_TEXT_FIELDS:
|
|
378
|
+
value = node.get(key)
|
|
379
|
+
if isinstance(value, str):
|
|
380
|
+
cleaned = clean_transcript_text(value)
|
|
381
|
+
if cleaned:
|
|
382
|
+
lines.append(cleaned)
|
|
383
|
+
break
|
|
384
|
+
return
|
|
385
|
+
if isinstance(node, list):
|
|
386
|
+
for item in node:
|
|
387
|
+
_append_segment_lines(item, lines)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _extract_segment_text_from_node(node: Dict[str, Any]) -> str:
|
|
391
|
+
lines: List[str] = []
|
|
392
|
+
for key in SEGMENT_CONTAINER_FIELDS:
|
|
393
|
+
if key not in node:
|
|
394
|
+
continue
|
|
395
|
+
_append_segment_lines(node.get(key), lines)
|
|
396
|
+
if lines:
|
|
397
|
+
break
|
|
398
|
+
if not lines:
|
|
399
|
+
return ""
|
|
400
|
+
|
|
401
|
+
deduped: List[str] = []
|
|
402
|
+
seen = set()
|
|
403
|
+
for line in lines:
|
|
404
|
+
signature = _text_signature(line)
|
|
405
|
+
if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
|
|
406
|
+
continue
|
|
407
|
+
seen.add(signature)
|
|
408
|
+
deduped.append(line)
|
|
409
|
+
return "\n".join(deduped).strip()
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _extract_canonical_transcript_from_node(node: Dict[str, Any]) -> Dict[str, Any]:
|
|
413
|
+
summary_text, summary_field = _extract_summary_text_from_node(node)
|
|
414
|
+
if summary_text:
|
|
415
|
+
return {
|
|
416
|
+
"transcript_text": summary_text,
|
|
417
|
+
"summary_field_used": summary_field,
|
|
418
|
+
"segment_fallback_used": False,
|
|
419
|
+
"canonical_text_source": f"summary:{summary_field}",
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
segment_text = _extract_segment_text_from_node(node)
|
|
423
|
+
if segment_text:
|
|
424
|
+
return {
|
|
425
|
+
"transcript_text": segment_text,
|
|
426
|
+
"summary_field_used": "",
|
|
427
|
+
"segment_fallback_used": True,
|
|
428
|
+
"canonical_text_source": "segments",
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
fallback_text = clean_transcript_text(extract_transcript_text(node))
|
|
432
|
+
if fallback_text:
|
|
433
|
+
return {
|
|
434
|
+
"transcript_text": fallback_text,
|
|
435
|
+
"summary_field_used": "",
|
|
436
|
+
"segment_fallback_used": True,
|
|
437
|
+
"canonical_text_source": "deep_search_fallback",
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
return {
|
|
441
|
+
"transcript_text": "",
|
|
442
|
+
"summary_field_used": "",
|
|
443
|
+
"segment_fallback_used": False,
|
|
444
|
+
"canonical_text_source": "missing",
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
|
|
295
448
|
def extract_u2_task_metrics(payload: Any) -> Dict[str, Any]:
|
|
296
449
|
metrics = deep_find_first(payload, ["task_metrics", "metrics"])
|
|
297
450
|
return metrics if isinstance(metrics, dict) else {}
|
|
@@ -349,16 +502,8 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
|
|
|
349
502
|
)
|
|
350
503
|
file_url = normalize_media_url(str(raw_file_url or ""))
|
|
351
504
|
if file_url:
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
or node.get("text")
|
|
355
|
-
or node.get("transcript")
|
|
356
|
-
or node.get("transcription")
|
|
357
|
-
or node.get("content")
|
|
358
|
-
or ""
|
|
359
|
-
)
|
|
360
|
-
if not transcript:
|
|
361
|
-
transcript = clean_transcript_text(extract_transcript_text(node))
|
|
505
|
+
canonical = _extract_canonical_transcript_from_node(node)
|
|
506
|
+
transcript = clean_transcript_text(canonical.get("transcript_text"))
|
|
362
507
|
|
|
363
508
|
status = _status_upper(node.get("status") or node.get("task_status") or node.get("state"))
|
|
364
509
|
error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
|
|
@@ -372,6 +517,9 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
|
|
|
372
517
|
"transcription_url": transcription_url,
|
|
373
518
|
"error_reason": error_reason,
|
|
374
519
|
"ok": ok,
|
|
520
|
+
"summary_field_used": canonical.get("summary_field_used", ""),
|
|
521
|
+
"segment_fallback_used": bool(canonical.get("segment_fallback_used")),
|
|
522
|
+
"canonical_text_source": canonical.get("canonical_text_source", "missing"),
|
|
375
523
|
}
|
|
376
524
|
|
|
377
525
|
existing = found.get(file_url)
|
|
@@ -380,12 +528,16 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
|
|
|
380
528
|
else:
|
|
381
529
|
old_score = (
|
|
382
530
|
1 if existing.get("ok") else 0,
|
|
531
|
+
1 if not existing.get("segment_fallback_used") else 0,
|
|
532
|
+
1 if existing.get("summary_field_used") else 0,
|
|
383
533
|
len(str(existing.get("transcript_text") or "")),
|
|
384
534
|
1 if existing.get("transcription_url") else 0,
|
|
385
535
|
1 if not existing.get("error_reason") else 0,
|
|
386
536
|
)
|
|
387
537
|
new_score = (
|
|
388
538
|
1 if candidate.get("ok") else 0,
|
|
539
|
+
1 if not candidate.get("segment_fallback_used") else 0,
|
|
540
|
+
1 if candidate.get("summary_field_used") else 0,
|
|
389
541
|
len(str(candidate.get("transcript_text") or "")),
|
|
390
542
|
1 if candidate.get("transcription_url") else 0,
|
|
391
543
|
1 if not candidate.get("error_reason") else 0,
|
|
@@ -441,16 +593,8 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
|
|
|
441
593
|
item_index_raw = node.get("item_index")
|
|
442
594
|
item_index = _parse_non_negative_item_index(item_index_raw)
|
|
443
595
|
if item_index is not None:
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
or node.get("text")
|
|
447
|
-
or node.get("transcript")
|
|
448
|
-
or node.get("transcription")
|
|
449
|
-
or node.get("content")
|
|
450
|
-
or ""
|
|
451
|
-
)
|
|
452
|
-
if not transcript:
|
|
453
|
-
transcript = clean_transcript_text(extract_transcript_text(node))
|
|
596
|
+
canonical = _extract_canonical_transcript_from_node(node)
|
|
597
|
+
transcript = clean_transcript_text(canonical.get("transcript_text"))
|
|
454
598
|
|
|
455
599
|
status = _status_upper(node.get("task_status") or node.get("status") or node.get("state"))
|
|
456
600
|
error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
|
|
@@ -464,6 +608,9 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
|
|
|
464
608
|
"error_reason": error_reason,
|
|
465
609
|
"transcription_url": transcription_url,
|
|
466
610
|
"ok": ok,
|
|
611
|
+
"summary_field_used": canonical.get("summary_field_used", ""),
|
|
612
|
+
"segment_fallback_used": bool(canonical.get("segment_fallback_used")),
|
|
613
|
+
"canonical_text_source": canonical.get("canonical_text_source", "missing"),
|
|
467
614
|
}
|
|
468
615
|
|
|
469
616
|
existing = mapped.get(item_index)
|
|
@@ -472,12 +619,16 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
|
|
|
472
619
|
else:
|
|
473
620
|
old_score = (
|
|
474
621
|
1 if existing.get("ok") else 0,
|
|
622
|
+
1 if not existing.get("segment_fallback_used") else 0,
|
|
623
|
+
1 if existing.get("summary_field_used") else 0,
|
|
475
624
|
len(str(existing.get("transcript_text") or "")),
|
|
476
625
|
1 if existing.get("transcription_url") else 0,
|
|
477
626
|
1 if not existing.get("error_reason") else 0,
|
|
478
627
|
)
|
|
479
628
|
new_score = (
|
|
480
629
|
1 if candidate.get("ok") else 0,
|
|
630
|
+
1 if not candidate.get("segment_fallback_used") else 0,
|
|
631
|
+
1 if candidate.get("summary_field_used") else 0,
|
|
481
632
|
len(str(candidate.get("transcript_text") or "")),
|
|
482
633
|
1 if candidate.get("transcription_url") else 0,
|
|
483
634
|
1 if not candidate.get("error_reason") else 0,
|
|
@@ -506,29 +657,102 @@ def _extract_transcript_from_transcription_payload(payload: Any) -> str:
|
|
|
506
657
|
except Exception:
|
|
507
658
|
return ""
|
|
508
659
|
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
660
|
+
for key in SUMMARY_TEXT_FIELDS:
|
|
661
|
+
transcript = clean_transcript_text(deep_find_first(payload, [key]))
|
|
662
|
+
if transcript:
|
|
663
|
+
return transcript
|
|
664
|
+
|
|
665
|
+
for key in SEGMENT_CONTAINER_FIELDS:
|
|
666
|
+
segments = deep_find_first(payload, [key])
|
|
667
|
+
if segments is None:
|
|
668
|
+
continue
|
|
669
|
+
lines: List[str] = []
|
|
670
|
+
_append_segment_lines(segments, lines)
|
|
671
|
+
deduped: List[str] = []
|
|
672
|
+
seen = set()
|
|
673
|
+
for line in lines:
|
|
674
|
+
signature = _text_signature(line)
|
|
675
|
+
if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
|
|
676
|
+
continue
|
|
677
|
+
seen.add(signature)
|
|
678
|
+
deduped.append(line)
|
|
679
|
+
if deduped:
|
|
680
|
+
return "\n".join(deduped)
|
|
512
681
|
|
|
513
682
|
transcript = clean_transcript_text(extract_transcript_text(payload))
|
|
514
683
|
if transcript:
|
|
515
684
|
return transcript
|
|
516
685
|
|
|
517
|
-
|
|
518
|
-
|
|
686
|
+
return ""
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def _extract_transcript_bundle_from_transcription_payload(payload: Any) -> Dict[str, Any]:
|
|
690
|
+
if isinstance(payload, str):
|
|
691
|
+
text = clean_transcript_text(payload)
|
|
692
|
+
if text:
|
|
693
|
+
return {
|
|
694
|
+
"transcript_text": text,
|
|
695
|
+
"summary_field_used": "raw_string",
|
|
696
|
+
"segment_fallback_used": False,
|
|
697
|
+
"canonical_text_source": "summary:raw_string",
|
|
698
|
+
}
|
|
699
|
+
try:
|
|
700
|
+
payload = json.loads(payload)
|
|
701
|
+
except Exception:
|
|
702
|
+
return {
|
|
703
|
+
"transcript_text": "",
|
|
704
|
+
"summary_field_used": "",
|
|
705
|
+
"segment_fallback_used": False,
|
|
706
|
+
"canonical_text_source": "missing",
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
for key in SUMMARY_TEXT_FIELDS:
|
|
710
|
+
transcript = clean_transcript_text(deep_find_first(payload, [key]))
|
|
711
|
+
if transcript:
|
|
712
|
+
return {
|
|
713
|
+
"transcript_text": transcript,
|
|
714
|
+
"summary_field_used": key,
|
|
715
|
+
"segment_fallback_used": False,
|
|
716
|
+
"canonical_text_source": f"summary:{key}",
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
for key in SEGMENT_CONTAINER_FIELDS:
|
|
720
|
+
segments = deep_find_first(payload, [key])
|
|
721
|
+
if segments is None:
|
|
722
|
+
continue
|
|
519
723
|
lines: List[str] = []
|
|
520
|
-
|
|
521
|
-
|
|
724
|
+
_append_segment_lines(segments, lines)
|
|
725
|
+
deduped: List[str] = []
|
|
726
|
+
seen = set()
|
|
727
|
+
for line in lines:
|
|
728
|
+
signature = _text_signature(line)
|
|
729
|
+
if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
|
|
522
730
|
continue
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
731
|
+
seen.add(signature)
|
|
732
|
+
deduped.append(line)
|
|
733
|
+
if deduped:
|
|
734
|
+
return {
|
|
735
|
+
"transcript_text": "\n".join(deduped),
|
|
736
|
+
"summary_field_used": "",
|
|
737
|
+
"segment_fallback_used": True,
|
|
738
|
+
"canonical_text_source": f"segments:{key}",
|
|
739
|
+
}
|
|
530
740
|
|
|
531
|
-
|
|
741
|
+
transcript = clean_transcript_text(extract_transcript_text(payload))
|
|
742
|
+
if transcript:
|
|
743
|
+
return {
|
|
744
|
+
"transcript_text": transcript,
|
|
745
|
+
"summary_field_used": "",
|
|
746
|
+
"segment_fallback_used": True,
|
|
747
|
+
"canonical_text_source": "deep_search_fallback",
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
return {
|
|
751
|
+
"transcript_text": "",
|
|
752
|
+
"summary_field_used": "",
|
|
753
|
+
"segment_fallback_used": False,
|
|
754
|
+
"canonical_text_source": "missing",
|
|
755
|
+
}
|
|
532
756
|
|
|
533
757
|
|
|
534
758
|
def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int) -> Dict[str, Any]:
|
|
@@ -573,13 +797,17 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
|
|
|
573
797
|
except Exception:
|
|
574
798
|
payload = raw_text
|
|
575
799
|
|
|
576
|
-
|
|
800
|
+
transcript_bundle = _extract_transcript_bundle_from_transcription_payload(payload)
|
|
801
|
+
transcript = transcript_bundle.get("transcript_text", "")
|
|
577
802
|
if transcript:
|
|
578
803
|
return {
|
|
579
804
|
"ok": True,
|
|
580
805
|
"transcription_url": url,
|
|
581
806
|
"error_reason": "",
|
|
582
807
|
"transcript_text": transcript,
|
|
808
|
+
"summary_field_used": transcript_bundle.get("summary_field_used", ""),
|
|
809
|
+
"segment_fallback_used": bool(transcript_bundle.get("segment_fallback_used")),
|
|
810
|
+
"canonical_text_source": transcript_bundle.get("canonical_text_source", "missing"),
|
|
583
811
|
}
|
|
584
812
|
|
|
585
813
|
return {
|
|
@@ -587,6 +815,9 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
|
|
|
587
815
|
"transcription_url": url,
|
|
588
816
|
"error_reason": "transcription_payload_empty",
|
|
589
817
|
"transcript_text": "",
|
|
818
|
+
"summary_field_used": "",
|
|
819
|
+
"segment_fallback_used": False,
|
|
820
|
+
"canonical_text_source": "missing",
|
|
590
821
|
}
|
|
591
822
|
|
|
592
823
|
|
|
@@ -620,6 +851,9 @@ def hydrate_u2_batch_results_from_transcription_urls(
|
|
|
620
851
|
if fetched_text:
|
|
621
852
|
transcript = fetched_text
|
|
622
853
|
candidate["transcript_text"] = fetched_text
|
|
854
|
+
candidate["summary_field_used"] = fetch_result.get("summary_field_used", "")
|
|
855
|
+
candidate["segment_fallback_used"] = bool(fetch_result.get("segment_fallback_used"))
|
|
856
|
+
candidate["canonical_text_source"] = fetch_result.get("canonical_text_source", "missing")
|
|
623
857
|
elif not candidate.get("error_reason"):
|
|
624
858
|
candidate["error_reason"] = fetch_result.get("error_reason") or "transcription_payload_empty"
|
|
625
859
|
|
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
|
|
4
4
|
from __future__ import annotations
|
|
5
5
|
|
|
6
|
+
from copy import deepcopy
|
|
6
7
|
import json
|
|
7
8
|
import re
|
|
8
9
|
import urllib.request
|
|
9
|
-
from
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple
|
|
10
12
|
|
|
11
13
|
from scripts.core.progress_report import ProgressReporter
|
|
12
14
|
from scripts.core.tikomni_common import normalize_text
|
|
@@ -26,6 +28,20 @@ XHS_U3_U2_BATCH_SIZE = 20
|
|
|
26
28
|
U2_GATE_MIN_DURATION_MS = 13000
|
|
27
29
|
U2_GATE_MAX_DURATION_MS = 1800000
|
|
28
30
|
U2_GATE_RULE = "is_video && 13000<duration_ms<=1800000 && video_download_url_present"
|
|
31
|
+
CHECKPOINT_WORK_FIELDS = (
|
|
32
|
+
"platform_work_id",
|
|
33
|
+
"subtitle_raw",
|
|
34
|
+
"subtitle_source",
|
|
35
|
+
"asr_raw",
|
|
36
|
+
"asr_clean",
|
|
37
|
+
"primary_text",
|
|
38
|
+
"primary_text_source",
|
|
39
|
+
"analysis_eligibility",
|
|
40
|
+
"analysis_exclusion_reason",
|
|
41
|
+
"asr_status",
|
|
42
|
+
"asr_error_reason",
|
|
43
|
+
"asr_source",
|
|
44
|
+
)
|
|
29
45
|
|
|
30
46
|
|
|
31
47
|
def _to_int_or_none(value: Any) -> Optional[int]:
|
|
@@ -467,6 +483,108 @@ def _dedupe_works_by_platform_id(works: List[Dict[str, Any]]) -> Tuple[List[Dict
|
|
|
467
483
|
return deduped, duplicates
|
|
468
484
|
|
|
469
485
|
|
|
486
|
+
def _snapshot_work_for_checkpoint(work: Dict[str, Any]) -> Dict[str, Any]:
|
|
487
|
+
snapshot: Dict[str, Any] = {}
|
|
488
|
+
for key in CHECKPOINT_WORK_FIELDS:
|
|
489
|
+
if key in work:
|
|
490
|
+
snapshot[key] = deepcopy(work.get(key))
|
|
491
|
+
platform_work_id = normalize_text(work.get("platform_work_id"))
|
|
492
|
+
if platform_work_id:
|
|
493
|
+
snapshot["platform_work_id"] = platform_work_id
|
|
494
|
+
return snapshot
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def _restore_completed_work_payloads(*, works: List[Dict[str, Any]], checkpoint: Dict[str, Any]) -> int:
|
|
498
|
+
completed_payloads = checkpoint.get("completed_work_payloads")
|
|
499
|
+
if not isinstance(completed_payloads, dict):
|
|
500
|
+
return 0
|
|
501
|
+
|
|
502
|
+
restored = 0
|
|
503
|
+
for work in works:
|
|
504
|
+
if not isinstance(work, dict):
|
|
505
|
+
continue
|
|
506
|
+
work_id = normalize_text(work.get("platform_work_id"))
|
|
507
|
+
if not work_id:
|
|
508
|
+
continue
|
|
509
|
+
payload = completed_payloads.get(work_id)
|
|
510
|
+
if not isinstance(payload, dict):
|
|
511
|
+
continue
|
|
512
|
+
work.update(deepcopy(payload))
|
|
513
|
+
restored += 1
|
|
514
|
+
return restored
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def _count_processed_results(*, works: List[Dict[str, Any]], completed_ids: Set[str]) -> Tuple[int, int, List[str]]:
|
|
518
|
+
success_count = 0
|
|
519
|
+
failed_ids: List[str] = []
|
|
520
|
+
completed_id_set = {normalize_text(item) for item in completed_ids if normalize_text(item)}
|
|
521
|
+
|
|
522
|
+
for work in works:
|
|
523
|
+
if not isinstance(work, dict):
|
|
524
|
+
continue
|
|
525
|
+
work_id = normalize_text(work.get("platform_work_id"))
|
|
526
|
+
if not work_id or work_id not in completed_id_set:
|
|
527
|
+
continue
|
|
528
|
+
if str(work.get("analysis_eligibility") or "") == "eligible":
|
|
529
|
+
success_count += 1
|
|
530
|
+
else:
|
|
531
|
+
failed_ids.append(work_id)
|
|
532
|
+
|
|
533
|
+
failed_ids = sorted(set(failed_ids))
|
|
534
|
+
return success_count, len(failed_ids), failed_ids
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _build_checkpoint_snapshot(
|
|
538
|
+
*,
|
|
539
|
+
platform: str,
|
|
540
|
+
works: List[Dict[str, Any]],
|
|
541
|
+
completed_ids: Set[str],
|
|
542
|
+
batch_size: int,
|
|
543
|
+
batches_total: int,
|
|
544
|
+
batches_submitted: int,
|
|
545
|
+
batches_completed: int,
|
|
546
|
+
batch_mapped: int,
|
|
547
|
+
batch_unmapped: int,
|
|
548
|
+
fallback_singles: int,
|
|
549
|
+
request_id: str,
|
|
550
|
+
last_completed_batch_id: str,
|
|
551
|
+
) -> Dict[str, Any]:
|
|
552
|
+
completed_id_set = {normalize_text(item) for item in completed_ids if normalize_text(item)}
|
|
553
|
+
success_count, failed_count, failed_work_ids = _count_processed_results(works=works, completed_ids=completed_id_set)
|
|
554
|
+
completed_work_payloads: Dict[str, Any] = {}
|
|
555
|
+
for work in works:
|
|
556
|
+
if not isinstance(work, dict):
|
|
557
|
+
continue
|
|
558
|
+
work_id = normalize_text(work.get("platform_work_id"))
|
|
559
|
+
if not work_id or work_id not in completed_id_set:
|
|
560
|
+
continue
|
|
561
|
+
completed_work_payloads[work_id] = _snapshot_work_for_checkpoint(work)
|
|
562
|
+
|
|
563
|
+
return {
|
|
564
|
+
"platform": platform,
|
|
565
|
+
"request_id": request_id or None,
|
|
566
|
+
"completed_work_ids": sorted(completed_id_set),
|
|
567
|
+
"failed_work_ids": failed_work_ids,
|
|
568
|
+
"completed_work_payloads": completed_work_payloads,
|
|
569
|
+
"batch_size": batch_size,
|
|
570
|
+
"batches_total": batches_total,
|
|
571
|
+
"batches_submitted": batches_submitted,
|
|
572
|
+
"batches_completed": batches_completed,
|
|
573
|
+
"batch_mapped": batch_mapped,
|
|
574
|
+
"batch_unmapped": batch_unmapped,
|
|
575
|
+
"fallback_singles": fallback_singles,
|
|
576
|
+
"total_works": len(works),
|
|
577
|
+
"processed_works": len(completed_id_set),
|
|
578
|
+
"success_works": success_count,
|
|
579
|
+
"failed_works": failed_count,
|
|
580
|
+
"pending_works": max(0, len(works) - len(completed_id_set)),
|
|
581
|
+
"last_completed_batch_id": last_completed_batch_id,
|
|
582
|
+
"updated_at": datetime.now().isoformat(timespec="seconds"),
|
|
583
|
+
# backward-compatible checkpoint fields
|
|
584
|
+
"refill_attempted": fallback_singles,
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
|
|
470
588
|
def _fallback_none_result(reason: str) -> Dict[str, Any]:
|
|
471
589
|
return {
|
|
472
590
|
"subtitle_raw": "",
|
|
@@ -804,12 +922,15 @@ def enrich_author_home_asr(
|
|
|
804
922
|
timeout_retry_max_retries: int = 3,
|
|
805
923
|
batch_size: int = DEFAULT_BATCH_SUBMIT_SIZE,
|
|
806
924
|
checkpoint: Optional[Dict[str, Any]] = None,
|
|
925
|
+
request_id: str = "",
|
|
926
|
+
on_batch_complete: Optional[Callable[[Dict[str, Any]], None]] = None,
|
|
807
927
|
progress: Optional[ProgressReporter] = None,
|
|
808
928
|
) -> Dict[str, Any]:
|
|
809
929
|
trace: List[Dict[str, Any]] = []
|
|
810
930
|
deduped_works, duplicate_count = _dedupe_works_by_platform_id(works)
|
|
811
931
|
|
|
812
932
|
checkpoint_in = checkpoint if isinstance(checkpoint, dict) else {}
|
|
933
|
+
restored_payloads = _restore_completed_work_payloads(works=deduped_works, checkpoint=checkpoint_in)
|
|
813
934
|
completed_ids = {
|
|
814
935
|
normalize_text(item)
|
|
815
936
|
for item in (checkpoint_in.get("completed_work_ids") or [])
|
|
@@ -833,6 +954,7 @@ def enrich_author_home_asr(
|
|
|
833
954
|
"deduped_count": len(deduped_works),
|
|
834
955
|
"duplicate_count": duplicate_count,
|
|
835
956
|
"resume_completed": len(completed_ids),
|
|
957
|
+
"resume_payloads_restored": restored_payloads,
|
|
836
958
|
"requested_batch_size": requested_batch,
|
|
837
959
|
"batch_size": effective_batch,
|
|
838
960
|
"batch_size_clamped": requested_batch != effective_batch,
|
|
@@ -847,6 +969,7 @@ def enrich_author_home_asr(
|
|
|
847
969
|
"input_count": len(works),
|
|
848
970
|
"deduped_count": len(deduped_works),
|
|
849
971
|
"resume_completed": len(completed_ids),
|
|
972
|
+
"resume_payloads_restored": restored_payloads,
|
|
850
973
|
"batch_size": effective_batch,
|
|
851
974
|
},
|
|
852
975
|
)
|
|
@@ -874,8 +997,9 @@ def enrich_author_home_asr(
|
|
|
874
997
|
data={"queued_count": len(queue), "batch_total": batch_total},
|
|
875
998
|
)
|
|
876
999
|
|
|
877
|
-
|
|
878
|
-
|
|
1000
|
+
restored_success_count, restored_failed_count, _ = _count_processed_results(works=deduped_works, completed_ids=completed_ids)
|
|
1001
|
+
success_count = restored_success_count
|
|
1002
|
+
fallback_none_count = restored_failed_count
|
|
879
1003
|
submitted_batches = 0
|
|
880
1004
|
completed_batches = 0
|
|
881
1005
|
batch_mapped_count = 0
|
|
@@ -1124,34 +1248,66 @@ def enrich_author_home_asr(
|
|
|
1124
1248
|
},
|
|
1125
1249
|
)
|
|
1126
1250
|
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1251
|
+
checkpoint_snapshot = _build_checkpoint_snapshot(
|
|
1252
|
+
platform=platform,
|
|
1253
|
+
works=deduped_works,
|
|
1254
|
+
completed_ids=completed_ids,
|
|
1255
|
+
batch_size=effective_batch,
|
|
1256
|
+
batches_total=batch_total,
|
|
1257
|
+
batches_submitted=submitted_batches,
|
|
1258
|
+
batches_completed=completed_batches,
|
|
1259
|
+
batch_mapped=batch_mapped_count,
|
|
1260
|
+
batch_unmapped=batch_unmapped_count,
|
|
1261
|
+
fallback_singles=fallback_single_count,
|
|
1262
|
+
request_id=request_id,
|
|
1263
|
+
last_completed_batch_id=batch_id,
|
|
1136
1264
|
)
|
|
1265
|
+
if on_batch_complete is not None:
|
|
1266
|
+
on_batch_complete(
|
|
1267
|
+
{
|
|
1268
|
+
"platform": platform,
|
|
1269
|
+
"batch_id": batch_id,
|
|
1270
|
+
"batch_index": batch_index + 1,
|
|
1271
|
+
"batch_total": batch_total,
|
|
1272
|
+
"batch_works": batch,
|
|
1273
|
+
"works": deduped_works,
|
|
1274
|
+
"trace": list(trace),
|
|
1275
|
+
"checkpoint": checkpoint_snapshot,
|
|
1276
|
+
"stats": {
|
|
1277
|
+
"total": len(deduped_works),
|
|
1278
|
+
"success": success_count,
|
|
1279
|
+
"fallback_none": fallback_none_count,
|
|
1280
|
+
"duplicates_dropped": duplicate_count,
|
|
1281
|
+
"submitted_batches": submitted_batches,
|
|
1282
|
+
"completed_batches": completed_batches,
|
|
1283
|
+
"batch_mapped": batch_mapped_count,
|
|
1284
|
+
"batch_unmapped": batch_unmapped_count,
|
|
1285
|
+
"fallback_singles": fallback_single_count,
|
|
1286
|
+
"refill_attempted": fallback_single_count,
|
|
1287
|
+
"refill_failed": checkpoint_snapshot.get("failed_works", 0),
|
|
1288
|
+
},
|
|
1289
|
+
}
|
|
1290
|
+
)
|
|
1291
|
+
|
|
1292
|
+
success_count, fallback_none_count, failed_work_ids = _count_processed_results(
|
|
1293
|
+
works=deduped_works,
|
|
1294
|
+
completed_ids=completed_ids,
|
|
1137
1295
|
)
|
|
1138
1296
|
|
|
1139
|
-
checkpoint_out =
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
"
|
|
1152
|
-
|
|
1153
|
-
"refill_attempted": fallback_single_count,
|
|
1154
|
-
}
|
|
1297
|
+
checkpoint_out = _build_checkpoint_snapshot(
|
|
1298
|
+
platform=platform,
|
|
1299
|
+
works=deduped_works,
|
|
1300
|
+
completed_ids=completed_ids,
|
|
1301
|
+
batch_size=effective_batch,
|
|
1302
|
+
batches_total=batch_total,
|
|
1303
|
+
batches_submitted=submitted_batches,
|
|
1304
|
+
batches_completed=completed_batches,
|
|
1305
|
+
batch_mapped=batch_mapped_count,
|
|
1306
|
+
batch_unmapped=batch_unmapped_count,
|
|
1307
|
+
fallback_singles=fallback_single_count,
|
|
1308
|
+
request_id=request_id,
|
|
1309
|
+
last_completed_batch_id=f"batch-{batch_total:03d}" if batch_total > 0 else normalize_text(checkpoint_in.get("last_completed_batch_id")),
|
|
1310
|
+
)
|
|
1155
1311
|
|
|
1156
1312
|
stats = {
|
|
1157
1313
|
"total": len(deduped_works),
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Shared runtime-state helpers for homepage pipelines."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from scripts.core.storage_router import resolve_author_directory_name
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _safe_text(value: Any) -> str:
|
|
15
|
+
if value is None:
|
|
16
|
+
return ""
|
|
17
|
+
return str(value).strip()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _now_iso() -> str:
|
|
21
|
+
return datetime.now().isoformat(timespec="seconds")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def resolve_homepage_author_dir(*, platform: str, profile: Dict[str, Any], card_root: str) -> Path:
|
|
25
|
+
author_dir_name = resolve_author_directory_name(
|
|
26
|
+
platform,
|
|
27
|
+
_safe_text(profile.get("author_handle")),
|
|
28
|
+
_safe_text(profile.get("platform_author_id")),
|
|
29
|
+
_safe_text(profile.get("nickname")),
|
|
30
|
+
)
|
|
31
|
+
author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
|
|
32
|
+
author_dir.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
return author_dir
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def load_homepage_checkpoint(*, platform: str, profile: Dict[str, Any], card_root: str) -> Dict[str, Any]:
|
|
37
|
+
author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
|
|
38
|
+
checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
|
|
39
|
+
if not checkpoint_path.is_file():
|
|
40
|
+
return {}
|
|
41
|
+
try:
|
|
42
|
+
payload = json.loads(checkpoint_path.read_text(encoding="utf-8"))
|
|
43
|
+
except Exception:
|
|
44
|
+
return {}
|
|
45
|
+
return payload if isinstance(payload, dict) else {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def clear_homepage_checkpoint(*, platform: str, profile: Dict[str, Any], card_root: str) -> Optional[str]:
|
|
49
|
+
author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
|
|
50
|
+
checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
|
|
51
|
+
if not checkpoint_path.exists():
|
|
52
|
+
return None
|
|
53
|
+
checkpoint_path.unlink()
|
|
54
|
+
return str(checkpoint_path)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def persist_homepage_runtime_artifacts(
|
|
58
|
+
*,
|
|
59
|
+
platform: str,
|
|
60
|
+
profile: Dict[str, Any],
|
|
61
|
+
works: List[Dict[str, Any]],
|
|
62
|
+
card_root: str,
|
|
63
|
+
extract_trace: List[Dict[str, Any]],
|
|
64
|
+
request_id: str,
|
|
65
|
+
checkpoint: Optional[Dict[str, Any]],
|
|
66
|
+
run_status: str,
|
|
67
|
+
last_completed_batch_id: str = "",
|
|
68
|
+
) -> Dict[str, str]:
|
|
69
|
+
author_dir = resolve_homepage_author_dir(platform=platform, profile=profile, card_root=card_root)
|
|
70
|
+
updated_at = _now_iso()
|
|
71
|
+
|
|
72
|
+
checkpoint_payload = checkpoint if isinstance(checkpoint, dict) else {}
|
|
73
|
+
completed_work_ids = sorted({_safe_text(item) for item in (checkpoint_payload.get("completed_work_ids") or []) if _safe_text(item)})
|
|
74
|
+
failed_work_ids = sorted({_safe_text(item) for item in (checkpoint_payload.get("failed_work_ids") or []) if _safe_text(item)})
|
|
75
|
+
completed_id_set = set(completed_work_ids)
|
|
76
|
+
failed_id_set = set(failed_work_ids)
|
|
77
|
+
|
|
78
|
+
collection_items: List[Dict[str, Any]] = []
|
|
79
|
+
for work in works:
|
|
80
|
+
if not isinstance(work, dict):
|
|
81
|
+
continue
|
|
82
|
+
work_id = _safe_text(work.get("platform_work_id"))
|
|
83
|
+
processing_status = "pending"
|
|
84
|
+
if work_id in failed_id_set:
|
|
85
|
+
processing_status = "failed"
|
|
86
|
+
elif work_id in completed_id_set:
|
|
87
|
+
processing_status = "completed"
|
|
88
|
+
collection_items.append(
|
|
89
|
+
{
|
|
90
|
+
"platform_work_id": work_id,
|
|
91
|
+
"title": work.get("title"),
|
|
92
|
+
"published_date": work.get("published_date"),
|
|
93
|
+
"processing_status": processing_status,
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
completed_count = len(completed_work_ids)
|
|
98
|
+
failed_count = len(failed_work_ids)
|
|
99
|
+
total_count = len(collection_items)
|
|
100
|
+
pending_count = max(0, total_count - completed_count)
|
|
101
|
+
|
|
102
|
+
creator_profile = dict(profile)
|
|
103
|
+
creator_profile.update(
|
|
104
|
+
{
|
|
105
|
+
"request_id": request_id,
|
|
106
|
+
"extract_trace": extract_trace,
|
|
107
|
+
"run_status": run_status,
|
|
108
|
+
"completed_count": completed_count,
|
|
109
|
+
"failed_count": failed_count,
|
|
110
|
+
"pending_count": pending_count,
|
|
111
|
+
"updated_at": updated_at,
|
|
112
|
+
}
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
work_collection = {
|
|
116
|
+
"platform": platform,
|
|
117
|
+
"platform_author_id": profile.get("platform_author_id"),
|
|
118
|
+
"count": total_count,
|
|
119
|
+
"items": collection_items,
|
|
120
|
+
"request_id": request_id,
|
|
121
|
+
"extract_trace": extract_trace,
|
|
122
|
+
"run_status": run_status,
|
|
123
|
+
"completed_count": completed_count,
|
|
124
|
+
"failed_count": failed_count,
|
|
125
|
+
"pending_count": pending_count,
|
|
126
|
+
"completed_work_ids": completed_work_ids,
|
|
127
|
+
"failed_work_ids": failed_work_ids,
|
|
128
|
+
"batch_size": checkpoint_payload.get("batch_size"),
|
|
129
|
+
"batches_total": checkpoint_payload.get("batches_total"),
|
|
130
|
+
"batches_completed": checkpoint_payload.get("batches_completed"),
|
|
131
|
+
"batch_mapped": checkpoint_payload.get("batch_mapped"),
|
|
132
|
+
"batch_unmapped": checkpoint_payload.get("batch_unmapped"),
|
|
133
|
+
"fallback_singles": checkpoint_payload.get("fallback_singles"),
|
|
134
|
+
"last_completed_batch_id": last_completed_batch_id or _safe_text(checkpoint_payload.get("last_completed_batch_id")),
|
|
135
|
+
"updated_at": updated_at,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if checkpoint_payload:
|
|
139
|
+
checkpoint_to_write = dict(checkpoint_payload)
|
|
140
|
+
checkpoint_to_write["request_id"] = request_id
|
|
141
|
+
checkpoint_to_write["updated_at"] = updated_at
|
|
142
|
+
checkpoint_to_write["last_completed_batch_id"] = last_completed_batch_id or _safe_text(checkpoint_payload.get("last_completed_batch_id"))
|
|
143
|
+
else:
|
|
144
|
+
checkpoint_to_write = {}
|
|
145
|
+
|
|
146
|
+
profile_path = author_dir / "_creator_profile.json"
|
|
147
|
+
collection_path = author_dir / "_work_collection.json"
|
|
148
|
+
checkpoint_path = author_dir / "_homepage_asr_checkpoint.json"
|
|
149
|
+
profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
150
|
+
collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
151
|
+
if checkpoint_to_write:
|
|
152
|
+
checkpoint_path.write_text(json.dumps(checkpoint_to_write, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
153
|
+
|
|
154
|
+
return {
|
|
155
|
+
"author_dir": str(author_dir),
|
|
156
|
+
"creator_profile_path": str(profile_path),
|
|
157
|
+
"work_collection_path": str(collection_path),
|
|
158
|
+
"checkpoint_path": str(checkpoint_path),
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def resolve_homepage_run_status(stats: Optional[Dict[str, Any]]) -> str:
|
|
163
|
+
payload = stats if isinstance(stats, dict) else {}
|
|
164
|
+
total = int(payload.get("total") or 0)
|
|
165
|
+
success = int(payload.get("success") or 0)
|
|
166
|
+
failed = int(payload.get("fallback_none") or 0)
|
|
167
|
+
if total <= 0:
|
|
168
|
+
return "complete"
|
|
169
|
+
if failed <= 0 and success >= total:
|
|
170
|
+
return "complete"
|
|
171
|
+
if success > 0:
|
|
172
|
+
return "partial"
|
|
173
|
+
return "failed"
|
|
@@ -14,9 +14,7 @@ if __package__ in {None, ""}:
|
|
|
14
14
|
break
|
|
15
15
|
|
|
16
16
|
import argparse
|
|
17
|
-
import
|
|
18
|
-
from pathlib import Path
|
|
19
|
-
from typing import Any, Dict, List
|
|
17
|
+
from typing import Any, Dict, List, Set
|
|
20
18
|
|
|
21
19
|
from scripts.core.bootstrap_env import bootstrap_for_direct_run
|
|
22
20
|
|
|
@@ -25,63 +23,23 @@ bootstrap_for_direct_run(__file__, __package__)
|
|
|
25
23
|
from scripts.core.completeness import ensure_request_id, evaluate_collection, normalize_missing_fields
|
|
26
24
|
from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
|
|
27
25
|
from scripts.core.progress_report import build_progress_reporter
|
|
28
|
-
from scripts.core.storage_router import resolve_author_directory_name
|
|
29
26
|
from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
|
|
30
27
|
from scripts.pipelines.input_contracts import normalize_douyin_creator_input
|
|
31
28
|
from scripts.pipelines.schema import build_author_profile
|
|
32
29
|
from scripts.pipelines.douyin_creator_home_helpers import collect_and_adapt
|
|
33
30
|
from scripts.pipelines.home_asr import enrich_author_home_asr
|
|
31
|
+
from scripts.pipelines.homepage_runtime_state import (
|
|
32
|
+
clear_homepage_checkpoint,
|
|
33
|
+
load_homepage_checkpoint,
|
|
34
|
+
persist_homepage_runtime_artifacts,
|
|
35
|
+
resolve_homepage_run_status,
|
|
36
|
+
)
|
|
34
37
|
from scripts.writers.write_work_fact_card import build_work_fact_card, persist_output_envelope, write_work_fact_card
|
|
35
38
|
|
|
36
39
|
DEFAULT_MAX_ITEMS = 200
|
|
37
40
|
MAX_ITEMS_HARD_LIMIT = 200
|
|
38
41
|
|
|
39
42
|
|
|
40
|
-
def _write_collection_artifacts(
|
|
41
|
-
*,
|
|
42
|
-
profile: Dict[str, Any],
|
|
43
|
-
works: List[Dict[str, Any]],
|
|
44
|
-
card_root: str,
|
|
45
|
-
extract_trace: List[Dict[str, Any]],
|
|
46
|
-
request_id: str,
|
|
47
|
-
) -> Dict[str, str]:
|
|
48
|
-
author_dir_name = resolve_author_directory_name(
|
|
49
|
-
"douyin",
|
|
50
|
-
str(profile.get("author_handle") or ""),
|
|
51
|
-
str(profile.get("platform_author_id") or ""),
|
|
52
|
-
str(profile.get("nickname") or ""),
|
|
53
|
-
)
|
|
54
|
-
author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
|
|
55
|
-
author_dir.mkdir(parents=True, exist_ok=True)
|
|
56
|
-
|
|
57
|
-
creator_profile = dict(profile)
|
|
58
|
-
creator_profile["request_id"] = request_id
|
|
59
|
-
creator_profile["extract_trace"] = extract_trace
|
|
60
|
-
|
|
61
|
-
work_collection = {
|
|
62
|
-
"platform": "douyin",
|
|
63
|
-
"platform_author_id": profile.get("platform_author_id"),
|
|
64
|
-
"count": len(works),
|
|
65
|
-
"items": [
|
|
66
|
-
{
|
|
67
|
-
"platform_work_id": item.get("platform_work_id"),
|
|
68
|
-
"title": item.get("title"),
|
|
69
|
-
"published_date": item.get("published_date"),
|
|
70
|
-
}
|
|
71
|
-
for item in works
|
|
72
|
-
if isinstance(item, dict)
|
|
73
|
-
],
|
|
74
|
-
"request_id": request_id,
|
|
75
|
-
"extract_trace": extract_trace,
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
profile_path = author_dir / "_creator_profile.json"
|
|
79
|
-
collection_path = author_dir / "_work_collection.json"
|
|
80
|
-
profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
81
|
-
collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
82
|
-
return {"creator_profile_path": str(profile_path), "work_collection_path": str(collection_path)}
|
|
83
|
-
|
|
84
|
-
|
|
85
43
|
def run_douyin_creator_home(
|
|
86
44
|
*,
|
|
87
45
|
input_value: str,
|
|
@@ -166,7 +124,76 @@ def run_douyin_creator_home(
|
|
|
166
124
|
progress=progress.child(scope="author_home.collect"),
|
|
167
125
|
)
|
|
168
126
|
|
|
127
|
+
card_root = resolve_storage_paths(config)["card_root"]
|
|
128
|
+
request_id = ensure_request_id(
|
|
129
|
+
raw.get("request_id") or profile.get("request_id"),
|
|
130
|
+
fallback_seed=normalized_input_value or input_value,
|
|
131
|
+
)
|
|
132
|
+
raw_extract_trace = list(raw.get("extract_trace") or [])
|
|
133
|
+
checkpoint = load_homepage_checkpoint(
|
|
134
|
+
platform="douyin",
|
|
135
|
+
profile=profile,
|
|
136
|
+
card_root=card_root,
|
|
137
|
+
)
|
|
138
|
+
if checkpoint:
|
|
139
|
+
progress.progress(
|
|
140
|
+
stage="author_home.workflow.resume",
|
|
141
|
+
message="douyin author_home checkpoint loaded",
|
|
142
|
+
data={
|
|
143
|
+
"completed_work_ids": len(checkpoint.get("completed_work_ids") or []),
|
|
144
|
+
"last_completed_batch_id": checkpoint.get("last_completed_batch_id"),
|
|
145
|
+
},
|
|
146
|
+
)
|
|
147
|
+
|
|
169
148
|
asr_strategy = config_get(config, "asr_strategy", {})
|
|
149
|
+
card_results: List[Dict[str, Any]] = []
|
|
150
|
+
written_work_ids: Set[str] = set()
|
|
151
|
+
|
|
152
|
+
def _persist_batch(event: Dict[str, Any]) -> None:
|
|
153
|
+
batch_id = str(event.get("batch_id") or "")
|
|
154
|
+
batch_works = event.get("batch_works") if isinstance(event.get("batch_works"), list) else []
|
|
155
|
+
all_works = event.get("works") if isinstance(event.get("works"), list) else []
|
|
156
|
+
batch_trace = raw_extract_trace + list(event.get("trace") or [])
|
|
157
|
+
|
|
158
|
+
batch_card_count = 0
|
|
159
|
+
if write_card:
|
|
160
|
+
for work in batch_works:
|
|
161
|
+
if not isinstance(work, dict):
|
|
162
|
+
continue
|
|
163
|
+
result = write_work_fact_card(
|
|
164
|
+
payload=work,
|
|
165
|
+
platform="douyin",
|
|
166
|
+
card_root=card_root,
|
|
167
|
+
storage_config=config,
|
|
168
|
+
)
|
|
169
|
+
card_results.append(result)
|
|
170
|
+
work_id = str(work.get("platform_work_id") or "").strip()
|
|
171
|
+
if work_id:
|
|
172
|
+
written_work_ids.add(work_id)
|
|
173
|
+
batch_card_count += 1
|
|
174
|
+
|
|
175
|
+
persist_homepage_runtime_artifacts(
|
|
176
|
+
platform="douyin",
|
|
177
|
+
profile=profile,
|
|
178
|
+
works=all_works,
|
|
179
|
+
card_root=card_root,
|
|
180
|
+
extract_trace=batch_trace,
|
|
181
|
+
request_id=request_id,
|
|
182
|
+
checkpoint=event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {},
|
|
183
|
+
run_status="in_progress",
|
|
184
|
+
last_completed_batch_id=batch_id,
|
|
185
|
+
)
|
|
186
|
+
progress.progress(
|
|
187
|
+
stage="author_home.persist.batch",
|
|
188
|
+
message="douyin author_home batch persisted",
|
|
189
|
+
data={
|
|
190
|
+
"batch_id": batch_id,
|
|
191
|
+
"batch_cards": batch_card_count,
|
|
192
|
+
"completed_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("processed_works"),
|
|
193
|
+
"pending_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("pending_works"),
|
|
194
|
+
},
|
|
195
|
+
)
|
|
196
|
+
|
|
170
197
|
asr_bundle = enrich_author_home_asr(
|
|
171
198
|
platform="douyin",
|
|
172
199
|
works=works,
|
|
@@ -179,14 +206,18 @@ def run_douyin_creator_home(
|
|
|
179
206
|
douyin_submit_backoff_ms=int(config_get(config, "asr_strategy.submit_retry.douyin_video.backoff_ms", 1500)),
|
|
180
207
|
timeout_retry_enabled=bool(config_get(config, "asr_strategy.u2_timeout_retry.enabled", True)),
|
|
181
208
|
timeout_retry_max_retries=int(config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 0)),
|
|
209
|
+
checkpoint=checkpoint,
|
|
210
|
+
request_id=request_id,
|
|
211
|
+
on_batch_complete=_persist_batch,
|
|
182
212
|
progress=progress.child(scope="author_home.asr"),
|
|
183
213
|
)
|
|
184
214
|
works = list(asr_bundle.get("works") or [])
|
|
185
215
|
|
|
186
|
-
card_root = resolve_storage_paths(config)["card_root"]
|
|
187
|
-
card_results: List[Dict[str, Any]] = []
|
|
188
216
|
if write_card:
|
|
189
217
|
for work in works:
|
|
218
|
+
work_id = str(work.get("platform_work_id") or "").strip()
|
|
219
|
+
if work_id and work_id in written_work_ids:
|
|
220
|
+
continue
|
|
190
221
|
card_results.append(
|
|
191
222
|
write_work_fact_card(
|
|
192
223
|
payload=work,
|
|
@@ -196,19 +227,23 @@ def run_douyin_creator_home(
|
|
|
196
227
|
)
|
|
197
228
|
)
|
|
198
229
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
|
|
204
|
-
|
|
205
|
-
collection_artifacts = _write_collection_artifacts(
|
|
230
|
+
extract_trace = raw_extract_trace + list(asr_bundle.get("trace") or [])
|
|
231
|
+
checkpoint_out = asr_bundle.get("checkpoint") if isinstance(asr_bundle.get("checkpoint"), dict) else {}
|
|
232
|
+
collection_artifacts = persist_homepage_runtime_artifacts(
|
|
233
|
+
platform="douyin",
|
|
206
234
|
profile=profile,
|
|
207
235
|
works=works,
|
|
208
236
|
card_root=card_root,
|
|
209
237
|
extract_trace=extract_trace,
|
|
210
238
|
request_id=request_id,
|
|
239
|
+
checkpoint=checkpoint_out,
|
|
240
|
+
run_status=resolve_homepage_run_status(asr_bundle.get("stats")),
|
|
241
|
+
last_completed_batch_id=str(checkpoint_out.get("last_completed_batch_id") or ""),
|
|
211
242
|
)
|
|
243
|
+
if int(checkpoint_out.get("pending_works") or 0) <= 0:
|
|
244
|
+
cleared_checkpoint_path = clear_homepage_checkpoint(platform="douyin", profile=profile, card_root=card_root)
|
|
245
|
+
if cleared_checkpoint_path:
|
|
246
|
+
collection_artifacts["checkpoint_cleared_path"] = cleared_checkpoint_path
|
|
212
247
|
|
|
213
248
|
normalized_profile = dict(profile)
|
|
214
249
|
normalized_profile["request_id"] = request_id
|
|
@@ -14,9 +14,7 @@ if __package__ in {None, ""}:
|
|
|
14
14
|
break
|
|
15
15
|
|
|
16
16
|
import argparse
|
|
17
|
-
import
|
|
18
|
-
from pathlib import Path
|
|
19
|
-
from typing import Any, Dict, List
|
|
17
|
+
from typing import Any, Dict, List, Set
|
|
20
18
|
|
|
21
19
|
from scripts.core.bootstrap_env import bootstrap_for_direct_run
|
|
22
20
|
|
|
@@ -25,9 +23,14 @@ bootstrap_for_direct_run(__file__, __package__)
|
|
|
25
23
|
from scripts.core.completeness import ensure_request_id, evaluate_collection, normalize_missing_fields
|
|
26
24
|
from scripts.core.config_loader import config_get, load_tikomni_config, resolve_storage_paths
|
|
27
25
|
from scripts.core.progress_report import build_progress_reporter
|
|
28
|
-
from scripts.core.storage_router import resolve_author_directory_name
|
|
29
26
|
from scripts.core.tikomni_common import resolve_runtime, write_json_stdout
|
|
30
27
|
from scripts.pipelines.home_asr import enrich_author_home_asr
|
|
28
|
+
from scripts.pipelines.homepage_runtime_state import (
|
|
29
|
+
clear_homepage_checkpoint,
|
|
30
|
+
load_homepage_checkpoint,
|
|
31
|
+
persist_homepage_runtime_artifacts,
|
|
32
|
+
resolve_homepage_run_status,
|
|
33
|
+
)
|
|
31
34
|
from scripts.pipelines.input_contracts import normalize_xhs_creator_input
|
|
32
35
|
from scripts.pipelines.schema import build_author_profile
|
|
33
36
|
from scripts.pipelines.xiaohongshu_creator_home_helpers import collect_and_adapt
|
|
@@ -37,51 +40,6 @@ DEFAULT_MAX_ITEMS = 200
|
|
|
37
40
|
MAX_ITEMS_HARD_LIMIT = 200
|
|
38
41
|
|
|
39
42
|
|
|
40
|
-
def _write_collection_artifacts(
|
|
41
|
-
*,
|
|
42
|
-
profile: Dict[str, Any],
|
|
43
|
-
works: List[Dict[str, Any]],
|
|
44
|
-
card_root: str,
|
|
45
|
-
extract_trace: List[Dict[str, Any]],
|
|
46
|
-
request_id: str,
|
|
47
|
-
) -> Dict[str, str]:
|
|
48
|
-
author_dir_name = resolve_author_directory_name(
|
|
49
|
-
"xiaohongshu",
|
|
50
|
-
str(profile.get("author_handle") or ""),
|
|
51
|
-
str(profile.get("platform_author_id") or ""),
|
|
52
|
-
str(profile.get("nickname") or ""),
|
|
53
|
-
)
|
|
54
|
-
author_dir = Path(card_root) / "内容系统" / "作品库" / author_dir_name
|
|
55
|
-
author_dir.mkdir(parents=True, exist_ok=True)
|
|
56
|
-
|
|
57
|
-
creator_profile = dict(profile)
|
|
58
|
-
creator_profile["request_id"] = request_id
|
|
59
|
-
creator_profile["extract_trace"] = extract_trace
|
|
60
|
-
|
|
61
|
-
work_collection = {
|
|
62
|
-
"platform": "xiaohongshu",
|
|
63
|
-
"platform_author_id": profile.get("platform_author_id"),
|
|
64
|
-
"count": len(works),
|
|
65
|
-
"items": [
|
|
66
|
-
{
|
|
67
|
-
"platform_work_id": item.get("platform_work_id"),
|
|
68
|
-
"title": item.get("title"),
|
|
69
|
-
"published_date": item.get("published_date"),
|
|
70
|
-
}
|
|
71
|
-
for item in works
|
|
72
|
-
if isinstance(item, dict)
|
|
73
|
-
],
|
|
74
|
-
"request_id": request_id,
|
|
75
|
-
"extract_trace": extract_trace,
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
profile_path = author_dir / "_creator_profile.json"
|
|
79
|
-
collection_path = author_dir / "_work_collection.json"
|
|
80
|
-
profile_path.write_text(json.dumps(creator_profile, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
81
|
-
collection_path.write_text(json.dumps(work_collection, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
82
|
-
return {"creator_profile_path": str(profile_path), "work_collection_path": str(collection_path)}
|
|
83
|
-
|
|
84
|
-
|
|
85
43
|
def run_xiaohongshu_creator_home(
|
|
86
44
|
*,
|
|
87
45
|
input_value: str,
|
|
@@ -166,7 +124,76 @@ def run_xiaohongshu_creator_home(
|
|
|
166
124
|
progress=progress.child(scope="author_home.collect"),
|
|
167
125
|
)
|
|
168
126
|
|
|
127
|
+
card_root = resolve_storage_paths(config)["card_root"]
|
|
128
|
+
request_id = ensure_request_id(
|
|
129
|
+
raw.get("request_id") or profile.get("request_id"),
|
|
130
|
+
fallback_seed=normalized_input_value or input_value,
|
|
131
|
+
)
|
|
132
|
+
raw_extract_trace = list(raw.get("extract_trace") or [])
|
|
133
|
+
checkpoint = load_homepage_checkpoint(
|
|
134
|
+
platform="xiaohongshu",
|
|
135
|
+
profile=profile,
|
|
136
|
+
card_root=card_root,
|
|
137
|
+
)
|
|
138
|
+
if checkpoint:
|
|
139
|
+
progress.progress(
|
|
140
|
+
stage="author_home.workflow.resume",
|
|
141
|
+
message="xiaohongshu author_home checkpoint loaded",
|
|
142
|
+
data={
|
|
143
|
+
"completed_work_ids": len(checkpoint.get("completed_work_ids") or []),
|
|
144
|
+
"last_completed_batch_id": checkpoint.get("last_completed_batch_id"),
|
|
145
|
+
},
|
|
146
|
+
)
|
|
147
|
+
|
|
169
148
|
asr_strategy = config_get(config, "asr_strategy", {})
|
|
149
|
+
card_results: List[Dict[str, Any]] = []
|
|
150
|
+
written_work_ids: Set[str] = set()
|
|
151
|
+
|
|
152
|
+
def _persist_batch(event: Dict[str, Any]) -> None:
|
|
153
|
+
batch_id = str(event.get("batch_id") or "")
|
|
154
|
+
batch_works = event.get("batch_works") if isinstance(event.get("batch_works"), list) else []
|
|
155
|
+
all_works = event.get("works") if isinstance(event.get("works"), list) else []
|
|
156
|
+
batch_trace = raw_extract_trace + list(event.get("trace") or [])
|
|
157
|
+
|
|
158
|
+
batch_card_count = 0
|
|
159
|
+
if write_card:
|
|
160
|
+
for work in batch_works:
|
|
161
|
+
if not isinstance(work, dict):
|
|
162
|
+
continue
|
|
163
|
+
result = write_work_fact_card(
|
|
164
|
+
payload=work,
|
|
165
|
+
platform="xiaohongshu",
|
|
166
|
+
card_root=card_root,
|
|
167
|
+
storage_config=config,
|
|
168
|
+
)
|
|
169
|
+
card_results.append(result)
|
|
170
|
+
work_id = str(work.get("platform_work_id") or "").strip()
|
|
171
|
+
if work_id:
|
|
172
|
+
written_work_ids.add(work_id)
|
|
173
|
+
batch_card_count += 1
|
|
174
|
+
|
|
175
|
+
persist_homepage_runtime_artifacts(
|
|
176
|
+
platform="xiaohongshu",
|
|
177
|
+
profile=profile,
|
|
178
|
+
works=all_works,
|
|
179
|
+
card_root=card_root,
|
|
180
|
+
extract_trace=batch_trace,
|
|
181
|
+
request_id=request_id,
|
|
182
|
+
checkpoint=event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {},
|
|
183
|
+
run_status="in_progress",
|
|
184
|
+
last_completed_batch_id=batch_id,
|
|
185
|
+
)
|
|
186
|
+
progress.progress(
|
|
187
|
+
stage="author_home.persist.batch",
|
|
188
|
+
message="xiaohongshu author_home batch persisted",
|
|
189
|
+
data={
|
|
190
|
+
"batch_id": batch_id,
|
|
191
|
+
"batch_cards": batch_card_count,
|
|
192
|
+
"completed_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("processed_works"),
|
|
193
|
+
"pending_count": (event.get("checkpoint") if isinstance(event.get("checkpoint"), dict) else {}).get("pending_works"),
|
|
194
|
+
},
|
|
195
|
+
)
|
|
196
|
+
|
|
170
197
|
asr_bundle = enrich_author_home_asr(
|
|
171
198
|
platform="xiaohongshu",
|
|
172
199
|
works=works,
|
|
@@ -179,14 +206,18 @@ def run_xiaohongshu_creator_home(
|
|
|
179
206
|
xhs_submit_backoff_ms=int(config_get(config, "asr_strategy.submit_retry.xiaohongshu_note.backoff_ms", 0)),
|
|
180
207
|
timeout_retry_enabled=bool(config_get(config, "asr_strategy.u2_timeout_retry.enabled", True)),
|
|
181
208
|
timeout_retry_max_retries=int(config_get(config, "asr_strategy.u2_timeout_retry.max_retries", 0)),
|
|
209
|
+
checkpoint=checkpoint,
|
|
210
|
+
request_id=request_id,
|
|
211
|
+
on_batch_complete=_persist_batch,
|
|
182
212
|
progress=progress.child(scope="author_home.asr"),
|
|
183
213
|
)
|
|
184
214
|
works = list(asr_bundle.get("works") or [])
|
|
185
215
|
|
|
186
|
-
card_root = resolve_storage_paths(config)["card_root"]
|
|
187
|
-
card_results: List[Dict[str, Any]] = []
|
|
188
216
|
if write_card:
|
|
189
217
|
for work in works:
|
|
218
|
+
work_id = str(work.get("platform_work_id") or "").strip()
|
|
219
|
+
if work_id and work_id in written_work_ids:
|
|
220
|
+
continue
|
|
190
221
|
card_results.append(
|
|
191
222
|
write_work_fact_card(
|
|
192
223
|
payload=work,
|
|
@@ -196,19 +227,23 @@ def run_xiaohongshu_creator_home(
|
|
|
196
227
|
)
|
|
197
228
|
)
|
|
198
229
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
extract_trace = list(raw.get("extract_trace") or []) + list(asr_bundle.get("trace") or [])
|
|
204
|
-
|
|
205
|
-
collection_artifacts = _write_collection_artifacts(
|
|
230
|
+
extract_trace = raw_extract_trace + list(asr_bundle.get("trace") or [])
|
|
231
|
+
checkpoint_out = asr_bundle.get("checkpoint") if isinstance(asr_bundle.get("checkpoint"), dict) else {}
|
|
232
|
+
collection_artifacts = persist_homepage_runtime_artifacts(
|
|
233
|
+
platform="xiaohongshu",
|
|
206
234
|
profile=profile,
|
|
207
235
|
works=works,
|
|
208
236
|
card_root=card_root,
|
|
209
237
|
extract_trace=extract_trace,
|
|
210
238
|
request_id=request_id,
|
|
239
|
+
checkpoint=checkpoint_out,
|
|
240
|
+
run_status=resolve_homepage_run_status(asr_bundle.get("stats")),
|
|
241
|
+
last_completed_batch_id=str(checkpoint_out.get("last_completed_batch_id") or ""),
|
|
211
242
|
)
|
|
243
|
+
if int(checkpoint_out.get("pending_works") or 0) <= 0:
|
|
244
|
+
cleared_checkpoint_path = clear_homepage_checkpoint(platform="xiaohongshu", profile=profile, card_root=card_root)
|
|
245
|
+
if cleared_checkpoint_path:
|
|
246
|
+
collection_artifacts["checkpoint_cleared_path"] = cleared_checkpoint_path
|
|
212
247
|
|
|
213
248
|
normalized_profile = dict(profile)
|
|
214
249
|
normalized_profile["request_id"] = request_id
|
|
@@ -359,18 +359,10 @@ def _markdown_lines(card: Dict[str, Any]) -> List[str]:
|
|
|
359
359
|
lines = _frontmatter_lines(card)
|
|
360
360
|
primary_text = _safe_text(card.get("primary_text"))
|
|
361
361
|
caption_raw = _safe_text(card.get("caption_raw"))
|
|
362
|
-
subtitle_raw = _safe_text(card.get("subtitle_raw"))
|
|
363
|
-
asr_raw = _safe_text(card.get("asr_raw"))
|
|
364
362
|
|
|
365
363
|
lines.extend(["", "## 主文本", primary_text or ""])
|
|
366
364
|
if caption_raw and caption_raw != primary_text:
|
|
367
365
|
lines.extend(["", "## 原始文案", caption_raw])
|
|
368
|
-
if asr_raw and subtitle_raw and asr_raw == subtitle_raw and asr_raw != primary_text:
|
|
369
|
-
lines.extend(["", "## 原始转写", asr_raw])
|
|
370
|
-
elif subtitle_raw and subtitle_raw != primary_text:
|
|
371
|
-
lines.extend(["", "## 原始字幕", subtitle_raw])
|
|
372
|
-
if asr_raw and asr_raw not in {primary_text, subtitle_raw}:
|
|
373
|
-
lines.extend(["", "## 原始转写", asr_raw])
|
|
374
366
|
if card.get("missing_fields"):
|
|
375
367
|
lines.extend(["", "## 缺失字段"])
|
|
376
368
|
for entry in card["missing_fields"]:
|