@tikomni/skills 0.1.11 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/social-media-crawl/references/contracts/work-fact-card-fields.md +2 -0
- package/skills/social-media-crawl/references/schemas/work-fact-card.schema.json +1 -0
- package/skills/social-media-crawl/scripts/core/asr_pipeline.py +273 -39
- package/skills/social-media-crawl/scripts/pipelines/home_asr.py +220 -50
- package/skills/social-media-crawl/scripts/pipelines/homepage_collectors.py +276 -40
- package/skills/social-media-crawl/scripts/pipelines/homepage_runtime_state.py +173 -0
- package/skills/social-media-crawl/scripts/pipelines/platform_adapters.py +61 -6
- package/skills/social-media-crawl/scripts/pipelines/run_douyin_creator_home.py +93 -58
- package/skills/social-media-crawl/scripts/pipelines/run_xiaohongshu_creator_home.py +93 -58
- package/skills/social-media-crawl/scripts/pipelines/schema.py +2 -2
- package/skills/social-media-crawl/scripts/writers/write_work_fact_card.py +9 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tikomni/skills",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "1.0.1",
|
|
4
4
|
"description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
|
|
25
25
|
## Optional Fields
|
|
26
26
|
|
|
27
|
+
- `duration_ms`
|
|
27
28
|
- `digg_count`
|
|
28
29
|
- `comment_count`
|
|
29
30
|
- `collect_count`
|
|
@@ -37,6 +38,7 @@
|
|
|
37
38
|
## Field Rules
|
|
38
39
|
|
|
39
40
|
- `author` is the display name, not an object.
|
|
41
|
+
- `duration_ms` uses milliseconds. Write `null` when the duration is unavailable or not applicable.
|
|
40
42
|
- Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
|
|
41
43
|
- The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
|
|
42
44
|
- `primary_text` is the text that is best suited for reading and indexing in the current task.
|
|
@@ -32,6 +32,7 @@
|
|
|
32
32
|
"subtitle_raw": { "type": "string" },
|
|
33
33
|
"work_modality": { "type": "string" },
|
|
34
34
|
"published_date": { "type": "string" },
|
|
35
|
+
"duration_ms": { "type": ["integer", "null"] },
|
|
35
36
|
"digg_count": { "type": "integer" },
|
|
36
37
|
"comment_count": { "type": "integer" },
|
|
37
38
|
"collect_count": { "type": "integer" },
|
|
@@ -8,7 +8,7 @@ import time
|
|
|
8
8
|
import urllib.error
|
|
9
9
|
import urllib.request
|
|
10
10
|
from urllib.parse import urlparse, urlunparse
|
|
11
|
-
from typing import Any, Callable, Dict, List, Optional
|
|
11
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
12
12
|
|
|
13
13
|
from scripts.core.tikomni_common import (
|
|
14
14
|
call_json_api,
|
|
@@ -23,6 +23,21 @@ from scripts.core.u3_fallback import run_u3_public_url_fallback
|
|
|
23
23
|
|
|
24
24
|
U2_BATCH_SUBMIT_HARD_LIMIT = 100
|
|
25
25
|
DEFAULT_U2_PENDING_TIMEOUT_SEC = 60
|
|
26
|
+
SUMMARY_TEXT_FIELDS = (
|
|
27
|
+
"full_text",
|
|
28
|
+
"transcript_text",
|
|
29
|
+
"transcription_text",
|
|
30
|
+
"result_text",
|
|
31
|
+
"summary_text",
|
|
32
|
+
"transcript",
|
|
33
|
+
"transcription",
|
|
34
|
+
"result",
|
|
35
|
+
"content",
|
|
36
|
+
"text",
|
|
37
|
+
)
|
|
38
|
+
SEGMENT_CONTAINER_FIELDS = ("sentences", "segments", "paragraphs")
|
|
39
|
+
SEGMENT_TEXT_FIELDS = ("text", "sentence", "content", "paragraph", "transcript_text")
|
|
40
|
+
CHAR_SPACED_RUN_RE = re.compile(r"(?:[A-Za-z0-9\u4e00-\u9fff]{1,4}\s+){5,}[A-Za-z0-9\u4e00-\u9fff]{1,4}")
|
|
26
41
|
|
|
27
42
|
|
|
28
43
|
def clamp_u2_batch_submit_size(size: int, *, default: int = 50, hard_limit: int = U2_BATCH_SUBMIT_HARD_LIMIT) -> int:
|
|
@@ -251,6 +266,33 @@ def clean_transcript_text(raw_text: Any) -> str:
|
|
|
251
266
|
return normalize_text(raw_text)
|
|
252
267
|
|
|
253
268
|
|
|
269
|
+
def _text_signature(text: str) -> str:
|
|
270
|
+
return re.sub(r"[\W_]+", "", clean_transcript_text(text)).lower()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _is_char_spaced_noise_sequence(text: str) -> bool:
|
|
274
|
+
tokens = [token for token in clean_transcript_text(text).split(" ") if token]
|
|
275
|
+
if len(tokens) < 6:
|
|
276
|
+
return False
|
|
277
|
+
single_char_tokens = sum(1 for token in tokens if len(token) == 1)
|
|
278
|
+
short_tokens = sum(1 for token in tokens if len(token) <= 2)
|
|
279
|
+
cjk_tokens = sum(1 for token in tokens if any("\u4e00" <= char <= "\u9fff" for char in token))
|
|
280
|
+
return (
|
|
281
|
+
single_char_tokens >= 4
|
|
282
|
+
and short_tokens / max(len(tokens), 1) >= 0.75
|
|
283
|
+
and cjk_tokens / max(len(tokens), 1) >= 0.5
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _strip_char_spaced_noise_runs(text: str) -> str:
|
|
288
|
+
def _replace(match: re.Match[str]) -> str:
|
|
289
|
+
chunk = match.group(0)
|
|
290
|
+
return " " if _is_char_spaced_noise_sequence(chunk) else chunk
|
|
291
|
+
|
|
292
|
+
cleaned = CHAR_SPACED_RUN_RE.sub(_replace, text)
|
|
293
|
+
return re.sub(r"\s+", " ", cleaned).strip()
|
|
294
|
+
|
|
295
|
+
|
|
254
296
|
def _ensure_sentence_end(text: str) -> str:
|
|
255
297
|
if not text:
|
|
256
298
|
return text
|
|
@@ -264,13 +306,36 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
|
|
|
264
306
|
if not base:
|
|
265
307
|
return ""
|
|
266
308
|
|
|
267
|
-
denoised =
|
|
309
|
+
denoised = _strip_char_spaced_noise_runs(base)
|
|
310
|
+
denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", denoised)
|
|
268
311
|
denoised = re.sub(r"(嗯+|啊+|呃+)", " ", denoised)
|
|
269
312
|
denoised = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", denoised)
|
|
270
313
|
denoised = re.sub(r"\s+", " ", denoised).strip()
|
|
271
314
|
|
|
272
315
|
units = [clean_transcript_text(part) for part in re.split(r"[。!?!?;;\n]+", denoised)]
|
|
273
|
-
sentences = [
|
|
316
|
+
sentences: List[str] = []
|
|
317
|
+
signatures: List[str] = []
|
|
318
|
+
for unit in units:
|
|
319
|
+
if not unit or _is_char_spaced_noise_sequence(unit):
|
|
320
|
+
continue
|
|
321
|
+
sentence = _ensure_sentence_end(unit)
|
|
322
|
+
signature = _text_signature(sentence)
|
|
323
|
+
if not signature:
|
|
324
|
+
continue
|
|
325
|
+
duplicate = False
|
|
326
|
+
for existing in signatures:
|
|
327
|
+
if signature == existing:
|
|
328
|
+
duplicate = True
|
|
329
|
+
break
|
|
330
|
+
smaller = signature if len(signature) <= len(existing) else existing
|
|
331
|
+
larger = existing if len(signature) <= len(existing) else signature
|
|
332
|
+
if len(smaller) >= 12 and smaller in larger:
|
|
333
|
+
duplicate = True
|
|
334
|
+
break
|
|
335
|
+
if duplicate:
|
|
336
|
+
continue
|
|
337
|
+
signatures.append(signature)
|
|
338
|
+
sentences.append(sentence)
|
|
274
339
|
if not sentences:
|
|
275
340
|
fallback = _ensure_sentence_end(denoised)
|
|
276
341
|
return fallback if fallback else ""
|
|
@@ -292,6 +357,94 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
|
|
|
292
357
|
return "\n\n".join(paragraphs)
|
|
293
358
|
|
|
294
359
|
|
|
360
|
+
def _extract_summary_text_from_node(node: Dict[str, Any]) -> Tuple[str, str]:
|
|
361
|
+
for key in SUMMARY_TEXT_FIELDS:
|
|
362
|
+
value = node.get(key)
|
|
363
|
+
if isinstance(value, str):
|
|
364
|
+
cleaned = clean_transcript_text(value)
|
|
365
|
+
if cleaned:
|
|
366
|
+
return cleaned, key
|
|
367
|
+
return "", ""
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _append_segment_lines(node: Any, lines: List[str]) -> None:
|
|
371
|
+
if isinstance(node, str):
|
|
372
|
+
cleaned = clean_transcript_text(node)
|
|
373
|
+
if cleaned:
|
|
374
|
+
lines.append(cleaned)
|
|
375
|
+
return
|
|
376
|
+
if isinstance(node, dict):
|
|
377
|
+
for key in SEGMENT_TEXT_FIELDS:
|
|
378
|
+
value = node.get(key)
|
|
379
|
+
if isinstance(value, str):
|
|
380
|
+
cleaned = clean_transcript_text(value)
|
|
381
|
+
if cleaned:
|
|
382
|
+
lines.append(cleaned)
|
|
383
|
+
break
|
|
384
|
+
return
|
|
385
|
+
if isinstance(node, list):
|
|
386
|
+
for item in node:
|
|
387
|
+
_append_segment_lines(item, lines)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _extract_segment_text_from_node(node: Dict[str, Any]) -> str:
|
|
391
|
+
lines: List[str] = []
|
|
392
|
+
for key in SEGMENT_CONTAINER_FIELDS:
|
|
393
|
+
if key not in node:
|
|
394
|
+
continue
|
|
395
|
+
_append_segment_lines(node.get(key), lines)
|
|
396
|
+
if lines:
|
|
397
|
+
break
|
|
398
|
+
if not lines:
|
|
399
|
+
return ""
|
|
400
|
+
|
|
401
|
+
deduped: List[str] = []
|
|
402
|
+
seen = set()
|
|
403
|
+
for line in lines:
|
|
404
|
+
signature = _text_signature(line)
|
|
405
|
+
if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
|
|
406
|
+
continue
|
|
407
|
+
seen.add(signature)
|
|
408
|
+
deduped.append(line)
|
|
409
|
+
return "\n".join(deduped).strip()
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _extract_canonical_transcript_from_node(node: Dict[str, Any]) -> Dict[str, Any]:
|
|
413
|
+
summary_text, summary_field = _extract_summary_text_from_node(node)
|
|
414
|
+
if summary_text:
|
|
415
|
+
return {
|
|
416
|
+
"transcript_text": summary_text,
|
|
417
|
+
"summary_field_used": summary_field,
|
|
418
|
+
"segment_fallback_used": False,
|
|
419
|
+
"canonical_text_source": f"summary:{summary_field}",
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
segment_text = _extract_segment_text_from_node(node)
|
|
423
|
+
if segment_text:
|
|
424
|
+
return {
|
|
425
|
+
"transcript_text": segment_text,
|
|
426
|
+
"summary_field_used": "",
|
|
427
|
+
"segment_fallback_used": True,
|
|
428
|
+
"canonical_text_source": "segments",
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
fallback_text = clean_transcript_text(extract_transcript_text(node))
|
|
432
|
+
if fallback_text:
|
|
433
|
+
return {
|
|
434
|
+
"transcript_text": fallback_text,
|
|
435
|
+
"summary_field_used": "",
|
|
436
|
+
"segment_fallback_used": True,
|
|
437
|
+
"canonical_text_source": "deep_search_fallback",
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
return {
|
|
441
|
+
"transcript_text": "",
|
|
442
|
+
"summary_field_used": "",
|
|
443
|
+
"segment_fallback_used": False,
|
|
444
|
+
"canonical_text_source": "missing",
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
|
|
295
448
|
def extract_u2_task_metrics(payload: Any) -> Dict[str, Any]:
|
|
296
449
|
metrics = deep_find_first(payload, ["task_metrics", "metrics"])
|
|
297
450
|
return metrics if isinstance(metrics, dict) else {}
|
|
@@ -349,16 +502,8 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
|
|
|
349
502
|
)
|
|
350
503
|
file_url = normalize_media_url(str(raw_file_url or ""))
|
|
351
504
|
if file_url:
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
or node.get("text")
|
|
355
|
-
or node.get("transcript")
|
|
356
|
-
or node.get("transcription")
|
|
357
|
-
or node.get("content")
|
|
358
|
-
or ""
|
|
359
|
-
)
|
|
360
|
-
if not transcript:
|
|
361
|
-
transcript = clean_transcript_text(extract_transcript_text(node))
|
|
505
|
+
canonical = _extract_canonical_transcript_from_node(node)
|
|
506
|
+
transcript = clean_transcript_text(canonical.get("transcript_text"))
|
|
362
507
|
|
|
363
508
|
status = _status_upper(node.get("status") or node.get("task_status") or node.get("state"))
|
|
364
509
|
error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
|
|
@@ -372,6 +517,9 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
|
|
|
372
517
|
"transcription_url": transcription_url,
|
|
373
518
|
"error_reason": error_reason,
|
|
374
519
|
"ok": ok,
|
|
520
|
+
"summary_field_used": canonical.get("summary_field_used", ""),
|
|
521
|
+
"segment_fallback_used": bool(canonical.get("segment_fallback_used")),
|
|
522
|
+
"canonical_text_source": canonical.get("canonical_text_source", "missing"),
|
|
375
523
|
}
|
|
376
524
|
|
|
377
525
|
existing = found.get(file_url)
|
|
@@ -380,12 +528,16 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
|
|
|
380
528
|
else:
|
|
381
529
|
old_score = (
|
|
382
530
|
1 if existing.get("ok") else 0,
|
|
531
|
+
1 if not existing.get("segment_fallback_used") else 0,
|
|
532
|
+
1 if existing.get("summary_field_used") else 0,
|
|
383
533
|
len(str(existing.get("transcript_text") or "")),
|
|
384
534
|
1 if existing.get("transcription_url") else 0,
|
|
385
535
|
1 if not existing.get("error_reason") else 0,
|
|
386
536
|
)
|
|
387
537
|
new_score = (
|
|
388
538
|
1 if candidate.get("ok") else 0,
|
|
539
|
+
1 if not candidate.get("segment_fallback_used") else 0,
|
|
540
|
+
1 if candidate.get("summary_field_used") else 0,
|
|
389
541
|
len(str(candidate.get("transcript_text") or "")),
|
|
390
542
|
1 if candidate.get("transcription_url") else 0,
|
|
391
543
|
1 if not candidate.get("error_reason") else 0,
|
|
@@ -441,16 +593,8 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
|
|
|
441
593
|
item_index_raw = node.get("item_index")
|
|
442
594
|
item_index = _parse_non_negative_item_index(item_index_raw)
|
|
443
595
|
if item_index is not None:
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
or node.get("text")
|
|
447
|
-
or node.get("transcript")
|
|
448
|
-
or node.get("transcription")
|
|
449
|
-
or node.get("content")
|
|
450
|
-
or ""
|
|
451
|
-
)
|
|
452
|
-
if not transcript:
|
|
453
|
-
transcript = clean_transcript_text(extract_transcript_text(node))
|
|
596
|
+
canonical = _extract_canonical_transcript_from_node(node)
|
|
597
|
+
transcript = clean_transcript_text(canonical.get("transcript_text"))
|
|
454
598
|
|
|
455
599
|
status = _status_upper(node.get("task_status") or node.get("status") or node.get("state"))
|
|
456
600
|
error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
|
|
@@ -464,6 +608,9 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
|
|
|
464
608
|
"error_reason": error_reason,
|
|
465
609
|
"transcription_url": transcription_url,
|
|
466
610
|
"ok": ok,
|
|
611
|
+
"summary_field_used": canonical.get("summary_field_used", ""),
|
|
612
|
+
"segment_fallback_used": bool(canonical.get("segment_fallback_used")),
|
|
613
|
+
"canonical_text_source": canonical.get("canonical_text_source", "missing"),
|
|
467
614
|
}
|
|
468
615
|
|
|
469
616
|
existing = mapped.get(item_index)
|
|
@@ -472,12 +619,16 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
|
|
|
472
619
|
else:
|
|
473
620
|
old_score = (
|
|
474
621
|
1 if existing.get("ok") else 0,
|
|
622
|
+
1 if not existing.get("segment_fallback_used") else 0,
|
|
623
|
+
1 if existing.get("summary_field_used") else 0,
|
|
475
624
|
len(str(existing.get("transcript_text") or "")),
|
|
476
625
|
1 if existing.get("transcription_url") else 0,
|
|
477
626
|
1 if not existing.get("error_reason") else 0,
|
|
478
627
|
)
|
|
479
628
|
new_score = (
|
|
480
629
|
1 if candidate.get("ok") else 0,
|
|
630
|
+
1 if not candidate.get("segment_fallback_used") else 0,
|
|
631
|
+
1 if candidate.get("summary_field_used") else 0,
|
|
481
632
|
len(str(candidate.get("transcript_text") or "")),
|
|
482
633
|
1 if candidate.get("transcription_url") else 0,
|
|
483
634
|
1 if not candidate.get("error_reason") else 0,
|
|
@@ -506,29 +657,102 @@ def _extract_transcript_from_transcription_payload(payload: Any) -> str:
|
|
|
506
657
|
except Exception:
|
|
507
658
|
return ""
|
|
508
659
|
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
660
|
+
for key in SUMMARY_TEXT_FIELDS:
|
|
661
|
+
transcript = clean_transcript_text(deep_find_first(payload, [key]))
|
|
662
|
+
if transcript:
|
|
663
|
+
return transcript
|
|
664
|
+
|
|
665
|
+
for key in SEGMENT_CONTAINER_FIELDS:
|
|
666
|
+
segments = deep_find_first(payload, [key])
|
|
667
|
+
if segments is None:
|
|
668
|
+
continue
|
|
669
|
+
lines: List[str] = []
|
|
670
|
+
_append_segment_lines(segments, lines)
|
|
671
|
+
deduped: List[str] = []
|
|
672
|
+
seen = set()
|
|
673
|
+
for line in lines:
|
|
674
|
+
signature = _text_signature(line)
|
|
675
|
+
if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
|
|
676
|
+
continue
|
|
677
|
+
seen.add(signature)
|
|
678
|
+
deduped.append(line)
|
|
679
|
+
if deduped:
|
|
680
|
+
return "\n".join(deduped)
|
|
512
681
|
|
|
513
682
|
transcript = clean_transcript_text(extract_transcript_text(payload))
|
|
514
683
|
if transcript:
|
|
515
684
|
return transcript
|
|
516
685
|
|
|
517
|
-
|
|
518
|
-
|
|
686
|
+
return ""
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def _extract_transcript_bundle_from_transcription_payload(payload: Any) -> Dict[str, Any]:
|
|
690
|
+
if isinstance(payload, str):
|
|
691
|
+
text = clean_transcript_text(payload)
|
|
692
|
+
if text:
|
|
693
|
+
return {
|
|
694
|
+
"transcript_text": text,
|
|
695
|
+
"summary_field_used": "raw_string",
|
|
696
|
+
"segment_fallback_used": False,
|
|
697
|
+
"canonical_text_source": "summary:raw_string",
|
|
698
|
+
}
|
|
699
|
+
try:
|
|
700
|
+
payload = json.loads(payload)
|
|
701
|
+
except Exception:
|
|
702
|
+
return {
|
|
703
|
+
"transcript_text": "",
|
|
704
|
+
"summary_field_used": "",
|
|
705
|
+
"segment_fallback_used": False,
|
|
706
|
+
"canonical_text_source": "missing",
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
for key in SUMMARY_TEXT_FIELDS:
|
|
710
|
+
transcript = clean_transcript_text(deep_find_first(payload, [key]))
|
|
711
|
+
if transcript:
|
|
712
|
+
return {
|
|
713
|
+
"transcript_text": transcript,
|
|
714
|
+
"summary_field_used": key,
|
|
715
|
+
"segment_fallback_used": False,
|
|
716
|
+
"canonical_text_source": f"summary:{key}",
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
for key in SEGMENT_CONTAINER_FIELDS:
|
|
720
|
+
segments = deep_find_first(payload, [key])
|
|
721
|
+
if segments is None:
|
|
722
|
+
continue
|
|
519
723
|
lines: List[str] = []
|
|
520
|
-
|
|
521
|
-
|
|
724
|
+
_append_segment_lines(segments, lines)
|
|
725
|
+
deduped: List[str] = []
|
|
726
|
+
seen = set()
|
|
727
|
+
for line in lines:
|
|
728
|
+
signature = _text_signature(line)
|
|
729
|
+
if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
|
|
522
730
|
continue
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
731
|
+
seen.add(signature)
|
|
732
|
+
deduped.append(line)
|
|
733
|
+
if deduped:
|
|
734
|
+
return {
|
|
735
|
+
"transcript_text": "\n".join(deduped),
|
|
736
|
+
"summary_field_used": "",
|
|
737
|
+
"segment_fallback_used": True,
|
|
738
|
+
"canonical_text_source": f"segments:{key}",
|
|
739
|
+
}
|
|
530
740
|
|
|
531
|
-
|
|
741
|
+
transcript = clean_transcript_text(extract_transcript_text(payload))
|
|
742
|
+
if transcript:
|
|
743
|
+
return {
|
|
744
|
+
"transcript_text": transcript,
|
|
745
|
+
"summary_field_used": "",
|
|
746
|
+
"segment_fallback_used": True,
|
|
747
|
+
"canonical_text_source": "deep_search_fallback",
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
return {
|
|
751
|
+
"transcript_text": "",
|
|
752
|
+
"summary_field_used": "",
|
|
753
|
+
"segment_fallback_used": False,
|
|
754
|
+
"canonical_text_source": "missing",
|
|
755
|
+
}
|
|
532
756
|
|
|
533
757
|
|
|
534
758
|
def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int) -> Dict[str, Any]:
|
|
@@ -573,13 +797,17 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
|
|
|
573
797
|
except Exception:
|
|
574
798
|
payload = raw_text
|
|
575
799
|
|
|
576
|
-
|
|
800
|
+
transcript_bundle = _extract_transcript_bundle_from_transcription_payload(payload)
|
|
801
|
+
transcript = transcript_bundle.get("transcript_text", "")
|
|
577
802
|
if transcript:
|
|
578
803
|
return {
|
|
579
804
|
"ok": True,
|
|
580
805
|
"transcription_url": url,
|
|
581
806
|
"error_reason": "",
|
|
582
807
|
"transcript_text": transcript,
|
|
808
|
+
"summary_field_used": transcript_bundle.get("summary_field_used", ""),
|
|
809
|
+
"segment_fallback_used": bool(transcript_bundle.get("segment_fallback_used")),
|
|
810
|
+
"canonical_text_source": transcript_bundle.get("canonical_text_source", "missing"),
|
|
583
811
|
}
|
|
584
812
|
|
|
585
813
|
return {
|
|
@@ -587,6 +815,9 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
|
|
|
587
815
|
"transcription_url": url,
|
|
588
816
|
"error_reason": "transcription_payload_empty",
|
|
589
817
|
"transcript_text": "",
|
|
818
|
+
"summary_field_used": "",
|
|
819
|
+
"segment_fallback_used": False,
|
|
820
|
+
"canonical_text_source": "missing",
|
|
590
821
|
}
|
|
591
822
|
|
|
592
823
|
|
|
@@ -620,6 +851,9 @@ def hydrate_u2_batch_results_from_transcription_urls(
|
|
|
620
851
|
if fetched_text:
|
|
621
852
|
transcript = fetched_text
|
|
622
853
|
candidate["transcript_text"] = fetched_text
|
|
854
|
+
candidate["summary_field_used"] = fetch_result.get("summary_field_used", "")
|
|
855
|
+
candidate["segment_fallback_used"] = bool(fetch_result.get("segment_fallback_used"))
|
|
856
|
+
candidate["canonical_text_source"] = fetch_result.get("canonical_text_source", "missing")
|
|
623
857
|
elif not candidate.get("error_reason"):
|
|
624
858
|
candidate["error_reason"] = fetch_result.get("error_reason") or "transcription_payload_empty"
|
|
625
859
|
|