@tikomni/skills 0.1.11 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tikomni/skills",
3
- "version": "0.1.11",
3
+ "version": "1.0.1",
4
4
  "description": "TikOmni skill installer CLI for structured social media crawling in Codex, Claude Code, and OpenClaw",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/mark-ly-wang/TikOmni-Skills#readme",
@@ -24,6 +24,7 @@
24
24
 
25
25
  ## Optional Fields
26
26
 
27
+ - `duration_ms`
27
28
  - `digg_count`
28
29
  - `comment_count`
29
30
  - `collect_count`
@@ -37,6 +38,7 @@
37
38
  ## Field Rules
38
39
 
39
40
  - `author` is the display name, not an object.
41
+ - `duration_ms` uses milliseconds. Write `null` when the duration is unavailable or not applicable.
40
42
  - Fact fields for the Markdown card go into frontmatter. Do not emit a separate `## Facts` section.
41
43
  - The work-library directory writes only the Markdown card and no extra `.json` sidecar in the same directory.
42
44
  - `primary_text` is the text that is best suited for reading and indexing in the current task.
@@ -32,6 +32,7 @@
32
32
  "subtitle_raw": { "type": "string" },
33
33
  "work_modality": { "type": "string" },
34
34
  "published_date": { "type": "string" },
35
+ "duration_ms": { "type": ["integer", "null"] },
35
36
  "digg_count": { "type": "integer" },
36
37
  "comment_count": { "type": "integer" },
37
38
  "collect_count": { "type": "integer" },
@@ -8,7 +8,7 @@ import time
8
8
  import urllib.error
9
9
  import urllib.request
10
10
  from urllib.parse import urlparse, urlunparse
11
- from typing import Any, Callable, Dict, List, Optional
11
+ from typing import Any, Callable, Dict, List, Optional, Tuple
12
12
 
13
13
  from scripts.core.tikomni_common import (
14
14
  call_json_api,
@@ -23,6 +23,21 @@ from scripts.core.u3_fallback import run_u3_public_url_fallback
23
23
 
24
24
  U2_BATCH_SUBMIT_HARD_LIMIT = 100
25
25
  DEFAULT_U2_PENDING_TIMEOUT_SEC = 60
26
+ SUMMARY_TEXT_FIELDS = (
27
+ "full_text",
28
+ "transcript_text",
29
+ "transcription_text",
30
+ "result_text",
31
+ "summary_text",
32
+ "transcript",
33
+ "transcription",
34
+ "result",
35
+ "content",
36
+ "text",
37
+ )
38
+ SEGMENT_CONTAINER_FIELDS = ("sentences", "segments", "paragraphs")
39
+ SEGMENT_TEXT_FIELDS = ("text", "sentence", "content", "paragraph", "transcript_text")
40
+ CHAR_SPACED_RUN_RE = re.compile(r"(?:[A-Za-z0-9\u4e00-\u9fff]{1,4}\s+){5,}[A-Za-z0-9\u4e00-\u9fff]{1,4}")
26
41
 
27
42
 
28
43
  def clamp_u2_batch_submit_size(size: int, *, default: int = 50, hard_limit: int = U2_BATCH_SUBMIT_HARD_LIMIT) -> int:
@@ -251,6 +266,33 @@ def clean_transcript_text(raw_text: Any) -> str:
251
266
  return normalize_text(raw_text)
252
267
 
253
268
 
269
+ def _text_signature(text: str) -> str:
270
+ return re.sub(r"[\W_]+", "", clean_transcript_text(text)).lower()
271
+
272
+
273
+ def _is_char_spaced_noise_sequence(text: str) -> bool:
274
+ tokens = [token for token in clean_transcript_text(text).split(" ") if token]
275
+ if len(tokens) < 6:
276
+ return False
277
+ single_char_tokens = sum(1 for token in tokens if len(token) == 1)
278
+ short_tokens = sum(1 for token in tokens if len(token) <= 2)
279
+ cjk_tokens = sum(1 for token in tokens if any("\u4e00" <= char <= "\u9fff" for char in token))
280
+ return (
281
+ single_char_tokens >= 4
282
+ and short_tokens / max(len(tokens), 1) >= 0.75
283
+ and cjk_tokens / max(len(tokens), 1) >= 0.5
284
+ )
285
+
286
+
287
+ def _strip_char_spaced_noise_runs(text: str) -> str:
288
+ def _replace(match: re.Match[str]) -> str:
289
+ chunk = match.group(0)
290
+ return " " if _is_char_spaced_noise_sequence(chunk) else chunk
291
+
292
+ cleaned = CHAR_SPACED_RUN_RE.sub(_replace, text)
293
+ return re.sub(r"\s+", " ", cleaned).strip()
294
+
295
+
254
296
  def _ensure_sentence_end(text: str) -> str:
255
297
  if not text:
256
298
  return text
@@ -264,13 +306,36 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
264
306
  if not base:
265
307
  return ""
266
308
 
267
- denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", base)
309
+ denoised = _strip_char_spaced_noise_runs(base)
310
+ denoised = re.sub(r"\b(嗯|啊|呃|额|那个|这个|然后|就是)\b", " ", denoised)
268
311
  denoised = re.sub(r"(嗯+|啊+|呃+)", " ", denoised)
269
312
  denoised = re.sub(r"(就是就是|然后然后|这个这个|那个那个)", " ", denoised)
270
313
  denoised = re.sub(r"\s+", " ", denoised).strip()
271
314
 
272
315
  units = [clean_transcript_text(part) for part in re.split(r"[。!?!?;;\n]+", denoised)]
273
- sentences = [_ensure_sentence_end(unit) for unit in units if unit]
316
+ sentences: List[str] = []
317
+ signatures: List[str] = []
318
+ for unit in units:
319
+ if not unit or _is_char_spaced_noise_sequence(unit):
320
+ continue
321
+ sentence = _ensure_sentence_end(unit)
322
+ signature = _text_signature(sentence)
323
+ if not signature:
324
+ continue
325
+ duplicate = False
326
+ for existing in signatures:
327
+ if signature == existing:
328
+ duplicate = True
329
+ break
330
+ smaller = signature if len(signature) <= len(existing) else existing
331
+ larger = existing if len(signature) <= len(existing) else signature
332
+ if len(smaller) >= 12 and smaller in larger:
333
+ duplicate = True
334
+ break
335
+ if duplicate:
336
+ continue
337
+ signatures.append(signature)
338
+ sentences.append(sentence)
274
339
  if not sentences:
275
340
  fallback = _ensure_sentence_end(denoised)
276
341
  return fallback if fallback else ""
@@ -292,6 +357,94 @@ def derive_asr_clean_text(asr_raw: Any, legacy_clean: Any = None) -> str:
292
357
  return "\n\n".join(paragraphs)
293
358
 
294
359
 
360
+ def _extract_summary_text_from_node(node: Dict[str, Any]) -> Tuple[str, str]:
361
+ for key in SUMMARY_TEXT_FIELDS:
362
+ value = node.get(key)
363
+ if isinstance(value, str):
364
+ cleaned = clean_transcript_text(value)
365
+ if cleaned:
366
+ return cleaned, key
367
+ return "", ""
368
+
369
+
370
+ def _append_segment_lines(node: Any, lines: List[str]) -> None:
371
+ if isinstance(node, str):
372
+ cleaned = clean_transcript_text(node)
373
+ if cleaned:
374
+ lines.append(cleaned)
375
+ return
376
+ if isinstance(node, dict):
377
+ for key in SEGMENT_TEXT_FIELDS:
378
+ value = node.get(key)
379
+ if isinstance(value, str):
380
+ cleaned = clean_transcript_text(value)
381
+ if cleaned:
382
+ lines.append(cleaned)
383
+ break
384
+ return
385
+ if isinstance(node, list):
386
+ for item in node:
387
+ _append_segment_lines(item, lines)
388
+
389
+
390
+ def _extract_segment_text_from_node(node: Dict[str, Any]) -> str:
391
+ lines: List[str] = []
392
+ for key in SEGMENT_CONTAINER_FIELDS:
393
+ if key not in node:
394
+ continue
395
+ _append_segment_lines(node.get(key), lines)
396
+ if lines:
397
+ break
398
+ if not lines:
399
+ return ""
400
+
401
+ deduped: List[str] = []
402
+ seen = set()
403
+ for line in lines:
404
+ signature = _text_signature(line)
405
+ if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
406
+ continue
407
+ seen.add(signature)
408
+ deduped.append(line)
409
+ return "\n".join(deduped).strip()
410
+
411
+
412
+ def _extract_canonical_transcript_from_node(node: Dict[str, Any]) -> Dict[str, Any]:
413
+ summary_text, summary_field = _extract_summary_text_from_node(node)
414
+ if summary_text:
415
+ return {
416
+ "transcript_text": summary_text,
417
+ "summary_field_used": summary_field,
418
+ "segment_fallback_used": False,
419
+ "canonical_text_source": f"summary:{summary_field}",
420
+ }
421
+
422
+ segment_text = _extract_segment_text_from_node(node)
423
+ if segment_text:
424
+ return {
425
+ "transcript_text": segment_text,
426
+ "summary_field_used": "",
427
+ "segment_fallback_used": True,
428
+ "canonical_text_source": "segments",
429
+ }
430
+
431
+ fallback_text = clean_transcript_text(extract_transcript_text(node))
432
+ if fallback_text:
433
+ return {
434
+ "transcript_text": fallback_text,
435
+ "summary_field_used": "",
436
+ "segment_fallback_used": True,
437
+ "canonical_text_source": "deep_search_fallback",
438
+ }
439
+
440
+ return {
441
+ "transcript_text": "",
442
+ "summary_field_used": "",
443
+ "segment_fallback_used": False,
444
+ "canonical_text_source": "missing",
445
+ }
446
+
447
+
295
448
  def extract_u2_task_metrics(payload: Any) -> Dict[str, Any]:
296
449
  metrics = deep_find_first(payload, ["task_metrics", "metrics"])
297
450
  return metrics if isinstance(metrics, dict) else {}
@@ -349,16 +502,8 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
349
502
  )
350
503
  file_url = normalize_media_url(str(raw_file_url or ""))
351
504
  if file_url:
352
- transcript = clean_transcript_text(
353
- node.get("transcript_text")
354
- or node.get("text")
355
- or node.get("transcript")
356
- or node.get("transcription")
357
- or node.get("content")
358
- or ""
359
- )
360
- if not transcript:
361
- transcript = clean_transcript_text(extract_transcript_text(node))
505
+ canonical = _extract_canonical_transcript_from_node(node)
506
+ transcript = clean_transcript_text(canonical.get("transcript_text"))
362
507
 
363
508
  status = _status_upper(node.get("status") or node.get("task_status") or node.get("state"))
364
509
  error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
@@ -372,6 +517,9 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
372
517
  "transcription_url": transcription_url,
373
518
  "error_reason": error_reason,
374
519
  "ok": ok,
520
+ "summary_field_used": canonical.get("summary_field_used", ""),
521
+ "segment_fallback_used": bool(canonical.get("segment_fallback_used")),
522
+ "canonical_text_source": canonical.get("canonical_text_source", "missing"),
375
523
  }
376
524
 
377
525
  existing = found.get(file_url)
@@ -380,12 +528,16 @@ def extract_u2_batch_result_items(payload: Any) -> List[Dict[str, Any]]:
380
528
  else:
381
529
  old_score = (
382
530
  1 if existing.get("ok") else 0,
531
+ 1 if not existing.get("segment_fallback_used") else 0,
532
+ 1 if existing.get("summary_field_used") else 0,
383
533
  len(str(existing.get("transcript_text") or "")),
384
534
  1 if existing.get("transcription_url") else 0,
385
535
  1 if not existing.get("error_reason") else 0,
386
536
  )
387
537
  new_score = (
388
538
  1 if candidate.get("ok") else 0,
539
+ 1 if not candidate.get("segment_fallback_used") else 0,
540
+ 1 if candidate.get("summary_field_used") else 0,
389
541
  len(str(candidate.get("transcript_text") or "")),
390
542
  1 if candidate.get("transcription_url") else 0,
391
543
  1 if not candidate.get("error_reason") else 0,
@@ -441,16 +593,8 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
441
593
  item_index_raw = node.get("item_index")
442
594
  item_index = _parse_non_negative_item_index(item_index_raw)
443
595
  if item_index is not None:
444
- transcript = clean_transcript_text(
445
- node.get("transcript_text")
446
- or node.get("text")
447
- or node.get("transcript")
448
- or node.get("transcription")
449
- or node.get("content")
450
- or ""
451
- )
452
- if not transcript:
453
- transcript = clean_transcript_text(extract_transcript_text(node))
596
+ canonical = _extract_canonical_transcript_from_node(node)
597
+ transcript = clean_transcript_text(canonical.get("transcript_text"))
454
598
 
455
599
  status = _status_upper(node.get("task_status") or node.get("status") or node.get("state"))
456
600
  error_reason = str(node.get("error_reason") or node.get("error") or "").strip()
@@ -464,6 +608,9 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
464
608
  "error_reason": error_reason,
465
609
  "transcription_url": transcription_url,
466
610
  "ok": ok,
611
+ "summary_field_used": canonical.get("summary_field_used", ""),
612
+ "segment_fallback_used": bool(canonical.get("segment_fallback_used")),
613
+ "canonical_text_source": canonical.get("canonical_text_source", "missing"),
467
614
  }
468
615
 
469
616
  existing = mapped.get(item_index)
@@ -472,12 +619,16 @@ def map_u2_batch_results_by_item_index(payload: Any) -> Dict[int, Dict[str, Any]
472
619
  else:
473
620
  old_score = (
474
621
  1 if existing.get("ok") else 0,
622
+ 1 if not existing.get("segment_fallback_used") else 0,
623
+ 1 if existing.get("summary_field_used") else 0,
475
624
  len(str(existing.get("transcript_text") or "")),
476
625
  1 if existing.get("transcription_url") else 0,
477
626
  1 if not existing.get("error_reason") else 0,
478
627
  )
479
628
  new_score = (
480
629
  1 if candidate.get("ok") else 0,
630
+ 1 if not candidate.get("segment_fallback_used") else 0,
631
+ 1 if candidate.get("summary_field_used") else 0,
481
632
  len(str(candidate.get("transcript_text") or "")),
482
633
  1 if candidate.get("transcription_url") else 0,
483
634
  1 if not candidate.get("error_reason") else 0,
@@ -506,29 +657,102 @@ def _extract_transcript_from_transcription_payload(payload: Any) -> str:
506
657
  except Exception:
507
658
  return ""
508
659
 
509
- transcript = clean_transcript_text(deep_find_first(payload, ["full_text"]))
510
- if transcript:
511
- return transcript
660
+ for key in SUMMARY_TEXT_FIELDS:
661
+ transcript = clean_transcript_text(deep_find_first(payload, [key]))
662
+ if transcript:
663
+ return transcript
664
+
665
+ for key in SEGMENT_CONTAINER_FIELDS:
666
+ segments = deep_find_first(payload, [key])
667
+ if segments is None:
668
+ continue
669
+ lines: List[str] = []
670
+ _append_segment_lines(segments, lines)
671
+ deduped: List[str] = []
672
+ seen = set()
673
+ for line in lines:
674
+ signature = _text_signature(line)
675
+ if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
676
+ continue
677
+ seen.add(signature)
678
+ deduped.append(line)
679
+ if deduped:
680
+ return "\n".join(deduped)
512
681
 
513
682
  transcript = clean_transcript_text(extract_transcript_text(payload))
514
683
  if transcript:
515
684
  return transcript
516
685
 
517
- sentences = deep_find_first(payload, ["sentences"])
518
- if isinstance(sentences, list):
686
+ return ""
687
+
688
+
689
+ def _extract_transcript_bundle_from_transcription_payload(payload: Any) -> Dict[str, Any]:
690
+ if isinstance(payload, str):
691
+ text = clean_transcript_text(payload)
692
+ if text:
693
+ return {
694
+ "transcript_text": text,
695
+ "summary_field_used": "raw_string",
696
+ "segment_fallback_used": False,
697
+ "canonical_text_source": "summary:raw_string",
698
+ }
699
+ try:
700
+ payload = json.loads(payload)
701
+ except Exception:
702
+ return {
703
+ "transcript_text": "",
704
+ "summary_field_used": "",
705
+ "segment_fallback_used": False,
706
+ "canonical_text_source": "missing",
707
+ }
708
+
709
+ for key in SUMMARY_TEXT_FIELDS:
710
+ transcript = clean_transcript_text(deep_find_first(payload, [key]))
711
+ if transcript:
712
+ return {
713
+ "transcript_text": transcript,
714
+ "summary_field_used": key,
715
+ "segment_fallback_used": False,
716
+ "canonical_text_source": f"summary:{key}",
717
+ }
718
+
719
+ for key in SEGMENT_CONTAINER_FIELDS:
720
+ segments = deep_find_first(payload, [key])
721
+ if segments is None:
722
+ continue
519
723
  lines: List[str] = []
520
- for sentence in sentences:
521
- if not isinstance(sentence, dict):
724
+ _append_segment_lines(segments, lines)
725
+ deduped: List[str] = []
726
+ seen = set()
727
+ for line in lines:
728
+ signature = _text_signature(line)
729
+ if not signature or signature in seen or _is_char_spaced_noise_sequence(line):
522
730
  continue
523
- line = clean_transcript_text(
524
- sentence.get("text") or sentence.get("sentence") or sentence.get("content")
525
- )
526
- if line:
527
- lines.append(line)
528
- if lines:
529
- return "\n".join(lines)
731
+ seen.add(signature)
732
+ deduped.append(line)
733
+ if deduped:
734
+ return {
735
+ "transcript_text": "\n".join(deduped),
736
+ "summary_field_used": "",
737
+ "segment_fallback_used": True,
738
+ "canonical_text_source": f"segments:{key}",
739
+ }
530
740
 
531
- return ""
741
+ transcript = clean_transcript_text(extract_transcript_text(payload))
742
+ if transcript:
743
+ return {
744
+ "transcript_text": transcript,
745
+ "summary_field_used": "",
746
+ "segment_fallback_used": True,
747
+ "canonical_text_source": "deep_search_fallback",
748
+ }
749
+
750
+ return {
751
+ "transcript_text": "",
752
+ "summary_field_used": "",
753
+ "segment_fallback_used": False,
754
+ "canonical_text_source": "missing",
755
+ }
532
756
 
533
757
 
534
758
  def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int) -> Dict[str, Any]:
@@ -573,13 +797,17 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
573
797
  except Exception:
574
798
  payload = raw_text
575
799
 
576
- transcript = _extract_transcript_from_transcription_payload(payload)
800
+ transcript_bundle = _extract_transcript_bundle_from_transcription_payload(payload)
801
+ transcript = transcript_bundle.get("transcript_text", "")
577
802
  if transcript:
578
803
  return {
579
804
  "ok": True,
580
805
  "transcription_url": url,
581
806
  "error_reason": "",
582
807
  "transcript_text": transcript,
808
+ "summary_field_used": transcript_bundle.get("summary_field_used", ""),
809
+ "segment_fallback_used": bool(transcript_bundle.get("segment_fallback_used")),
810
+ "canonical_text_source": transcript_bundle.get("canonical_text_source", "missing"),
583
811
  }
584
812
 
585
813
  return {
@@ -587,6 +815,9 @@ def fetch_transcription_text_by_url(*, transcription_url: str, timeout_ms: int)
587
815
  "transcription_url": url,
588
816
  "error_reason": "transcription_payload_empty",
589
817
  "transcript_text": "",
818
+ "summary_field_used": "",
819
+ "segment_fallback_used": False,
820
+ "canonical_text_source": "missing",
590
821
  }
591
822
 
592
823
 
@@ -620,6 +851,9 @@ def hydrate_u2_batch_results_from_transcription_urls(
620
851
  if fetched_text:
621
852
  transcript = fetched_text
622
853
  candidate["transcript_text"] = fetched_text
854
+ candidate["summary_field_used"] = fetch_result.get("summary_field_used", "")
855
+ candidate["segment_fallback_used"] = bool(fetch_result.get("segment_fallback_used"))
856
+ candidate["canonical_text_source"] = fetch_result.get("canonical_text_source", "missing")
623
857
  elif not candidate.get("error_reason"):
624
858
  candidate["error_reason"] = fetch_result.get("error_reason") or "transcription_payload_empty"
625
859