@they-juanreina/compost-cli 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. package/dist/commands/agreement.d.ts +3 -0
  2. package/dist/commands/agreement.d.ts.map +1 -0
  3. package/dist/commands/agreement.js +35 -0
  4. package/dist/commands/agreement.js.map +1 -0
  5. package/dist/commands/create.d.ts +1 -0
  6. package/dist/commands/create.d.ts.map +1 -1
  7. package/dist/commands/create.js +39 -1
  8. package/dist/commands/create.js.map +1 -1
  9. package/dist/commands/export.d.ts.map +1 -1
  10. package/dist/commands/export.js +47 -4
  11. package/dist/commands/export.js.map +1 -1
  12. package/dist/commands/import.d.ts +3 -0
  13. package/dist/commands/import.d.ts.map +1 -0
  14. package/dist/commands/import.js +59 -0
  15. package/dist/commands/import.js.map +1 -0
  16. package/dist/commands/init.d.ts.map +1 -1
  17. package/dist/commands/init.js +1 -0
  18. package/dist/commands/init.js.map +1 -1
  19. package/dist/commands/jobs.d.ts +3 -0
  20. package/dist/commands/jobs.d.ts.map +1 -0
  21. package/dist/commands/jobs.js +105 -0
  22. package/dist/commands/jobs.js.map +1 -0
  23. package/dist/commands/label.d.ts +3 -0
  24. package/dist/commands/label.d.ts.map +1 -0
  25. package/dist/commands/label.js +67 -0
  26. package/dist/commands/label.js.map +1 -0
  27. package/dist/commands/models.d.ts.map +1 -1
  28. package/dist/commands/models.js +2 -1
  29. package/dist/commands/models.js.map +1 -1
  30. package/dist/commands/recode.d.ts +3 -0
  31. package/dist/commands/recode.d.ts.map +1 -0
  32. package/dist/commands/recode.js +60 -0
  33. package/dist/commands/recode.js.map +1 -0
  34. package/dist/commands/reindex.d.ts.map +1 -1
  35. package/dist/commands/reindex.js +6 -4
  36. package/dist/commands/reindex.js.map +1 -1
  37. package/dist/commands/rerun.d.ts +3 -0
  38. package/dist/commands/rerun.d.ts.map +1 -0
  39. package/dist/commands/rerun.js +91 -0
  40. package/dist/commands/rerun.js.map +1 -0
  41. package/dist/commands/search.d.ts.map +1 -1
  42. package/dist/commands/search.js +2 -1
  43. package/dist/commands/search.js.map +1 -1
  44. package/dist/commands/secrets.d.ts +3 -0
  45. package/dist/commands/secrets.d.ts.map +1 -0
  46. package/dist/commands/secrets.js +143 -0
  47. package/dist/commands/secrets.js.map +1 -0
  48. package/dist/commands/setup.d.ts.map +1 -1
  49. package/dist/commands/setup.js +90 -1
  50. package/dist/commands/setup.js.map +1 -1
  51. package/dist/commands/status.d.ts.map +1 -1
  52. package/dist/commands/status.js +2 -1
  53. package/dist/commands/status.js.map +1 -1
  54. package/dist/commands/transcribe.d.ts.map +1 -1
  55. package/dist/commands/transcribe.js +13 -2
  56. package/dist/commands/transcribe.js.map +1 -1
  57. package/dist/commands/validate.d.ts.map +1 -1
  58. package/dist/commands/validate.js +29 -1
  59. package/dist/commands/validate.js.map +1 -1
  60. package/dist/engine.d.ts +23 -0
  61. package/dist/engine.d.ts.map +1 -0
  62. package/dist/engine.js +32 -0
  63. package/dist/engine.js.map +1 -0
  64. package/dist/exporters/prov.d.ts +11 -0
  65. package/dist/exporters/prov.d.ts.map +1 -0
  66. package/dist/exporters/prov.js +151 -0
  67. package/dist/exporters/prov.js.map +1 -0
  68. package/dist/index.d.ts.map +1 -1
  69. package/dist/index.js +6 -0
  70. package/dist/index.js.map +1 -1
  71. package/dist/lib/agreement.d.ts +77 -0
  72. package/dist/lib/agreement.d.ts.map +1 -0
  73. package/dist/lib/agreement.js +261 -0
  74. package/dist/lib/agreement.js.map +1 -0
  75. package/dist/lib/artifacts.d.ts +32 -1
  76. package/dist/lib/artifacts.d.ts.map +1 -1
  77. package/dist/lib/artifacts.js +156 -22
  78. package/dist/lib/artifacts.js.map +1 -1
  79. package/dist/lib/blame.d.ts.map +1 -1
  80. package/dist/lib/blame.js +3 -2
  81. package/dist/lib/blame.js.map +1 -1
  82. package/dist/lib/config.d.ts +3 -0
  83. package/dist/lib/config.d.ts.map +1 -1
  84. package/dist/lib/config.js.map +1 -1
  85. package/dist/lib/doctor.d.ts +3 -0
  86. package/dist/lib/doctor.d.ts.map +1 -1
  87. package/dist/lib/doctor.js +24 -1
  88. package/dist/lib/doctor.js.map +1 -1
  89. package/dist/lib/events.d.ts +34 -1
  90. package/dist/lib/events.d.ts.map +1 -1
  91. package/dist/lib/events.js +35 -1
  92. package/dist/lib/events.js.map +1 -1
  93. package/dist/lib/importTranscript.d.ts +16 -0
  94. package/dist/lib/importTranscript.d.ts.map +1 -0
  95. package/dist/lib/importTranscript.js +94 -0
  96. package/dist/lib/importTranscript.js.map +1 -0
  97. package/dist/lib/ingest.d.ts.map +1 -1
  98. package/dist/lib/ingest.js +12 -6
  99. package/dist/lib/ingest.js.map +1 -1
  100. package/dist/lib/journal.d.ts +13 -0
  101. package/dist/lib/journal.d.ts.map +1 -1
  102. package/dist/lib/journal.js +58 -2
  103. package/dist/lib/journal.js.map +1 -1
  104. package/dist/lib/legacyNative.d.ts +24 -0
  105. package/dist/lib/legacyNative.d.ts.map +1 -0
  106. package/dist/lib/legacyNative.js +51 -0
  107. package/dist/lib/legacyNative.js.map +1 -0
  108. package/dist/lib/migrate.d.ts.map +1 -1
  109. package/dist/lib/migrate.js +1 -0
  110. package/dist/lib/migrate.js.map +1 -1
  111. package/dist/lib/nativeRuntime.d.ts +6 -3
  112. package/dist/lib/nativeRuntime.d.ts.map +1 -1
  113. package/dist/lib/nativeRuntime.js +6 -3
  114. package/dist/lib/nativeRuntime.js.map +1 -1
  115. package/dist/lib/provisionNative.js +1 -1
  116. package/dist/lib/provisionNative.js.map +1 -1
  117. package/dist/lib/queue.d.ts +25 -0
  118. package/dist/lib/queue.d.ts.map +1 -1
  119. package/dist/lib/queue.js +70 -3
  120. package/dist/lib/queue.js.map +1 -1
  121. package/dist/lib/reads.d.ts +24 -0
  122. package/dist/lib/reads.d.ts.map +1 -0
  123. package/dist/lib/reads.js +115 -0
  124. package/dist/lib/reads.js.map +1 -0
  125. package/dist/lib/recode.d.ts +19 -0
  126. package/dist/lib/recode.d.ts.map +1 -0
  127. package/dist/lib/recode.js +43 -0
  128. package/dist/lib/recode.js.map +1 -0
  129. package/dist/lib/rerun.d.ts +51 -0
  130. package/dist/lib/rerun.d.ts.map +1 -0
  131. package/dist/lib/rerun.js +166 -0
  132. package/dist/lib/rerun.js.map +1 -0
  133. package/dist/lib/retrieve.d.ts +8 -4
  134. package/dist/lib/retrieve.d.ts.map +1 -1
  135. package/dist/lib/retrieve.js +12 -10
  136. package/dist/lib/retrieve.js.map +1 -1
  137. package/dist/lib/schemas.generated.d.ts.map +1 -1
  138. package/dist/lib/schemas.generated.js +28 -0
  139. package/dist/lib/schemas.generated.js.map +1 -1
  140. package/dist/lib/secrets.d.ts +158 -0
  141. package/dist/lib/secrets.d.ts.map +1 -0
  142. package/dist/lib/secrets.js +507 -0
  143. package/dist/lib/secrets.js.map +1 -0
  144. package/dist/lib/seed.d.ts +5 -0
  145. package/dist/lib/seed.d.ts.map +1 -1
  146. package/dist/lib/seed.js +15 -2
  147. package/dist/lib/seed.js.map +1 -1
  148. package/dist/lib/seedResolve.d.ts.map +1 -1
  149. package/dist/lib/seedResolve.js +1 -0
  150. package/dist/lib/seedResolve.js.map +1 -1
  151. package/dist/lib/session.d.ts +14 -0
  152. package/dist/lib/session.d.ts.map +1 -1
  153. package/dist/lib/session.js +47 -0
  154. package/dist/lib/session.js.map +1 -1
  155. package/dist/lib/setup.d.ts +5 -0
  156. package/dist/lib/setup.d.ts.map +1 -1
  157. package/dist/lib/setup.js +78 -14
  158. package/dist/lib/setup.js.map +1 -1
  159. package/dist/lib/setupWizard.d.ts +51 -0
  160. package/dist/lib/setupWizard.d.ts.map +1 -0
  161. package/dist/lib/setupWizard.js +223 -0
  162. package/dist/lib/setupWizard.js.map +1 -0
  163. package/dist/lib/snap.d.ts.map +1 -1
  164. package/dist/lib/snap.js +2 -5
  165. package/dist/lib/snap.js.map +1 -1
  166. package/dist/lib/speakers.d.ts +41 -0
  167. package/dist/lib/speakers.d.ts.map +1 -0
  168. package/dist/lib/speakers.js +78 -0
  169. package/dist/lib/speakers.js.map +1 -0
  170. package/dist/lib/status.d.ts.map +1 -1
  171. package/dist/lib/status.js +21 -0
  172. package/dist/lib/status.js.map +1 -1
  173. package/dist/lib/userConfig.d.ts +22 -0
  174. package/dist/lib/userConfig.d.ts.map +1 -0
  175. package/dist/lib/userConfig.js +67 -0
  176. package/dist/lib/userConfig.js.map +1 -0
  177. package/dist/lib/validate.d.ts +18 -0
  178. package/dist/lib/validate.d.ts.map +1 -1
  179. package/dist/lib/validate.js +70 -1
  180. package/dist/lib/validate.js.map +1 -1
  181. package/dist/lib/version.d.ts +30 -0
  182. package/dist/lib/version.d.ts.map +1 -0
  183. package/dist/lib/version.js +73 -0
  184. package/dist/lib/version.js.map +1 -0
  185. package/dist/llm/adapter.d.ts.map +1 -1
  186. package/dist/llm/adapter.js +2 -0
  187. package/dist/llm/adapter.js.map +1 -1
  188. package/dist/llm/providers/ollama.d.ts.map +1 -1
  189. package/dist/llm/providers/ollama.js +6 -0
  190. package/dist/llm/providers/ollama.js.map +1 -1
  191. package/dist/loops/ingest_watcher.d.ts.map +1 -1
  192. package/dist/loops/ingest_watcher.js +6 -3
  193. package/dist/loops/ingest_watcher.js.map +1 -1
  194. package/dist/loops/legacy_worker.d.ts +28 -1
  195. package/dist/loops/legacy_worker.d.ts.map +1 -1
  196. package/dist/loops/legacy_worker.js +81 -9
  197. package/dist/loops/legacy_worker.js.map +1 -1
  198. package/dist/loops/supervisor.d.ts +3 -0
  199. package/dist/loops/supervisor.d.ts.map +1 -1
  200. package/dist/loops/supervisor.js +12 -0
  201. package/dist/loops/supervisor.js.map +1 -1
  202. package/dist/loops/synthesis.d.ts.map +1 -1
  203. package/dist/loops/synthesis.js +15 -0
  204. package/dist/loops/synthesis.js.map +1 -1
  205. package/dist/loops/transcribe_worker.d.ts.map +1 -1
  206. package/dist/loops/transcribe_worker.js +2 -4
  207. package/dist/loops/transcribe_worker.js.map +1 -1
  208. package/dist/output.d.ts +13 -1
  209. package/dist/output.d.ts.map +1 -1
  210. package/dist/output.js +22 -2
  211. package/dist/output.js.map +1 -1
  212. package/dist/render/human.d.ts +20 -0
  213. package/dist/render/human.d.ts.map +1 -0
  214. package/dist/render/human.js +54 -0
  215. package/dist/render/human.js.map +1 -0
  216. package/dist/router.d.ts.map +1 -1
  217. package/dist/router.js +17 -2
  218. package/dist/router.js.map +1 -1
  219. package/package.json +18 -5
  220. package/templates/config.toml +6 -1
  221. package/transcriber/app/__init__.py +3 -0
  222. package/transcriber/app/asr.py +198 -0
  223. package/transcriber/app/asr_parakeet.py +174 -0
  224. package/transcriber/app/cue_parser.py +110 -0
  225. package/transcriber/app/diarization.py +330 -0
  226. package/transcriber/app/frame_annotation.py +77 -0
  227. package/transcriber/app/frames.py +130 -0
  228. package/transcriber/app/health.py +70 -0
  229. package/transcriber/app/legacy.py +355 -0
  230. package/transcriber/app/legacy_cli.py +90 -0
  231. package/transcriber/app/main.py +30 -0
  232. package/transcriber/app/pipeline.py +210 -0
  233. package/transcriber/app/pptx_export.py +42 -0
  234. package/transcriber/app/prosody.py +128 -0
  235. package/transcriber/app/routes/__init__.py +1 -0
  236. package/transcriber/app/routes/legacy.py +117 -0
  237. package/transcriber/app/routes/transcribe.py +133 -0
  238. package/transcriber/app/shot_change.py +74 -0
  239. package/transcriber/app/silence_typer.py +144 -0
  240. package/transcriber/app/transcribe_cli.py +82 -0
  241. package/transcriber/app/vad.py +216 -0
  242. package/transcriber/pyproject.toml +56 -0
@@ -0,0 +1,110 @@
1
+ """Cue parser (#10).
2
+
3
+ Whisper-large-v3 with event-tag tokens emits inline markers like [laughter],
4
+ [sigh], [cough], [clear_throat], [unintelligible], and code-switching markers.
5
+ This module pulls those out of utterance text into structured cues[] entries
6
+ (schema/cues.taxonomy.json) and returns the cleaned text.
7
+
8
+ Pure and deterministic — no model. The ASR wrapper (asr.py) produces the
9
+ tagged text; this turns it into cues.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from typing import Any
16
+
17
+ # Whisper/Whisper-AT event tag → compost cue kind (cues.taxonomy.json).
18
+ TAG_TO_KIND: dict[str, str] = {
19
+ "laughter": "laughter",
20
+ "laugh": "laughter",
21
+ "laughs": "laughter",
22
+ "sigh": "sigh",
23
+ "sighs": "sigh",
24
+ "cough": "cough",
25
+ "coughs": "cough",
26
+ "clear_throat": "throat-clear",
27
+ "throat_clear": "throat-clear",
28
+ "throat-clear": "throat-clear",
29
+ "unintelligible": "unintelligible",
30
+ "inaudible": "unintelligible",
31
+ "code_switch": "code-switching",
32
+ "code-switch": "code-switching",
33
+ "code_switching": "code-switching",
34
+ }
35
+
36
+ # Default confidence assigned to a tag-derived cue when the ASR gives none.
37
+ DEFAULT_CONFIDENCE = 0.8
38
+
39
+ _TAG_RE = re.compile(r"\[([a-zA-Z_\-]+)\]")
40
+
41
+
42
+ def _clean_text(text: str) -> str:
43
+ # Drop recognized event tags, collapse the resulting double spaces.
44
+ def repl(m: re.Match[str]) -> str:
45
+ return "" if m.group(1).lower() in TAG_TO_KIND else m.group(0)
46
+
47
+ return re.sub(r"\s{2,}", " ", _TAG_RE.sub(repl, text)).strip()
48
+
49
+
50
+ def parse_cues_from_utterance(
51
+ utterance: dict[str, Any],
52
+ next_cue_index: int = 1,
53
+ confidence: float = DEFAULT_CONFIDENCE,
54
+ ) -> tuple[str, list[dict[str, Any]]]:
55
+ """Return (cleaned_text, cues) for one utterance.
56
+
57
+ Cue timing: if a word in `words[]` matches the tag, use that word's span;
58
+ otherwise fall back to the utterance span.
59
+ """
60
+ text = utterance.get("text", "")
61
+ words = utterance.get("words", [])
62
+ speaker_id = utterance.get("speaker_id")
63
+ cues: list[dict[str, Any]] = []
64
+ idx = next_cue_index
65
+
66
+ for m in _TAG_RE.finditer(text):
67
+ kind = TAG_TO_KIND.get(m.group(1).lower())
68
+ if kind is None:
69
+ continue
70
+ start_ms, end_ms = _tag_span(m.group(0), words, utterance)
71
+ cue: dict[str, Any] = {
72
+ "id": f"CUE-{idx:03d}",
73
+ "kind": kind,
74
+ "start_ms": start_ms,
75
+ "end_ms": end_ms,
76
+ "source": "audio",
77
+ "confidence": confidence,
78
+ }
79
+ if speaker_id is not None:
80
+ cue["speaker_id"] = speaker_id
81
+ cues.append(cue)
82
+ idx += 1
83
+
84
+ return _clean_text(text), cues
85
+
86
+
87
+ def _tag_span(
88
+ tag_token: str,
89
+ words: list[dict[str, Any]],
90
+ utterance: dict[str, Any],
91
+ ) -> tuple[int, int]:
92
+ for w in words:
93
+ if w.get("w") == tag_token:
94
+ return int(w["s"]), int(w["e"])
95
+ return int(utterance["start_ms"]), int(utterance["end_ms"])
96
+
97
+
98
+ def parse_transcript_cues(transcript: dict[str, Any]) -> dict[str, Any]:
99
+ """Extract cues from every utterance, append to cues[], strip tags from text.
100
+
101
+ Cue ids continue from any existing cues[]. Mutates and returns the transcript.
102
+ """
103
+ existing = transcript.setdefault("cues", [])
104
+ idx = len(existing) + 1
105
+ for utt in transcript.get("utterances", []):
106
+ cleaned, cues = parse_cues_from_utterance(utt, next_cue_index=idx)
107
+ utt["text"] = cleaned
108
+ existing.extend(cues)
109
+ idx += len(cues)
110
+ return transcript
@@ -0,0 +1,330 @@
1
+ """pyannote-audio diarization + word-level alignment (#11).
2
+
3
+ The pyannote pipeline (gated model; needs HUGGINGFACE_TOKEN + torch) is loaded
4
+ lazily. The alignment maths — assigning a stable speaker_id to each utterance
5
+ by maximum temporal overlap with diarization turns, flagging overlap regions,
6
+ and gating low-confidence sessions — is pure and fully unit-tested.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from dataclasses import dataclass
13
+ from functools import lru_cache
14
+ from typing import Any, Protocol
15
+
16
+ # Below this mean per-utterance overlap confidence, the session is queued for
17
+ # human speaker labelling instead of trusted.
18
+ DIARIZATION_CONFIDENCE_FLOOR = 0.5
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class Turn:
23
+ start_ms: int
24
+ end_ms: int
25
+ speaker: str
26
+
27
+
28
+ # Speakers below this share of total speech are treated as over-segmentation
29
+ # fragments and merged into the nearest dominant cluster (#178). pyannote 3.1
30
+ # routinely splits a clean 2-party interview into 5–6 speakers (~85% / 10% +
31
+ # three 1–3% slivers); the slivers are temporal fragments of the dominant
32
+ # pair, not extra speakers.
33
+ DEFAULT_MIN_SPEAKER_SHARE = 0.05
34
+
35
+
36
+ class DiarizationBackend(Protocol):
37
+ def diarize(self, audio_path: str) -> list[dict[str, Any]]: ...
38
+
39
+
40
+ PYANNOTE_MODEL = "pyannote/speaker-diarization-3.1"
41
+
42
+
43
+ def _resolve_diar_device(requested: str) -> str: # pragma: no cover - env-dependent
44
+ """Map 'auto' to the best available device. On Apple Silicon that's MPS
45
+ (Metal) — ~18x faster than CPU for pyannote with identical results on
46
+ torch>=2.12. 'cpu'/'mps'/'cuda' pass through."""
47
+ if requested != "auto":
48
+ return requested
49
+ try:
50
+ import torch # type: ignore
51
+
52
+ if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
53
+ return "mps"
54
+ if torch.cuda.is_available():
55
+ return "cuda"
56
+ except ImportError:
57
+ pass
58
+ return "cpu"
59
+
60
+
61
+ class PyannoteBackend: # pragma: no cover - needs gated weights + torch
62
+ """Concrete DiarizationBackend wrapping pyannote-audio.
63
+
64
+ The pipeline is loaded once per process. HuggingFace token comes from
65
+ HUGGINGFACE_TOKEN or HF_TOKEN env vars (one must be set; the user must
66
+ also have accepted the license at hf.co/pyannote/speaker-diarization-3.1).
67
+ """
68
+
69
+ def __init__(self, device: str | None = None) -> None:
70
+ import os
71
+
72
+ token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
73
+ if not token:
74
+ raise RuntimeError(
75
+ "pyannote needs HUGGINGFACE_TOKEN to download the gated model. "
76
+ "Set it in .env.local and accept the license at hf.co/pyannote/speaker-diarization-3.1."
77
+ )
78
+ try:
79
+ import torch # type: ignore
80
+ import torchaudio # type: ignore
81
+ from pyannote.audio import Pipeline # type: ignore
82
+ except ImportError as e:
83
+ raise RuntimeError(
84
+ "pyannote.audio / torchaudio not installed. Install the asr extra: pip install -e '.[asr]'"
85
+ ) from e
86
+
87
+ resolved = _resolve_diar_device(
88
+ device or os.environ.get("COMPOST_DIARIZATION_DEVICE", "auto")
89
+ )
90
+ # On Apple Silicon, MPS runs pyannote ~18x faster than CPU with identical
91
+ # results (verified on torch>=2.12); enable CPU fallback for any op MPS
92
+ # lacks so it can never error out mid-pipeline (#176).
93
+ if resolved == "mps":
94
+ os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
95
+
96
+ self._pipeline = Pipeline.from_pretrained(PYANNOTE_MODEL, token=token)
97
+ if resolved != "cpu":
98
+ self._pipeline = self._pipeline.to(torch.device(resolved))
99
+ self._device = resolved
100
+ self._torchaudio = torchaudio
101
+
102
+ def diarize(self, audio_path: str) -> list[dict[str, Any]]:
103
+ # Preload audio in-memory with torchaudio so pyannote 4.x doesn't hit
104
+ # torchcodec (which requires CUDA runtime libraries we don't ship in
105
+ # the CPU-only container). This is the documented fallback path.
106
+ waveform, sample_rate = self._torchaudio.load(audio_path)
107
+ output = self._pipeline({"waveform": waveform, "sample_rate": sample_rate})
108
+ # pyannote 4.x returns DiarizeOutput; 3.x returned the Annotation directly.
109
+ # Support both by reading .speaker_diarization if present, else the object itself.
110
+ diarization = getattr(output, "speaker_diarization", output)
111
+ turns: list[dict[str, Any]] = []
112
+ for segment, _, speaker in diarization.itertracks(yield_label=True):
113
+ turns.append(
114
+ {
115
+ "start_ms": int(segment.start * 1000),
116
+ "end_ms": int(segment.end * 1000),
117
+ "speaker": str(speaker),
118
+ }
119
+ )
120
+ return turns
121
+
122
+
123
+ @lru_cache(maxsize=1)
124
+ def _load_pyannote(token_present: bool) -> DiarizationBackend: # pragma: no cover - needs weights
125
+ if not token_present:
126
+ raise RuntimeError(
127
+ "pyannote needs HUGGINGFACE_TOKEN to download the gated model. "
128
+ "Set it in .env.local and accept the license at hf.co/pyannote/speaker-diarization-3.1."
129
+ )
130
+ return PyannoteBackend()
131
+
132
+
133
+ _PYANNOTE_LABEL_RE = re.compile(r"^SPEAKER_(\d+)$")
134
+
135
+
136
+ def normalize_speaker_label(label: str) -> str:
137
+ """Canonicalize a diarization speaker label to the schema's ``^S[0-9]+$`` form.
138
+
139
+ pyannote emits cluster labels like ``SPEAKER_00`` / ``SPEAKER_01``; the
140
+ transcript schema (schema/transcript.schema.json $defs.speaker.id and
141
+ $defs.utterance.speaker_id) requires ``S{n}`` — e.g. ``S0``, ``S1``. Leading
142
+ zeros are dropped (``SPEAKER_00`` → ``S0``). Already-canonical labels
143
+ (``S1``) and the ``S?`` orphan sentinel pass through unchanged, so this is
144
+ idempotent and safe to apply at the single write point in ``align()``.
145
+ """
146
+ m = _PYANNOTE_LABEL_RE.match(label)
147
+ return f"S{int(m.group(1))}" if m else label
148
+
149
+
150
+ def _overlap_ms(a_start: int, a_end: int, b_start: int, b_end: int) -> int:
151
+ return max(0, min(a_end, b_end) - max(a_start, b_start))
152
+
153
+
154
+ def merge_subthreshold_speakers(
155
+ turns: list[Turn], min_share: float = DEFAULT_MIN_SPEAKER_SHARE
156
+ ) -> list[Turn]:
157
+ """Collapse speakers with sub-threshold airtime into the nearest dominant
158
+ cluster (#178). Pure transformation; safe to skip when nothing's spurious.
159
+
160
+ A 60-min 2-party interview routinely diarizes as 6 speakers (~85% / 10%
161
+ + three 1–3% slivers). The slivers are temporal fragments of the real
162
+ pair, not extra speakers — reassign each sliver-turn to whichever
163
+ dominant speaker is temporally closest (gap to the nearest dominant
164
+ turn before vs after).
165
+
166
+ Conservative: when every speaker meets the threshold the input is
167
+ returned unchanged, and when no speaker meets the threshold (degenerate)
168
+ the input is also returned unchanged rather than zeroing the speaker set.
169
+ """
170
+ if not turns:
171
+ return turns
172
+ total = sum(t.end_ms - t.start_ms for t in turns)
173
+ if total <= 0:
174
+ return turns
175
+ by_speaker: dict[str, int] = {}
176
+ for t in turns:
177
+ by_speaker[t.speaker] = by_speaker.get(t.speaker, 0) + (t.end_ms - t.start_ms)
178
+ dominant = {s for s, dur in by_speaker.items() if dur / total >= min_share}
179
+ if not dominant or len(dominant) == len(by_speaker):
180
+ return turns
181
+ ordered = sorted(turns, key=lambda t: t.start_ms)
182
+ out: list[Turn] = []
183
+ for i, t in enumerate(ordered):
184
+ if t.speaker in dominant:
185
+ out.append(t)
186
+ continue
187
+ prev_dom = next((o for o in reversed(ordered[:i]) if o.speaker in dominant), None)
188
+ next_dom = next((o for o in ordered[i + 1 :] if o.speaker in dominant), None)
189
+ if prev_dom is None and next_dom is None:
190
+ out.append(t) # no anchor — leave as-is rather than guess
191
+ continue
192
+ if prev_dom is None:
193
+ chosen = next_dom.speaker # type: ignore[union-attr]
194
+ elif next_dom is None:
195
+ chosen = prev_dom.speaker
196
+ else:
197
+ gap_prev = t.start_ms - prev_dom.end_ms
198
+ gap_next = next_dom.start_ms - t.end_ms
199
+ chosen = prev_dom.speaker if gap_prev <= gap_next else next_dom.speaker
200
+ out.append(Turn(t.start_ms, t.end_ms, chosen))
201
+ return out
202
+
203
+
204
+ def _nearest_turn_speaker(utt_start_ms: int, utt_end_ms: int, turns: list[Turn]) -> str | None:
205
+ """Pick the speaker of the turn whose nearest edge is closest to the
206
+ utterance's midpoint (#178). Used to rescue 'S?' orphans — utterances
207
+ whose timing didn't overlap any diarization turn (a few-ms sliver
208
+ between turn boundaries). Returns None if turns is empty.
209
+ """
210
+ if not turns:
211
+ return None
212
+ mid = (utt_start_ms + utt_end_ms) // 2
213
+ return min(
214
+ turns,
215
+ key=lambda t: min(abs(t.start_ms - mid), abs(t.end_ms - mid)),
216
+ ).speaker
217
+
218
+
219
+ def assign_speaker(utterance: dict[str, Any], turns: list[Turn]) -> tuple[str, float]:
220
+ """Return (speaker_id, confidence) for an utterance by max overlap.
221
+
222
+ confidence = overlapped duration with the winning speaker / utterance
223
+ duration (0..1). Ties resolve to the earlier-starting turn.
224
+ """
225
+ u_start = utterance["start_ms"]
226
+ u_end = utterance["end_ms"]
227
+ u_dur = max(u_end - u_start, 1)
228
+
229
+ by_speaker: dict[str, int] = {}
230
+ for t in turns:
231
+ ov = _overlap_ms(u_start, u_end, t.start_ms, t.end_ms)
232
+ if ov > 0:
233
+ by_speaker[t.speaker] = by_speaker.get(t.speaker, 0) + ov
234
+
235
+ if not by_speaker:
236
+ return "S?", 0.0
237
+ winner = max(by_speaker.items(), key=lambda kv: kv[1])
238
+ return winner[0], min(winner[1] / u_dur, 1.0)
239
+
240
+
241
+ def detect_overlaps(
242
+ turns: list[Turn], min_overlap_ms: int = 200, start_index: int = 1
243
+ ) -> list[dict[str, Any]]:
244
+ """Find regions where two turns overlap; emit `overlap` cues.
245
+
246
+ Cue ids use the schema's uniform ``CUE-[0-9]{3,}`` space (the cue ``kind``
247
+ already distinguishes overlap cues from ASR-tag cues, so a typed ``CUE-OV-``
248
+ prefix would both duplicate that and violate the id pattern). ``start_index``
249
+ lets the caller continue numbering past any cues already in cues[] so the
250
+ overlap and tag-derived cues share one collision-free id sequence.
251
+ """
252
+ cues: list[dict[str, Any]] = []
253
+ ordered = sorted(turns, key=lambda t: t.start_ms)
254
+ idx = start_index
255
+ for i in range(len(ordered)):
256
+ for j in range(i + 1, len(ordered)):
257
+ a, b = ordered[i], ordered[j]
258
+ if b.start_ms >= a.end_ms:
259
+ break # no later turn can overlap a (sorted by start)
260
+ if a.speaker == b.speaker:
261
+ continue
262
+ ov_start = max(a.start_ms, b.start_ms)
263
+ ov_end = min(a.end_ms, b.end_ms)
264
+ if ov_end - ov_start >= min_overlap_ms:
265
+ cues.append(
266
+ {
267
+ "id": f"CUE-{idx:03d}",
268
+ "kind": "overlap",
269
+ "start_ms": ov_start,
270
+ "end_ms": ov_end,
271
+ "source": "audio",
272
+ }
273
+ )
274
+ idx += 1
275
+ return cues
276
+
277
+
278
+ def align(transcript: dict[str, Any], turns: list[Turn]) -> dict[str, Any]:
279
+ """Assign speaker_id + per-utterance diarization confidence, attach overlap
280
+ cues, and set session status when mean confidence is below the floor.
281
+ Mutates and returns the transcript.
282
+
283
+ Post-fix (#178): an utterance whose timing doesn't overlap any diarization
284
+ turn (an "S?" orphan, e.g. a sliver between turn boundaries) is rescued by
285
+ attaching the nearest turn's speaker. The confidence stays 0.0 to mark the
286
+ assignment as a fallback rather than a verified overlap — those still
287
+ accumulate against the mean-confidence floor and can trigger the
288
+ needs_speaker_labels gate when there are many.
289
+ """
290
+ confidences: list[float] = []
291
+ for utt in transcript.get("utterances", []):
292
+ speaker, conf = assign_speaker(utt, turns)
293
+ if speaker == "S?":
294
+ rescued = _nearest_turn_speaker(utt["start_ms"], utt["end_ms"], turns)
295
+ if rescued is not None:
296
+ speaker = rescued # confidence stays 0.0 (fallback marker)
297
+ # Canonicalize pyannote's SPEAKER_NN labels to the schema's S{n} form at
298
+ # the single write point so speakers[].id (derived from these) and every
299
+ # utterances[].speaker_id agree with ^S[0-9]+$.
300
+ utt["speaker_id"] = normalize_speaker_label(speaker)
301
+ utt.setdefault("diarization", {})["confidence"] = round(conf, 3)
302
+ confidences.append(conf)
303
+
304
+ cues = transcript.setdefault("cues", [])
305
+ cues.extend(detect_overlaps(turns, start_index=len(cues) + 1))
306
+
307
+ mean_conf = sum(confidences) / len(confidences) if confidences else 0.0
308
+ if mean_conf < DIARIZATION_CONFIDENCE_FLOOR:
309
+ transcript["status"] = "needs_speaker_labels"
310
+ return transcript
311
+
312
+
313
+ class Diarizer:
314
+ def __init__(self, backend: DiarizationBackend | None = None):
315
+ self._backend = backend
316
+
317
+ def _get_backend(self) -> DiarizationBackend:
318
+ if self._backend is not None:
319
+ return self._backend
320
+ import os
321
+
322
+ token_present = bool(os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN"))
323
+ return _load_pyannote(token_present)
324
+
325
+ def diarize(self, audio_path: str) -> list[Turn]:
326
+ raw = self._get_backend().diarize(audio_path)
327
+ turns = [Turn(int(t["start_ms"]), int(t["end_ms"]), str(t["speaker"])) for t in raw]
328
+ # Collapse over-segmentation slivers into the dominant cluster (#178)
329
+ # before align() and detect_overlaps() consume the turns.
330
+ return merge_subthreshold_speakers(turns)
@@ -0,0 +1,77 @@
1
+ """Optional frame annotation (#50).
2
+
3
+ A one-sentence description of a frame from a vision-capable model. Off by
4
+ default; opt in via config.toml `[frames] annotation = "claude" | "moondream2"`
5
+ (decision #72). The annotation is recorded as an AI-authored event on the frame
6
+ and surfaces as [draft] until a researcher endorses it.
7
+
8
+ The vision models are injected (the Claude path calls the Anthropic API with
9
+ the frame + linked utterance; the Moondream2 path is a lazy-loaded local model)
10
+ so the gate, prompt, and event shape are testable without weights.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from collections.abc import Callable
16
+ from typing import Any, Protocol
17
+
18
+ PROMPT = (
19
+ "In one sentence, describe what's visible in this interview frame that a "
20
+ "researcher reviewing the session might find notable. If nothing is notable, "
21
+ "return null."
22
+ )
23
+
24
+
25
+ class VisionModel(Protocol):
26
+ def describe(self, frame_path: str, prompt: str, linked_text: str) -> str | None: ...
27
+
28
+
29
+ def build_prompt(linked_text: str) -> str:
30
+ """The standard prompt + the linked utterance text for context."""
31
+ if linked_text:
32
+ return f'{PROMPT}\n\nThe speaker was saying: "{linked_text}"'
33
+ return PROMPT
34
+
35
+
36
+ def annotate_frame(
37
+ frame: dict[str, Any],
38
+ linked_text: str,
39
+ model: VisionModel,
40
+ *,
41
+ enabled: bool,
42
+ actor_id: str,
43
+ ) -> dict[str, Any] | None:
44
+ """Return an AI-authored `create` event for the frame's annotation, or None
45
+ when annotation is disabled or the model declines (nothing notable).
46
+
47
+ `enabled` reflects the per-seed config gate; right-click "annotate this
48
+ frame" passes enabled=True on demand even when the default is off.
49
+ """
50
+ if not enabled:
51
+ return None
52
+ description = model.describe(frame["path"], build_prompt(linked_text), linked_text)
53
+ if description is None or not description.strip():
54
+ return None
55
+ return {
56
+ "artifact_kind": "frame_annotation",
57
+ "action": "create",
58
+ "actor_type": "ai",
59
+ "actor_id": actor_id,
60
+ "model": actor_id,
61
+ "payload": {
62
+ "frame_id": frame["id"],
63
+ "at_ms": frame["at_ms"],
64
+ "annotation": description.strip(),
65
+ "status": "draft",
66
+ },
67
+ }
68
+
69
+
70
+ def claude_vision(call: Callable[[str, str], str | None]) -> VisionModel:
71
+ """Wrap an Anthropic vision call (frame_path, prompt) → text into a VisionModel."""
72
+
73
+ class _Claude:
74
+ def describe(self, frame_path: str, prompt: str, linked_text: str) -> str | None:
75
+ return call(frame_path, prompt)
76
+
77
+ return _Claude()
@@ -0,0 +1,130 @@
1
+ """ffmpeg-backed frame extractor (#14).
2
+
3
+ Pulls a JPG from the video stream at each requested trigger timestamp and
4
+ writes it to sessions/<sid>/frames/<padded_ms>.jpg (640x360). Returns the
5
+ frames[] index entries for transcript.json. No classification — frames are
6
+ evidence.
7
+
8
+ Triggers (see schema/frames.taxonomy.json): silence_*, audio_cue, shot_change,
9
+ highlight, manual, sampling. The caller supplies (at_ms, trigger,
10
+ linked_utterance_id?) tuples; this module just extracts + indexes.
11
+
12
+ Idempotent: a frame whose target JPG already exists is not re-extracted, and
13
+ the returned id is stable (FR-<padded_ms>).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import subprocess
19
+ from dataclasses import dataclass
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ FRAME_WIDTH = 640
24
+ FRAME_HEIGHT = 360
25
+ _PAD = 9 # zero-pad ms to 9 digits (~277h) for lexical sort
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class FrameTrigger:
30
+ at_ms: int
31
+ trigger: str
32
+ linked_utterance_id: str | None = None
33
+
34
+
35
+ def _padded(ms: int) -> str:
36
+ return str(ms).zfill(_PAD)
37
+
38
+
39
+ def frame_id(at_ms: int) -> str:
40
+ return f"FR-{_padded(at_ms)}"
41
+
42
+
43
+ def frame_relpath(session_id: str, at_ms: int) -> str:
44
+ return f"sessions/{session_id}/frames/{_padded(at_ms)}.jpg"
45
+
46
+
47
+ def _extract_one(video_path: Path, at_ms: int, out_path: Path) -> None:
48
+ out_path.parent.mkdir(parents=True, exist_ok=True)
49
+ ts = at_ms / 1000.0
50
+ # -ss before -i seeks fast; -frames:v 1 grabs a single frame; scale to 640x360.
51
+ cmd = [
52
+ "ffmpeg",
53
+ "-y",
54
+ "-ss",
55
+ f"{ts:.3f}",
56
+ "-i",
57
+ str(video_path),
58
+ "-frames:v",
59
+ "1",
60
+ "-vf",
61
+ f"scale={FRAME_WIDTH}:{FRAME_HEIGHT}",
62
+ "-q:v",
63
+ "4",
64
+ str(out_path),
65
+ ]
66
+ proc = subprocess.run(cmd, capture_output=True, text=True)
67
+ if proc.returncode != 0 or not out_path.exists():
68
+ raise RuntimeError(f"ffmpeg failed extracting frame at {at_ms}ms: {proc.stderr[-300:]}")
69
+
70
+
71
+ def extract_frames(
72
+ video_path: str | Path,
73
+ session_id: str,
74
+ triggers: list[FrameTrigger],
75
+ seed_root: str | Path,
76
+ ) -> list[dict[str, Any]]:
77
+ """Extract a frame per trigger; return frames[] index entries.
78
+
79
+ Deduplicates by at_ms (the first trigger for a given ms wins), and skips
80
+ extraction when the JPG already exists (idempotent re-runs).
81
+ """
82
+ video_path = Path(video_path)
83
+ seed_root = Path(seed_root)
84
+
85
+ seen: dict[int, FrameTrigger] = {}
86
+ for t in triggers:
87
+ seen.setdefault(t.at_ms, t)
88
+
89
+ frames: list[dict[str, Any]] = []
90
+ for at_ms in sorted(seen):
91
+ trig = seen[at_ms]
92
+ rel = frame_relpath(session_id, at_ms)
93
+ abs_path = seed_root / rel
94
+ if not abs_path.exists():
95
+ _extract_one(video_path, at_ms, abs_path)
96
+ entry: dict[str, Any] = {
97
+ "id": frame_id(at_ms),
98
+ "at_ms": at_ms,
99
+ "path": rel,
100
+ "trigger": trig.trigger,
101
+ }
102
+ if trig.linked_utterance_id is not None:
103
+ entry["linked_utterance_id"] = trig.linked_utterance_id
104
+ frames.append(entry)
105
+ return frames
106
+
107
+
108
+ def sampling_triggers(
109
+ duration_ms: int,
110
+ existing_ms: list[int],
111
+ interval_s: int = 60,
112
+ ) -> list[FrameTrigger]:
113
+ """Emit a `sampling` trigger every `interval_s` only when no other trigger
114
+ already fired within that window (ROADMAP § Descriptive transcription B).
115
+ """
116
+ interval_ms = interval_s * 1000
117
+ existing = sorted(existing_ms)
118
+ out: list[FrameTrigger] = []
119
+ t = 0
120
+ ei = 0
121
+ while t < duration_ms:
122
+ window_end = t + interval_ms
123
+ # advance existing pointer past anything before this window
124
+ while ei < len(existing) and existing[ei] < t:
125
+ ei += 1
126
+ covered = ei < len(existing) and existing[ei] < window_end
127
+ if not covered:
128
+ out.append(FrameTrigger(at_ms=t, trigger="sampling"))
129
+ t = window_end
130
+ return out