@they-juanreina/compost-cli 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dist/lib/blame.d.ts.map +1 -1
  2. package/dist/lib/blame.js +3 -2
  3. package/dist/lib/blame.js.map +1 -1
  4. package/dist/lib/journal.d.ts.map +1 -1
  5. package/dist/lib/journal.js +9 -0
  6. package/dist/lib/journal.js.map +1 -1
  7. package/dist/lib/migrate.d.ts.map +1 -1
  8. package/dist/lib/migrate.js +1 -0
  9. package/dist/lib/migrate.js.map +1 -1
  10. package/dist/lib/nativeRuntime.d.ts +6 -3
  11. package/dist/lib/nativeRuntime.d.ts.map +1 -1
  12. package/dist/lib/nativeRuntime.js +6 -3
  13. package/dist/lib/nativeRuntime.js.map +1 -1
  14. package/dist/lib/retrieve.d.ts.map +1 -1
  15. package/dist/lib/retrieve.js +0 -8
  16. package/dist/lib/retrieve.js.map +1 -1
  17. package/dist/lib/seedResolve.d.ts.map +1 -1
  18. package/dist/lib/seedResolve.js +1 -0
  19. package/dist/lib/seedResolve.js.map +1 -1
  20. package/dist/lib/setup.d.ts.map +1 -1
  21. package/dist/lib/setup.js +9 -8
  22. package/dist/lib/setup.js.map +1 -1
  23. package/dist/lib/snap.d.ts.map +1 -1
  24. package/dist/lib/snap.js +2 -5
  25. package/dist/lib/snap.js.map +1 -1
  26. package/dist/loops/supervisor.d.ts.map +1 -1
  27. package/dist/loops/supervisor.js +1 -0
  28. package/dist/loops/supervisor.js.map +1 -1
  29. package/dist/loops/transcribe_worker.d.ts.map +1 -1
  30. package/dist/loops/transcribe_worker.js +0 -1
  31. package/dist/loops/transcribe_worker.js.map +1 -1
  32. package/dist/router.js +1 -1
  33. package/package.json +10 -4
  34. package/transcriber/app/__init__.py +3 -0
  35. package/transcriber/app/asr.py +198 -0
  36. package/transcriber/app/asr_parakeet.py +174 -0
  37. package/transcriber/app/cue_parser.py +110 -0
  38. package/transcriber/app/diarization.py +300 -0
  39. package/transcriber/app/frame_annotation.py +77 -0
  40. package/transcriber/app/frames.py +130 -0
  41. package/transcriber/app/health.py +70 -0
  42. package/transcriber/app/legacy.py +355 -0
  43. package/transcriber/app/main.py +30 -0
  44. package/transcriber/app/pipeline.py +204 -0
  45. package/transcriber/app/pptx_export.py +42 -0
  46. package/transcriber/app/prosody.py +123 -0
  47. package/transcriber/app/routes/__init__.py +1 -0
  48. package/transcriber/app/routes/legacy.py +117 -0
  49. package/transcriber/app/routes/transcribe.py +133 -0
  50. package/transcriber/app/shot_change.py +74 -0
  51. package/transcriber/app/silence_typer.py +144 -0
  52. package/transcriber/app/transcribe_cli.py +82 -0
  53. package/transcriber/app/vad.py +145 -0
  54. package/transcriber/pyproject.toml +56 -0
@@ -0,0 +1,300 @@
1
+ """pyannote-audio diarization + word-level alignment (#11).
2
+
3
+ The pyannote pipeline (gated model; needs HUGGINGFACE_TOKEN + torch) is loaded
4
+ lazily. The alignment maths — assigning a stable speaker_id to each utterance
5
+ by maximum temporal overlap with diarization turns, flagging overlap regions,
6
+ and gating low-confidence sessions — is pure and fully unit-tested.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass
12
+ from functools import lru_cache
13
+ from typing import Any, Protocol
14
+
15
+ # Below this mean per-utterance overlap confidence, the session is queued for
16
+ # human speaker labelling instead of trusted.
17
+ DIARIZATION_CONFIDENCE_FLOOR = 0.5
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class Turn:
22
+ start_ms: int
23
+ end_ms: int
24
+ speaker: str
25
+
26
+
27
+ # Speakers below this share of total speech are treated as over-segmentation
28
+ # fragments and merged into the nearest dominant cluster (#178). pyannote 3.1
29
+ # routinely splits a clean 2-party interview into 5–6 speakers (~85% / 10% +
30
+ # three 1–3% slivers); the slivers are temporal fragments of the dominant
31
+ # pair, not extra speakers.
32
+ DEFAULT_MIN_SPEAKER_SHARE = 0.05
33
+
34
+
35
+ class DiarizationBackend(Protocol):
36
+ def diarize(self, audio_path: str) -> list[dict[str, Any]]: ...
37
+
38
+
39
+ PYANNOTE_MODEL = "pyannote/speaker-diarization-3.1"
40
+
41
+
42
+ def _resolve_diar_device(requested: str) -> str: # pragma: no cover - env-dependent
43
+ """Map 'auto' to the best available device. On Apple Silicon that's MPS
44
+ (Metal) — ~18x faster than CPU for pyannote with identical results on
45
+ torch>=2.12. 'cpu'/'mps'/'cuda' pass through."""
46
+ if requested != "auto":
47
+ return requested
48
+ try:
49
+ import torch # type: ignore
50
+
51
+ if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
52
+ return "mps"
53
+ if torch.cuda.is_available():
54
+ return "cuda"
55
+ except ImportError:
56
+ pass
57
+ return "cpu"
58
+
59
+
60
+ class PyannoteBackend: # pragma: no cover - needs gated weights + torch
61
+ """Concrete DiarizationBackend wrapping pyannote-audio.
62
+
63
+ The pipeline is loaded once per process. HuggingFace token comes from
64
+ HUGGINGFACE_TOKEN or HF_TOKEN env vars (one must be set; the user must
65
+ also have accepted the license at hf.co/pyannote/speaker-diarization-3.1).
66
+ """
67
+
68
+ def __init__(self, device: str | None = None) -> None:
69
+ import os
70
+
71
+ token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
72
+ if not token:
73
+ raise RuntimeError(
74
+ "pyannote needs HUGGINGFACE_TOKEN to download the gated model. "
75
+ "Set it in .env.local and accept the license at hf.co/pyannote/speaker-diarization-3.1."
76
+ )
77
+ try:
78
+ import torch # type: ignore
79
+ import torchaudio # type: ignore
80
+ from pyannote.audio import Pipeline # type: ignore
81
+ except ImportError as e:
82
+ raise RuntimeError(
83
+ "pyannote.audio / torchaudio not installed. Install the asr extra: pip install -e '.[asr]'"
84
+ ) from e
85
+
86
+ resolved = _resolve_diar_device(
87
+ device or os.environ.get("COMPOST_DIARIZATION_DEVICE", "auto")
88
+ )
89
+ # On Apple Silicon, MPS runs pyannote ~18x faster than CPU with identical
90
+ # results (verified on torch>=2.12); enable CPU fallback for any op MPS
91
+ # lacks so it can never error out mid-pipeline (#176).
92
+ if resolved == "mps":
93
+ os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
94
+
95
+ self._pipeline = Pipeline.from_pretrained(PYANNOTE_MODEL, token=token)
96
+ if resolved != "cpu":
97
+ self._pipeline = self._pipeline.to(torch.device(resolved))
98
+ self._device = resolved
99
+ self._torchaudio = torchaudio
100
+
101
+ def diarize(self, audio_path: str) -> list[dict[str, Any]]:
102
+ # Preload audio in-memory with torchaudio so pyannote 4.x doesn't hit
103
+ # torchcodec (which requires CUDA runtime libraries we don't ship in
104
+ # the CPU-only container). This is the documented fallback path.
105
+ waveform, sample_rate = self._torchaudio.load(audio_path)
106
+ output = self._pipeline({"waveform": waveform, "sample_rate": sample_rate})
107
+ # pyannote 4.x returns DiarizeOutput; 3.x returned the Annotation directly.
108
+ # Support both by reading .speaker_diarization if present, else the object itself.
109
+ diarization = getattr(output, "speaker_diarization", output)
110
+ turns: list[dict[str, Any]] = []
111
+ for segment, _, speaker in diarization.itertracks(yield_label=True):
112
+ turns.append(
113
+ {
114
+ "start_ms": int(segment.start * 1000),
115
+ "end_ms": int(segment.end * 1000),
116
+ "speaker": str(speaker),
117
+ }
118
+ )
119
+ return turns
120
+
121
+
122
+ @lru_cache(maxsize=1)
123
+ def _load_pyannote(token_present: bool) -> DiarizationBackend: # pragma: no cover - needs weights
124
+ if not token_present:
125
+ raise RuntimeError(
126
+ "pyannote needs HUGGINGFACE_TOKEN to download the gated model. "
127
+ "Set it in .env.local and accept the license at hf.co/pyannote/speaker-diarization-3.1."
128
+ )
129
+ return PyannoteBackend()
130
+
131
+
132
+ def _overlap_ms(a_start: int, a_end: int, b_start: int, b_end: int) -> int:
133
+ return max(0, min(a_end, b_end) - max(a_start, b_start))
134
+
135
+
136
+ def merge_subthreshold_speakers(
137
+ turns: list[Turn], min_share: float = DEFAULT_MIN_SPEAKER_SHARE
138
+ ) -> list[Turn]:
139
+ """Collapse speakers with sub-threshold airtime into the nearest dominant
140
+ cluster (#178). Pure transformation; safe to skip when nothing's spurious.
141
+
142
+ A 60-min 2-party interview routinely diarizes as 6 speakers (~85% / 10%
143
+ + three 1–3% slivers). The slivers are temporal fragments of the real
144
+ pair, not extra speakers — reassign each sliver-turn to whichever
145
+ dominant speaker is temporally closest (gap to the nearest dominant
146
+ turn before vs after).
147
+
148
+ Conservative: when every speaker meets the threshold the input is
149
+ returned unchanged, and when no speaker meets the threshold (degenerate)
150
+ the input is also returned unchanged rather than zeroing the speaker set.
151
+ """
152
+ if not turns:
153
+ return turns
154
+ total = sum(t.end_ms - t.start_ms for t in turns)
155
+ if total <= 0:
156
+ return turns
157
+ by_speaker: dict[str, int] = {}
158
+ for t in turns:
159
+ by_speaker[t.speaker] = by_speaker.get(t.speaker, 0) + (t.end_ms - t.start_ms)
160
+ dominant = {s for s, dur in by_speaker.items() if dur / total >= min_share}
161
+ if not dominant or len(dominant) == len(by_speaker):
162
+ return turns
163
+ ordered = sorted(turns, key=lambda t: t.start_ms)
164
+ out: list[Turn] = []
165
+ for i, t in enumerate(ordered):
166
+ if t.speaker in dominant:
167
+ out.append(t)
168
+ continue
169
+ prev_dom = next((o for o in reversed(ordered[:i]) if o.speaker in dominant), None)
170
+ next_dom = next((o for o in ordered[i + 1 :] if o.speaker in dominant), None)
171
+ if prev_dom is None and next_dom is None:
172
+ out.append(t) # no anchor — leave as-is rather than guess
173
+ continue
174
+ if prev_dom is None:
175
+ chosen = next_dom.speaker # type: ignore[union-attr]
176
+ elif next_dom is None:
177
+ chosen = prev_dom.speaker
178
+ else:
179
+ gap_prev = t.start_ms - prev_dom.end_ms
180
+ gap_next = next_dom.start_ms - t.end_ms
181
+ chosen = prev_dom.speaker if gap_prev <= gap_next else next_dom.speaker
182
+ out.append(Turn(t.start_ms, t.end_ms, chosen))
183
+ return out
184
+
185
+
186
+ def _nearest_turn_speaker(utt_start_ms: int, utt_end_ms: int, turns: list[Turn]) -> str | None:
187
+ """Pick the speaker of the turn whose nearest edge is closest to the
188
+ utterance's midpoint (#178). Used to rescue 'S?' orphans — utterances
189
+ whose timing didn't overlap any diarization turn (a few-ms sliver
190
+ between turn boundaries). Returns None if turns is empty.
191
+ """
192
+ if not turns:
193
+ return None
194
+ mid = (utt_start_ms + utt_end_ms) // 2
195
+ return min(
196
+ turns,
197
+ key=lambda t: min(abs(t.start_ms - mid), abs(t.end_ms - mid)),
198
+ ).speaker
199
+
200
+
201
+ def assign_speaker(utterance: dict[str, Any], turns: list[Turn]) -> tuple[str, float]:
202
+ """Return (speaker_id, confidence) for an utterance by max overlap.
203
+
204
+ confidence = overlapped duration with the winning speaker / utterance
205
+ duration (0..1). Ties resolve to the earlier-starting turn.
206
+ """
207
+ u_start = utterance["start_ms"]
208
+ u_end = utterance["end_ms"]
209
+ u_dur = max(u_end - u_start, 1)
210
+
211
+ by_speaker: dict[str, int] = {}
212
+ for t in turns:
213
+ ov = _overlap_ms(u_start, u_end, t.start_ms, t.end_ms)
214
+ if ov > 0:
215
+ by_speaker[t.speaker] = by_speaker.get(t.speaker, 0) + ov
216
+
217
+ if not by_speaker:
218
+ return "S?", 0.0
219
+ winner = max(by_speaker.items(), key=lambda kv: kv[1])
220
+ return winner[0], min(winner[1] / u_dur, 1.0)
221
+
222
+
223
+ def detect_overlaps(turns: list[Turn], min_overlap_ms: int = 200) -> list[dict[str, Any]]:
224
+ """Find regions where two turns overlap; emit `overlap` cues."""
225
+ cues: list[dict[str, Any]] = []
226
+ ordered = sorted(turns, key=lambda t: t.start_ms)
227
+ idx = 1
228
+ for i in range(len(ordered)):
229
+ for j in range(i + 1, len(ordered)):
230
+ a, b = ordered[i], ordered[j]
231
+ if b.start_ms >= a.end_ms:
232
+ break # no later turn can overlap a (sorted by start)
233
+ if a.speaker == b.speaker:
234
+ continue
235
+ ov_start = max(a.start_ms, b.start_ms)
236
+ ov_end = min(a.end_ms, b.end_ms)
237
+ if ov_end - ov_start >= min_overlap_ms:
238
+ cues.append(
239
+ {
240
+ "id": f"CUE-OV-{idx:03d}",
241
+ "kind": "overlap",
242
+ "start_ms": ov_start,
243
+ "end_ms": ov_end,
244
+ "source": "audio",
245
+ }
246
+ )
247
+ idx += 1
248
+ return cues
249
+
250
+
251
+ def align(transcript: dict[str, Any], turns: list[Turn]) -> dict[str, Any]:
252
+ """Assign speaker_id + per-utterance diarization confidence, attach overlap
253
+ cues, and set session status when mean confidence is below the floor.
254
+ Mutates and returns the transcript.
255
+
256
+ Post-fix (#178): an utterance whose timing doesn't overlap any diarization
257
+ turn (an "S?" orphan, e.g. a sliver between turn boundaries) is rescued by
258
+ attaching the nearest turn's speaker. The confidence stays 0.0 to mark the
259
+ assignment as a fallback rather than a verified overlap — those still
260
+ accumulate against the mean-confidence floor and can trigger the
261
+ needs_speaker_labels gate when there are many.
262
+ """
263
+ confidences: list[float] = []
264
+ for utt in transcript.get("utterances", []):
265
+ speaker, conf = assign_speaker(utt, turns)
266
+ if speaker == "S?":
267
+ rescued = _nearest_turn_speaker(utt["start_ms"], utt["end_ms"], turns)
268
+ if rescued is not None:
269
+ speaker = rescued # confidence stays 0.0 (fallback marker)
270
+ utt["speaker_id"] = speaker
271
+ utt.setdefault("diarization", {})["confidence"] = round(conf, 3)
272
+ confidences.append(conf)
273
+
274
+ cues = transcript.setdefault("cues", [])
275
+ cues.extend(detect_overlaps(turns))
276
+
277
+ mean_conf = sum(confidences) / len(confidences) if confidences else 0.0
278
+ if mean_conf < DIARIZATION_CONFIDENCE_FLOOR:
279
+ transcript["status"] = "needs_speaker_labels"
280
+ return transcript
281
+
282
+
283
+ class Diarizer:
284
+ def __init__(self, backend: DiarizationBackend | None = None):
285
+ self._backend = backend
286
+
287
+ def _get_backend(self) -> DiarizationBackend:
288
+ if self._backend is not None:
289
+ return self._backend
290
+ import os
291
+
292
+ token_present = bool(os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN"))
293
+ return _load_pyannote(token_present)
294
+
295
+ def diarize(self, audio_path: str) -> list[Turn]:
296
+ raw = self._get_backend().diarize(audio_path)
297
+ turns = [Turn(int(t["start_ms"]), int(t["end_ms"]), str(t["speaker"])) for t in raw]
298
+ # Collapse over-segmentation slivers into the dominant cluster (#178)
299
+ # before align() and detect_overlaps() consume the turns.
300
+ return merge_subthreshold_speakers(turns)
@@ -0,0 +1,77 @@
1
+ """Optional frame annotation (#50).
2
+
3
+ A one-sentence description of a frame from a vision-capable model. Off by
4
+ default; opt in via config.toml `[frames] annotation = "claude" | "moondream2"`
5
+ (decision #72). The annotation is recorded as an AI-authored event on the frame
6
+ and surfaces as [draft] until a researcher endorses it.
7
+
8
+ The vision models are injected (the Claude path calls the Anthropic API with
9
+ the frame + linked utterance; the Moondream2 path is a lazy-loaded local model)
10
+ so the gate, prompt, and event shape are testable without weights.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from collections.abc import Callable
16
+ from typing import Any, Protocol
17
+
18
+ PROMPT = (
19
+ "In one sentence, describe what's visible in this interview frame that a "
20
+ "researcher reviewing the session might find notable. If nothing is notable, "
21
+ "return null."
22
+ )
23
+
24
+
25
+ class VisionModel(Protocol):
26
+ def describe(self, frame_path: str, prompt: str, linked_text: str) -> str | None: ...
27
+
28
+
29
+ def build_prompt(linked_text: str) -> str:
30
+ """The standard prompt + the linked utterance text for context."""
31
+ if linked_text:
32
+ return f'{PROMPT}\n\nThe speaker was saying: "{linked_text}"'
33
+ return PROMPT
34
+
35
+
36
+ def annotate_frame(
37
+ frame: dict[str, Any],
38
+ linked_text: str,
39
+ model: VisionModel,
40
+ *,
41
+ enabled: bool,
42
+ actor_id: str,
43
+ ) -> dict[str, Any] | None:
44
+ """Return an AI-authored `create` event for the frame's annotation, or None
45
+ when annotation is disabled or the model declines (nothing notable).
46
+
47
+ `enabled` reflects the per-seed config gate; right-click "annotate this
48
+ frame" passes enabled=True on demand even when the default is off.
49
+ """
50
+ if not enabled:
51
+ return None
52
+ description = model.describe(frame["path"], build_prompt(linked_text), linked_text)
53
+ if description is None or not description.strip():
54
+ return None
55
+ return {
56
+ "artifact_kind": "frame_annotation",
57
+ "action": "create",
58
+ "actor_type": "ai",
59
+ "actor_id": actor_id,
60
+ "model": actor_id,
61
+ "payload": {
62
+ "frame_id": frame["id"],
63
+ "at_ms": frame["at_ms"],
64
+ "annotation": description.strip(),
65
+ "status": "draft",
66
+ },
67
+ }
68
+
69
+
70
+ def claude_vision(call: Callable[[str, str], str | None]) -> VisionModel:
71
+ """Wrap an Anthropic vision call (frame_path, prompt) → text into a VisionModel."""
72
+
73
+ class _Claude:
74
+ def describe(self, frame_path: str, prompt: str, linked_text: str) -> str | None:
75
+ return call(frame_path, prompt)
76
+
77
+ return _Claude()
@@ -0,0 +1,130 @@
1
+ """ffmpeg-backed frame extractor (#14).
2
+
3
+ Pulls a JPG from the video stream at each requested trigger timestamp and
4
+ writes it to sessions/<sid>/frames/<padded_ms>.jpg (640x360). Returns the
5
+ frames[] index entries for transcript.json. No classification — frames are
6
+ evidence.
7
+
8
+ Triggers (see schema/frames.taxonomy.json): silence_*, audio_cue, shot_change,
9
+ highlight, manual, sampling. The caller supplies (at_ms, trigger,
10
+ linked_utterance_id?) tuples; this module just extracts + indexes.
11
+
12
+ Idempotent: a frame whose target JPG already exists is not re-extracted, and
13
+ the returned id is stable (FR-<padded_ms>).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import subprocess
19
+ from dataclasses import dataclass
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ FRAME_WIDTH = 640
24
+ FRAME_HEIGHT = 360
25
+ _PAD = 9 # zero-pad ms to 9 digits (~277h) for lexical sort
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class FrameTrigger:
30
+ at_ms: int
31
+ trigger: str
32
+ linked_utterance_id: str | None = None
33
+
34
+
35
+ def _padded(ms: int) -> str:
36
+ return str(ms).zfill(_PAD)
37
+
38
+
39
+ def frame_id(at_ms: int) -> str:
40
+ return f"FR-{_padded(at_ms)}"
41
+
42
+
43
+ def frame_relpath(session_id: str, at_ms: int) -> str:
44
+ return f"sessions/{session_id}/frames/{_padded(at_ms)}.jpg"
45
+
46
+
47
+ def _extract_one(video_path: Path, at_ms: int, out_path: Path) -> None:
48
+ out_path.parent.mkdir(parents=True, exist_ok=True)
49
+ ts = at_ms / 1000.0
50
+ # -ss before -i seeks fast; -frames:v 1 grabs a single frame; scale to 640x360.
51
+ cmd = [
52
+ "ffmpeg",
53
+ "-y",
54
+ "-ss",
55
+ f"{ts:.3f}",
56
+ "-i",
57
+ str(video_path),
58
+ "-frames:v",
59
+ "1",
60
+ "-vf",
61
+ f"scale={FRAME_WIDTH}:{FRAME_HEIGHT}",
62
+ "-q:v",
63
+ "4",
64
+ str(out_path),
65
+ ]
66
+ proc = subprocess.run(cmd, capture_output=True, text=True)
67
+ if proc.returncode != 0 or not out_path.exists():
68
+ raise RuntimeError(f"ffmpeg failed extracting frame at {at_ms}ms: {proc.stderr[-300:]}")
69
+
70
+
71
+ def extract_frames(
72
+ video_path: str | Path,
73
+ session_id: str,
74
+ triggers: list[FrameTrigger],
75
+ seed_root: str | Path,
76
+ ) -> list[dict[str, Any]]:
77
+ """Extract a frame per trigger; return frames[] index entries.
78
+
79
+ Deduplicates by at_ms (the first trigger for a given ms wins), and skips
80
+ extraction when the JPG already exists (idempotent re-runs).
81
+ """
82
+ video_path = Path(video_path)
83
+ seed_root = Path(seed_root)
84
+
85
+ seen: dict[int, FrameTrigger] = {}
86
+ for t in triggers:
87
+ seen.setdefault(t.at_ms, t)
88
+
89
+ frames: list[dict[str, Any]] = []
90
+ for at_ms in sorted(seen):
91
+ trig = seen[at_ms]
92
+ rel = frame_relpath(session_id, at_ms)
93
+ abs_path = seed_root / rel
94
+ if not abs_path.exists():
95
+ _extract_one(video_path, at_ms, abs_path)
96
+ entry: dict[str, Any] = {
97
+ "id": frame_id(at_ms),
98
+ "at_ms": at_ms,
99
+ "path": rel,
100
+ "trigger": trig.trigger,
101
+ }
102
+ if trig.linked_utterance_id is not None:
103
+ entry["linked_utterance_id"] = trig.linked_utterance_id
104
+ frames.append(entry)
105
+ return frames
106
+
107
+
108
+ def sampling_triggers(
109
+ duration_ms: int,
110
+ existing_ms: list[int],
111
+ interval_s: int = 60,
112
+ ) -> list[FrameTrigger]:
113
+ """Emit a `sampling` trigger every `interval_s` only when no other trigger
114
+ already fired within that window (ROADMAP § Descriptive transcription B).
115
+ """
116
+ interval_ms = interval_s * 1000
117
+ existing = sorted(existing_ms)
118
+ out: list[FrameTrigger] = []
119
+ t = 0
120
+ ei = 0
121
+ while t < duration_ms:
122
+ window_end = t + interval_ms
123
+ # advance existing pointer past anything before this window
124
+ while ei < len(existing) and existing[ei] < t:
125
+ ei += 1
126
+ covered = ei < len(existing) and existing[ei] < window_end
127
+ if not covered:
128
+ out.append(FrameTrigger(at_ms=t, trigger="sampling"))
129
+ t = window_end
130
+ return out
@@ -0,0 +1,70 @@
1
+ """Health endpoint for the transcriber service.
2
+
3
+ ROADMAP § Verification — `compost watch` and the CLI probe this on startup to
4
+ confirm the transcriber container is reachable before queuing work.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import platform
10
+ import sys
11
+ from importlib.metadata import PackageNotFoundError, version
12
+
13
+ from fastapi import APIRouter
14
+ from pydantic import BaseModel
15
+
16
+ from . import __version__
17
+
18
+ router = APIRouter()
19
+
20
+
21
+ class HealthResponse(BaseModel):
22
+ """Stable contract for /health. CLI parses these fields."""
23
+
24
+ status: str
25
+ service: str
26
+ versions: dict[str, str | None]
27
+
28
+
29
+ def _safe_version(pkg: str) -> str | None:
30
+ """Return the installed version of `pkg`, or None if it isn't installed.
31
+
32
+ Model-heavy optional deps (whisperx, pyannote.audio, silero-vad) are
33
+ declared in `pyproject.toml` under the `asr` extra and only installed
34
+ when their respective issues land (#9-#15). Until then, /health
35
+ reports them as `null` so the CLI can tell the user what's missing.
36
+ """
37
+ try:
38
+ return version(pkg)
39
+ except PackageNotFoundError:
40
+ return None
41
+
42
+
43
+ @router.get("/health", response_model=HealthResponse)
44
+ def get_health() -> HealthResponse:
45
+ return HealthResponse(
46
+ status="ok",
47
+ service="compost-transcriber",
48
+ versions={
49
+ "transcriber": __version__,
50
+ "python": platform.python_version(),
51
+ "fastapi": _safe_version("fastapi"),
52
+ "uvicorn": _safe_version("uvicorn"),
53
+ "whisperx": _safe_version("whisperx"),
54
+ "pyannote.audio": _safe_version("pyannote.audio"),
55
+ "silero-vad": _safe_version("silero-vad"),
56
+ },
57
+ )
58
+
59
+
60
+ __all__ = ["router", "HealthResponse", "get_health"]
61
+
62
+
63
+ def _python_metadata_check() -> None:
64
+ """Self-check at import time: make sure we're on a supported runtime."""
65
+ major, minor = sys.version_info[:2]
66
+ if (major, minor) < (3, 11):
67
+ raise RuntimeError(f"compost-transcriber requires Python >=3.11, got {major}.{minor}")
68
+
69
+
70
+ _python_metadata_check()