@they-juanreina/compost-cli 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/blame.d.ts.map +1 -1
- package/dist/lib/blame.js +3 -2
- package/dist/lib/blame.js.map +1 -1
- package/dist/lib/journal.d.ts.map +1 -1
- package/dist/lib/journal.js +9 -0
- package/dist/lib/journal.js.map +1 -1
- package/dist/lib/migrate.d.ts.map +1 -1
- package/dist/lib/migrate.js +1 -0
- package/dist/lib/migrate.js.map +1 -1
- package/dist/lib/nativeRuntime.d.ts +6 -3
- package/dist/lib/nativeRuntime.d.ts.map +1 -1
- package/dist/lib/nativeRuntime.js +6 -3
- package/dist/lib/nativeRuntime.js.map +1 -1
- package/dist/lib/retrieve.d.ts.map +1 -1
- package/dist/lib/retrieve.js +0 -8
- package/dist/lib/retrieve.js.map +1 -1
- package/dist/lib/seedResolve.d.ts.map +1 -1
- package/dist/lib/seedResolve.js +1 -0
- package/dist/lib/seedResolve.js.map +1 -1
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +9 -8
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/snap.d.ts.map +1 -1
- package/dist/lib/snap.js +2 -5
- package/dist/lib/snap.js.map +1 -1
- package/dist/loops/supervisor.d.ts.map +1 -1
- package/dist/loops/supervisor.js +1 -0
- package/dist/loops/supervisor.js.map +1 -1
- package/dist/loops/transcribe_worker.d.ts.map +1 -1
- package/dist/loops/transcribe_worker.js +0 -1
- package/dist/loops/transcribe_worker.js.map +1 -1
- package/dist/router.js +1 -1
- package/package.json +10 -4
- package/transcriber/app/__init__.py +3 -0
- package/transcriber/app/asr.py +198 -0
- package/transcriber/app/asr_parakeet.py +174 -0
- package/transcriber/app/cue_parser.py +110 -0
- package/transcriber/app/diarization.py +300 -0
- package/transcriber/app/frame_annotation.py +77 -0
- package/transcriber/app/frames.py +130 -0
- package/transcriber/app/health.py +70 -0
- package/transcriber/app/legacy.py +355 -0
- package/transcriber/app/main.py +30 -0
- package/transcriber/app/pipeline.py +204 -0
- package/transcriber/app/pptx_export.py +42 -0
- package/transcriber/app/prosody.py +123 -0
- package/transcriber/app/routes/__init__.py +1 -0
- package/transcriber/app/routes/legacy.py +117 -0
- package/transcriber/app/routes/transcribe.py +133 -0
- package/transcriber/app/shot_change.py +74 -0
- package/transcriber/app/silence_typer.py +144 -0
- package/transcriber/app/transcribe_cli.py +82 -0
- package/transcriber/app/vad.py +145 -0
- package/transcriber/pyproject.toml +56 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
"""pyannote-audio diarization + word-level alignment (#11).
|
|
2
|
+
|
|
3
|
+
The pyannote pipeline (gated model; needs HUGGINGFACE_TOKEN + torch) is loaded
|
|
4
|
+
lazily. The alignment maths — assigning a stable speaker_id to each utterance
|
|
5
|
+
by maximum temporal overlap with diarization turns, flagging overlap regions,
|
|
6
|
+
and gating low-confidence sessions — is pure and fully unit-tested.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from functools import lru_cache
|
|
13
|
+
from typing import Any, Protocol
|
|
14
|
+
|
|
15
|
+
# Below this mean per-utterance overlap confidence, the session is queued for
|
|
16
|
+
# human speaker labelling instead of trusted.
|
|
17
|
+
DIARIZATION_CONFIDENCE_FLOOR = 0.5
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class Turn:
|
|
22
|
+
start_ms: int
|
|
23
|
+
end_ms: int
|
|
24
|
+
speaker: str
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Speakers below this share of total speech are treated as over-segmentation
|
|
28
|
+
# fragments and merged into the nearest dominant cluster (#178). pyannote 3.1
|
|
29
|
+
# routinely splits a clean 2-party interview into 5–6 speakers (~85% / 10% +
|
|
30
|
+
# three 1–3% slivers); the slivers are temporal fragments of the dominant
|
|
31
|
+
# pair, not extra speakers.
|
|
32
|
+
DEFAULT_MIN_SPEAKER_SHARE = 0.05
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DiarizationBackend(Protocol):
|
|
36
|
+
def diarize(self, audio_path: str) -> list[dict[str, Any]]: ...
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
PYANNOTE_MODEL = "pyannote/speaker-diarization-3.1"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _resolve_diar_device(requested: str) -> str: # pragma: no cover - env-dependent
|
|
43
|
+
"""Map 'auto' to the best available device. On Apple Silicon that's MPS
|
|
44
|
+
(Metal) — ~18x faster than CPU for pyannote with identical results on
|
|
45
|
+
torch>=2.12. 'cpu'/'mps'/'cuda' pass through."""
|
|
46
|
+
if requested != "auto":
|
|
47
|
+
return requested
|
|
48
|
+
try:
|
|
49
|
+
import torch # type: ignore
|
|
50
|
+
|
|
51
|
+
if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
|
|
52
|
+
return "mps"
|
|
53
|
+
if torch.cuda.is_available():
|
|
54
|
+
return "cuda"
|
|
55
|
+
except ImportError:
|
|
56
|
+
pass
|
|
57
|
+
return "cpu"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class PyannoteBackend: # pragma: no cover - needs gated weights + torch
|
|
61
|
+
"""Concrete DiarizationBackend wrapping pyannote-audio.
|
|
62
|
+
|
|
63
|
+
The pipeline is loaded once per process. HuggingFace token comes from
|
|
64
|
+
HUGGINGFACE_TOKEN or HF_TOKEN env vars (one must be set; the user must
|
|
65
|
+
also have accepted the license at hf.co/pyannote/speaker-diarization-3.1).
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(self, device: str | None = None) -> None:
|
|
69
|
+
import os
|
|
70
|
+
|
|
71
|
+
token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
|
|
72
|
+
if not token:
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
"pyannote needs HUGGINGFACE_TOKEN to download the gated model. "
|
|
75
|
+
"Set it in .env.local and accept the license at hf.co/pyannote/speaker-diarization-3.1."
|
|
76
|
+
)
|
|
77
|
+
try:
|
|
78
|
+
import torch # type: ignore
|
|
79
|
+
import torchaudio # type: ignore
|
|
80
|
+
from pyannote.audio import Pipeline # type: ignore
|
|
81
|
+
except ImportError as e:
|
|
82
|
+
raise RuntimeError(
|
|
83
|
+
"pyannote.audio / torchaudio not installed. Install the asr extra: pip install -e '.[asr]'"
|
|
84
|
+
) from e
|
|
85
|
+
|
|
86
|
+
resolved = _resolve_diar_device(
|
|
87
|
+
device or os.environ.get("COMPOST_DIARIZATION_DEVICE", "auto")
|
|
88
|
+
)
|
|
89
|
+
# On Apple Silicon, MPS runs pyannote ~18x faster than CPU with identical
|
|
90
|
+
# results (verified on torch>=2.12); enable CPU fallback for any op MPS
|
|
91
|
+
# lacks so it can never error out mid-pipeline (#176).
|
|
92
|
+
if resolved == "mps":
|
|
93
|
+
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
|
|
94
|
+
|
|
95
|
+
self._pipeline = Pipeline.from_pretrained(PYANNOTE_MODEL, token=token)
|
|
96
|
+
if resolved != "cpu":
|
|
97
|
+
self._pipeline = self._pipeline.to(torch.device(resolved))
|
|
98
|
+
self._device = resolved
|
|
99
|
+
self._torchaudio = torchaudio
|
|
100
|
+
|
|
101
|
+
def diarize(self, audio_path: str) -> list[dict[str, Any]]:
|
|
102
|
+
# Preload audio in-memory with torchaudio so pyannote 4.x doesn't hit
|
|
103
|
+
# torchcodec (which requires CUDA runtime libraries we don't ship in
|
|
104
|
+
# the CPU-only container). This is the documented fallback path.
|
|
105
|
+
waveform, sample_rate = self._torchaudio.load(audio_path)
|
|
106
|
+
output = self._pipeline({"waveform": waveform, "sample_rate": sample_rate})
|
|
107
|
+
# pyannote 4.x returns DiarizeOutput; 3.x returned the Annotation directly.
|
|
108
|
+
# Support both by reading .speaker_diarization if present, else the object itself.
|
|
109
|
+
diarization = getattr(output, "speaker_diarization", output)
|
|
110
|
+
turns: list[dict[str, Any]] = []
|
|
111
|
+
for segment, _, speaker in diarization.itertracks(yield_label=True):
|
|
112
|
+
turns.append(
|
|
113
|
+
{
|
|
114
|
+
"start_ms": int(segment.start * 1000),
|
|
115
|
+
"end_ms": int(segment.end * 1000),
|
|
116
|
+
"speaker": str(speaker),
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
return turns
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@lru_cache(maxsize=1)
|
|
123
|
+
def _load_pyannote(token_present: bool) -> DiarizationBackend: # pragma: no cover - needs weights
|
|
124
|
+
if not token_present:
|
|
125
|
+
raise RuntimeError(
|
|
126
|
+
"pyannote needs HUGGINGFACE_TOKEN to download the gated model. "
|
|
127
|
+
"Set it in .env.local and accept the license at hf.co/pyannote/speaker-diarization-3.1."
|
|
128
|
+
)
|
|
129
|
+
return PyannoteBackend()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _overlap_ms(a_start: int, a_end: int, b_start: int, b_end: int) -> int:
|
|
133
|
+
return max(0, min(a_end, b_end) - max(a_start, b_start))
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def merge_subthreshold_speakers(
|
|
137
|
+
turns: list[Turn], min_share: float = DEFAULT_MIN_SPEAKER_SHARE
|
|
138
|
+
) -> list[Turn]:
|
|
139
|
+
"""Collapse speakers with sub-threshold airtime into the nearest dominant
|
|
140
|
+
cluster (#178). Pure transformation; safe to skip when nothing's spurious.
|
|
141
|
+
|
|
142
|
+
A 60-min 2-party interview routinely diarizes as 6 speakers (~85% / 10%
|
|
143
|
+
+ three 1–3% slivers). The slivers are temporal fragments of the real
|
|
144
|
+
pair, not extra speakers — reassign each sliver-turn to whichever
|
|
145
|
+
dominant speaker is temporally closest (gap to the nearest dominant
|
|
146
|
+
turn before vs after).
|
|
147
|
+
|
|
148
|
+
Conservative: when every speaker meets the threshold the input is
|
|
149
|
+
returned unchanged, and when no speaker meets the threshold (degenerate)
|
|
150
|
+
the input is also returned unchanged rather than zeroing the speaker set.
|
|
151
|
+
"""
|
|
152
|
+
if not turns:
|
|
153
|
+
return turns
|
|
154
|
+
total = sum(t.end_ms - t.start_ms for t in turns)
|
|
155
|
+
if total <= 0:
|
|
156
|
+
return turns
|
|
157
|
+
by_speaker: dict[str, int] = {}
|
|
158
|
+
for t in turns:
|
|
159
|
+
by_speaker[t.speaker] = by_speaker.get(t.speaker, 0) + (t.end_ms - t.start_ms)
|
|
160
|
+
dominant = {s for s, dur in by_speaker.items() if dur / total >= min_share}
|
|
161
|
+
if not dominant or len(dominant) == len(by_speaker):
|
|
162
|
+
return turns
|
|
163
|
+
ordered = sorted(turns, key=lambda t: t.start_ms)
|
|
164
|
+
out: list[Turn] = []
|
|
165
|
+
for i, t in enumerate(ordered):
|
|
166
|
+
if t.speaker in dominant:
|
|
167
|
+
out.append(t)
|
|
168
|
+
continue
|
|
169
|
+
prev_dom = next((o for o in reversed(ordered[:i]) if o.speaker in dominant), None)
|
|
170
|
+
next_dom = next((o for o in ordered[i + 1 :] if o.speaker in dominant), None)
|
|
171
|
+
if prev_dom is None and next_dom is None:
|
|
172
|
+
out.append(t) # no anchor — leave as-is rather than guess
|
|
173
|
+
continue
|
|
174
|
+
if prev_dom is None:
|
|
175
|
+
chosen = next_dom.speaker # type: ignore[union-attr]
|
|
176
|
+
elif next_dom is None:
|
|
177
|
+
chosen = prev_dom.speaker
|
|
178
|
+
else:
|
|
179
|
+
gap_prev = t.start_ms - prev_dom.end_ms
|
|
180
|
+
gap_next = next_dom.start_ms - t.end_ms
|
|
181
|
+
chosen = prev_dom.speaker if gap_prev <= gap_next else next_dom.speaker
|
|
182
|
+
out.append(Turn(t.start_ms, t.end_ms, chosen))
|
|
183
|
+
return out
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _nearest_turn_speaker(utt_start_ms: int, utt_end_ms: int, turns: list[Turn]) -> str | None:
|
|
187
|
+
"""Pick the speaker of the turn whose nearest edge is closest to the
|
|
188
|
+
utterance's midpoint (#178). Used to rescue 'S?' orphans — utterances
|
|
189
|
+
whose timing didn't overlap any diarization turn (a few-ms sliver
|
|
190
|
+
between turn boundaries). Returns None if turns is empty.
|
|
191
|
+
"""
|
|
192
|
+
if not turns:
|
|
193
|
+
return None
|
|
194
|
+
mid = (utt_start_ms + utt_end_ms) // 2
|
|
195
|
+
return min(
|
|
196
|
+
turns,
|
|
197
|
+
key=lambda t: min(abs(t.start_ms - mid), abs(t.end_ms - mid)),
|
|
198
|
+
).speaker
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def assign_speaker(utterance: dict[str, Any], turns: list[Turn]) -> tuple[str, float]:
|
|
202
|
+
"""Return (speaker_id, confidence) for an utterance by max overlap.
|
|
203
|
+
|
|
204
|
+
confidence = overlapped duration with the winning speaker / utterance
|
|
205
|
+
duration (0..1). Ties resolve to the earlier-starting turn.
|
|
206
|
+
"""
|
|
207
|
+
u_start = utterance["start_ms"]
|
|
208
|
+
u_end = utterance["end_ms"]
|
|
209
|
+
u_dur = max(u_end - u_start, 1)
|
|
210
|
+
|
|
211
|
+
by_speaker: dict[str, int] = {}
|
|
212
|
+
for t in turns:
|
|
213
|
+
ov = _overlap_ms(u_start, u_end, t.start_ms, t.end_ms)
|
|
214
|
+
if ov > 0:
|
|
215
|
+
by_speaker[t.speaker] = by_speaker.get(t.speaker, 0) + ov
|
|
216
|
+
|
|
217
|
+
if not by_speaker:
|
|
218
|
+
return "S?", 0.0
|
|
219
|
+
winner = max(by_speaker.items(), key=lambda kv: kv[1])
|
|
220
|
+
return winner[0], min(winner[1] / u_dur, 1.0)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def detect_overlaps(turns: list[Turn], min_overlap_ms: int = 200) -> list[dict[str, Any]]:
|
|
224
|
+
"""Find regions where two turns overlap; emit `overlap` cues."""
|
|
225
|
+
cues: list[dict[str, Any]] = []
|
|
226
|
+
ordered = sorted(turns, key=lambda t: t.start_ms)
|
|
227
|
+
idx = 1
|
|
228
|
+
for i in range(len(ordered)):
|
|
229
|
+
for j in range(i + 1, len(ordered)):
|
|
230
|
+
a, b = ordered[i], ordered[j]
|
|
231
|
+
if b.start_ms >= a.end_ms:
|
|
232
|
+
break # no later turn can overlap a (sorted by start)
|
|
233
|
+
if a.speaker == b.speaker:
|
|
234
|
+
continue
|
|
235
|
+
ov_start = max(a.start_ms, b.start_ms)
|
|
236
|
+
ov_end = min(a.end_ms, b.end_ms)
|
|
237
|
+
if ov_end - ov_start >= min_overlap_ms:
|
|
238
|
+
cues.append(
|
|
239
|
+
{
|
|
240
|
+
"id": f"CUE-OV-{idx:03d}",
|
|
241
|
+
"kind": "overlap",
|
|
242
|
+
"start_ms": ov_start,
|
|
243
|
+
"end_ms": ov_end,
|
|
244
|
+
"source": "audio",
|
|
245
|
+
}
|
|
246
|
+
)
|
|
247
|
+
idx += 1
|
|
248
|
+
return cues
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def align(transcript: dict[str, Any], turns: list[Turn]) -> dict[str, Any]:
|
|
252
|
+
"""Assign speaker_id + per-utterance diarization confidence, attach overlap
|
|
253
|
+
cues, and set session status when mean confidence is below the floor.
|
|
254
|
+
Mutates and returns the transcript.
|
|
255
|
+
|
|
256
|
+
Post-fix (#178): an utterance whose timing doesn't overlap any diarization
|
|
257
|
+
turn (an "S?" orphan, e.g. a sliver between turn boundaries) is rescued by
|
|
258
|
+
attaching the nearest turn's speaker. The confidence stays 0.0 to mark the
|
|
259
|
+
assignment as a fallback rather than a verified overlap — those still
|
|
260
|
+
accumulate against the mean-confidence floor and can trigger the
|
|
261
|
+
needs_speaker_labels gate when there are many.
|
|
262
|
+
"""
|
|
263
|
+
confidences: list[float] = []
|
|
264
|
+
for utt in transcript.get("utterances", []):
|
|
265
|
+
speaker, conf = assign_speaker(utt, turns)
|
|
266
|
+
if speaker == "S?":
|
|
267
|
+
rescued = _nearest_turn_speaker(utt["start_ms"], utt["end_ms"], turns)
|
|
268
|
+
if rescued is not None:
|
|
269
|
+
speaker = rescued # confidence stays 0.0 (fallback marker)
|
|
270
|
+
utt["speaker_id"] = speaker
|
|
271
|
+
utt.setdefault("diarization", {})["confidence"] = round(conf, 3)
|
|
272
|
+
confidences.append(conf)
|
|
273
|
+
|
|
274
|
+
cues = transcript.setdefault("cues", [])
|
|
275
|
+
cues.extend(detect_overlaps(turns))
|
|
276
|
+
|
|
277
|
+
mean_conf = sum(confidences) / len(confidences) if confidences else 0.0
|
|
278
|
+
if mean_conf < DIARIZATION_CONFIDENCE_FLOOR:
|
|
279
|
+
transcript["status"] = "needs_speaker_labels"
|
|
280
|
+
return transcript
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class Diarizer:
|
|
284
|
+
def __init__(self, backend: DiarizationBackend | None = None):
|
|
285
|
+
self._backend = backend
|
|
286
|
+
|
|
287
|
+
def _get_backend(self) -> DiarizationBackend:
|
|
288
|
+
if self._backend is not None:
|
|
289
|
+
return self._backend
|
|
290
|
+
import os
|
|
291
|
+
|
|
292
|
+
token_present = bool(os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN"))
|
|
293
|
+
return _load_pyannote(token_present)
|
|
294
|
+
|
|
295
|
+
def diarize(self, audio_path: str) -> list[Turn]:
|
|
296
|
+
raw = self._get_backend().diarize(audio_path)
|
|
297
|
+
turns = [Turn(int(t["start_ms"]), int(t["end_ms"]), str(t["speaker"])) for t in raw]
|
|
298
|
+
# Collapse over-segmentation slivers into the dominant cluster (#178)
|
|
299
|
+
# before align() and detect_overlaps() consume the turns.
|
|
300
|
+
return merge_subthreshold_speakers(turns)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Optional frame annotation (#50).
|
|
2
|
+
|
|
3
|
+
A one-sentence description of a frame from a vision-capable model. Off by
|
|
4
|
+
default; opt in via config.toml `[frames] annotation = "claude" | "moondream2"`
|
|
5
|
+
(decision #72). The annotation is recorded as an AI-authored event on the frame
|
|
6
|
+
and surfaces as [draft] until a researcher endorses it.
|
|
7
|
+
|
|
8
|
+
The vision models are injected (the Claude path calls the Anthropic API with
|
|
9
|
+
the frame + linked utterance; the Moondream2 path is a lazy-loaded local model)
|
|
10
|
+
so the gate, prompt, and event shape are testable without weights.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
from typing import Any, Protocol
|
|
17
|
+
|
|
18
|
+
PROMPT = (
|
|
19
|
+
"In one sentence, describe what's visible in this interview frame that a "
|
|
20
|
+
"researcher reviewing the session might find notable. If nothing is notable, "
|
|
21
|
+
"return null."
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class VisionModel(Protocol):
|
|
26
|
+
def describe(self, frame_path: str, prompt: str, linked_text: str) -> str | None: ...
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_prompt(linked_text: str) -> str:
|
|
30
|
+
"""The standard prompt + the linked utterance text for context."""
|
|
31
|
+
if linked_text:
|
|
32
|
+
return f'{PROMPT}\n\nThe speaker was saying: "{linked_text}"'
|
|
33
|
+
return PROMPT
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def annotate_frame(
|
|
37
|
+
frame: dict[str, Any],
|
|
38
|
+
linked_text: str,
|
|
39
|
+
model: VisionModel,
|
|
40
|
+
*,
|
|
41
|
+
enabled: bool,
|
|
42
|
+
actor_id: str,
|
|
43
|
+
) -> dict[str, Any] | None:
|
|
44
|
+
"""Return an AI-authored `create` event for the frame's annotation, or None
|
|
45
|
+
when annotation is disabled or the model declines (nothing notable).
|
|
46
|
+
|
|
47
|
+
`enabled` reflects the per-seed config gate; right-click "annotate this
|
|
48
|
+
frame" passes enabled=True on demand even when the default is off.
|
|
49
|
+
"""
|
|
50
|
+
if not enabled:
|
|
51
|
+
return None
|
|
52
|
+
description = model.describe(frame["path"], build_prompt(linked_text), linked_text)
|
|
53
|
+
if description is None or not description.strip():
|
|
54
|
+
return None
|
|
55
|
+
return {
|
|
56
|
+
"artifact_kind": "frame_annotation",
|
|
57
|
+
"action": "create",
|
|
58
|
+
"actor_type": "ai",
|
|
59
|
+
"actor_id": actor_id,
|
|
60
|
+
"model": actor_id,
|
|
61
|
+
"payload": {
|
|
62
|
+
"frame_id": frame["id"],
|
|
63
|
+
"at_ms": frame["at_ms"],
|
|
64
|
+
"annotation": description.strip(),
|
|
65
|
+
"status": "draft",
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def claude_vision(call: Callable[[str, str], str | None]) -> VisionModel:
|
|
71
|
+
"""Wrap an Anthropic vision call (frame_path, prompt) → text into a VisionModel."""
|
|
72
|
+
|
|
73
|
+
class _Claude:
|
|
74
|
+
def describe(self, frame_path: str, prompt: str, linked_text: str) -> str | None:
|
|
75
|
+
return call(frame_path, prompt)
|
|
76
|
+
|
|
77
|
+
return _Claude()
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""ffmpeg-backed frame extractor (#14).
|
|
2
|
+
|
|
3
|
+
Pulls a JPG from the video stream at each requested trigger timestamp and
|
|
4
|
+
writes it to sessions/<sid>/frames/<padded_ms>.jpg (640x360). Returns the
|
|
5
|
+
frames[] index entries for transcript.json. No classification — frames are
|
|
6
|
+
evidence.
|
|
7
|
+
|
|
8
|
+
Triggers (see schema/frames.taxonomy.json): silence_*, audio_cue, shot_change,
|
|
9
|
+
highlight, manual, sampling. The caller supplies (at_ms, trigger,
|
|
10
|
+
linked_utterance_id?) tuples; this module just extracts + indexes.
|
|
11
|
+
|
|
12
|
+
Idempotent: a frame whose target JPG already exists is not re-extracted, and
|
|
13
|
+
the returned id is stable (FR-<padded_ms>).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import subprocess
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
FRAME_WIDTH = 640
|
|
24
|
+
FRAME_HEIGHT = 360
|
|
25
|
+
_PAD = 9 # zero-pad ms to 9 digits (~277h) for lexical sort
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class FrameTrigger:
|
|
30
|
+
at_ms: int
|
|
31
|
+
trigger: str
|
|
32
|
+
linked_utterance_id: str | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _padded(ms: int) -> str:
|
|
36
|
+
return str(ms).zfill(_PAD)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def frame_id(at_ms: int) -> str:
|
|
40
|
+
return f"FR-{_padded(at_ms)}"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def frame_relpath(session_id: str, at_ms: int) -> str:
|
|
44
|
+
return f"sessions/{session_id}/frames/{_padded(at_ms)}.jpg"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _extract_one(video_path: Path, at_ms: int, out_path: Path) -> None:
|
|
48
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
ts = at_ms / 1000.0
|
|
50
|
+
# -ss before -i seeks fast; -frames:v 1 grabs a single frame; scale to 640x360.
|
|
51
|
+
cmd = [
|
|
52
|
+
"ffmpeg",
|
|
53
|
+
"-y",
|
|
54
|
+
"-ss",
|
|
55
|
+
f"{ts:.3f}",
|
|
56
|
+
"-i",
|
|
57
|
+
str(video_path),
|
|
58
|
+
"-frames:v",
|
|
59
|
+
"1",
|
|
60
|
+
"-vf",
|
|
61
|
+
f"scale={FRAME_WIDTH}:{FRAME_HEIGHT}",
|
|
62
|
+
"-q:v",
|
|
63
|
+
"4",
|
|
64
|
+
str(out_path),
|
|
65
|
+
]
|
|
66
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
67
|
+
if proc.returncode != 0 or not out_path.exists():
|
|
68
|
+
raise RuntimeError(f"ffmpeg failed extracting frame at {at_ms}ms: {proc.stderr[-300:]}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract_frames(
|
|
72
|
+
video_path: str | Path,
|
|
73
|
+
session_id: str,
|
|
74
|
+
triggers: list[FrameTrigger],
|
|
75
|
+
seed_root: str | Path,
|
|
76
|
+
) -> list[dict[str, Any]]:
|
|
77
|
+
"""Extract a frame per trigger; return frames[] index entries.
|
|
78
|
+
|
|
79
|
+
Deduplicates by at_ms (the first trigger for a given ms wins), and skips
|
|
80
|
+
extraction when the JPG already exists (idempotent re-runs).
|
|
81
|
+
"""
|
|
82
|
+
video_path = Path(video_path)
|
|
83
|
+
seed_root = Path(seed_root)
|
|
84
|
+
|
|
85
|
+
seen: dict[int, FrameTrigger] = {}
|
|
86
|
+
for t in triggers:
|
|
87
|
+
seen.setdefault(t.at_ms, t)
|
|
88
|
+
|
|
89
|
+
frames: list[dict[str, Any]] = []
|
|
90
|
+
for at_ms in sorted(seen):
|
|
91
|
+
trig = seen[at_ms]
|
|
92
|
+
rel = frame_relpath(session_id, at_ms)
|
|
93
|
+
abs_path = seed_root / rel
|
|
94
|
+
if not abs_path.exists():
|
|
95
|
+
_extract_one(video_path, at_ms, abs_path)
|
|
96
|
+
entry: dict[str, Any] = {
|
|
97
|
+
"id": frame_id(at_ms),
|
|
98
|
+
"at_ms": at_ms,
|
|
99
|
+
"path": rel,
|
|
100
|
+
"trigger": trig.trigger,
|
|
101
|
+
}
|
|
102
|
+
if trig.linked_utterance_id is not None:
|
|
103
|
+
entry["linked_utterance_id"] = trig.linked_utterance_id
|
|
104
|
+
frames.append(entry)
|
|
105
|
+
return frames
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def sampling_triggers(
|
|
109
|
+
duration_ms: int,
|
|
110
|
+
existing_ms: list[int],
|
|
111
|
+
interval_s: int = 60,
|
|
112
|
+
) -> list[FrameTrigger]:
|
|
113
|
+
"""Emit a `sampling` trigger every `interval_s` only when no other trigger
|
|
114
|
+
already fired within that window (ROADMAP § Descriptive transcription B).
|
|
115
|
+
"""
|
|
116
|
+
interval_ms = interval_s * 1000
|
|
117
|
+
existing = sorted(existing_ms)
|
|
118
|
+
out: list[FrameTrigger] = []
|
|
119
|
+
t = 0
|
|
120
|
+
ei = 0
|
|
121
|
+
while t < duration_ms:
|
|
122
|
+
window_end = t + interval_ms
|
|
123
|
+
# advance existing pointer past anything before this window
|
|
124
|
+
while ei < len(existing) and existing[ei] < t:
|
|
125
|
+
ei += 1
|
|
126
|
+
covered = ei < len(existing) and existing[ei] < window_end
|
|
127
|
+
if not covered:
|
|
128
|
+
out.append(FrameTrigger(at_ms=t, trigger="sampling"))
|
|
129
|
+
t = window_end
|
|
130
|
+
return out
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Health endpoint for the transcriber service.
|
|
2
|
+
|
|
3
|
+
ROADMAP § Verification — `compost watch` and the CLI probe this on startup to
|
|
4
|
+
confirm the transcriber container is reachable before queuing work.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import platform
|
|
10
|
+
import sys
|
|
11
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
12
|
+
|
|
13
|
+
from fastapi import APIRouter
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
from . import __version__
|
|
17
|
+
|
|
18
|
+
router = APIRouter()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HealthResponse(BaseModel):
|
|
22
|
+
"""Stable contract for /health. CLI parses these fields."""
|
|
23
|
+
|
|
24
|
+
status: str
|
|
25
|
+
service: str
|
|
26
|
+
versions: dict[str, str | None]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _safe_version(pkg: str) -> str | None:
|
|
30
|
+
"""Return the installed version of `pkg`, or None if it isn't installed.
|
|
31
|
+
|
|
32
|
+
Model-heavy optional deps (whisperx, pyannote.audio, silero-vad) are
|
|
33
|
+
declared in `pyproject.toml` under the `asr` extra and only installed
|
|
34
|
+
when their respective issues land (#9-#15). Until then, /health
|
|
35
|
+
reports them as `null` so the CLI can tell the user what's missing.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
return version(pkg)
|
|
39
|
+
except PackageNotFoundError:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@router.get("/health", response_model=HealthResponse)
|
|
44
|
+
def get_health() -> HealthResponse:
|
|
45
|
+
return HealthResponse(
|
|
46
|
+
status="ok",
|
|
47
|
+
service="compost-transcriber",
|
|
48
|
+
versions={
|
|
49
|
+
"transcriber": __version__,
|
|
50
|
+
"python": platform.python_version(),
|
|
51
|
+
"fastapi": _safe_version("fastapi"),
|
|
52
|
+
"uvicorn": _safe_version("uvicorn"),
|
|
53
|
+
"whisperx": _safe_version("whisperx"),
|
|
54
|
+
"pyannote.audio": _safe_version("pyannote.audio"),
|
|
55
|
+
"silero-vad": _safe_version("silero-vad"),
|
|
56
|
+
},
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__all__ = ["router", "HealthResponse", "get_health"]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _python_metadata_check() -> None:
|
|
64
|
+
"""Self-check at import time: make sure we're on a supported runtime."""
|
|
65
|
+
major, minor = sys.version_info[:2]
|
|
66
|
+
if (major, minor) < (3, 11):
|
|
67
|
+
raise RuntimeError(f"compost-transcriber requires Python >=3.11, got {major}.{minor}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
_python_metadata_check()
|