@they-juanreina/compost-cli 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/agreement.d.ts +3 -0
- package/dist/commands/agreement.d.ts.map +1 -0
- package/dist/commands/agreement.js +35 -0
- package/dist/commands/agreement.js.map +1 -0
- package/dist/commands/create.d.ts +1 -0
- package/dist/commands/create.d.ts.map +1 -1
- package/dist/commands/create.js +39 -1
- package/dist/commands/create.js.map +1 -1
- package/dist/commands/export.d.ts.map +1 -1
- package/dist/commands/export.js +47 -4
- package/dist/commands/export.js.map +1 -1
- package/dist/commands/import.d.ts +3 -0
- package/dist/commands/import.d.ts.map +1 -0
- package/dist/commands/import.js +59 -0
- package/dist/commands/import.js.map +1 -0
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +1 -0
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/jobs.d.ts +3 -0
- package/dist/commands/jobs.d.ts.map +1 -0
- package/dist/commands/jobs.js +105 -0
- package/dist/commands/jobs.js.map +1 -0
- package/dist/commands/label.d.ts +3 -0
- package/dist/commands/label.d.ts.map +1 -0
- package/dist/commands/label.js +67 -0
- package/dist/commands/label.js.map +1 -0
- package/dist/commands/models.d.ts.map +1 -1
- package/dist/commands/models.js +2 -1
- package/dist/commands/models.js.map +1 -1
- package/dist/commands/recode.d.ts +3 -0
- package/dist/commands/recode.d.ts.map +1 -0
- package/dist/commands/recode.js +60 -0
- package/dist/commands/recode.js.map +1 -0
- package/dist/commands/reindex.d.ts.map +1 -1
- package/dist/commands/reindex.js +6 -4
- package/dist/commands/reindex.js.map +1 -1
- package/dist/commands/rerun.d.ts +3 -0
- package/dist/commands/rerun.d.ts.map +1 -0
- package/dist/commands/rerun.js +91 -0
- package/dist/commands/rerun.js.map +1 -0
- package/dist/commands/search.d.ts.map +1 -1
- package/dist/commands/search.js +2 -1
- package/dist/commands/search.js.map +1 -1
- package/dist/commands/secrets.d.ts +3 -0
- package/dist/commands/secrets.d.ts.map +1 -0
- package/dist/commands/secrets.js +143 -0
- package/dist/commands/secrets.js.map +1 -0
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +90 -1
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/status.d.ts.map +1 -1
- package/dist/commands/status.js +2 -1
- package/dist/commands/status.js.map +1 -1
- package/dist/commands/transcribe.d.ts.map +1 -1
- package/dist/commands/transcribe.js +13 -2
- package/dist/commands/transcribe.js.map +1 -1
- package/dist/commands/validate.d.ts.map +1 -1
- package/dist/commands/validate.js +29 -1
- package/dist/commands/validate.js.map +1 -1
- package/dist/engine.d.ts +23 -0
- package/dist/engine.d.ts.map +1 -0
- package/dist/engine.js +32 -0
- package/dist/engine.js.map +1 -0
- package/dist/exporters/prov.d.ts +11 -0
- package/dist/exporters/prov.d.ts.map +1 -0
- package/dist/exporters/prov.js +151 -0
- package/dist/exporters/prov.js.map +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -1
- package/dist/lib/agreement.d.ts +77 -0
- package/dist/lib/agreement.d.ts.map +1 -0
- package/dist/lib/agreement.js +261 -0
- package/dist/lib/agreement.js.map +1 -0
- package/dist/lib/artifacts.d.ts +32 -1
- package/dist/lib/artifacts.d.ts.map +1 -1
- package/dist/lib/artifacts.js +156 -22
- package/dist/lib/artifacts.js.map +1 -1
- package/dist/lib/blame.d.ts.map +1 -1
- package/dist/lib/blame.js +3 -2
- package/dist/lib/blame.js.map +1 -1
- package/dist/lib/config.d.ts +3 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/doctor.d.ts +3 -0
- package/dist/lib/doctor.d.ts.map +1 -1
- package/dist/lib/doctor.js +24 -1
- package/dist/lib/doctor.js.map +1 -1
- package/dist/lib/events.d.ts +34 -1
- package/dist/lib/events.d.ts.map +1 -1
- package/dist/lib/events.js +35 -1
- package/dist/lib/events.js.map +1 -1
- package/dist/lib/importTranscript.d.ts +16 -0
- package/dist/lib/importTranscript.d.ts.map +1 -0
- package/dist/lib/importTranscript.js +94 -0
- package/dist/lib/importTranscript.js.map +1 -0
- package/dist/lib/ingest.d.ts.map +1 -1
- package/dist/lib/ingest.js +12 -6
- package/dist/lib/ingest.js.map +1 -1
- package/dist/lib/journal.d.ts +13 -0
- package/dist/lib/journal.d.ts.map +1 -1
- package/dist/lib/journal.js +58 -2
- package/dist/lib/journal.js.map +1 -1
- package/dist/lib/legacyNative.d.ts +24 -0
- package/dist/lib/legacyNative.d.ts.map +1 -0
- package/dist/lib/legacyNative.js +51 -0
- package/dist/lib/legacyNative.js.map +1 -0
- package/dist/lib/migrate.d.ts.map +1 -1
- package/dist/lib/migrate.js +1 -0
- package/dist/lib/migrate.js.map +1 -1
- package/dist/lib/nativeRuntime.d.ts +6 -3
- package/dist/lib/nativeRuntime.d.ts.map +1 -1
- package/dist/lib/nativeRuntime.js +6 -3
- package/dist/lib/nativeRuntime.js.map +1 -1
- package/dist/lib/provisionNative.js +1 -1
- package/dist/lib/provisionNative.js.map +1 -1
- package/dist/lib/queue.d.ts +25 -0
- package/dist/lib/queue.d.ts.map +1 -1
- package/dist/lib/queue.js +70 -3
- package/dist/lib/queue.js.map +1 -1
- package/dist/lib/reads.d.ts +24 -0
- package/dist/lib/reads.d.ts.map +1 -0
- package/dist/lib/reads.js +115 -0
- package/dist/lib/reads.js.map +1 -0
- package/dist/lib/recode.d.ts +19 -0
- package/dist/lib/recode.d.ts.map +1 -0
- package/dist/lib/recode.js +43 -0
- package/dist/lib/recode.js.map +1 -0
- package/dist/lib/rerun.d.ts +51 -0
- package/dist/lib/rerun.d.ts.map +1 -0
- package/dist/lib/rerun.js +166 -0
- package/dist/lib/rerun.js.map +1 -0
- package/dist/lib/retrieve.d.ts +8 -4
- package/dist/lib/retrieve.d.ts.map +1 -1
- package/dist/lib/retrieve.js +12 -10
- package/dist/lib/retrieve.js.map +1 -1
- package/dist/lib/schemas.generated.d.ts.map +1 -1
- package/dist/lib/schemas.generated.js +28 -0
- package/dist/lib/schemas.generated.js.map +1 -1
- package/dist/lib/secrets.d.ts +158 -0
- package/dist/lib/secrets.d.ts.map +1 -0
- package/dist/lib/secrets.js +507 -0
- package/dist/lib/secrets.js.map +1 -0
- package/dist/lib/seed.d.ts +5 -0
- package/dist/lib/seed.d.ts.map +1 -1
- package/dist/lib/seed.js +15 -2
- package/dist/lib/seed.js.map +1 -1
- package/dist/lib/seedResolve.d.ts.map +1 -1
- package/dist/lib/seedResolve.js +1 -0
- package/dist/lib/seedResolve.js.map +1 -1
- package/dist/lib/session.d.ts +14 -0
- package/dist/lib/session.d.ts.map +1 -1
- package/dist/lib/session.js +47 -0
- package/dist/lib/session.js.map +1 -1
- package/dist/lib/setup.d.ts +5 -0
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +78 -14
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/setupWizard.d.ts +51 -0
- package/dist/lib/setupWizard.d.ts.map +1 -0
- package/dist/lib/setupWizard.js +223 -0
- package/dist/lib/setupWizard.js.map +1 -0
- package/dist/lib/snap.d.ts.map +1 -1
- package/dist/lib/snap.js +2 -5
- package/dist/lib/snap.js.map +1 -1
- package/dist/lib/speakers.d.ts +41 -0
- package/dist/lib/speakers.d.ts.map +1 -0
- package/dist/lib/speakers.js +78 -0
- package/dist/lib/speakers.js.map +1 -0
- package/dist/lib/status.d.ts.map +1 -1
- package/dist/lib/status.js +21 -0
- package/dist/lib/status.js.map +1 -1
- package/dist/lib/userConfig.d.ts +22 -0
- package/dist/lib/userConfig.d.ts.map +1 -0
- package/dist/lib/userConfig.js +67 -0
- package/dist/lib/userConfig.js.map +1 -0
- package/dist/lib/validate.d.ts +18 -0
- package/dist/lib/validate.d.ts.map +1 -1
- package/dist/lib/validate.js +70 -1
- package/dist/lib/validate.js.map +1 -1
- package/dist/lib/version.d.ts +30 -0
- package/dist/lib/version.d.ts.map +1 -0
- package/dist/lib/version.js +73 -0
- package/dist/lib/version.js.map +1 -0
- package/dist/llm/adapter.d.ts.map +1 -1
- package/dist/llm/adapter.js +2 -0
- package/dist/llm/adapter.js.map +1 -1
- package/dist/llm/providers/ollama.d.ts.map +1 -1
- package/dist/llm/providers/ollama.js +6 -0
- package/dist/llm/providers/ollama.js.map +1 -1
- package/dist/loops/ingest_watcher.d.ts.map +1 -1
- package/dist/loops/ingest_watcher.js +6 -3
- package/dist/loops/ingest_watcher.js.map +1 -1
- package/dist/loops/legacy_worker.d.ts +28 -1
- package/dist/loops/legacy_worker.d.ts.map +1 -1
- package/dist/loops/legacy_worker.js +81 -9
- package/dist/loops/legacy_worker.js.map +1 -1
- package/dist/loops/supervisor.d.ts +3 -0
- package/dist/loops/supervisor.d.ts.map +1 -1
- package/dist/loops/supervisor.js +12 -0
- package/dist/loops/supervisor.js.map +1 -1
- package/dist/loops/synthesis.d.ts.map +1 -1
- package/dist/loops/synthesis.js +15 -0
- package/dist/loops/synthesis.js.map +1 -1
- package/dist/loops/transcribe_worker.d.ts.map +1 -1
- package/dist/loops/transcribe_worker.js +2 -4
- package/dist/loops/transcribe_worker.js.map +1 -1
- package/dist/output.d.ts +13 -1
- package/dist/output.d.ts.map +1 -1
- package/dist/output.js +22 -2
- package/dist/output.js.map +1 -1
- package/dist/render/human.d.ts +20 -0
- package/dist/render/human.d.ts.map +1 -0
- package/dist/render/human.js +54 -0
- package/dist/render/human.js.map +1 -0
- package/dist/router.d.ts.map +1 -1
- package/dist/router.js +17 -2
- package/dist/router.js.map +1 -1
- package/package.json +18 -5
- package/templates/config.toml +6 -1
- package/transcriber/app/__init__.py +3 -0
- package/transcriber/app/asr.py +198 -0
- package/transcriber/app/asr_parakeet.py +174 -0
- package/transcriber/app/cue_parser.py +110 -0
- package/transcriber/app/diarization.py +330 -0
- package/transcriber/app/frame_annotation.py +77 -0
- package/transcriber/app/frames.py +130 -0
- package/transcriber/app/health.py +70 -0
- package/transcriber/app/legacy.py +355 -0
- package/transcriber/app/legacy_cli.py +90 -0
- package/transcriber/app/main.py +30 -0
- package/transcriber/app/pipeline.py +210 -0
- package/transcriber/app/pptx_export.py +42 -0
- package/transcriber/app/prosody.py +128 -0
- package/transcriber/app/routes/__init__.py +1 -0
- package/transcriber/app/routes/legacy.py +117 -0
- package/transcriber/app/routes/transcribe.py +133 -0
- package/transcriber/app/shot_change.py +74 -0
- package/transcriber/app/silence_typer.py +144 -0
- package/transcriber/app/transcribe_cli.py +82 -0
- package/transcriber/app/vad.py +216 -0
- package/transcriber/pyproject.toml +56 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Cue parser (#10).
|
|
2
|
+
|
|
3
|
+
Whisper-large-v3 with event-tag tokens emits inline markers like [laughter],
|
|
4
|
+
[sigh], [cough], [clear_throat], [unintelligible], and code-switching markers.
|
|
5
|
+
This module pulls those out of utterance text into structured cues[] entries
|
|
6
|
+
(schema/cues.taxonomy.json) and returns the cleaned text.
|
|
7
|
+
|
|
8
|
+
Pure and deterministic — no model. The ASR wrapper (asr.py) produces the
|
|
9
|
+
tagged text; this turns it into cues.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
# Whisper/Whisper-AT event tag → compost cue kind (cues.taxonomy.json).
|
|
18
|
+
TAG_TO_KIND: dict[str, str] = {
|
|
19
|
+
"laughter": "laughter",
|
|
20
|
+
"laugh": "laughter",
|
|
21
|
+
"laughs": "laughter",
|
|
22
|
+
"sigh": "sigh",
|
|
23
|
+
"sighs": "sigh",
|
|
24
|
+
"cough": "cough",
|
|
25
|
+
"coughs": "cough",
|
|
26
|
+
"clear_throat": "throat-clear",
|
|
27
|
+
"throat_clear": "throat-clear",
|
|
28
|
+
"throat-clear": "throat-clear",
|
|
29
|
+
"unintelligible": "unintelligible",
|
|
30
|
+
"inaudible": "unintelligible",
|
|
31
|
+
"code_switch": "code-switching",
|
|
32
|
+
"code-switch": "code-switching",
|
|
33
|
+
"code_switching": "code-switching",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Default confidence assigned to a tag-derived cue when the ASR gives none.
|
|
37
|
+
DEFAULT_CONFIDENCE = 0.8
|
|
38
|
+
|
|
39
|
+
_TAG_RE = re.compile(r"\[([a-zA-Z_\-]+)\]")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _clean_text(text: str) -> str:
|
|
43
|
+
# Drop recognized event tags, collapse the resulting double spaces.
|
|
44
|
+
def repl(m: re.Match[str]) -> str:
|
|
45
|
+
return "" if m.group(1).lower() in TAG_TO_KIND else m.group(0)
|
|
46
|
+
|
|
47
|
+
return re.sub(r"\s{2,}", " ", _TAG_RE.sub(repl, text)).strip()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def parse_cues_from_utterance(
|
|
51
|
+
utterance: dict[str, Any],
|
|
52
|
+
next_cue_index: int = 1,
|
|
53
|
+
confidence: float = DEFAULT_CONFIDENCE,
|
|
54
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
55
|
+
"""Return (cleaned_text, cues) for one utterance.
|
|
56
|
+
|
|
57
|
+
Cue timing: if a word in `words[]` matches the tag, use that word's span;
|
|
58
|
+
otherwise fall back to the utterance span.
|
|
59
|
+
"""
|
|
60
|
+
text = utterance.get("text", "")
|
|
61
|
+
words = utterance.get("words", [])
|
|
62
|
+
speaker_id = utterance.get("speaker_id")
|
|
63
|
+
cues: list[dict[str, Any]] = []
|
|
64
|
+
idx = next_cue_index
|
|
65
|
+
|
|
66
|
+
for m in _TAG_RE.finditer(text):
|
|
67
|
+
kind = TAG_TO_KIND.get(m.group(1).lower())
|
|
68
|
+
if kind is None:
|
|
69
|
+
continue
|
|
70
|
+
start_ms, end_ms = _tag_span(m.group(0), words, utterance)
|
|
71
|
+
cue: dict[str, Any] = {
|
|
72
|
+
"id": f"CUE-{idx:03d}",
|
|
73
|
+
"kind": kind,
|
|
74
|
+
"start_ms": start_ms,
|
|
75
|
+
"end_ms": end_ms,
|
|
76
|
+
"source": "audio",
|
|
77
|
+
"confidence": confidence,
|
|
78
|
+
}
|
|
79
|
+
if speaker_id is not None:
|
|
80
|
+
cue["speaker_id"] = speaker_id
|
|
81
|
+
cues.append(cue)
|
|
82
|
+
idx += 1
|
|
83
|
+
|
|
84
|
+
return _clean_text(text), cues
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _tag_span(
|
|
88
|
+
tag_token: str,
|
|
89
|
+
words: list[dict[str, Any]],
|
|
90
|
+
utterance: dict[str, Any],
|
|
91
|
+
) -> tuple[int, int]:
|
|
92
|
+
for w in words:
|
|
93
|
+
if w.get("w") == tag_token:
|
|
94
|
+
return int(w["s"]), int(w["e"])
|
|
95
|
+
return int(utterance["start_ms"]), int(utterance["end_ms"])
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def parse_transcript_cues(transcript: dict[str, Any]) -> dict[str, Any]:
|
|
99
|
+
"""Extract cues from every utterance, append to cues[], strip tags from text.
|
|
100
|
+
|
|
101
|
+
Cue ids continue from any existing cues[]. Mutates and returns the transcript.
|
|
102
|
+
"""
|
|
103
|
+
existing = transcript.setdefault("cues", [])
|
|
104
|
+
idx = len(existing) + 1
|
|
105
|
+
for utt in transcript.get("utterances", []):
|
|
106
|
+
cleaned, cues = parse_cues_from_utterance(utt, next_cue_index=idx)
|
|
107
|
+
utt["text"] = cleaned
|
|
108
|
+
existing.extend(cues)
|
|
109
|
+
idx += len(cues)
|
|
110
|
+
return transcript
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
"""pyannote-audio diarization + word-level alignment (#11).
|
|
2
|
+
|
|
3
|
+
The pyannote pipeline (gated model; needs HUGGINGFACE_TOKEN + torch) is loaded
|
|
4
|
+
lazily. The alignment maths — assigning a stable speaker_id to each utterance
|
|
5
|
+
by maximum temporal overlap with diarization turns, flagging overlap regions,
|
|
6
|
+
and gating low-confidence sessions — is pure and fully unit-tested.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from functools import lru_cache
|
|
14
|
+
from typing import Any, Protocol
|
|
15
|
+
|
|
16
|
+
# Below this mean per-utterance overlap confidence, the session is queued for
|
|
17
|
+
# human speaker labelling instead of trusted.
|
|
18
|
+
DIARIZATION_CONFIDENCE_FLOOR = 0.5
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class Turn:
|
|
23
|
+
start_ms: int
|
|
24
|
+
end_ms: int
|
|
25
|
+
speaker: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Speakers below this share of total speech are treated as over-segmentation
|
|
29
|
+
# fragments and merged into the nearest dominant cluster (#178). pyannote 3.1
|
|
30
|
+
# routinely splits a clean 2-party interview into 5–6 speakers (~85% / 10% +
|
|
31
|
+
# three 1–3% slivers); the slivers are temporal fragments of the dominant
|
|
32
|
+
# pair, not extra speakers.
|
|
33
|
+
DEFAULT_MIN_SPEAKER_SHARE = 0.05
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DiarizationBackend(Protocol):
|
|
37
|
+
def diarize(self, audio_path: str) -> list[dict[str, Any]]: ...
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
PYANNOTE_MODEL = "pyannote/speaker-diarization-3.1"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _resolve_diar_device(requested: str) -> str: # pragma: no cover - env-dependent
|
|
44
|
+
"""Map 'auto' to the best available device. On Apple Silicon that's MPS
|
|
45
|
+
(Metal) — ~18x faster than CPU for pyannote with identical results on
|
|
46
|
+
torch>=2.12. 'cpu'/'mps'/'cuda' pass through."""
|
|
47
|
+
if requested != "auto":
|
|
48
|
+
return requested
|
|
49
|
+
try:
|
|
50
|
+
import torch # type: ignore
|
|
51
|
+
|
|
52
|
+
if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
|
|
53
|
+
return "mps"
|
|
54
|
+
if torch.cuda.is_available():
|
|
55
|
+
return "cuda"
|
|
56
|
+
except ImportError:
|
|
57
|
+
pass
|
|
58
|
+
return "cpu"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class PyannoteBackend: # pragma: no cover - needs gated weights + torch
|
|
62
|
+
"""Concrete DiarizationBackend wrapping pyannote-audio.
|
|
63
|
+
|
|
64
|
+
The pipeline is loaded once per process. HuggingFace token comes from
|
|
65
|
+
HUGGINGFACE_TOKEN or HF_TOKEN env vars (one must be set; the user must
|
|
66
|
+
also have accepted the license at hf.co/pyannote/speaker-diarization-3.1).
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(self, device: str | None = None) -> None:
|
|
70
|
+
import os
|
|
71
|
+
|
|
72
|
+
token = os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN")
|
|
73
|
+
if not token:
|
|
74
|
+
raise RuntimeError(
|
|
75
|
+
"pyannote needs HUGGINGFACE_TOKEN to download the gated model. "
|
|
76
|
+
"Set it in .env.local and accept the license at hf.co/pyannote/speaker-diarization-3.1."
|
|
77
|
+
)
|
|
78
|
+
try:
|
|
79
|
+
import torch # type: ignore
|
|
80
|
+
import torchaudio # type: ignore
|
|
81
|
+
from pyannote.audio import Pipeline # type: ignore
|
|
82
|
+
except ImportError as e:
|
|
83
|
+
raise RuntimeError(
|
|
84
|
+
"pyannote.audio / torchaudio not installed. Install the asr extra: pip install -e '.[asr]'"
|
|
85
|
+
) from e
|
|
86
|
+
|
|
87
|
+
resolved = _resolve_diar_device(
|
|
88
|
+
device or os.environ.get("COMPOST_DIARIZATION_DEVICE", "auto")
|
|
89
|
+
)
|
|
90
|
+
# On Apple Silicon, MPS runs pyannote ~18x faster than CPU with identical
|
|
91
|
+
# results (verified on torch>=2.12); enable CPU fallback for any op MPS
|
|
92
|
+
# lacks so it can never error out mid-pipeline (#176).
|
|
93
|
+
if resolved == "mps":
|
|
94
|
+
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
|
|
95
|
+
|
|
96
|
+
self._pipeline = Pipeline.from_pretrained(PYANNOTE_MODEL, token=token)
|
|
97
|
+
if resolved != "cpu":
|
|
98
|
+
self._pipeline = self._pipeline.to(torch.device(resolved))
|
|
99
|
+
self._device = resolved
|
|
100
|
+
self._torchaudio = torchaudio
|
|
101
|
+
|
|
102
|
+
def diarize(self, audio_path: str) -> list[dict[str, Any]]:
|
|
103
|
+
# Preload audio in-memory with torchaudio so pyannote 4.x doesn't hit
|
|
104
|
+
# torchcodec (which requires CUDA runtime libraries we don't ship in
|
|
105
|
+
# the CPU-only container). This is the documented fallback path.
|
|
106
|
+
waveform, sample_rate = self._torchaudio.load(audio_path)
|
|
107
|
+
output = self._pipeline({"waveform": waveform, "sample_rate": sample_rate})
|
|
108
|
+
# pyannote 4.x returns DiarizeOutput; 3.x returned the Annotation directly.
|
|
109
|
+
# Support both by reading .speaker_diarization if present, else the object itself.
|
|
110
|
+
diarization = getattr(output, "speaker_diarization", output)
|
|
111
|
+
turns: list[dict[str, Any]] = []
|
|
112
|
+
for segment, _, speaker in diarization.itertracks(yield_label=True):
|
|
113
|
+
turns.append(
|
|
114
|
+
{
|
|
115
|
+
"start_ms": int(segment.start * 1000),
|
|
116
|
+
"end_ms": int(segment.end * 1000),
|
|
117
|
+
"speaker": str(speaker),
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
return turns
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@lru_cache(maxsize=1)
|
|
124
|
+
def _load_pyannote(token_present: bool) -> DiarizationBackend: # pragma: no cover - needs weights
|
|
125
|
+
if not token_present:
|
|
126
|
+
raise RuntimeError(
|
|
127
|
+
"pyannote needs HUGGINGFACE_TOKEN to download the gated model. "
|
|
128
|
+
"Set it in .env.local and accept the license at hf.co/pyannote/speaker-diarization-3.1."
|
|
129
|
+
)
|
|
130
|
+
return PyannoteBackend()
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
_PYANNOTE_LABEL_RE = re.compile(r"^SPEAKER_(\d+)$")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def normalize_speaker_label(label: str) -> str:
|
|
137
|
+
"""Canonicalize a diarization speaker label to the schema's ``^S[0-9]+$`` form.
|
|
138
|
+
|
|
139
|
+
pyannote emits cluster labels like ``SPEAKER_00`` / ``SPEAKER_01``; the
|
|
140
|
+
transcript schema (schema/transcript.schema.json $defs.speaker.id and
|
|
141
|
+
$defs.utterance.speaker_id) requires ``S{n}`` — e.g. ``S0``, ``S1``. Leading
|
|
142
|
+
zeros are dropped (``SPEAKER_00`` → ``S0``). Already-canonical labels
|
|
143
|
+
(``S1``) and the ``S?`` orphan sentinel pass through unchanged, so this is
|
|
144
|
+
idempotent and safe to apply at the single write point in ``align()``.
|
|
145
|
+
"""
|
|
146
|
+
m = _PYANNOTE_LABEL_RE.match(label)
|
|
147
|
+
return f"S{int(m.group(1))}" if m else label
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _overlap_ms(a_start: int, a_end: int, b_start: int, b_end: int) -> int:
|
|
151
|
+
return max(0, min(a_end, b_end) - max(a_start, b_start))
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def merge_subthreshold_speakers(
|
|
155
|
+
turns: list[Turn], min_share: float = DEFAULT_MIN_SPEAKER_SHARE
|
|
156
|
+
) -> list[Turn]:
|
|
157
|
+
"""Collapse speakers with sub-threshold airtime into the nearest dominant
|
|
158
|
+
cluster (#178). Pure transformation; safe to skip when nothing's spurious.
|
|
159
|
+
|
|
160
|
+
A 60-min 2-party interview routinely diarizes as 6 speakers (~85% / 10%
|
|
161
|
+
+ three 1–3% slivers). The slivers are temporal fragments of the real
|
|
162
|
+
pair, not extra speakers — reassign each sliver-turn to whichever
|
|
163
|
+
dominant speaker is temporally closest (gap to the nearest dominant
|
|
164
|
+
turn before vs after).
|
|
165
|
+
|
|
166
|
+
Conservative: when every speaker meets the threshold the input is
|
|
167
|
+
returned unchanged, and when no speaker meets the threshold (degenerate)
|
|
168
|
+
the input is also returned unchanged rather than zeroing the speaker set.
|
|
169
|
+
"""
|
|
170
|
+
if not turns:
|
|
171
|
+
return turns
|
|
172
|
+
total = sum(t.end_ms - t.start_ms for t in turns)
|
|
173
|
+
if total <= 0:
|
|
174
|
+
return turns
|
|
175
|
+
by_speaker: dict[str, int] = {}
|
|
176
|
+
for t in turns:
|
|
177
|
+
by_speaker[t.speaker] = by_speaker.get(t.speaker, 0) + (t.end_ms - t.start_ms)
|
|
178
|
+
dominant = {s for s, dur in by_speaker.items() if dur / total >= min_share}
|
|
179
|
+
if not dominant or len(dominant) == len(by_speaker):
|
|
180
|
+
return turns
|
|
181
|
+
ordered = sorted(turns, key=lambda t: t.start_ms)
|
|
182
|
+
out: list[Turn] = []
|
|
183
|
+
for i, t in enumerate(ordered):
|
|
184
|
+
if t.speaker in dominant:
|
|
185
|
+
out.append(t)
|
|
186
|
+
continue
|
|
187
|
+
prev_dom = next((o for o in reversed(ordered[:i]) if o.speaker in dominant), None)
|
|
188
|
+
next_dom = next((o for o in ordered[i + 1 :] if o.speaker in dominant), None)
|
|
189
|
+
if prev_dom is None and next_dom is None:
|
|
190
|
+
out.append(t) # no anchor — leave as-is rather than guess
|
|
191
|
+
continue
|
|
192
|
+
if prev_dom is None:
|
|
193
|
+
chosen = next_dom.speaker # type: ignore[union-attr]
|
|
194
|
+
elif next_dom is None:
|
|
195
|
+
chosen = prev_dom.speaker
|
|
196
|
+
else:
|
|
197
|
+
gap_prev = t.start_ms - prev_dom.end_ms
|
|
198
|
+
gap_next = next_dom.start_ms - t.end_ms
|
|
199
|
+
chosen = prev_dom.speaker if gap_prev <= gap_next else next_dom.speaker
|
|
200
|
+
out.append(Turn(t.start_ms, t.end_ms, chosen))
|
|
201
|
+
return out
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _nearest_turn_speaker(utt_start_ms: int, utt_end_ms: int, turns: list[Turn]) -> str | None:
|
|
205
|
+
"""Pick the speaker of the turn whose nearest edge is closest to the
|
|
206
|
+
utterance's midpoint (#178). Used to rescue 'S?' orphans — utterances
|
|
207
|
+
whose timing didn't overlap any diarization turn (a few-ms sliver
|
|
208
|
+
between turn boundaries). Returns None if turns is empty.
|
|
209
|
+
"""
|
|
210
|
+
if not turns:
|
|
211
|
+
return None
|
|
212
|
+
mid = (utt_start_ms + utt_end_ms) // 2
|
|
213
|
+
return min(
|
|
214
|
+
turns,
|
|
215
|
+
key=lambda t: min(abs(t.start_ms - mid), abs(t.end_ms - mid)),
|
|
216
|
+
).speaker
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def assign_speaker(utterance: dict[str, Any], turns: list[Turn]) -> tuple[str, float]:
|
|
220
|
+
"""Return (speaker_id, confidence) for an utterance by max overlap.
|
|
221
|
+
|
|
222
|
+
confidence = overlapped duration with the winning speaker / utterance
|
|
223
|
+
duration (0..1). Ties resolve to the earlier-starting turn.
|
|
224
|
+
"""
|
|
225
|
+
u_start = utterance["start_ms"]
|
|
226
|
+
u_end = utterance["end_ms"]
|
|
227
|
+
u_dur = max(u_end - u_start, 1)
|
|
228
|
+
|
|
229
|
+
by_speaker: dict[str, int] = {}
|
|
230
|
+
for t in turns:
|
|
231
|
+
ov = _overlap_ms(u_start, u_end, t.start_ms, t.end_ms)
|
|
232
|
+
if ov > 0:
|
|
233
|
+
by_speaker[t.speaker] = by_speaker.get(t.speaker, 0) + ov
|
|
234
|
+
|
|
235
|
+
if not by_speaker:
|
|
236
|
+
return "S?", 0.0
|
|
237
|
+
winner = max(by_speaker.items(), key=lambda kv: kv[1])
|
|
238
|
+
return winner[0], min(winner[1] / u_dur, 1.0)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def detect_overlaps(
|
|
242
|
+
turns: list[Turn], min_overlap_ms: int = 200, start_index: int = 1
|
|
243
|
+
) -> list[dict[str, Any]]:
|
|
244
|
+
"""Find regions where two turns overlap; emit `overlap` cues.
|
|
245
|
+
|
|
246
|
+
Cue ids use the schema's uniform ``CUE-[0-9]{3,}`` space (the cue ``kind``
|
|
247
|
+
already distinguishes overlap cues from ASR-tag cues, so a typed ``CUE-OV-``
|
|
248
|
+
prefix would both duplicate that and violate the id pattern). ``start_index``
|
|
249
|
+
lets the caller continue numbering past any cues already in cues[] so the
|
|
250
|
+
overlap and tag-derived cues share one collision-free id sequence.
|
|
251
|
+
"""
|
|
252
|
+
cues: list[dict[str, Any]] = []
|
|
253
|
+
ordered = sorted(turns, key=lambda t: t.start_ms)
|
|
254
|
+
idx = start_index
|
|
255
|
+
for i in range(len(ordered)):
|
|
256
|
+
for j in range(i + 1, len(ordered)):
|
|
257
|
+
a, b = ordered[i], ordered[j]
|
|
258
|
+
if b.start_ms >= a.end_ms:
|
|
259
|
+
break # no later turn can overlap a (sorted by start)
|
|
260
|
+
if a.speaker == b.speaker:
|
|
261
|
+
continue
|
|
262
|
+
ov_start = max(a.start_ms, b.start_ms)
|
|
263
|
+
ov_end = min(a.end_ms, b.end_ms)
|
|
264
|
+
if ov_end - ov_start >= min_overlap_ms:
|
|
265
|
+
cues.append(
|
|
266
|
+
{
|
|
267
|
+
"id": f"CUE-{idx:03d}",
|
|
268
|
+
"kind": "overlap",
|
|
269
|
+
"start_ms": ov_start,
|
|
270
|
+
"end_ms": ov_end,
|
|
271
|
+
"source": "audio",
|
|
272
|
+
}
|
|
273
|
+
)
|
|
274
|
+
idx += 1
|
|
275
|
+
return cues
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def align(transcript: dict[str, Any], turns: list[Turn]) -> dict[str, Any]:
|
|
279
|
+
"""Assign speaker_id + per-utterance diarization confidence, attach overlap
|
|
280
|
+
cues, and set session status when mean confidence is below the floor.
|
|
281
|
+
Mutates and returns the transcript.
|
|
282
|
+
|
|
283
|
+
Post-fix (#178): an utterance whose timing doesn't overlap any diarization
|
|
284
|
+
turn (an "S?" orphan, e.g. a sliver between turn boundaries) is rescued by
|
|
285
|
+
attaching the nearest turn's speaker. The confidence stays 0.0 to mark the
|
|
286
|
+
assignment as a fallback rather than a verified overlap — those still
|
|
287
|
+
accumulate against the mean-confidence floor and can trigger the
|
|
288
|
+
needs_speaker_labels gate when there are many.
|
|
289
|
+
"""
|
|
290
|
+
confidences: list[float] = []
|
|
291
|
+
for utt in transcript.get("utterances", []):
|
|
292
|
+
speaker, conf = assign_speaker(utt, turns)
|
|
293
|
+
if speaker == "S?":
|
|
294
|
+
rescued = _nearest_turn_speaker(utt["start_ms"], utt["end_ms"], turns)
|
|
295
|
+
if rescued is not None:
|
|
296
|
+
speaker = rescued # confidence stays 0.0 (fallback marker)
|
|
297
|
+
# Canonicalize pyannote's SPEAKER_NN labels to the schema's S{n} form at
|
|
298
|
+
# the single write point so speakers[].id (derived from these) and every
|
|
299
|
+
# utterances[].speaker_id agree with ^S[0-9]+$.
|
|
300
|
+
utt["speaker_id"] = normalize_speaker_label(speaker)
|
|
301
|
+
utt.setdefault("diarization", {})["confidence"] = round(conf, 3)
|
|
302
|
+
confidences.append(conf)
|
|
303
|
+
|
|
304
|
+
cues = transcript.setdefault("cues", [])
|
|
305
|
+
cues.extend(detect_overlaps(turns, start_index=len(cues) + 1))
|
|
306
|
+
|
|
307
|
+
mean_conf = sum(confidences) / len(confidences) if confidences else 0.0
|
|
308
|
+
if mean_conf < DIARIZATION_CONFIDENCE_FLOOR:
|
|
309
|
+
transcript["status"] = "needs_speaker_labels"
|
|
310
|
+
return transcript
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
class Diarizer:
|
|
314
|
+
def __init__(self, backend: DiarizationBackend | None = None):
|
|
315
|
+
self._backend = backend
|
|
316
|
+
|
|
317
|
+
def _get_backend(self) -> DiarizationBackend:
|
|
318
|
+
if self._backend is not None:
|
|
319
|
+
return self._backend
|
|
320
|
+
import os
|
|
321
|
+
|
|
322
|
+
token_present = bool(os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN"))
|
|
323
|
+
return _load_pyannote(token_present)
|
|
324
|
+
|
|
325
|
+
def diarize(self, audio_path: str) -> list[Turn]:
|
|
326
|
+
raw = self._get_backend().diarize(audio_path)
|
|
327
|
+
turns = [Turn(int(t["start_ms"]), int(t["end_ms"]), str(t["speaker"])) for t in raw]
|
|
328
|
+
# Collapse over-segmentation slivers into the dominant cluster (#178)
|
|
329
|
+
# before align() and detect_overlaps() consume the turns.
|
|
330
|
+
return merge_subthreshold_speakers(turns)
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Optional frame annotation (#50).
|
|
2
|
+
|
|
3
|
+
A one-sentence description of a frame from a vision-capable model. Off by
|
|
4
|
+
default; opt in via config.toml `[frames] annotation = "claude" | "moondream2"`
|
|
5
|
+
(decision #72). The annotation is recorded as an AI-authored event on the frame
|
|
6
|
+
and surfaces as [draft] until a researcher endorses it.
|
|
7
|
+
|
|
8
|
+
The vision models are injected (the Claude path calls the Anthropic API with
|
|
9
|
+
the frame + linked utterance; the Moondream2 path is a lazy-loaded local model)
|
|
10
|
+
so the gate, prompt, and event shape are testable without weights.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
from typing import Any, Protocol
|
|
17
|
+
|
|
18
|
+
PROMPT = (
|
|
19
|
+
"In one sentence, describe what's visible in this interview frame that a "
|
|
20
|
+
"researcher reviewing the session might find notable. If nothing is notable, "
|
|
21
|
+
"return null."
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class VisionModel(Protocol):
|
|
26
|
+
def describe(self, frame_path: str, prompt: str, linked_text: str) -> str | None: ...
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_prompt(linked_text: str) -> str:
|
|
30
|
+
"""The standard prompt + the linked utterance text for context."""
|
|
31
|
+
if linked_text:
|
|
32
|
+
return f'{PROMPT}\n\nThe speaker was saying: "{linked_text}"'
|
|
33
|
+
return PROMPT
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def annotate_frame(
|
|
37
|
+
frame: dict[str, Any],
|
|
38
|
+
linked_text: str,
|
|
39
|
+
model: VisionModel,
|
|
40
|
+
*,
|
|
41
|
+
enabled: bool,
|
|
42
|
+
actor_id: str,
|
|
43
|
+
) -> dict[str, Any] | None:
|
|
44
|
+
"""Return an AI-authored `create` event for the frame's annotation, or None
|
|
45
|
+
when annotation is disabled or the model declines (nothing notable).
|
|
46
|
+
|
|
47
|
+
`enabled` reflects the per-seed config gate; right-click "annotate this
|
|
48
|
+
frame" passes enabled=True on demand even when the default is off.
|
|
49
|
+
"""
|
|
50
|
+
if not enabled:
|
|
51
|
+
return None
|
|
52
|
+
description = model.describe(frame["path"], build_prompt(linked_text), linked_text)
|
|
53
|
+
if description is None or not description.strip():
|
|
54
|
+
return None
|
|
55
|
+
return {
|
|
56
|
+
"artifact_kind": "frame_annotation",
|
|
57
|
+
"action": "create",
|
|
58
|
+
"actor_type": "ai",
|
|
59
|
+
"actor_id": actor_id,
|
|
60
|
+
"model": actor_id,
|
|
61
|
+
"payload": {
|
|
62
|
+
"frame_id": frame["id"],
|
|
63
|
+
"at_ms": frame["at_ms"],
|
|
64
|
+
"annotation": description.strip(),
|
|
65
|
+
"status": "draft",
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def claude_vision(call: Callable[[str, str], str | None]) -> VisionModel:
|
|
71
|
+
"""Wrap an Anthropic vision call (frame_path, prompt) → text into a VisionModel."""
|
|
72
|
+
|
|
73
|
+
class _Claude:
|
|
74
|
+
def describe(self, frame_path: str, prompt: str, linked_text: str) -> str | None:
|
|
75
|
+
return call(frame_path, prompt)
|
|
76
|
+
|
|
77
|
+
return _Claude()
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""ffmpeg-backed frame extractor (#14).
|
|
2
|
+
|
|
3
|
+
Pulls a JPG from the video stream at each requested trigger timestamp and
|
|
4
|
+
writes it to sessions/<sid>/frames/<padded_ms>.jpg (640x360). Returns the
|
|
5
|
+
frames[] index entries for transcript.json. No classification — frames are
|
|
6
|
+
evidence.
|
|
7
|
+
|
|
8
|
+
Triggers (see schema/frames.taxonomy.json): silence_*, audio_cue, shot_change,
|
|
9
|
+
highlight, manual, sampling. The caller supplies (at_ms, trigger,
|
|
10
|
+
linked_utterance_id?) tuples; this module just extracts + indexes.
|
|
11
|
+
|
|
12
|
+
Idempotent: a frame whose target JPG already exists is not re-extracted, and
|
|
13
|
+
the returned id is stable (FR-<padded_ms>).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import subprocess
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
FRAME_WIDTH = 640
|
|
24
|
+
FRAME_HEIGHT = 360
|
|
25
|
+
_PAD = 9 # zero-pad ms to 9 digits (~277h) for lexical sort
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class FrameTrigger:
|
|
30
|
+
at_ms: int
|
|
31
|
+
trigger: str
|
|
32
|
+
linked_utterance_id: str | None = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _padded(ms: int) -> str:
|
|
36
|
+
return str(ms).zfill(_PAD)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def frame_id(at_ms: int) -> str:
|
|
40
|
+
return f"FR-{_padded(at_ms)}"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def frame_relpath(session_id: str, at_ms: int) -> str:
|
|
44
|
+
return f"sessions/{session_id}/frames/{_padded(at_ms)}.jpg"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _extract_one(video_path: Path, at_ms: int, out_path: Path) -> None:
|
|
48
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
ts = at_ms / 1000.0
|
|
50
|
+
# -ss before -i seeks fast; -frames:v 1 grabs a single frame; scale to 640x360.
|
|
51
|
+
cmd = [
|
|
52
|
+
"ffmpeg",
|
|
53
|
+
"-y",
|
|
54
|
+
"-ss",
|
|
55
|
+
f"{ts:.3f}",
|
|
56
|
+
"-i",
|
|
57
|
+
str(video_path),
|
|
58
|
+
"-frames:v",
|
|
59
|
+
"1",
|
|
60
|
+
"-vf",
|
|
61
|
+
f"scale={FRAME_WIDTH}:{FRAME_HEIGHT}",
|
|
62
|
+
"-q:v",
|
|
63
|
+
"4",
|
|
64
|
+
str(out_path),
|
|
65
|
+
]
|
|
66
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
67
|
+
if proc.returncode != 0 or not out_path.exists():
|
|
68
|
+
raise RuntimeError(f"ffmpeg failed extracting frame at {at_ms}ms: {proc.stderr[-300:]}")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract_frames(
|
|
72
|
+
video_path: str | Path,
|
|
73
|
+
session_id: str,
|
|
74
|
+
triggers: list[FrameTrigger],
|
|
75
|
+
seed_root: str | Path,
|
|
76
|
+
) -> list[dict[str, Any]]:
|
|
77
|
+
"""Extract a frame per trigger; return frames[] index entries.
|
|
78
|
+
|
|
79
|
+
Deduplicates by at_ms (the first trigger for a given ms wins), and skips
|
|
80
|
+
extraction when the JPG already exists (idempotent re-runs).
|
|
81
|
+
"""
|
|
82
|
+
video_path = Path(video_path)
|
|
83
|
+
seed_root = Path(seed_root)
|
|
84
|
+
|
|
85
|
+
seen: dict[int, FrameTrigger] = {}
|
|
86
|
+
for t in triggers:
|
|
87
|
+
seen.setdefault(t.at_ms, t)
|
|
88
|
+
|
|
89
|
+
frames: list[dict[str, Any]] = []
|
|
90
|
+
for at_ms in sorted(seen):
|
|
91
|
+
trig = seen[at_ms]
|
|
92
|
+
rel = frame_relpath(session_id, at_ms)
|
|
93
|
+
abs_path = seed_root / rel
|
|
94
|
+
if not abs_path.exists():
|
|
95
|
+
_extract_one(video_path, at_ms, abs_path)
|
|
96
|
+
entry: dict[str, Any] = {
|
|
97
|
+
"id": frame_id(at_ms),
|
|
98
|
+
"at_ms": at_ms,
|
|
99
|
+
"path": rel,
|
|
100
|
+
"trigger": trig.trigger,
|
|
101
|
+
}
|
|
102
|
+
if trig.linked_utterance_id is not None:
|
|
103
|
+
entry["linked_utterance_id"] = trig.linked_utterance_id
|
|
104
|
+
frames.append(entry)
|
|
105
|
+
return frames
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def sampling_triggers(
|
|
109
|
+
duration_ms: int,
|
|
110
|
+
existing_ms: list[int],
|
|
111
|
+
interval_s: int = 60,
|
|
112
|
+
) -> list[FrameTrigger]:
|
|
113
|
+
"""Emit a `sampling` trigger every `interval_s` only when no other trigger
|
|
114
|
+
already fired within that window (ROADMAP § Descriptive transcription B).
|
|
115
|
+
"""
|
|
116
|
+
interval_ms = interval_s * 1000
|
|
117
|
+
existing = sorted(existing_ms)
|
|
118
|
+
out: list[FrameTrigger] = []
|
|
119
|
+
t = 0
|
|
120
|
+
ei = 0
|
|
121
|
+
while t < duration_ms:
|
|
122
|
+
window_end = t + interval_ms
|
|
123
|
+
# advance existing pointer past anything before this window
|
|
124
|
+
while ei < len(existing) and existing[ei] < t:
|
|
125
|
+
ei += 1
|
|
126
|
+
covered = ei < len(existing) and existing[ei] < window_end
|
|
127
|
+
if not covered:
|
|
128
|
+
out.append(FrameTrigger(at_ms=t, trigger="sampling"))
|
|
129
|
+
t = window_end
|
|
130
|
+
return out
|