@they-juanreina/compost-cli 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/blame.d.ts.map +1 -1
- package/dist/lib/blame.js +3 -2
- package/dist/lib/blame.js.map +1 -1
- package/dist/lib/journal.d.ts.map +1 -1
- package/dist/lib/journal.js +9 -0
- package/dist/lib/journal.js.map +1 -1
- package/dist/lib/migrate.d.ts.map +1 -1
- package/dist/lib/migrate.js +1 -0
- package/dist/lib/migrate.js.map +1 -1
- package/dist/lib/nativeRuntime.d.ts +6 -3
- package/dist/lib/nativeRuntime.d.ts.map +1 -1
- package/dist/lib/nativeRuntime.js +6 -3
- package/dist/lib/nativeRuntime.js.map +1 -1
- package/dist/lib/retrieve.d.ts.map +1 -1
- package/dist/lib/retrieve.js +0 -8
- package/dist/lib/retrieve.js.map +1 -1
- package/dist/lib/seedResolve.d.ts.map +1 -1
- package/dist/lib/seedResolve.js +1 -0
- package/dist/lib/seedResolve.js.map +1 -1
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +9 -8
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/snap.d.ts.map +1 -1
- package/dist/lib/snap.js +2 -5
- package/dist/lib/snap.js.map +1 -1
- package/dist/loops/supervisor.d.ts.map +1 -1
- package/dist/loops/supervisor.js +1 -0
- package/dist/loops/supervisor.js.map +1 -1
- package/dist/loops/transcribe_worker.d.ts.map +1 -1
- package/dist/loops/transcribe_worker.js +0 -1
- package/dist/loops/transcribe_worker.js.map +1 -1
- package/dist/router.js +1 -1
- package/package.json +10 -4
- package/transcriber/app/__init__.py +3 -0
- package/transcriber/app/asr.py +198 -0
- package/transcriber/app/asr_parakeet.py +174 -0
- package/transcriber/app/cue_parser.py +110 -0
- package/transcriber/app/diarization.py +300 -0
- package/transcriber/app/frame_annotation.py +77 -0
- package/transcriber/app/frames.py +130 -0
- package/transcriber/app/health.py +70 -0
- package/transcriber/app/legacy.py +355 -0
- package/transcriber/app/main.py +30 -0
- package/transcriber/app/pipeline.py +204 -0
- package/transcriber/app/pptx_export.py +42 -0
- package/transcriber/app/prosody.py +123 -0
- package/transcriber/app/routes/__init__.py +1 -0
- package/transcriber/app/routes/legacy.py +117 -0
- package/transcriber/app/routes/transcribe.py +133 -0
- package/transcriber/app/shot_change.py +74 -0
- package/transcriber/app/silence_typer.py +144 -0
- package/transcriber/app/transcribe_cli.py +82 -0
- package/transcriber/app/vad.py +145 -0
- package/transcriber/pyproject.toml +56 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""ASR wrapper (#10): Whisper-large-v3 with event-tag tokens via WhisperX.
|
|
2
|
+
|
|
3
|
+
The heavy model (whisperx / faster-whisper / torch) is imported lazily so the
|
|
4
|
+
service, the cue parser, and the test suite all work without the multi-GB
|
|
5
|
+
weights installed. Install the `asr` extra and run inside the OrbStack
|
|
6
|
+
container for real transcription:
|
|
7
|
+
|
|
8
|
+
pip install -e '.[asr]'
|
|
9
|
+
|
|
10
|
+
`transcribe()` returns word-aligned utterances whose text may contain event
|
|
11
|
+
tags; cue_parser.parse_transcript_cues() then lifts those into cues[].
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from functools import lru_cache
|
|
18
|
+
from typing import Any, Protocol
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ASRConfig:
|
|
23
|
+
model_name: str = "large-v3"
|
|
24
|
+
device: str = "auto" # "cpu" | "cuda" | "mps" | "auto"
|
|
25
|
+
compute_type: str = "int8"
|
|
26
|
+
language: str | None = None
|
|
27
|
+
event_tags: bool = True
|
|
28
|
+
engine: str = "whisper" # "whisper" (WhisperX, Docker/CPU) | "parakeet" (parakeet-mlx, native Metal)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ASRResult:
|
|
33
|
+
utterances: list[dict[str, Any]] = field(default_factory=list)
|
|
34
|
+
language: str | None = None
|
|
35
|
+
model: str = ""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class WhisperBackend(Protocol):
|
|
39
|
+
"""Minimal surface the ASR wrapper needs. The real WhisperX backend
|
|
40
|
+
implements this; tests provide a fake."""
|
|
41
|
+
|
|
42
|
+
def transcribe(self, audio_path: str) -> dict[str, Any]: ...
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build_whisperx_transcribe_kwargs(language: str | None) -> dict[str, Any]:
|
|
46
|
+
"""Build the per-call kwargs for `whisperx.Model.transcribe()` so the
|
|
47
|
+
configured language hint reaches transcribe, not just load_model (#180).
|
|
48
|
+
|
|
49
|
+
Pre-fix, the hint was only passed to ``whisperx.load_model``; ``transcribe``
|
|
50
|
+
re-ran auto-detect per file, so the request-level ``"language":"en"`` was
|
|
51
|
+
effectively ignored. Tests the pure mapping without needing whisperx weights.
|
|
52
|
+
"""
|
|
53
|
+
kwargs: dict[str, Any] = {"batch_size": 16}
|
|
54
|
+
if language:
|
|
55
|
+
kwargs["language"] = language
|
|
56
|
+
return kwargs
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class WhisperXBackend: # pragma: no cover - needs multi-GB weights
|
|
60
|
+
"""Concrete WhisperBackend wrapping `whisperx`.
|
|
61
|
+
|
|
62
|
+
Imports `whisperx` and `torch` lazily inside `__init__` so this module
|
|
63
|
+
remains importable in environments without the [asr] extra installed.
|
|
64
|
+
The constructor loads the model (multi-GB) the first time only — the
|
|
65
|
+
`_load_whisperx_backend` lru_cache ensures one instance per (model, device,
|
|
66
|
+
compute_type) tuple per process.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(self, config: ASRConfig):
|
|
70
|
+
try:
|
|
71
|
+
import torch # type: ignore
|
|
72
|
+
import whisperx # type: ignore
|
|
73
|
+
except ImportError as e:
|
|
74
|
+
raise RuntimeError(
|
|
75
|
+
"whisperx is not installed. Install the asr extra: pip install -e '.[asr]'"
|
|
76
|
+
) from e
|
|
77
|
+
|
|
78
|
+
device = _resolve_device(config.device)
|
|
79
|
+
self._model = whisperx.load_model(
|
|
80
|
+
config.model_name,
|
|
81
|
+
device=device,
|
|
82
|
+
compute_type=config.compute_type,
|
|
83
|
+
language=config.language,
|
|
84
|
+
asr_options={"suppress_numerals": False},
|
|
85
|
+
)
|
|
86
|
+
self._align_model = None
|
|
87
|
+
self._align_metadata = None
|
|
88
|
+
self._device = device
|
|
89
|
+
self._whisperx = whisperx
|
|
90
|
+
self._torch = torch
|
|
91
|
+
# Pre-fix (#180): load_model() received the language hint but
|
|
92
|
+
# model.transcribe() didn't — WhisperX re-ran auto-detect per file
|
|
93
|
+
# ("No language specified, language will be detected ... (increases
|
|
94
|
+
# inference time)"). Hold the configured language on the backend and
|
|
95
|
+
# pass it through on every transcribe call so the hint actually skips
|
|
96
|
+
# the per-file detection step.
|
|
97
|
+
self._language = config.language
|
|
98
|
+
|
|
99
|
+
def transcribe(self, audio_path: str) -> dict[str, Any]:
|
|
100
|
+
audio = self._whisperx.load_audio(audio_path)
|
|
101
|
+
# Forward the configured language so WhisperX skips per-file auto-detect.
|
|
102
|
+
# When None, behavior is unchanged (auto-detect, then we use the result).
|
|
103
|
+
result = self._model.transcribe(audio, **build_whisperx_transcribe_kwargs(self._language))
|
|
104
|
+
language = result.get("language") or self._language or "en"
|
|
105
|
+
|
|
106
|
+
# Lazy-load the alignment model on first use (depends on detected language).
|
|
107
|
+
if self._align_model is None:
|
|
108
|
+
self._align_model, self._align_metadata = self._whisperx.load_align_model(
|
|
109
|
+
language_code=language, device=self._device
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
aligned = self._whisperx.align(
|
|
113
|
+
result["segments"],
|
|
114
|
+
self._align_model,
|
|
115
|
+
self._align_metadata,
|
|
116
|
+
audio,
|
|
117
|
+
self._device,
|
|
118
|
+
return_char_alignments=False,
|
|
119
|
+
)
|
|
120
|
+
return {"segments": aligned["segments"], "language": language}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _resolve_device(requested: str) -> str: # pragma: no cover - env-dependent
|
|
124
|
+
"""Map `auto` to the best available device. `cpu`/`cuda`/`mps` pass through."""
|
|
125
|
+
if requested != "auto":
|
|
126
|
+
return requested
|
|
127
|
+
try:
|
|
128
|
+
import torch # type: ignore
|
|
129
|
+
|
|
130
|
+
if torch.cuda.is_available():
|
|
131
|
+
return "cuda"
|
|
132
|
+
if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
|
|
133
|
+
return "mps"
|
|
134
|
+
except ImportError:
|
|
135
|
+
pass
|
|
136
|
+
return "cpu"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@lru_cache(maxsize=1)
|
|
140
|
+
def _load_whisperx_backend(config_key: str) -> WhisperBackend: # pragma: no cover - needs weights
|
|
141
|
+
"""Lazily construct the real WhisperX backend. Cached per process so the
|
|
142
|
+
multi-GB model loads once (cold-start cached)."""
|
|
143
|
+
# config_key encodes (model_name, device, compute_type); reconstruct.
|
|
144
|
+
model_name, device, compute_type = config_key.split(":", 2)
|
|
145
|
+
return WhisperXBackend(
|
|
146
|
+
ASRConfig(model_name=model_name, device=device, compute_type=compute_type)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class Transcriber:
|
|
151
|
+
def __init__(self, config: ASRConfig | None = None, backend: WhisperBackend | None = None):
|
|
152
|
+
self.config = config or ASRConfig()
|
|
153
|
+
self._backend = backend
|
|
154
|
+
|
|
155
|
+
def _get_backend(self) -> WhisperBackend:
|
|
156
|
+
if self._backend is not None:
|
|
157
|
+
return self._backend
|
|
158
|
+
if self.config.engine == "parakeet":
|
|
159
|
+
from .asr_parakeet import _load_parakeet_backend, resolve_parakeet_model
|
|
160
|
+
|
|
161
|
+
return _load_parakeet_backend(
|
|
162
|
+
resolve_parakeet_model(self.config.model_name), self.config.language
|
|
163
|
+
)
|
|
164
|
+
key = f"{self.config.model_name}:{self.config.device}:{self.config.compute_type}"
|
|
165
|
+
return _load_whisperx_backend(key)
|
|
166
|
+
|
|
167
|
+
def transcribe(self, audio_path: str) -> ASRResult:
|
|
168
|
+
raw = self._get_backend().transcribe(audio_path)
|
|
169
|
+
utterances = _normalize_segments(raw.get("segments", []))
|
|
170
|
+
return ASRResult(
|
|
171
|
+
utterances=utterances,
|
|
172
|
+
language=raw.get("language", self.config.language),
|
|
173
|
+
model=self.config.model_name,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _normalize_segments(segments: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
178
|
+
"""Convert backend segments into compost utterance dicts (pre-diarization,
|
|
179
|
+
pre-cue-extraction). Speaker ids are filled by the diarizer (#11)."""
|
|
180
|
+
out: list[dict[str, Any]] = []
|
|
181
|
+
for i, seg in enumerate(segments, start=1):
|
|
182
|
+
words = [
|
|
183
|
+
{"w": w["word"], "s": int(w["start"] * 1000), "e": int(w["end"] * 1000), "conf": w.get("score", 1.0)}
|
|
184
|
+
for w in seg.get("words", [])
|
|
185
|
+
if "start" in w and "end" in w
|
|
186
|
+
]
|
|
187
|
+
out.append(
|
|
188
|
+
{
|
|
189
|
+
"id": f"U-{i:04d}",
|
|
190
|
+
"speaker_id": seg.get("speaker", "S?"),
|
|
191
|
+
"turn": i,
|
|
192
|
+
"start_ms": int(seg["start"] * 1000),
|
|
193
|
+
"end_ms": int(seg["end"] * 1000),
|
|
194
|
+
"text": seg.get("text", "").strip(),
|
|
195
|
+
"words": words,
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
return out
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Parakeet (NVIDIA NeMo TDT) ASR backend via `parakeet-mlx` — the native
|
|
2
|
+
Apple-Silicon (Metal) transcription path (#176).
|
|
3
|
+
|
|
4
|
+
Conforms to the `WhisperBackend` Protocol in `asr.py`: `transcribe()` returns
|
|
5
|
+
`{"segments": [...], "language": ...}` with per-word timestamps, so the rest of
|
|
6
|
+
the pipeline (diarization, cue parsing, silence typing, prosody) is
|
|
7
|
+
engine-agnostic and unchanged.
|
|
8
|
+
|
|
9
|
+
`parakeet-mlx` requires Apple Silicon + MLX and is imported lazily, so this
|
|
10
|
+
module stays importable (and the pure mapping helpers stay unit-testable)
|
|
11
|
+
without the hardware or the ~2.5 GB weights. The heavy backend itself is marked
|
|
12
|
+
`# pragma: no cover`, exactly like `WhisperXBackend`.
|
|
13
|
+
|
|
14
|
+
Why Parakeet-TDT 0.6B v3 by default: it tops the Open ASR Leaderboard's
|
|
15
|
+
convenient (local, Apple-Silicon, word-timestamped) tier — measured ~58.8x
|
|
16
|
+
realtime on an M1 Max vs ~1.3x for WhisperX in the CPU container — with native
|
|
17
|
+
frame-level word timestamps and 25-language (incl. Spanish) coverage.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from functools import lru_cache
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from .asr import ASRConfig, WhisperBackend
|
|
27
|
+
|
|
28
|
+
# A tiny EN-vs-ES language heuristic for the native Parakeet path (#190).
|
|
29
|
+
# parakeet-mlx auto-detects internally but doesn't surface the detection in its
|
|
30
|
+
# AlignedResult, so without a `--language` hint the transcript previously
|
|
31
|
+
# recorded `language: "und"` (via `_detect_language`'s fallback). For a turnkey
|
|
32
|
+
# zero-config run we'd rather record something sensible. Counts function-word
|
|
33
|
+
# hits in the transcribed text; biased toward English when the signal is weak
|
|
34
|
+
# (the v3 model is English-first multilingual).
|
|
35
|
+
_ES_HEURISTIC_TOKENS = frozenset(
|
|
36
|
+
{
|
|
37
|
+
"que", "de", "la", "los", "las", "el", "en", "es", "no",
|
|
38
|
+
"una", "por", "con", "para", "del", "como", "pero", "más",
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
_EN_HEURISTIC_TOKENS = frozenset(
|
|
42
|
+
{
|
|
43
|
+
"the", "and", "of", "to", "in", "is", "for", "with",
|
|
44
|
+
"on", "that", "this", "you", "are", "was", "but", "they",
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
_WORD_RE = re.compile(r"[a-zA-Záéíóúñü]+")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def guess_lang_from_text(text: str) -> str:
|
|
51
|
+
"""Best-effort EN/ES guess for the Parakeet path when neither the model
|
|
52
|
+
nor a `--language` hint reveal the language. Falls back to ``en`` when the
|
|
53
|
+
text is empty or signal is too weak — never returns ``und`` (#190).
|
|
54
|
+
"""
|
|
55
|
+
if not text:
|
|
56
|
+
return "en"
|
|
57
|
+
tokens = _WORD_RE.findall(text.lower())
|
|
58
|
+
if not tokens:
|
|
59
|
+
return "en"
|
|
60
|
+
es = sum(1 for t in tokens if t in _ES_HEURISTIC_TOKENS)
|
|
61
|
+
en = sum(1 for t in tokens if t in _EN_HEURISTIC_TOKENS)
|
|
62
|
+
# Require a clear ES margin to flip — otherwise default to EN (v3 is EN-first).
|
|
63
|
+
if es > max(en, 1) * 1.2:
|
|
64
|
+
return "es"
|
|
65
|
+
return "en"
|
|
66
|
+
|
|
67
|
+
# Multilingual v3 (English + 24 European languages incl. Spanish) is the default;
|
|
68
|
+
# v2 (`...-0.6b-v2`) is English-only with marginally better English WER.
|
|
69
|
+
DEFAULT_PARAKEET_MODEL = "mlx-community/parakeet-tdt-0.6b-v3"
|
|
70
|
+
|
|
71
|
+
# parakeet-mlx loads the whole file into a single Metal buffer unless chunked: a
|
|
72
|
+
# 1-hour interview tries to allocate ~131 GB and blows past Metal's ~20 GB cap.
|
|
73
|
+
# Chunk at 2 minutes (parakeet stitches chunks via its default 15 s overlap +
|
|
74
|
+
# token timestamps) so arbitrarily long audio fits in memory. Tunable via env.
|
|
75
|
+
DEFAULT_CHUNK_DURATION_S = 120.0
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def tokens_to_words(tokens: list[Any]) -> list[dict[str, Any]]:
|
|
79
|
+
"""Merge parakeet sub-word tokens into words.
|
|
80
|
+
|
|
81
|
+
parakeet emits sub-word tokens where a word boundary is marked by a leading
|
|
82
|
+
space (e.g. ``[" If", ... ," we"," me","as"]`` → ``["If","we","measure"]``).
|
|
83
|
+
A token whose text starts with a space (or the first token) begins a new
|
|
84
|
+
word; the rest extend the current one. Timestamps are in **seconds** (the
|
|
85
|
+
pipeline's `_normalize_segments` converts to ms).
|
|
86
|
+
"""
|
|
87
|
+
words: list[dict[str, Any]] = []
|
|
88
|
+
cur: dict[str, Any] | None = None
|
|
89
|
+
for t in tokens:
|
|
90
|
+
txt = getattr(t, "text", "")
|
|
91
|
+
if cur is None or txt.startswith(" "):
|
|
92
|
+
if cur is not None:
|
|
93
|
+
words.append(cur)
|
|
94
|
+
cur = {
|
|
95
|
+
"word": txt,
|
|
96
|
+
"start": float(t.start),
|
|
97
|
+
"end": float(t.end),
|
|
98
|
+
"score": float(getattr(t, "confidence", 1.0) or 1.0),
|
|
99
|
+
}
|
|
100
|
+
else:
|
|
101
|
+
cur["word"] += txt
|
|
102
|
+
cur["end"] = float(t.end)
|
|
103
|
+
if cur is not None:
|
|
104
|
+
words.append(cur)
|
|
105
|
+
for w in words:
|
|
106
|
+
w["word"] = w["word"].strip()
|
|
107
|
+
return [w for w in words if w["word"]]
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def result_to_segments(result: Any) -> list[dict[str, Any]]:
|
|
111
|
+
"""Map a parakeet-mlx ``AlignedResult`` (sentences → tokens) to the
|
|
112
|
+
`WhisperBackend` segment shape: ``{start, end, text, words}`` (seconds)."""
|
|
113
|
+
segments: list[dict[str, Any]] = []
|
|
114
|
+
for sent in getattr(result, "sentences", None) or []:
|
|
115
|
+
segments.append(
|
|
116
|
+
{
|
|
117
|
+
"start": float(sent.start),
|
|
118
|
+
"end": float(sent.end),
|
|
119
|
+
"text": (getattr(sent, "text", "") or "").strip(),
|
|
120
|
+
"words": tokens_to_words(getattr(sent, "tokens", None) or []),
|
|
121
|
+
}
|
|
122
|
+
)
|
|
123
|
+
return segments
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def resolve_parakeet_model(model_name: str | None) -> str:
|
|
127
|
+
"""A whisper-style model name (the ASRConfig default) means 'use the Parakeet
|
|
128
|
+
default'; an explicit parakeet id passes through."""
|
|
129
|
+
if model_name and "parakeet" in model_name:
|
|
130
|
+
return model_name
|
|
131
|
+
return DEFAULT_PARAKEET_MODEL
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class ParakeetMLXBackend: # pragma: no cover - needs MLX + weights
|
|
135
|
+
"""Concrete `WhisperBackend` wrapping `parakeet-mlx` (Apple Silicon / Metal)."""
|
|
136
|
+
|
|
137
|
+
def __init__(self, config: ASRConfig):
|
|
138
|
+
import os
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
import parakeet_mlx # type: ignore
|
|
142
|
+
except ImportError as e:
|
|
143
|
+
raise RuntimeError(
|
|
144
|
+
"parakeet-mlx is not installed (native Apple-Silicon ASR). "
|
|
145
|
+
"Install it in the native transcriber venv: pip install parakeet-mlx"
|
|
146
|
+
) from e
|
|
147
|
+
self._model = parakeet_mlx.from_pretrained(resolve_parakeet_model(config.model_name))
|
|
148
|
+
self._language = config.language
|
|
149
|
+
self._chunk_s = float(os.environ.get("COMPOST_PARAKEET_CHUNK_S", DEFAULT_CHUNK_DURATION_S))
|
|
150
|
+
|
|
151
|
+
def transcribe(self, audio_path: str) -> dict[str, Any]:
|
|
152
|
+
# chunk_duration keeps long files within Metal's buffer cap (see above).
|
|
153
|
+
result = self._model.transcribe(audio_path, chunk_duration=self._chunk_s)
|
|
154
|
+
segments = result_to_segments(result)
|
|
155
|
+
# Language resolution priority (#190):
|
|
156
|
+
# 1. Whatever parakeet-mlx surfaces on the result (future-proof — the
|
|
157
|
+
# API doesn't expose it today, but a future minor might).
|
|
158
|
+
# 2. The configured `--language` hint, if any.
|
|
159
|
+
# 3. A tiny EN/ES heuristic on the transcribed text — better than
|
|
160
|
+
# letting `_detect_language` fall back to "und" on the zero-config
|
|
161
|
+
# turnkey path.
|
|
162
|
+
language = getattr(result, "language", None) or self._language
|
|
163
|
+
if not language:
|
|
164
|
+
full_text = " ".join((s.get("text") or "") for s in segments)
|
|
165
|
+
language = guess_lang_from_text(full_text)
|
|
166
|
+
return {"segments": segments, "language": language}
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@lru_cache(maxsize=1)
|
|
170
|
+
def _load_parakeet_backend(model_id: str, language: str | None) -> WhisperBackend: # pragma: no cover
|
|
171
|
+
"""Lazily construct + cache the Parakeet backend (one model load per process)."""
|
|
172
|
+
return ParakeetMLXBackend(
|
|
173
|
+
ASRConfig(model_name=model_id, language=language, engine="parakeet")
|
|
174
|
+
)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Cue parser (#10).
|
|
2
|
+
|
|
3
|
+
Whisper-large-v3 with event-tag tokens emits inline markers like [laughter],
|
|
4
|
+
[sigh], [cough], [clear_throat], [unintelligible], and code-switching markers.
|
|
5
|
+
This module pulls those out of utterance text into structured cues[] entries
|
|
6
|
+
(schema/cues.taxonomy.json) and returns the cleaned text.
|
|
7
|
+
|
|
8
|
+
Pure and deterministic — no model. The ASR wrapper (asr.py) produces the
|
|
9
|
+
tagged text; this turns it into cues.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
# Whisper/Whisper-AT event tag → compost cue kind (cues.taxonomy.json).
|
|
18
|
+
TAG_TO_KIND: dict[str, str] = {
|
|
19
|
+
"laughter": "laughter",
|
|
20
|
+
"laugh": "laughter",
|
|
21
|
+
"laughs": "laughter",
|
|
22
|
+
"sigh": "sigh",
|
|
23
|
+
"sighs": "sigh",
|
|
24
|
+
"cough": "cough",
|
|
25
|
+
"coughs": "cough",
|
|
26
|
+
"clear_throat": "throat-clear",
|
|
27
|
+
"throat_clear": "throat-clear",
|
|
28
|
+
"throat-clear": "throat-clear",
|
|
29
|
+
"unintelligible": "unintelligible",
|
|
30
|
+
"inaudible": "unintelligible",
|
|
31
|
+
"code_switch": "code-switching",
|
|
32
|
+
"code-switch": "code-switching",
|
|
33
|
+
"code_switching": "code-switching",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Default confidence assigned to a tag-derived cue when the ASR gives none.
|
|
37
|
+
DEFAULT_CONFIDENCE = 0.8
|
|
38
|
+
|
|
39
|
+
_TAG_RE = re.compile(r"\[([a-zA-Z_\-]+)\]")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _clean_text(text: str) -> str:
|
|
43
|
+
# Drop recognized event tags, collapse the resulting double spaces.
|
|
44
|
+
def repl(m: re.Match[str]) -> str:
|
|
45
|
+
return "" if m.group(1).lower() in TAG_TO_KIND else m.group(0)
|
|
46
|
+
|
|
47
|
+
return re.sub(r"\s{2,}", " ", _TAG_RE.sub(repl, text)).strip()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def parse_cues_from_utterance(
|
|
51
|
+
utterance: dict[str, Any],
|
|
52
|
+
next_cue_index: int = 1,
|
|
53
|
+
confidence: float = DEFAULT_CONFIDENCE,
|
|
54
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
55
|
+
"""Return (cleaned_text, cues) for one utterance.
|
|
56
|
+
|
|
57
|
+
Cue timing: if a word in `words[]` matches the tag, use that word's span;
|
|
58
|
+
otherwise fall back to the utterance span.
|
|
59
|
+
"""
|
|
60
|
+
text = utterance.get("text", "")
|
|
61
|
+
words = utterance.get("words", [])
|
|
62
|
+
speaker_id = utterance.get("speaker_id")
|
|
63
|
+
cues: list[dict[str, Any]] = []
|
|
64
|
+
idx = next_cue_index
|
|
65
|
+
|
|
66
|
+
for m in _TAG_RE.finditer(text):
|
|
67
|
+
kind = TAG_TO_KIND.get(m.group(1).lower())
|
|
68
|
+
if kind is None:
|
|
69
|
+
continue
|
|
70
|
+
start_ms, end_ms = _tag_span(m.group(0), words, utterance)
|
|
71
|
+
cue: dict[str, Any] = {
|
|
72
|
+
"id": f"CUE-{idx:03d}",
|
|
73
|
+
"kind": kind,
|
|
74
|
+
"start_ms": start_ms,
|
|
75
|
+
"end_ms": end_ms,
|
|
76
|
+
"source": "audio",
|
|
77
|
+
"confidence": confidence,
|
|
78
|
+
}
|
|
79
|
+
if speaker_id is not None:
|
|
80
|
+
cue["speaker_id"] = speaker_id
|
|
81
|
+
cues.append(cue)
|
|
82
|
+
idx += 1
|
|
83
|
+
|
|
84
|
+
return _clean_text(text), cues
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _tag_span(
|
|
88
|
+
tag_token: str,
|
|
89
|
+
words: list[dict[str, Any]],
|
|
90
|
+
utterance: dict[str, Any],
|
|
91
|
+
) -> tuple[int, int]:
|
|
92
|
+
for w in words:
|
|
93
|
+
if w.get("w") == tag_token:
|
|
94
|
+
return int(w["s"]), int(w["e"])
|
|
95
|
+
return int(utterance["start_ms"]), int(utterance["end_ms"])
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def parse_transcript_cues(transcript: dict[str, Any]) -> dict[str, Any]:
|
|
99
|
+
"""Extract cues from every utterance, append to cues[], strip tags from text.
|
|
100
|
+
|
|
101
|
+
Cue ids continue from any existing cues[]. Mutates and returns the transcript.
|
|
102
|
+
"""
|
|
103
|
+
existing = transcript.setdefault("cues", [])
|
|
104
|
+
idx = len(existing) + 1
|
|
105
|
+
for utt in transcript.get("utterances", []):
|
|
106
|
+
cleaned, cues = parse_cues_from_utterance(utt, next_cue_index=idx)
|
|
107
|
+
utt["text"] = cleaned
|
|
108
|
+
existing.extend(cues)
|
|
109
|
+
idx += len(cues)
|
|
110
|
+
return transcript
|