@they-juanreina/compost-cli 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dist/lib/blame.d.ts.map +1 -1
  2. package/dist/lib/blame.js +3 -2
  3. package/dist/lib/blame.js.map +1 -1
  4. package/dist/lib/journal.d.ts.map +1 -1
  5. package/dist/lib/journal.js +9 -0
  6. package/dist/lib/journal.js.map +1 -1
  7. package/dist/lib/migrate.d.ts.map +1 -1
  8. package/dist/lib/migrate.js +1 -0
  9. package/dist/lib/migrate.js.map +1 -1
  10. package/dist/lib/nativeRuntime.d.ts +6 -3
  11. package/dist/lib/nativeRuntime.d.ts.map +1 -1
  12. package/dist/lib/nativeRuntime.js +6 -3
  13. package/dist/lib/nativeRuntime.js.map +1 -1
  14. package/dist/lib/retrieve.d.ts.map +1 -1
  15. package/dist/lib/retrieve.js +0 -8
  16. package/dist/lib/retrieve.js.map +1 -1
  17. package/dist/lib/seedResolve.d.ts.map +1 -1
  18. package/dist/lib/seedResolve.js +1 -0
  19. package/dist/lib/seedResolve.js.map +1 -1
  20. package/dist/lib/setup.d.ts.map +1 -1
  21. package/dist/lib/setup.js +9 -8
  22. package/dist/lib/setup.js.map +1 -1
  23. package/dist/lib/snap.d.ts.map +1 -1
  24. package/dist/lib/snap.js +2 -5
  25. package/dist/lib/snap.js.map +1 -1
  26. package/dist/loops/supervisor.d.ts.map +1 -1
  27. package/dist/loops/supervisor.js +1 -0
  28. package/dist/loops/supervisor.js.map +1 -1
  29. package/dist/loops/transcribe_worker.d.ts.map +1 -1
  30. package/dist/loops/transcribe_worker.js +0 -1
  31. package/dist/loops/transcribe_worker.js.map +1 -1
  32. package/dist/router.js +1 -1
  33. package/package.json +10 -4
  34. package/transcriber/app/__init__.py +3 -0
  35. package/transcriber/app/asr.py +198 -0
  36. package/transcriber/app/asr_parakeet.py +174 -0
  37. package/transcriber/app/cue_parser.py +110 -0
  38. package/transcriber/app/diarization.py +300 -0
  39. package/transcriber/app/frame_annotation.py +77 -0
  40. package/transcriber/app/frames.py +130 -0
  41. package/transcriber/app/health.py +70 -0
  42. package/transcriber/app/legacy.py +355 -0
  43. package/transcriber/app/main.py +30 -0
  44. package/transcriber/app/pipeline.py +204 -0
  45. package/transcriber/app/pptx_export.py +42 -0
  46. package/transcriber/app/prosody.py +123 -0
  47. package/transcriber/app/routes/__init__.py +1 -0
  48. package/transcriber/app/routes/legacy.py +117 -0
  49. package/transcriber/app/routes/transcribe.py +133 -0
  50. package/transcriber/app/shot_change.py +74 -0
  51. package/transcriber/app/silence_typer.py +144 -0
  52. package/transcriber/app/transcribe_cli.py +82 -0
  53. package/transcriber/app/vad.py +145 -0
  54. package/transcriber/pyproject.toml +56 -0
@@ -0,0 +1,198 @@
1
+ """ASR wrapper (#10): Whisper-large-v3 with event-tag tokens via WhisperX.
2
+
3
+ The heavy model (whisperx / faster-whisper / torch) is imported lazily so the
4
+ service, the cue parser, and the test suite all work without the multi-GB
5
+ weights installed. Install the `asr` extra and run inside the OrbStack
6
+ container for real transcription:
7
+
8
+ pip install -e '.[asr]'
9
+
10
+ `transcribe()` returns word-aligned utterances whose text may contain event
11
+ tags; cue_parser.parse_transcript_cues() then lifts those into cues[].
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from dataclasses import dataclass, field
17
+ from functools import lru_cache
18
+ from typing import Any, Protocol
19
+
20
+
21
+ @dataclass
22
+ class ASRConfig:
23
+ model_name: str = "large-v3"
24
+ device: str = "auto" # "cpu" | "cuda" | "mps" | "auto"
25
+ compute_type: str = "int8"
26
+ language: str | None = None
27
+ event_tags: bool = True
28
+ engine: str = "whisper" # "whisper" (WhisperX, Docker/CPU) | "parakeet" (parakeet-mlx, native Metal)
29
+
30
+
31
+ @dataclass
32
+ class ASRResult:
33
+ utterances: list[dict[str, Any]] = field(default_factory=list)
34
+ language: str | None = None
35
+ model: str = ""
36
+
37
+
38
+ class WhisperBackend(Protocol):
39
+ """Minimal surface the ASR wrapper needs. The real WhisperX backend
40
+ implements this; tests provide a fake."""
41
+
42
+ def transcribe(self, audio_path: str) -> dict[str, Any]: ...
43
+
44
+
45
+ def build_whisperx_transcribe_kwargs(language: str | None) -> dict[str, Any]:
46
+ """Build the per-call kwargs for `whisperx.Model.transcribe()` so the
47
+ configured language hint reaches transcribe, not just load_model (#180).
48
+
49
+ Pre-fix, the hint was only passed to ``whisperx.load_model``; ``transcribe``
50
+ re-ran auto-detect per file, so the request-level ``"language":"en"`` was
51
+ effectively ignored. Tests the pure mapping without needing whisperx weights.
52
+ """
53
+ kwargs: dict[str, Any] = {"batch_size": 16}
54
+ if language:
55
+ kwargs["language"] = language
56
+ return kwargs
57
+
58
+
59
+ class WhisperXBackend: # pragma: no cover - needs multi-GB weights
60
+ """Concrete WhisperBackend wrapping `whisperx`.
61
+
62
+ Imports `whisperx` and `torch` lazily inside `__init__` so this module
63
+ remains importable in environments without the [asr] extra installed.
64
+ The constructor loads the model (multi-GB) the first time only — the
65
+ `_load_whisperx_backend` lru_cache ensures one instance per (model, device,
66
+ compute_type) tuple per process.
67
+ """
68
+
69
+ def __init__(self, config: ASRConfig):
70
+ try:
71
+ import torch # type: ignore
72
+ import whisperx # type: ignore
73
+ except ImportError as e:
74
+ raise RuntimeError(
75
+ "whisperx is not installed. Install the asr extra: pip install -e '.[asr]'"
76
+ ) from e
77
+
78
+ device = _resolve_device(config.device)
79
+ self._model = whisperx.load_model(
80
+ config.model_name,
81
+ device=device,
82
+ compute_type=config.compute_type,
83
+ language=config.language,
84
+ asr_options={"suppress_numerals": False},
85
+ )
86
+ self._align_model = None
87
+ self._align_metadata = None
88
+ self._device = device
89
+ self._whisperx = whisperx
90
+ self._torch = torch
91
+ # Pre-fix (#180): load_model() received the language hint but
92
+ # model.transcribe() didn't — WhisperX re-ran auto-detect per file
93
+ # ("No language specified, language will be detected ... (increases
94
+ # inference time)"). Hold the configured language on the backend and
95
+ # pass it through on every transcribe call so the hint actually skips
96
+ # the per-file detection step.
97
+ self._language = config.language
98
+
99
+ def transcribe(self, audio_path: str) -> dict[str, Any]:
100
+ audio = self._whisperx.load_audio(audio_path)
101
+ # Forward the configured language so WhisperX skips per-file auto-detect.
102
+ # When None, behavior is unchanged (auto-detect, then we use the result).
103
+ result = self._model.transcribe(audio, **build_whisperx_transcribe_kwargs(self._language))
104
+ language = result.get("language") or self._language or "en"
105
+
106
+ # Lazy-load the alignment model on first use (depends on detected language).
107
+ if self._align_model is None:
108
+ self._align_model, self._align_metadata = self._whisperx.load_align_model(
109
+ language_code=language, device=self._device
110
+ )
111
+
112
+ aligned = self._whisperx.align(
113
+ result["segments"],
114
+ self._align_model,
115
+ self._align_metadata,
116
+ audio,
117
+ self._device,
118
+ return_char_alignments=False,
119
+ )
120
+ return {"segments": aligned["segments"], "language": language}
121
+
122
+
123
+ def _resolve_device(requested: str) -> str: # pragma: no cover - env-dependent
124
+ """Map `auto` to the best available device. `cpu`/`cuda`/`mps` pass through."""
125
+ if requested != "auto":
126
+ return requested
127
+ try:
128
+ import torch # type: ignore
129
+
130
+ if torch.cuda.is_available():
131
+ return "cuda"
132
+ if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
133
+ return "mps"
134
+ except ImportError:
135
+ pass
136
+ return "cpu"
137
+
138
+
139
+ @lru_cache(maxsize=1)
140
+ def _load_whisperx_backend(config_key: str) -> WhisperBackend: # pragma: no cover - needs weights
141
+ """Lazily construct the real WhisperX backend. Cached per process so the
142
+ multi-GB model loads once (cold-start cached)."""
143
+ # config_key encodes (model_name, device, compute_type); reconstruct.
144
+ model_name, device, compute_type = config_key.split(":", 2)
145
+ return WhisperXBackend(
146
+ ASRConfig(model_name=model_name, device=device, compute_type=compute_type)
147
+ )
148
+
149
+
150
+ class Transcriber:
151
+ def __init__(self, config: ASRConfig | None = None, backend: WhisperBackend | None = None):
152
+ self.config = config or ASRConfig()
153
+ self._backend = backend
154
+
155
+ def _get_backend(self) -> WhisperBackend:
156
+ if self._backend is not None:
157
+ return self._backend
158
+ if self.config.engine == "parakeet":
159
+ from .asr_parakeet import _load_parakeet_backend, resolve_parakeet_model
160
+
161
+ return _load_parakeet_backend(
162
+ resolve_parakeet_model(self.config.model_name), self.config.language
163
+ )
164
+ key = f"{self.config.model_name}:{self.config.device}:{self.config.compute_type}"
165
+ return _load_whisperx_backend(key)
166
+
167
+ def transcribe(self, audio_path: str) -> ASRResult:
168
+ raw = self._get_backend().transcribe(audio_path)
169
+ utterances = _normalize_segments(raw.get("segments", []))
170
+ return ASRResult(
171
+ utterances=utterances,
172
+ language=raw.get("language", self.config.language),
173
+ model=self.config.model_name,
174
+ )
175
+
176
+
177
+ def _normalize_segments(segments: list[dict[str, Any]]) -> list[dict[str, Any]]:
178
+ """Convert backend segments into compost utterance dicts (pre-diarization,
179
+ pre-cue-extraction). Speaker ids are filled by the diarizer (#11)."""
180
+ out: list[dict[str, Any]] = []
181
+ for i, seg in enumerate(segments, start=1):
182
+ words = [
183
+ {"w": w["word"], "s": int(w["start"] * 1000), "e": int(w["end"] * 1000), "conf": w.get("score", 1.0)}
184
+ for w in seg.get("words", [])
185
+ if "start" in w and "end" in w
186
+ ]
187
+ out.append(
188
+ {
189
+ "id": f"U-{i:04d}",
190
+ "speaker_id": seg.get("speaker", "S?"),
191
+ "turn": i,
192
+ "start_ms": int(seg["start"] * 1000),
193
+ "end_ms": int(seg["end"] * 1000),
194
+ "text": seg.get("text", "").strip(),
195
+ "words": words,
196
+ }
197
+ )
198
+ return out
@@ -0,0 +1,174 @@
1
+ """Parakeet (NVIDIA NeMo TDT) ASR backend via `parakeet-mlx` — the native
2
+ Apple-Silicon (Metal) transcription path (#176).
3
+
4
+ Conforms to the `WhisperBackend` Protocol in `asr.py`: `transcribe()` returns
5
+ `{"segments": [...], "language": ...}` with per-word timestamps, so the rest of
6
+ the pipeline (diarization, cue parsing, silence typing, prosody) is
7
+ engine-agnostic and unchanged.
8
+
9
+ `parakeet-mlx` requires Apple Silicon + MLX and is imported lazily, so this
10
+ module stays importable (and the pure mapping helpers stay unit-testable)
11
+ without the hardware or the ~2.5 GB weights. The heavy backend itself is marked
12
+ `# pragma: no cover`, exactly like `WhisperXBackend`.
13
+
14
+ Why Parakeet-TDT 0.6B v3 by default: it tops the Open ASR Leaderboard's
15
+ convenient (local, Apple-Silicon, word-timestamped) tier — measured ~58.8x
16
+ realtime on an M1 Max vs ~1.3x for WhisperX in the CPU container — with native
17
+ frame-level word timestamps and 25-language (incl. Spanish) coverage.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import re
23
+ from functools import lru_cache
24
+ from typing import Any
25
+
26
+ from .asr import ASRConfig, WhisperBackend
27
+
28
+ # A tiny EN-vs-ES language heuristic for the native Parakeet path (#190).
29
+ # parakeet-mlx auto-detects internally but doesn't surface the detection in its
30
+ # AlignedResult, so without a `--language` hint the transcript previously
31
+ # recorded `language: "und"` (via `_detect_language`'s fallback). For a turnkey
32
+ # zero-config run we'd rather record something sensible. Counts function-word
33
+ # hits in the transcribed text; biased toward English when the signal is weak
34
+ # (the v3 model is English-first multilingual).
35
+ _ES_HEURISTIC_TOKENS = frozenset(
36
+ {
37
+ "que", "de", "la", "los", "las", "el", "en", "es", "no",
38
+ "una", "por", "con", "para", "del", "como", "pero", "más",
39
+ }
40
+ )
41
+ _EN_HEURISTIC_TOKENS = frozenset(
42
+ {
43
+ "the", "and", "of", "to", "in", "is", "for", "with",
44
+ "on", "that", "this", "you", "are", "was", "but", "they",
45
+ }
46
+ )
47
+ _WORD_RE = re.compile(r"[a-zA-Záéíóúñü]+")
48
+
49
+
50
+ def guess_lang_from_text(text: str) -> str:
51
+ """Best-effort EN/ES guess for the Parakeet path when neither the model
52
+ nor a `--language` hint reveal the language. Falls back to ``en`` when the
53
+ text is empty or signal is too weak — never returns ``und`` (#190).
54
+ """
55
+ if not text:
56
+ return "en"
57
+ tokens = _WORD_RE.findall(text.lower())
58
+ if not tokens:
59
+ return "en"
60
+ es = sum(1 for t in tokens if t in _ES_HEURISTIC_TOKENS)
61
+ en = sum(1 for t in tokens if t in _EN_HEURISTIC_TOKENS)
62
+ # Require a clear ES margin to flip — otherwise default to EN (v3 is EN-first).
63
+ if es > max(en, 1) * 1.2:
64
+ return "es"
65
+ return "en"
66
+
67
+ # Multilingual v3 (English + 24 European languages incl. Spanish) is the default;
68
+ # v2 (`...-0.6b-v2`) is English-only with marginally better English WER.
69
+ DEFAULT_PARAKEET_MODEL = "mlx-community/parakeet-tdt-0.6b-v3"
70
+
71
+ # parakeet-mlx loads the whole file into a single Metal buffer unless chunked: a
72
+ # 1-hour interview tries to allocate ~131 GB and blows past Metal's ~20 GB cap.
73
+ # Chunk at 2 minutes (parakeet stitches chunks via its default 15 s overlap +
74
+ # token timestamps) so arbitrarily long audio fits in memory. Tunable via env.
75
+ DEFAULT_CHUNK_DURATION_S = 120.0
76
+
77
+
78
+ def tokens_to_words(tokens: list[Any]) -> list[dict[str, Any]]:
79
+ """Merge parakeet sub-word tokens into words.
80
+
81
+ parakeet emits sub-word tokens where a word boundary is marked by a leading
82
+ space (e.g. ``[" If", ... ," we"," me","as"]`` → ``["If","we","measure"]``).
83
+ A token whose text starts with a space (or the first token) begins a new
84
+ word; the rest extend the current one. Timestamps are in **seconds** (the
85
+ pipeline's `_normalize_segments` converts to ms).
86
+ """
87
+ words: list[dict[str, Any]] = []
88
+ cur: dict[str, Any] | None = None
89
+ for t in tokens:
90
+ txt = getattr(t, "text", "")
91
+ if cur is None or txt.startswith(" "):
92
+ if cur is not None:
93
+ words.append(cur)
94
+ cur = {
95
+ "word": txt,
96
+ "start": float(t.start),
97
+ "end": float(t.end),
98
+ "score": float(getattr(t, "confidence", 1.0) or 1.0),
99
+ }
100
+ else:
101
+ cur["word"] += txt
102
+ cur["end"] = float(t.end)
103
+ if cur is not None:
104
+ words.append(cur)
105
+ for w in words:
106
+ w["word"] = w["word"].strip()
107
+ return [w for w in words if w["word"]]
108
+
109
+
110
+ def result_to_segments(result: Any) -> list[dict[str, Any]]:
111
+ """Map a parakeet-mlx ``AlignedResult`` (sentences → tokens) to the
112
+ `WhisperBackend` segment shape: ``{start, end, text, words}`` (seconds)."""
113
+ segments: list[dict[str, Any]] = []
114
+ for sent in getattr(result, "sentences", None) or []:
115
+ segments.append(
116
+ {
117
+ "start": float(sent.start),
118
+ "end": float(sent.end),
119
+ "text": (getattr(sent, "text", "") or "").strip(),
120
+ "words": tokens_to_words(getattr(sent, "tokens", None) or []),
121
+ }
122
+ )
123
+ return segments
124
+
125
+
126
+ def resolve_parakeet_model(model_name: str | None) -> str:
127
+ """A whisper-style model name (the ASRConfig default) means 'use the Parakeet
128
+ default'; an explicit parakeet id passes through."""
129
+ if model_name and "parakeet" in model_name:
130
+ return model_name
131
+ return DEFAULT_PARAKEET_MODEL
132
+
133
+
134
+ class ParakeetMLXBackend: # pragma: no cover - needs MLX + weights
135
+ """Concrete `WhisperBackend` wrapping `parakeet-mlx` (Apple Silicon / Metal)."""
136
+
137
+ def __init__(self, config: ASRConfig):
138
+ import os
139
+
140
+ try:
141
+ import parakeet_mlx # type: ignore
142
+ except ImportError as e:
143
+ raise RuntimeError(
144
+ "parakeet-mlx is not installed (native Apple-Silicon ASR). "
145
+ "Install it in the native transcriber venv: pip install parakeet-mlx"
146
+ ) from e
147
+ self._model = parakeet_mlx.from_pretrained(resolve_parakeet_model(config.model_name))
148
+ self._language = config.language
149
+ self._chunk_s = float(os.environ.get("COMPOST_PARAKEET_CHUNK_S", DEFAULT_CHUNK_DURATION_S))
150
+
151
+ def transcribe(self, audio_path: str) -> dict[str, Any]:
152
+ # chunk_duration keeps long files within Metal's buffer cap (see above).
153
+ result = self._model.transcribe(audio_path, chunk_duration=self._chunk_s)
154
+ segments = result_to_segments(result)
155
+ # Language resolution priority (#190):
156
+ # 1. Whatever parakeet-mlx surfaces on the result (future-proof — the
157
+ # API doesn't expose it today, but a future minor might).
158
+ # 2. The configured `--language` hint, if any.
159
+ # 3. A tiny EN/ES heuristic on the transcribed text — better than
160
+ # letting `_detect_language` fall back to "und" on the zero-config
161
+ # turnkey path.
162
+ language = getattr(result, "language", None) or self._language
163
+ if not language:
164
+ full_text = " ".join((s.get("text") or "") for s in segments)
165
+ language = guess_lang_from_text(full_text)
166
+ return {"segments": segments, "language": language}
167
+
168
+
169
+ @lru_cache(maxsize=1)
170
+ def _load_parakeet_backend(model_id: str, language: str | None) -> WhisperBackend: # pragma: no cover
171
+ """Lazily construct + cache the Parakeet backend (one model load per process)."""
172
+ return ParakeetMLXBackend(
173
+ ASRConfig(model_name=model_id, language=language, engine="parakeet")
174
+ )
@@ -0,0 +1,110 @@
1
+ """Cue parser (#10).
2
+
3
+ Whisper-large-v3 with event-tag tokens emits inline markers like [laughter],
4
+ [sigh], [cough], [clear_throat], [unintelligible], and code-switching markers.
5
+ This module pulls those out of utterance text into structured cues[] entries
6
+ (schema/cues.taxonomy.json) and returns the cleaned text.
7
+
8
+ Pure and deterministic — no model. The ASR wrapper (asr.py) produces the
9
+ tagged text; this turns it into cues.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from typing import Any
16
+
17
+ # Whisper/Whisper-AT event tag → compost cue kind (cues.taxonomy.json).
18
+ TAG_TO_KIND: dict[str, str] = {
19
+ "laughter": "laughter",
20
+ "laugh": "laughter",
21
+ "laughs": "laughter",
22
+ "sigh": "sigh",
23
+ "sighs": "sigh",
24
+ "cough": "cough",
25
+ "coughs": "cough",
26
+ "clear_throat": "throat-clear",
27
+ "throat_clear": "throat-clear",
28
+ "throat-clear": "throat-clear",
29
+ "unintelligible": "unintelligible",
30
+ "inaudible": "unintelligible",
31
+ "code_switch": "code-switching",
32
+ "code-switch": "code-switching",
33
+ "code_switching": "code-switching",
34
+ }
35
+
36
+ # Default confidence assigned to a tag-derived cue when the ASR gives none.
37
+ DEFAULT_CONFIDENCE = 0.8
38
+
39
+ _TAG_RE = re.compile(r"\[([a-zA-Z_\-]+)\]")
40
+
41
+
42
+ def _clean_text(text: str) -> str:
43
+ # Drop recognized event tags, collapse the resulting double spaces.
44
+ def repl(m: re.Match[str]) -> str:
45
+ return "" if m.group(1).lower() in TAG_TO_KIND else m.group(0)
46
+
47
+ return re.sub(r"\s{2,}", " ", _TAG_RE.sub(repl, text)).strip()
48
+
49
+
50
+ def parse_cues_from_utterance(
51
+ utterance: dict[str, Any],
52
+ next_cue_index: int = 1,
53
+ confidence: float = DEFAULT_CONFIDENCE,
54
+ ) -> tuple[str, list[dict[str, Any]]]:
55
+ """Return (cleaned_text, cues) for one utterance.
56
+
57
+ Cue timing: if a word in `words[]` matches the tag, use that word's span;
58
+ otherwise fall back to the utterance span.
59
+ """
60
+ text = utterance.get("text", "")
61
+ words = utterance.get("words", [])
62
+ speaker_id = utterance.get("speaker_id")
63
+ cues: list[dict[str, Any]] = []
64
+ idx = next_cue_index
65
+
66
+ for m in _TAG_RE.finditer(text):
67
+ kind = TAG_TO_KIND.get(m.group(1).lower())
68
+ if kind is None:
69
+ continue
70
+ start_ms, end_ms = _tag_span(m.group(0), words, utterance)
71
+ cue: dict[str, Any] = {
72
+ "id": f"CUE-{idx:03d}",
73
+ "kind": kind,
74
+ "start_ms": start_ms,
75
+ "end_ms": end_ms,
76
+ "source": "audio",
77
+ "confidence": confidence,
78
+ }
79
+ if speaker_id is not None:
80
+ cue["speaker_id"] = speaker_id
81
+ cues.append(cue)
82
+ idx += 1
83
+
84
+ return _clean_text(text), cues
85
+
86
+
87
+ def _tag_span(
88
+ tag_token: str,
89
+ words: list[dict[str, Any]],
90
+ utterance: dict[str, Any],
91
+ ) -> tuple[int, int]:
92
+ for w in words:
93
+ if w.get("w") == tag_token:
94
+ return int(w["s"]), int(w["e"])
95
+ return int(utterance["start_ms"]), int(utterance["end_ms"])
96
+
97
+
98
+ def parse_transcript_cues(transcript: dict[str, Any]) -> dict[str, Any]:
99
+ """Extract cues from every utterance, append to cues[], strip tags from text.
100
+
101
+ Cue ids continue from any existing cues[]. Mutates and returns the transcript.
102
+ """
103
+ existing = transcript.setdefault("cues", [])
104
+ idx = len(existing) + 1
105
+ for utt in transcript.get("utterances", []):
106
+ cleaned, cues = parse_cues_from_utterance(utt, next_cue_index=idx)
107
+ utt["text"] = cleaned
108
+ existing.extend(cues)
109
+ idx += len(cues)
110
+ return transcript