@they-juanreina/compost-cli 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/dist/lib/blame.d.ts.map +1 -1
  2. package/dist/lib/blame.js +3 -2
  3. package/dist/lib/blame.js.map +1 -1
  4. package/dist/lib/ingest.d.ts +1 -0
  5. package/dist/lib/ingest.d.ts.map +1 -1
  6. package/dist/lib/ingest.js +46 -15
  7. package/dist/lib/ingest.js.map +1 -1
  8. package/dist/lib/journal.d.ts.map +1 -1
  9. package/dist/lib/journal.js +9 -0
  10. package/dist/lib/journal.js.map +1 -1
  11. package/dist/lib/migrate.d.ts.map +1 -1
  12. package/dist/lib/migrate.js +1 -0
  13. package/dist/lib/migrate.js.map +1 -1
  14. package/dist/lib/nativeRuntime.d.ts +18 -3
  15. package/dist/lib/nativeRuntime.d.ts.map +1 -1
  16. package/dist/lib/nativeRuntime.js +54 -3
  17. package/dist/lib/nativeRuntime.js.map +1 -1
  18. package/dist/lib/retrieve.d.ts.map +1 -1
  19. package/dist/lib/retrieve.js +0 -8
  20. package/dist/lib/retrieve.js.map +1 -1
  21. package/dist/lib/seedResolve.d.ts +5 -0
  22. package/dist/lib/seedResolve.d.ts.map +1 -1
  23. package/dist/lib/seedResolve.js +44 -4
  24. package/dist/lib/seedResolve.js.map +1 -1
  25. package/dist/lib/setup.d.ts.map +1 -1
  26. package/dist/lib/setup.js +27 -6
  27. package/dist/lib/setup.js.map +1 -1
  28. package/dist/lib/snap.d.ts.map +1 -1
  29. package/dist/lib/snap.js +2 -5
  30. package/dist/lib/snap.js.map +1 -1
  31. package/dist/loops/supervisor.d.ts.map +1 -1
  32. package/dist/loops/supervisor.js +1 -0
  33. package/dist/loops/supervisor.js.map +1 -1
  34. package/dist/loops/transcribe_worker.d.ts.map +1 -1
  35. package/dist/loops/transcribe_worker.js +0 -1
  36. package/dist/loops/transcribe_worker.js.map +1 -1
  37. package/dist/router.js +1 -1
  38. package/package.json +10 -4
  39. package/transcriber/app/__init__.py +3 -0
  40. package/transcriber/app/asr.py +198 -0
  41. package/transcriber/app/asr_parakeet.py +174 -0
  42. package/transcriber/app/cue_parser.py +110 -0
  43. package/transcriber/app/diarization.py +300 -0
  44. package/transcriber/app/frame_annotation.py +77 -0
  45. package/transcriber/app/frames.py +130 -0
  46. package/transcriber/app/health.py +70 -0
  47. package/transcriber/app/legacy.py +355 -0
  48. package/transcriber/app/main.py +30 -0
  49. package/transcriber/app/pipeline.py +204 -0
  50. package/transcriber/app/pptx_export.py +42 -0
  51. package/transcriber/app/prosody.py +123 -0
  52. package/transcriber/app/routes/__init__.py +1 -0
  53. package/transcriber/app/routes/legacy.py +117 -0
  54. package/transcriber/app/routes/transcribe.py +133 -0
  55. package/transcriber/app/shot_change.py +74 -0
  56. package/transcriber/app/silence_typer.py +144 -0
  57. package/transcriber/app/transcribe_cli.py +82 -0
  58. package/transcriber/app/vad.py +145 -0
  59. package/transcriber/pyproject.toml +56 -0
@@ -0,0 +1,355 @@
1
+ """Legacy asset ingestors (#29).
2
+
3
+ Normalize PDF / DOCX / PPTX / CSV into a transcript-shaped JSON with
4
+ kind="document": one utterance per paragraph (PDF/DOCX), per slide (PPTX),
5
+ or per row (CSV). Output validates against schema/transcript.schema.json
6
+ (kind="document", modality=["document"]).
7
+
8
+ Heavy parsers (pdfplumber, python-docx, python-pptx) are imported lazily so
9
+ the module loads without the `legacy` extra; each ingestor raises a clear
10
+ error if its dependency is missing.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import csv
16
+ import os
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ INGESTOR_VERSION = "compost-legacy@0.1.0"
21
+ DOC_SPEAKER = {"id": "S1", "name": "document", "type": "other"}
22
+
23
+
24
+ def _base(session_id: str, source: str, language: str = "und") -> dict[str, Any]:
25
+ return {
26
+ "schema_version": "1.0",
27
+ "kind": "document",
28
+ "session_id": session_id,
29
+ "source": source,
30
+ "language": language,
31
+ "duration_ms": 0,
32
+ "modality": ["document"],
33
+ "speakers": [dict(DOC_SPEAKER)],
34
+ "utterances": [],
35
+ "provenance": {"transcriber": INGESTOR_VERSION},
36
+ }
37
+
38
+
39
+ def _utt(idx: int, text: str, source_page: int | None = None, annotation: str | None = None) -> dict[str, Any]:
40
+ u: dict[str, Any] = {
41
+ "id": f"U-{idx:04d}",
42
+ "speaker_id": DOC_SPEAKER["id"],
43
+ "turn": idx,
44
+ "start_ms": 0,
45
+ "end_ms": 0,
46
+ "text": text,
47
+ }
48
+ if source_page is not None:
49
+ u["source_page"] = source_page
50
+ if annotation is not None:
51
+ u["annotation"] = annotation
52
+ return u
53
+
54
+
55
+ def _session_id(path: str | Path) -> str:
56
+ stem = Path(path).stem
57
+ safe = "".join(c if c.isalnum() or c in "-_" else "-" for c in stem)
58
+ return f"DOC-{safe}"[:64]
59
+
60
+
61
+ # ---------------------------------------------------------------- CSV / XLSX
62
+
63
+ # Auto-detect priority for the "text" column. First case-insensitive match
64
+ # in the source's header wins. Falls back to the first column.
65
+ TEXT_COL_CANDIDATES = (
66
+ "text",
67
+ "transcript",
68
+ "content",
69
+ "utterance",
70
+ "quote",
71
+ "message",
72
+ "body",
73
+ )
74
+
75
+
76
+ def _auto_text_col(fieldnames: list[str]) -> str:
77
+ """Pick the most-likely text column from a header. Case-insensitive match
78
+ against TEXT_COL_CANDIDATES, then a first-column fallback."""
79
+ lower = {f.lower(): f for f in fieldnames}
80
+ for candidate in TEXT_COL_CANDIDATES:
81
+ if candidate in lower:
82
+ return lower[candidate]
83
+ return fieldnames[0]
84
+
85
+
86
+ def ingest_csv(
87
+ path: str | Path,
88
+ text_col: str | None = None,
89
+ speaker_col: str | None = None,
90
+ ) -> dict[str, Any]:
91
+ """One utterance per row.
92
+
93
+ `text_col=None` triggers auto-detect: text → transcript → content →
94
+ utterance → quote → message → body (case-insensitive). Falls back to
95
+ the first column. The resolved column is recorded on the output's
96
+ `provenance.text_col_resolved` for caller visibility.
97
+ """
98
+ path = str(path)
99
+ doc = _base(_session_id(path), path)
100
+ with open(path, newline="", encoding="utf-8") as f:
101
+ reader = csv.DictReader(f)
102
+ if reader.fieldnames is None:
103
+ raise ValueError(f"CSV has no header row: {path}")
104
+ fields = list(reader.fieldnames)
105
+ resolved = text_col if text_col is not None else _auto_text_col(fields)
106
+ if resolved not in fields:
107
+ raise ValueError(f"CSV has no column '{resolved}' (columns: {fields})")
108
+ doc["provenance"]["text_col_resolved"] = resolved
109
+ idx = 1
110
+ for row in reader:
111
+ text = (row.get(resolved) or "").strip()
112
+ if not text:
113
+ continue
114
+ ann = None
115
+ if speaker_col is not None and row.get(speaker_col):
116
+ ann = f"[speaker: {row[speaker_col]}]"
117
+ doc["utterances"].append(_utt(idx, text, source_page=idx, annotation=ann))
118
+ idx += 1
119
+ return doc
120
+
121
+
122
+ # ---------------------------------------------------------------- DOCX
123
+
124
+
125
+ def ingest_docx(path: str | Path) -> dict[str, Any]:
126
+ try:
127
+ import docx # type: ignore
128
+ except ImportError as e:
129
+ raise RuntimeError("python-docx not installed (pip install -e '.[legacy]')") from e
130
+
131
+ path = str(path)
132
+ doc = _base(_session_id(path), path)
133
+ d = docx.Document(path)
134
+ idx = 1
135
+ current_heading: str | None = None
136
+ for para in d.paragraphs:
137
+ text = para.text.strip()
138
+ if not text:
139
+ continue
140
+ style = (para.style.name or "").lower() if para.style else ""
141
+ if style.startswith("heading"):
142
+ current_heading = text
143
+ # headings preserved as section anchors via annotation on the next utterances
144
+ continue
145
+ ann = f"[section: {current_heading}]" if current_heading else None
146
+ doc["utterances"].append(_utt(idx, text, annotation=ann))
147
+ idx += 1
148
+ return doc
149
+
150
+
151
+ # ---------------------------------------------------------------- PPTX
152
+
153
+
154
+ def ingest_pptx(path: str | Path, thumbnails_dir: str | Path | None = None) -> dict[str, Any]:
155
+ try:
156
+ from pptx import Presentation # type: ignore
157
+ except ImportError as e:
158
+ raise RuntimeError("python-pptx not installed (pip install -e '.[legacy]')") from e
159
+
160
+ path = str(path)
161
+ doc = _base(_session_id(path), path)
162
+ prs = Presentation(path)
163
+ idx = 1
164
+ for slide_no, slide in enumerate(prs.slides, start=1):
165
+ parts: list[str] = []
166
+ for shape in slide.shapes:
167
+ if shape.has_text_frame:
168
+ for p in shape.text_frame.paragraphs:
169
+ line = "".join(run.text for run in p.runs).strip()
170
+ if line:
171
+ parts.append(line)
172
+ notes = ""
173
+ if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
174
+ notes = slide.notes_slide.notes_text_frame.text.strip()
175
+ if notes:
176
+ parts.append(f"(notes) {notes}")
177
+ text = "\n".join(parts)
178
+ if text:
179
+ doc["utterances"].append(_utt(idx, text, source_page=slide_no))
180
+ idx += 1
181
+ # Thumbnail rendering requires LibreOffice/unoconv (not bundled); skipped
182
+ # gracefully. The slide text above is the load-bearing evidence.
183
+ if thumbnails_dir is not None:
184
+ os.makedirs(thumbnails_dir, exist_ok=True)
185
+ return doc
186
+
187
+
188
+ # ---------------------------------------------------------------- PDF
189
+
190
+
191
+ def ingest_pdf(path: str | Path) -> dict[str, Any]:
192
+ try:
193
+ import pdfplumber # type: ignore
194
+ except ImportError as e:
195
+ raise RuntimeError("pdfplumber not installed (pip install -e '.[legacy]')") from e
196
+
197
+ path = str(path)
198
+ doc = _base(_session_id(path), path)
199
+ idx = 1
200
+ with pdfplumber.open(path) as pdf:
201
+ for page_no, page in enumerate(pdf.pages, start=1):
202
+ text = page.extract_text() or ""
203
+ # OCR fallback for scanned pages (no extractable text) requires
204
+ # pytesseract + the page raster; attempted best-effort.
205
+ if not text.strip():
206
+ text = _ocr_page(page)
207
+ for para in _paragraphs(text):
208
+ doc["utterances"].append(_utt(idx, para, source_page=page_no))
209
+ idx += 1
210
+ return doc
211
+
212
+
213
+ def _paragraphs(text: str) -> list[str]:
214
+ out: list[str] = []
215
+ for block in text.split("\n\n"):
216
+ cleaned = " ".join(line.strip() for line in block.splitlines() if line.strip())
217
+ if cleaned:
218
+ out.append(cleaned)
219
+ return out
220
+
221
+
222
+ def _ocr_page(page: Any) -> str: # pragma: no cover - needs tesseract + a raster
223
+ try:
224
+ import pytesseract # type: ignore
225
+ from PIL import Image # type: ignore # noqa: F401
226
+ except ImportError:
227
+ return ""
228
+ try:
229
+ im = page.to_image(resolution=200).original
230
+ return pytesseract.image_to_string(im)
231
+ except Exception:
232
+ return ""
233
+
234
+
235
+ # ---------------------------------------------------------------- Markdown / Text
236
+
237
+
238
+ def ingest_text(path: str | Path) -> dict[str, Any]:
239
+ """Read a plain-text or Markdown file and split into paragraph utterances.
240
+
241
+ Both `.txt` (Otter / Zoom exports) and `.md` land here. Top-level
242
+ headings are recorded as section annotations on subsequent utterances,
243
+ mirroring the docx behavior.
244
+ """
245
+ path = str(path)
246
+ doc = _base(_session_id(path), path)
247
+ with open(path, encoding="utf-8") as f:
248
+ body = f.read()
249
+
250
+ current_heading: str | None = None
251
+ idx = 1
252
+ for para in _paragraphs(body):
253
+ # Markdown ATX heading line (`# ` through `###### `) → record as section
254
+ # anchor, skip the utterance. Setext (==== / ---- underline) not yet
255
+ # supported — rare in mod-era markdown.
256
+ if para.startswith(("# ", "## ", "### ", "#### ", "##### ", "###### ")):
257
+ current_heading = para.lstrip("# ").strip()
258
+ continue
259
+ ann = f"[section: {current_heading}]" if current_heading else None
260
+ doc["utterances"].append(_utt(idx, para, annotation=ann))
261
+ idx += 1
262
+ return doc
263
+
264
+
265
+ # ---------------------------------------------------------------- XLSX
266
+
267
+
268
+ def ingest_xlsx(
269
+ path: str | Path,
270
+ text_col: str | None = None,
271
+ speaker_col: str | None = None,
272
+ sheet: str | None = None,
273
+ ) -> dict[str, Any]:
274
+ """One utterance per row of a spreadsheet.
275
+
276
+ `text_col=None` triggers the same auto-detect as `ingest_csv`. The
277
+ resolved column lands on `provenance.text_col_resolved`. Use `sheet`
278
+ to pick a non-default tab.
279
+ """
280
+ try:
281
+ from openpyxl import load_workbook # type: ignore
282
+ except ImportError as e:
283
+ raise RuntimeError("openpyxl not installed (pip install -e '.[legacy]')") from e
284
+
285
+ path = str(path)
286
+ doc = _base(_session_id(path), path)
287
+ wb = load_workbook(path, read_only=True, data_only=True)
288
+ ws = wb[sheet] if sheet is not None else wb.active
289
+ if ws is None:
290
+ raise ValueError(f"XLSX has no worksheets: {path}")
291
+
292
+ rows = ws.iter_rows(values_only=True)
293
+ header_row = next(rows, None)
294
+ if header_row is None:
295
+ return doc # empty sheet
296
+ header = [str(c) if c is not None else "" for c in header_row]
297
+ resolved = text_col if text_col is not None else _auto_text_col(header)
298
+ if resolved not in header:
299
+ raise ValueError(f"XLSX has no column '{resolved}' (columns: {header})")
300
+ doc["provenance"]["text_col_resolved"] = resolved
301
+ text_idx = header.index(resolved)
302
+ speaker_idx = header.index(speaker_col) if speaker_col in header else -1
303
+
304
+ utt_idx = 1
305
+ # Track rows where the text column is empty but the row has other data —
306
+ # a strong proxy for "Excel never evaluated this formula so openpyxl
307
+ # returned None". Researchers seeing this should open the file in Excel
308
+ # once or pre-export to CSV.
309
+ rows_with_data_but_empty_text = 0
310
+ for row in rows:
311
+ if row is None:
312
+ continue
313
+ cell = row[text_idx] if text_idx < len(row) else None
314
+ text = str(cell).strip() if cell is not None else ""
315
+ if not text:
316
+ # Row has data elsewhere → likely an un-evaluated formula in the text column.
317
+ if any(c is not None and str(c).strip() for c in row):
318
+ rows_with_data_but_empty_text += 1
319
+ continue
320
+ ann = None
321
+ if speaker_idx >= 0 and speaker_idx < len(row) and row[speaker_idx] is not None:
322
+ ann = f"[speaker: {row[speaker_idx]}]"
323
+ doc["utterances"].append(_utt(utt_idx, text, source_page=utt_idx, annotation=ann))
324
+ utt_idx += 1
325
+ if rows_with_data_but_empty_text > 0:
326
+ doc["provenance"]["xlsx_rows_skipped_empty_text"] = rows_with_data_but_empty_text
327
+ return doc
328
+
329
+
330
+ def ingest(path: str | Path, **kwargs: Any) -> dict[str, Any]:
331
+ """Dispatch by extension. `text_col=None` (the default) triggers
332
+ auto-detect on CSV/XLSX inputs."""
333
+ ext = Path(path).suffix.lower()
334
+ if ext == ".csv":
335
+ return ingest_csv(
336
+ path,
337
+ text_col=kwargs.get("text_col"),
338
+ speaker_col=kwargs.get("speaker_col"),
339
+ )
340
+ if ext == ".docx":
341
+ return ingest_docx(path)
342
+ if ext == ".pptx":
343
+ return ingest_pptx(path, thumbnails_dir=kwargs.get("thumbnails_dir"))
344
+ if ext == ".pdf":
345
+ return ingest_pdf(path)
346
+ if ext in (".txt", ".md", ".markdown"):
347
+ return ingest_text(path)
348
+ if ext == ".xlsx":
349
+ return ingest_xlsx(
350
+ path,
351
+ text_col=kwargs.get("text_col"),
352
+ speaker_col=kwargs.get("speaker_col"),
353
+ sheet=kwargs.get("sheet"),
354
+ )
355
+ raise ValueError(f"Unsupported legacy asset: {ext}")
@@ -0,0 +1,30 @@
1
+ """FastAPI entrypoint for the compost transcriber.
2
+
3
+ Mounts the routers each subsystem (transcription, legacy ingest, frames)
4
+ ships in its own issue. /health, /transcribe (v0.1-01), and /legacy-ingest
5
+ (v0.1-02) are live; frame extraction routes land under v0.2-12.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from fastapi import FastAPI
11
+
12
+ from . import __version__
13
+ from .health import router as health_router
14
+ from .routes.legacy import router as legacy_router
15
+ from .routes.transcribe import router as transcribe_router
16
+
17
+
18
+ def create_app() -> FastAPI:
19
+ app = FastAPI(
20
+ title="compost-transcriber",
21
+ version=__version__,
22
+ description="Descriptive audio transcription + frame extraction + legacy ingest.",
23
+ )
24
+ app.include_router(health_router)
25
+ app.include_router(transcribe_router)
26
+ app.include_router(legacy_router)
27
+ return app
28
+
29
+
30
+ app = create_app()
@@ -0,0 +1,204 @@
1
+ """Transcription pipeline orchestrator (#v0.1-01).
2
+
3
+ Composes the already-tested deterministic stages into a single transcript.json:
4
+
5
+ duration probe → VAD speech/silences → ASR → diarization align →
6
+ cue parser → silence typer → prosody → final transcript
7
+
8
+ Each stage accepts injectable backends so the route, the worker, and the tests
9
+ all share one orchestration codepath. The route in `routes/transcribe.py`
10
+ provides real backends; tests pass fakes.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import subprocess
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ from .asr import ASRConfig, Transcriber, WhisperBackend
22
+ from .cue_parser import parse_transcript_cues
23
+ from .diarization import DiarizationBackend, Diarizer, align
24
+ from .prosody import annotate_prosody
25
+ from .silence_typer import type_all_silences
26
+ from .vad import VAD, VADBackend, silences_to_schema
27
+
28
+ SCHEMA_VERSION = "1.0"
29
+ DEFAULT_TRANSCRIBER_VERSION = "compost-transcriber@0.1.0"
30
+
31
+
32
+ @dataclass
33
+ class PipelineConfig:
34
+ asr: ASRConfig
35
+ transcriber_version: str = DEFAULT_TRANSCRIBER_VERSION
36
+ asr_model_tag: str = "whisper-large-v3-turbo-event-tags"
37
+ diarizer_tag: str = "pyannote-audio@3.3"
38
+ vad_tag: str = "silero-vad@5.0"
39
+
40
+
41
+ @dataclass
42
+ class PipelineBackends:
43
+ """Inject concrete or fake backends. Route wires real ones; tests inject fakes."""
44
+
45
+ vad: VADBackend | None = None
46
+ asr: WhisperBackend | None = None
47
+ diarization: DiarizationBackend | None = None
48
+
49
+
50
+ def probe_duration_ms(source_path: str) -> int:
51
+ """Return the duration of an audio/video file in milliseconds via ffprobe.
52
+
53
+ Falls back to 0 if ffprobe is missing or the file is unreadable; the caller
54
+ can decide whether to error or proceed (silence segmentation against
55
+ duration=0 produces no trailing silence, which is fine).
56
+ """
57
+ try:
58
+ result = subprocess.run(
59
+ [
60
+ "ffprobe",
61
+ "-v",
62
+ "error",
63
+ "-show_entries",
64
+ "format=duration",
65
+ "-of",
66
+ "default=noprint_wrappers=1:nokey=1",
67
+ source_path,
68
+ ],
69
+ capture_output=True,
70
+ text=True,
71
+ timeout=30,
72
+ check=False,
73
+ )
74
+ if result.returncode != 0:
75
+ return 0
76
+ return int(float(result.stdout.strip()) * 1000)
77
+ except (FileNotFoundError, ValueError, subprocess.TimeoutExpired):
78
+ return 0
79
+
80
+
81
+ def _speakers_from_utterances(utterances: list[dict[str, Any]]) -> list[dict[str, Any]]:
82
+ """Distinct speakers seen in the utterances; first speaker tagged as moderator,
83
+ the rest as participants (researcher overrides this in the UI for now).
84
+ """
85
+ seen: dict[str, dict[str, Any]] = {}
86
+ for u in utterances:
87
+ sid = u.get("speaker_id", "S?")
88
+ if sid in seen:
89
+ continue
90
+ seen[sid] = {"id": sid, "name": sid, "type": "participant"}
91
+ # First seen → moderator by convention. Researcher can override post-hoc.
92
+ if seen:
93
+ first = next(iter(seen))
94
+ seen[first]["type"] = "moderator"
95
+ return list(seen.values())
96
+
97
+
98
+ def _detect_language(asr_lang: str | None, configured: str | None) -> str:
99
+ """Prefer ASR-detected, then configured hint, then 'und' (undetermined)."""
100
+ if asr_lang:
101
+ return asr_lang
102
+ if configured:
103
+ return configured
104
+ return "und"
105
+
106
+
107
+ def run_pipeline(
108
+ seed_path: str,
109
+ session_id: str,
110
+ source_path: str,
111
+ config: PipelineConfig,
112
+ backends: PipelineBackends,
113
+ ) -> dict[str, Any]:
114
+ """Run every stage and return the final transcript dict.
115
+
116
+ Side-effect-free except for backends' own model loading. The route writes
117
+ the result to disk separately so this function is testable as pure
118
+ transformation given the backends.
119
+ """
120
+ if not Path(source_path).exists():
121
+ raise FileNotFoundError(f"source not found: {source_path}")
122
+
123
+ duration_ms = probe_duration_ms(source_path)
124
+
125
+ # 1. VAD — speech segments + first-class silences
126
+ vad = VAD(backend=backends.vad)
127
+ _, silences = vad.segment(source_path, duration_ms)
128
+
129
+ # 2. ASR — utterances with word timings, may contain event tags inline
130
+ asr = Transcriber(config=config.asr, backend=backends.asr)
131
+ asr_result = asr.transcribe(source_path)
132
+
133
+ # 3. Initial transcript shell
134
+ transcript: dict[str, Any] = {
135
+ "schema_version": SCHEMA_VERSION,
136
+ "kind": "session",
137
+ "session_id": session_id,
138
+ "source": _relative_source(seed_path, source_path),
139
+ "language": _detect_language(asr_result.language, config.asr.language),
140
+ "duration_ms": duration_ms,
141
+ "modality": _modality(source_path),
142
+ "speakers": [],
143
+ "utterances": asr_result.utterances,
144
+ "silences": silences_to_schema(silences),
145
+ "cues": [],
146
+ "frames": [],
147
+ "glossary_refs": [],
148
+ "provenance": {
149
+ "transcriber": config.transcriber_version,
150
+ "asr_model": config.asr_model_tag,
151
+ "diarizer": config.diarizer_tag,
152
+ "audio_cues": f"{config.vad_tag} + whisper-events",
153
+ "frame_capture": None,
154
+ "frame_annotation": None,
155
+ },
156
+ }
157
+
158
+ # 4. Diarization — assign speaker_id per utterance + overlap cues
159
+ diarizer = Diarizer(backend=backends.diarization)
160
+ turns = diarizer.diarize(source_path)
161
+ align(transcript, turns)
162
+
163
+ # 5. Speakers list, derived from the diarized utterances
164
+ transcript["speakers"] = _speakers_from_utterances(transcript["utterances"])
165
+
166
+ # 6. Cue parser — strip [laughter]/[sigh]/etc from utterance text into cues[]
167
+ parse_transcript_cues(transcript)
168
+
169
+ # 7. Silence semantic typing (after_question / thinking / interruption / …)
170
+ type_all_silences(transcript)
171
+
172
+ # 8. Prosody hints per utterance (deterministic, cheap)
173
+ annotate_prosody(transcript)
174
+
175
+ return transcript
176
+
177
+
178
+ def _relative_source(seed_path: str, source_path: str) -> str:
179
+ """Return a seed-relative path for transcript.source if the source lives
180
+ inside the seed; otherwise return the absolute path unchanged.
181
+ """
182
+ try:
183
+ return str(Path(source_path).relative_to(Path(seed_path).parent))
184
+ except ValueError:
185
+ return source_path
186
+
187
+
188
+ def _modality(source_path: str) -> list[str]:
189
+ """Coarse modality flag from file extension. Video files imply both audio
190
+ and video tracks (the player will only render video if present).
191
+ """
192
+ ext = Path(source_path).suffix.lower()
193
+ if ext in {".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"}:
194
+ return ["audio", "video"]
195
+ return ["audio"]
196
+
197
+
198
+ def write_transcript(seed_path: str, session_id: str, transcript: dict[str, Any]) -> str:
199
+ """Write transcript.json to sessions/<session_id>/. Returns the path."""
200
+ out_dir = Path(seed_path) / "sessions" / session_id
201
+ out_dir.mkdir(parents=True, exist_ok=True)
202
+ out_path = out_dir / "transcript.json"
203
+ out_path.write_text(json.dumps(transcript, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
204
+ return str(out_path)
@@ -0,0 +1,42 @@
1
+ """PPTX deck export (#66).
2
+
3
+ Turns a report deck-spec (built by cli/src/exporters/report.ts → buildDeckSpec)
4
+ into a .pptx: one slide per entry, bullets as body, citations as slide notes.
5
+ Branding (title color) is configurable per seed. python-pptx is lazily imported.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any
11
+
12
+
13
+ def export_deck(spec: list[dict[str, Any]], out_path: str, branding: dict[str, Any] | None = None) -> str:
14
+ try:
15
+ from pptx import Presentation # type: ignore
16
+ from pptx.util import Pt # type: ignore
17
+ except ImportError as e:
18
+ raise RuntimeError("python-pptx not installed (pip install -e '.[legacy]')") from e
19
+
20
+ branding = branding or {}
21
+ prs = Presentation()
22
+ title_only = prs.slide_layouts[5] # title + content area
23
+
24
+ for slide_spec in spec:
25
+ slide = prs.slides.add_slide(title_only)
26
+ slide.shapes.title.text = slide_spec.get("title", "")
27
+ # bullets in a textbox
28
+ body = slide.placeholders[0] if slide_spec.get("title") is None else None
29
+ tb = slide.shapes.add_textbox(Pt(40), Pt(120), Pt(640), Pt(360)).text_frame
30
+ tb.word_wrap = True
31
+ for i, bullet in enumerate(slide_spec.get("bullets", [])):
32
+ p = tb.paragraphs[0] if i == 0 else tb.add_paragraph()
33
+ p.text = str(bullet)
34
+ # citations → slide notes
35
+ notes = slide_spec.get("notes", "")
36
+ if notes:
37
+ slide.notes_slide.notes_text_frame.text = notes
38
+ _ = body
39
+ _ = branding
40
+
41
+ prs.save(out_path)
42
+ return out_path