@they-juanreina/compost-cli 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/blame.d.ts.map +1 -1
- package/dist/lib/blame.js +3 -2
- package/dist/lib/blame.js.map +1 -1
- package/dist/lib/journal.d.ts.map +1 -1
- package/dist/lib/journal.js +9 -0
- package/dist/lib/journal.js.map +1 -1
- package/dist/lib/migrate.d.ts.map +1 -1
- package/dist/lib/migrate.js +1 -0
- package/dist/lib/migrate.js.map +1 -1
- package/dist/lib/nativeRuntime.d.ts +6 -3
- package/dist/lib/nativeRuntime.d.ts.map +1 -1
- package/dist/lib/nativeRuntime.js +6 -3
- package/dist/lib/nativeRuntime.js.map +1 -1
- package/dist/lib/retrieve.d.ts.map +1 -1
- package/dist/lib/retrieve.js +0 -8
- package/dist/lib/retrieve.js.map +1 -1
- package/dist/lib/seedResolve.d.ts.map +1 -1
- package/dist/lib/seedResolve.js +1 -0
- package/dist/lib/seedResolve.js.map +1 -1
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +9 -8
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/snap.d.ts.map +1 -1
- package/dist/lib/snap.js +2 -5
- package/dist/lib/snap.js.map +1 -1
- package/dist/loops/supervisor.d.ts.map +1 -1
- package/dist/loops/supervisor.js +1 -0
- package/dist/loops/supervisor.js.map +1 -1
- package/dist/loops/transcribe_worker.d.ts.map +1 -1
- package/dist/loops/transcribe_worker.js +0 -1
- package/dist/loops/transcribe_worker.js.map +1 -1
- package/dist/router.js +1 -1
- package/package.json +10 -4
- package/transcriber/app/__init__.py +3 -0
- package/transcriber/app/asr.py +198 -0
- package/transcriber/app/asr_parakeet.py +174 -0
- package/transcriber/app/cue_parser.py +110 -0
- package/transcriber/app/diarization.py +300 -0
- package/transcriber/app/frame_annotation.py +77 -0
- package/transcriber/app/frames.py +130 -0
- package/transcriber/app/health.py +70 -0
- package/transcriber/app/legacy.py +355 -0
- package/transcriber/app/main.py +30 -0
- package/transcriber/app/pipeline.py +204 -0
- package/transcriber/app/pptx_export.py +42 -0
- package/transcriber/app/prosody.py +123 -0
- package/transcriber/app/routes/__init__.py +1 -0
- package/transcriber/app/routes/legacy.py +117 -0
- package/transcriber/app/routes/transcribe.py +133 -0
- package/transcriber/app/shot_change.py +74 -0
- package/transcriber/app/silence_typer.py +144 -0
- package/transcriber/app/transcribe_cli.py +82 -0
- package/transcriber/app/vad.py +145 -0
- package/transcriber/pyproject.toml +56 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"""Legacy asset ingestors (#29).
|
|
2
|
+
|
|
3
|
+
Normalize PDF / DOCX / PPTX / CSV into a transcript-shaped JSON with
|
|
4
|
+
kind="document": one utterance per paragraph (PDF/DOCX), per slide (PPTX),
|
|
5
|
+
or per row (CSV). Output validates against schema/transcript.schema.json
|
|
6
|
+
(kind="document", modality=["document"]).
|
|
7
|
+
|
|
8
|
+
Heavy parsers (pdfplumber, python-docx, python-pptx) are imported lazily so
|
|
9
|
+
the module loads without the `legacy` extra; each ingestor raises a clear
|
|
10
|
+
error if its dependency is missing.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import csv
|
|
16
|
+
import os
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
INGESTOR_VERSION = "compost-legacy@0.1.0"
|
|
21
|
+
DOC_SPEAKER = {"id": "S1", "name": "document", "type": "other"}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _base(session_id: str, source: str, language: str = "und") -> dict[str, Any]:
|
|
25
|
+
return {
|
|
26
|
+
"schema_version": "1.0",
|
|
27
|
+
"kind": "document",
|
|
28
|
+
"session_id": session_id,
|
|
29
|
+
"source": source,
|
|
30
|
+
"language": language,
|
|
31
|
+
"duration_ms": 0,
|
|
32
|
+
"modality": ["document"],
|
|
33
|
+
"speakers": [dict(DOC_SPEAKER)],
|
|
34
|
+
"utterances": [],
|
|
35
|
+
"provenance": {"transcriber": INGESTOR_VERSION},
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _utt(idx: int, text: str, source_page: int | None = None, annotation: str | None = None) -> dict[str, Any]:
|
|
40
|
+
u: dict[str, Any] = {
|
|
41
|
+
"id": f"U-{idx:04d}",
|
|
42
|
+
"speaker_id": DOC_SPEAKER["id"],
|
|
43
|
+
"turn": idx,
|
|
44
|
+
"start_ms": 0,
|
|
45
|
+
"end_ms": 0,
|
|
46
|
+
"text": text,
|
|
47
|
+
}
|
|
48
|
+
if source_page is not None:
|
|
49
|
+
u["source_page"] = source_page
|
|
50
|
+
if annotation is not None:
|
|
51
|
+
u["annotation"] = annotation
|
|
52
|
+
return u
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _session_id(path: str | Path) -> str:
|
|
56
|
+
stem = Path(path).stem
|
|
57
|
+
safe = "".join(c if c.isalnum() or c in "-_" else "-" for c in stem)
|
|
58
|
+
return f"DOC-{safe}"[:64]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------- CSV / XLSX
|
|
62
|
+
|
|
63
|
+
# Auto-detect priority for the "text" column. First case-insensitive match
|
|
64
|
+
# in the source's header wins. Falls back to the first column.
|
|
65
|
+
TEXT_COL_CANDIDATES = (
|
|
66
|
+
"text",
|
|
67
|
+
"transcript",
|
|
68
|
+
"content",
|
|
69
|
+
"utterance",
|
|
70
|
+
"quote",
|
|
71
|
+
"message",
|
|
72
|
+
"body",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _auto_text_col(fieldnames: list[str]) -> str:
|
|
77
|
+
"""Pick the most-likely text column from a header. Case-insensitive match
|
|
78
|
+
against TEXT_COL_CANDIDATES, then a first-column fallback."""
|
|
79
|
+
lower = {f.lower(): f for f in fieldnames}
|
|
80
|
+
for candidate in TEXT_COL_CANDIDATES:
|
|
81
|
+
if candidate in lower:
|
|
82
|
+
return lower[candidate]
|
|
83
|
+
return fieldnames[0]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def ingest_csv(
|
|
87
|
+
path: str | Path,
|
|
88
|
+
text_col: str | None = None,
|
|
89
|
+
speaker_col: str | None = None,
|
|
90
|
+
) -> dict[str, Any]:
|
|
91
|
+
"""One utterance per row.
|
|
92
|
+
|
|
93
|
+
`text_col=None` triggers auto-detect: text → transcript → content →
|
|
94
|
+
utterance → quote → message → body (case-insensitive). Falls back to
|
|
95
|
+
the first column. The resolved column is recorded on the output's
|
|
96
|
+
`provenance.text_col_resolved` for caller visibility.
|
|
97
|
+
"""
|
|
98
|
+
path = str(path)
|
|
99
|
+
doc = _base(_session_id(path), path)
|
|
100
|
+
with open(path, newline="", encoding="utf-8") as f:
|
|
101
|
+
reader = csv.DictReader(f)
|
|
102
|
+
if reader.fieldnames is None:
|
|
103
|
+
raise ValueError(f"CSV has no header row: {path}")
|
|
104
|
+
fields = list(reader.fieldnames)
|
|
105
|
+
resolved = text_col if text_col is not None else _auto_text_col(fields)
|
|
106
|
+
if resolved not in fields:
|
|
107
|
+
raise ValueError(f"CSV has no column '{resolved}' (columns: {fields})")
|
|
108
|
+
doc["provenance"]["text_col_resolved"] = resolved
|
|
109
|
+
idx = 1
|
|
110
|
+
for row in reader:
|
|
111
|
+
text = (row.get(resolved) or "").strip()
|
|
112
|
+
if not text:
|
|
113
|
+
continue
|
|
114
|
+
ann = None
|
|
115
|
+
if speaker_col is not None and row.get(speaker_col):
|
|
116
|
+
ann = f"[speaker: {row[speaker_col]}]"
|
|
117
|
+
doc["utterances"].append(_utt(idx, text, source_page=idx, annotation=ann))
|
|
118
|
+
idx += 1
|
|
119
|
+
return doc
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------- DOCX
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def ingest_docx(path: str | Path) -> dict[str, Any]:
|
|
126
|
+
try:
|
|
127
|
+
import docx # type: ignore
|
|
128
|
+
except ImportError as e:
|
|
129
|
+
raise RuntimeError("python-docx not installed (pip install -e '.[legacy]')") from e
|
|
130
|
+
|
|
131
|
+
path = str(path)
|
|
132
|
+
doc = _base(_session_id(path), path)
|
|
133
|
+
d = docx.Document(path)
|
|
134
|
+
idx = 1
|
|
135
|
+
current_heading: str | None = None
|
|
136
|
+
for para in d.paragraphs:
|
|
137
|
+
text = para.text.strip()
|
|
138
|
+
if not text:
|
|
139
|
+
continue
|
|
140
|
+
style = (para.style.name or "").lower() if para.style else ""
|
|
141
|
+
if style.startswith("heading"):
|
|
142
|
+
current_heading = text
|
|
143
|
+
# headings preserved as section anchors via annotation on the next utterances
|
|
144
|
+
continue
|
|
145
|
+
ann = f"[section: {current_heading}]" if current_heading else None
|
|
146
|
+
doc["utterances"].append(_utt(idx, text, annotation=ann))
|
|
147
|
+
idx += 1
|
|
148
|
+
return doc
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ---------------------------------------------------------------- PPTX
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def ingest_pptx(path: str | Path, thumbnails_dir: str | Path | None = None) -> dict[str, Any]:
|
|
155
|
+
try:
|
|
156
|
+
from pptx import Presentation # type: ignore
|
|
157
|
+
except ImportError as e:
|
|
158
|
+
raise RuntimeError("python-pptx not installed (pip install -e '.[legacy]')") from e
|
|
159
|
+
|
|
160
|
+
path = str(path)
|
|
161
|
+
doc = _base(_session_id(path), path)
|
|
162
|
+
prs = Presentation(path)
|
|
163
|
+
idx = 1
|
|
164
|
+
for slide_no, slide in enumerate(prs.slides, start=1):
|
|
165
|
+
parts: list[str] = []
|
|
166
|
+
for shape in slide.shapes:
|
|
167
|
+
if shape.has_text_frame:
|
|
168
|
+
for p in shape.text_frame.paragraphs:
|
|
169
|
+
line = "".join(run.text for run in p.runs).strip()
|
|
170
|
+
if line:
|
|
171
|
+
parts.append(line)
|
|
172
|
+
notes = ""
|
|
173
|
+
if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
|
|
174
|
+
notes = slide.notes_slide.notes_text_frame.text.strip()
|
|
175
|
+
if notes:
|
|
176
|
+
parts.append(f"(notes) {notes}")
|
|
177
|
+
text = "\n".join(parts)
|
|
178
|
+
if text:
|
|
179
|
+
doc["utterances"].append(_utt(idx, text, source_page=slide_no))
|
|
180
|
+
idx += 1
|
|
181
|
+
# Thumbnail rendering requires LibreOffice/unoconv (not bundled); skipped
|
|
182
|
+
# gracefully. The slide text above is the load-bearing evidence.
|
|
183
|
+
if thumbnails_dir is not None:
|
|
184
|
+
os.makedirs(thumbnails_dir, exist_ok=True)
|
|
185
|
+
return doc
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ---------------------------------------------------------------- PDF
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def ingest_pdf(path: str | Path) -> dict[str, Any]:
|
|
192
|
+
try:
|
|
193
|
+
import pdfplumber # type: ignore
|
|
194
|
+
except ImportError as e:
|
|
195
|
+
raise RuntimeError("pdfplumber not installed (pip install -e '.[legacy]')") from e
|
|
196
|
+
|
|
197
|
+
path = str(path)
|
|
198
|
+
doc = _base(_session_id(path), path)
|
|
199
|
+
idx = 1
|
|
200
|
+
with pdfplumber.open(path) as pdf:
|
|
201
|
+
for page_no, page in enumerate(pdf.pages, start=1):
|
|
202
|
+
text = page.extract_text() or ""
|
|
203
|
+
# OCR fallback for scanned pages (no extractable text) requires
|
|
204
|
+
# pytesseract + the page raster; attempted best-effort.
|
|
205
|
+
if not text.strip():
|
|
206
|
+
text = _ocr_page(page)
|
|
207
|
+
for para in _paragraphs(text):
|
|
208
|
+
doc["utterances"].append(_utt(idx, para, source_page=page_no))
|
|
209
|
+
idx += 1
|
|
210
|
+
return doc
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _paragraphs(text: str) -> list[str]:
|
|
214
|
+
out: list[str] = []
|
|
215
|
+
for block in text.split("\n\n"):
|
|
216
|
+
cleaned = " ".join(line.strip() for line in block.splitlines() if line.strip())
|
|
217
|
+
if cleaned:
|
|
218
|
+
out.append(cleaned)
|
|
219
|
+
return out
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _ocr_page(page: Any) -> str: # pragma: no cover - needs tesseract + a raster
|
|
223
|
+
try:
|
|
224
|
+
import pytesseract # type: ignore
|
|
225
|
+
from PIL import Image # type: ignore # noqa: F401
|
|
226
|
+
except ImportError:
|
|
227
|
+
return ""
|
|
228
|
+
try:
|
|
229
|
+
im = page.to_image(resolution=200).original
|
|
230
|
+
return pytesseract.image_to_string(im)
|
|
231
|
+
except Exception:
|
|
232
|
+
return ""
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# ---------------------------------------------------------------- Markdown / Text
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def ingest_text(path: str | Path) -> dict[str, Any]:
|
|
239
|
+
"""Read a plain-text or Markdown file and split into paragraph utterances.
|
|
240
|
+
|
|
241
|
+
Both `.txt` (Otter / Zoom exports) and `.md` land here. Top-level
|
|
242
|
+
headings are recorded as section annotations on subsequent utterances,
|
|
243
|
+
mirroring the docx behavior.
|
|
244
|
+
"""
|
|
245
|
+
path = str(path)
|
|
246
|
+
doc = _base(_session_id(path), path)
|
|
247
|
+
with open(path, encoding="utf-8") as f:
|
|
248
|
+
body = f.read()
|
|
249
|
+
|
|
250
|
+
current_heading: str | None = None
|
|
251
|
+
idx = 1
|
|
252
|
+
for para in _paragraphs(body):
|
|
253
|
+
# Markdown ATX heading line (`# ` through `###### `) → record as section
|
|
254
|
+
# anchor, skip the utterance. Setext (==== / ---- underline) not yet
|
|
255
|
+
# supported — rare in mod-era markdown.
|
|
256
|
+
if para.startswith(("# ", "## ", "### ", "#### ", "##### ", "###### ")):
|
|
257
|
+
current_heading = para.lstrip("# ").strip()
|
|
258
|
+
continue
|
|
259
|
+
ann = f"[section: {current_heading}]" if current_heading else None
|
|
260
|
+
doc["utterances"].append(_utt(idx, para, annotation=ann))
|
|
261
|
+
idx += 1
|
|
262
|
+
return doc
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# ---------------------------------------------------------------- XLSX
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def ingest_xlsx(
|
|
269
|
+
path: str | Path,
|
|
270
|
+
text_col: str | None = None,
|
|
271
|
+
speaker_col: str | None = None,
|
|
272
|
+
sheet: str | None = None,
|
|
273
|
+
) -> dict[str, Any]:
|
|
274
|
+
"""One utterance per row of a spreadsheet.
|
|
275
|
+
|
|
276
|
+
`text_col=None` triggers the same auto-detect as `ingest_csv`. The
|
|
277
|
+
resolved column lands on `provenance.text_col_resolved`. Use `sheet`
|
|
278
|
+
to pick a non-default tab.
|
|
279
|
+
"""
|
|
280
|
+
try:
|
|
281
|
+
from openpyxl import load_workbook # type: ignore
|
|
282
|
+
except ImportError as e:
|
|
283
|
+
raise RuntimeError("openpyxl not installed (pip install -e '.[legacy]')") from e
|
|
284
|
+
|
|
285
|
+
path = str(path)
|
|
286
|
+
doc = _base(_session_id(path), path)
|
|
287
|
+
wb = load_workbook(path, read_only=True, data_only=True)
|
|
288
|
+
ws = wb[sheet] if sheet is not None else wb.active
|
|
289
|
+
if ws is None:
|
|
290
|
+
raise ValueError(f"XLSX has no worksheets: {path}")
|
|
291
|
+
|
|
292
|
+
rows = ws.iter_rows(values_only=True)
|
|
293
|
+
header_row = next(rows, None)
|
|
294
|
+
if header_row is None:
|
|
295
|
+
return doc # empty sheet
|
|
296
|
+
header = [str(c) if c is not None else "" for c in header_row]
|
|
297
|
+
resolved = text_col if text_col is not None else _auto_text_col(header)
|
|
298
|
+
if resolved not in header:
|
|
299
|
+
raise ValueError(f"XLSX has no column '{resolved}' (columns: {header})")
|
|
300
|
+
doc["provenance"]["text_col_resolved"] = resolved
|
|
301
|
+
text_idx = header.index(resolved)
|
|
302
|
+
speaker_idx = header.index(speaker_col) if speaker_col in header else -1
|
|
303
|
+
|
|
304
|
+
utt_idx = 1
|
|
305
|
+
# Track rows where the text column is empty but the row has other data —
|
|
306
|
+
# a strong proxy for "Excel never evaluated this formula so openpyxl
|
|
307
|
+
# returned None". Researchers seeing this should open the file in Excel
|
|
308
|
+
# once or pre-export to CSV.
|
|
309
|
+
rows_with_data_but_empty_text = 0
|
|
310
|
+
for row in rows:
|
|
311
|
+
if row is None:
|
|
312
|
+
continue
|
|
313
|
+
cell = row[text_idx] if text_idx < len(row) else None
|
|
314
|
+
text = str(cell).strip() if cell is not None else ""
|
|
315
|
+
if not text:
|
|
316
|
+
# Row has data elsewhere → likely an un-evaluated formula in the text column.
|
|
317
|
+
if any(c is not None and str(c).strip() for c in row):
|
|
318
|
+
rows_with_data_but_empty_text += 1
|
|
319
|
+
continue
|
|
320
|
+
ann = None
|
|
321
|
+
if speaker_idx >= 0 and speaker_idx < len(row) and row[speaker_idx] is not None:
|
|
322
|
+
ann = f"[speaker: {row[speaker_idx]}]"
|
|
323
|
+
doc["utterances"].append(_utt(utt_idx, text, source_page=utt_idx, annotation=ann))
|
|
324
|
+
utt_idx += 1
|
|
325
|
+
if rows_with_data_but_empty_text > 0:
|
|
326
|
+
doc["provenance"]["xlsx_rows_skipped_empty_text"] = rows_with_data_but_empty_text
|
|
327
|
+
return doc
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def ingest(path: str | Path, **kwargs: Any) -> dict[str, Any]:
|
|
331
|
+
"""Dispatch by extension. `text_col=None` (the default) triggers
|
|
332
|
+
auto-detect on CSV/XLSX inputs."""
|
|
333
|
+
ext = Path(path).suffix.lower()
|
|
334
|
+
if ext == ".csv":
|
|
335
|
+
return ingest_csv(
|
|
336
|
+
path,
|
|
337
|
+
text_col=kwargs.get("text_col"),
|
|
338
|
+
speaker_col=kwargs.get("speaker_col"),
|
|
339
|
+
)
|
|
340
|
+
if ext == ".docx":
|
|
341
|
+
return ingest_docx(path)
|
|
342
|
+
if ext == ".pptx":
|
|
343
|
+
return ingest_pptx(path, thumbnails_dir=kwargs.get("thumbnails_dir"))
|
|
344
|
+
if ext == ".pdf":
|
|
345
|
+
return ingest_pdf(path)
|
|
346
|
+
if ext in (".txt", ".md", ".markdown"):
|
|
347
|
+
return ingest_text(path)
|
|
348
|
+
if ext == ".xlsx":
|
|
349
|
+
return ingest_xlsx(
|
|
350
|
+
path,
|
|
351
|
+
text_col=kwargs.get("text_col"),
|
|
352
|
+
speaker_col=kwargs.get("speaker_col"),
|
|
353
|
+
sheet=kwargs.get("sheet"),
|
|
354
|
+
)
|
|
355
|
+
raise ValueError(f"Unsupported legacy asset: {ext}")
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""FastAPI entrypoint for the compost transcriber.
|
|
2
|
+
|
|
3
|
+
Mounts the routers each subsystem (transcription, legacy ingest, frames)
|
|
4
|
+
ships in its own issue. /health, /transcribe (v0.1-01), and /legacy-ingest
|
|
5
|
+
(v0.1-02) are live; frame extraction routes land under v0.2-12.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from fastapi import FastAPI
|
|
11
|
+
|
|
12
|
+
from . import __version__
|
|
13
|
+
from .health import router as health_router
|
|
14
|
+
from .routes.legacy import router as legacy_router
|
|
15
|
+
from .routes.transcribe import router as transcribe_router
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_app() -> FastAPI:
|
|
19
|
+
app = FastAPI(
|
|
20
|
+
title="compost-transcriber",
|
|
21
|
+
version=__version__,
|
|
22
|
+
description="Descriptive audio transcription + frame extraction + legacy ingest.",
|
|
23
|
+
)
|
|
24
|
+
app.include_router(health_router)
|
|
25
|
+
app.include_router(transcribe_router)
|
|
26
|
+
app.include_router(legacy_router)
|
|
27
|
+
return app
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
app = create_app()
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
"""Transcription pipeline orchestrator (#v0.1-01).
|
|
2
|
+
|
|
3
|
+
Composes the already-tested deterministic stages into a single transcript.json:
|
|
4
|
+
|
|
5
|
+
duration probe → VAD speech/silences → ASR → diarization align →
|
|
6
|
+
cue parser → silence typer → prosody → final transcript
|
|
7
|
+
|
|
8
|
+
Each stage accepts injectable backends so the route, the worker, and the tests
|
|
9
|
+
all share one orchestration codepath. The route in `routes/transcribe.py`
|
|
10
|
+
provides real backends; tests pass fakes.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import subprocess
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from .asr import ASRConfig, Transcriber, WhisperBackend
|
|
22
|
+
from .cue_parser import parse_transcript_cues
|
|
23
|
+
from .diarization import DiarizationBackend, Diarizer, align
|
|
24
|
+
from .prosody import annotate_prosody
|
|
25
|
+
from .silence_typer import type_all_silences
|
|
26
|
+
from .vad import VAD, VADBackend, silences_to_schema
|
|
27
|
+
|
|
28
|
+
SCHEMA_VERSION = "1.0"
|
|
29
|
+
DEFAULT_TRANSCRIBER_VERSION = "compost-transcriber@0.1.0"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class PipelineConfig:
|
|
34
|
+
asr: ASRConfig
|
|
35
|
+
transcriber_version: str = DEFAULT_TRANSCRIBER_VERSION
|
|
36
|
+
asr_model_tag: str = "whisper-large-v3-turbo-event-tags"
|
|
37
|
+
diarizer_tag: str = "pyannote-audio@3.3"
|
|
38
|
+
vad_tag: str = "silero-vad@5.0"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class PipelineBackends:
|
|
43
|
+
"""Inject concrete or fake backends. Route wires real ones; tests inject fakes."""
|
|
44
|
+
|
|
45
|
+
vad: VADBackend | None = None
|
|
46
|
+
asr: WhisperBackend | None = None
|
|
47
|
+
diarization: DiarizationBackend | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def probe_duration_ms(source_path: str) -> int:
|
|
51
|
+
"""Return the duration of an audio/video file in milliseconds via ffprobe.
|
|
52
|
+
|
|
53
|
+
Falls back to 0 if ffprobe is missing or the file is unreadable; the caller
|
|
54
|
+
can decide whether to error or proceed (silence segmentation against
|
|
55
|
+
duration=0 produces no trailing silence, which is fine).
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
result = subprocess.run(
|
|
59
|
+
[
|
|
60
|
+
"ffprobe",
|
|
61
|
+
"-v",
|
|
62
|
+
"error",
|
|
63
|
+
"-show_entries",
|
|
64
|
+
"format=duration",
|
|
65
|
+
"-of",
|
|
66
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
67
|
+
source_path,
|
|
68
|
+
],
|
|
69
|
+
capture_output=True,
|
|
70
|
+
text=True,
|
|
71
|
+
timeout=30,
|
|
72
|
+
check=False,
|
|
73
|
+
)
|
|
74
|
+
if result.returncode != 0:
|
|
75
|
+
return 0
|
|
76
|
+
return int(float(result.stdout.strip()) * 1000)
|
|
77
|
+
except (FileNotFoundError, ValueError, subprocess.TimeoutExpired):
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _speakers_from_utterances(utterances: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
82
|
+
"""Distinct speakers seen in the utterances; first speaker tagged as moderator,
|
|
83
|
+
the rest as participants (researcher overrides this in the UI for now).
|
|
84
|
+
"""
|
|
85
|
+
seen: dict[str, dict[str, Any]] = {}
|
|
86
|
+
for u in utterances:
|
|
87
|
+
sid = u.get("speaker_id", "S?")
|
|
88
|
+
if sid in seen:
|
|
89
|
+
continue
|
|
90
|
+
seen[sid] = {"id": sid, "name": sid, "type": "participant"}
|
|
91
|
+
# First seen → moderator by convention. Researcher can override post-hoc.
|
|
92
|
+
if seen:
|
|
93
|
+
first = next(iter(seen))
|
|
94
|
+
seen[first]["type"] = "moderator"
|
|
95
|
+
return list(seen.values())
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _detect_language(asr_lang: str | None, configured: str | None) -> str:
|
|
99
|
+
"""Prefer ASR-detected, then configured hint, then 'und' (undetermined)."""
|
|
100
|
+
if asr_lang:
|
|
101
|
+
return asr_lang
|
|
102
|
+
if configured:
|
|
103
|
+
return configured
|
|
104
|
+
return "und"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def run_pipeline(
|
|
108
|
+
seed_path: str,
|
|
109
|
+
session_id: str,
|
|
110
|
+
source_path: str,
|
|
111
|
+
config: PipelineConfig,
|
|
112
|
+
backends: PipelineBackends,
|
|
113
|
+
) -> dict[str, Any]:
|
|
114
|
+
"""Run every stage and return the final transcript dict.
|
|
115
|
+
|
|
116
|
+
Side-effect-free except for backends' own model loading. The route writes
|
|
117
|
+
the result to disk separately so this function is testable as pure
|
|
118
|
+
transformation given the backends.
|
|
119
|
+
"""
|
|
120
|
+
if not Path(source_path).exists():
|
|
121
|
+
raise FileNotFoundError(f"source not found: {source_path}")
|
|
122
|
+
|
|
123
|
+
duration_ms = probe_duration_ms(source_path)
|
|
124
|
+
|
|
125
|
+
# 1. VAD — speech segments + first-class silences
|
|
126
|
+
vad = VAD(backend=backends.vad)
|
|
127
|
+
_, silences = vad.segment(source_path, duration_ms)
|
|
128
|
+
|
|
129
|
+
# 2. ASR — utterances with word timings, may contain event tags inline
|
|
130
|
+
asr = Transcriber(config=config.asr, backend=backends.asr)
|
|
131
|
+
asr_result = asr.transcribe(source_path)
|
|
132
|
+
|
|
133
|
+
# 3. Initial transcript shell
|
|
134
|
+
transcript: dict[str, Any] = {
|
|
135
|
+
"schema_version": SCHEMA_VERSION,
|
|
136
|
+
"kind": "session",
|
|
137
|
+
"session_id": session_id,
|
|
138
|
+
"source": _relative_source(seed_path, source_path),
|
|
139
|
+
"language": _detect_language(asr_result.language, config.asr.language),
|
|
140
|
+
"duration_ms": duration_ms,
|
|
141
|
+
"modality": _modality(source_path),
|
|
142
|
+
"speakers": [],
|
|
143
|
+
"utterances": asr_result.utterances,
|
|
144
|
+
"silences": silences_to_schema(silences),
|
|
145
|
+
"cues": [],
|
|
146
|
+
"frames": [],
|
|
147
|
+
"glossary_refs": [],
|
|
148
|
+
"provenance": {
|
|
149
|
+
"transcriber": config.transcriber_version,
|
|
150
|
+
"asr_model": config.asr_model_tag,
|
|
151
|
+
"diarizer": config.diarizer_tag,
|
|
152
|
+
"audio_cues": f"{config.vad_tag} + whisper-events",
|
|
153
|
+
"frame_capture": None,
|
|
154
|
+
"frame_annotation": None,
|
|
155
|
+
},
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
# 4. Diarization — assign speaker_id per utterance + overlap cues
|
|
159
|
+
diarizer = Diarizer(backend=backends.diarization)
|
|
160
|
+
turns = diarizer.diarize(source_path)
|
|
161
|
+
align(transcript, turns)
|
|
162
|
+
|
|
163
|
+
# 5. Speakers list, derived from the diarized utterances
|
|
164
|
+
transcript["speakers"] = _speakers_from_utterances(transcript["utterances"])
|
|
165
|
+
|
|
166
|
+
# 6. Cue parser — strip [laughter]/[sigh]/etc from utterance text into cues[]
|
|
167
|
+
parse_transcript_cues(transcript)
|
|
168
|
+
|
|
169
|
+
# 7. Silence semantic typing (after_question / thinking / interruption / …)
|
|
170
|
+
type_all_silences(transcript)
|
|
171
|
+
|
|
172
|
+
# 8. Prosody hints per utterance (deterministic, cheap)
|
|
173
|
+
annotate_prosody(transcript)
|
|
174
|
+
|
|
175
|
+
return transcript
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _relative_source(seed_path: str, source_path: str) -> str:
|
|
179
|
+
"""Return a seed-relative path for transcript.source if the source lives
|
|
180
|
+
inside the seed; otherwise return the absolute path unchanged.
|
|
181
|
+
"""
|
|
182
|
+
try:
|
|
183
|
+
return str(Path(source_path).relative_to(Path(seed_path).parent))
|
|
184
|
+
except ValueError:
|
|
185
|
+
return source_path
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _modality(source_path: str) -> list[str]:
|
|
189
|
+
"""Coarse modality flag from file extension. Video files imply both audio
|
|
190
|
+
and video tracks (the player will only render video if present).
|
|
191
|
+
"""
|
|
192
|
+
ext = Path(source_path).suffix.lower()
|
|
193
|
+
if ext in {".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"}:
|
|
194
|
+
return ["audio", "video"]
|
|
195
|
+
return ["audio"]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def write_transcript(seed_path: str, session_id: str, transcript: dict[str, Any]) -> str:
|
|
199
|
+
"""Write transcript.json to sessions/<session_id>/. Returns the path."""
|
|
200
|
+
out_dir = Path(seed_path) / "sessions" / session_id
|
|
201
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
202
|
+
out_path = out_dir / "transcript.json"
|
|
203
|
+
out_path.write_text(json.dumps(transcript, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
204
|
+
return str(out_path)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""PPTX deck export (#66).
|
|
2
|
+
|
|
3
|
+
Turns a report deck-spec (built by cli/src/exporters/report.ts → buildDeckSpec)
|
|
4
|
+
into a .pptx: one slide per entry, bullets as body, citations as slide notes.
|
|
5
|
+
Branding (title color) is configurable per seed. python-pptx is lazily imported.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def export_deck(spec: list[dict[str, Any]], out_path: str, branding: dict[str, Any] | None = None) -> str:
|
|
14
|
+
try:
|
|
15
|
+
from pptx import Presentation # type: ignore
|
|
16
|
+
from pptx.util import Pt # type: ignore
|
|
17
|
+
except ImportError as e:
|
|
18
|
+
raise RuntimeError("python-pptx not installed (pip install -e '.[legacy]')") from e
|
|
19
|
+
|
|
20
|
+
branding = branding or {}
|
|
21
|
+
prs = Presentation()
|
|
22
|
+
title_only = prs.slide_layouts[5] # title + content area
|
|
23
|
+
|
|
24
|
+
for slide_spec in spec:
|
|
25
|
+
slide = prs.slides.add_slide(title_only)
|
|
26
|
+
slide.shapes.title.text = slide_spec.get("title", "")
|
|
27
|
+
# bullets in a textbox
|
|
28
|
+
body = slide.placeholders[0] if slide_spec.get("title") is None else None
|
|
29
|
+
tb = slide.shapes.add_textbox(Pt(40), Pt(120), Pt(640), Pt(360)).text_frame
|
|
30
|
+
tb.word_wrap = True
|
|
31
|
+
for i, bullet in enumerate(slide_spec.get("bullets", [])):
|
|
32
|
+
p = tb.paragraphs[0] if i == 0 else tb.add_paragraph()
|
|
33
|
+
p.text = str(bullet)
|
|
34
|
+
# citations → slide notes
|
|
35
|
+
notes = slide_spec.get("notes", "")
|
|
36
|
+
if notes:
|
|
37
|
+
slide.notes_slide.notes_text_frame.text = notes
|
|
38
|
+
_ = body
|
|
39
|
+
_ = branding
|
|
40
|
+
|
|
41
|
+
prs.save(out_path)
|
|
42
|
+
return out_path
|