@they-juanreina/compost-cli 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. package/dist/commands/agreement.d.ts +3 -0
  2. package/dist/commands/agreement.d.ts.map +1 -0
  3. package/dist/commands/agreement.js +35 -0
  4. package/dist/commands/agreement.js.map +1 -0
  5. package/dist/commands/create.d.ts +1 -0
  6. package/dist/commands/create.d.ts.map +1 -1
  7. package/dist/commands/create.js +39 -1
  8. package/dist/commands/create.js.map +1 -1
  9. package/dist/commands/export.d.ts.map +1 -1
  10. package/dist/commands/export.js +47 -4
  11. package/dist/commands/export.js.map +1 -1
  12. package/dist/commands/import.d.ts +3 -0
  13. package/dist/commands/import.d.ts.map +1 -0
  14. package/dist/commands/import.js +59 -0
  15. package/dist/commands/import.js.map +1 -0
  16. package/dist/commands/init.d.ts.map +1 -1
  17. package/dist/commands/init.js +1 -0
  18. package/dist/commands/init.js.map +1 -1
  19. package/dist/commands/jobs.d.ts +3 -0
  20. package/dist/commands/jobs.d.ts.map +1 -0
  21. package/dist/commands/jobs.js +105 -0
  22. package/dist/commands/jobs.js.map +1 -0
  23. package/dist/commands/label.d.ts +3 -0
  24. package/dist/commands/label.d.ts.map +1 -0
  25. package/dist/commands/label.js +67 -0
  26. package/dist/commands/label.js.map +1 -0
  27. package/dist/commands/models.d.ts.map +1 -1
  28. package/dist/commands/models.js +2 -1
  29. package/dist/commands/models.js.map +1 -1
  30. package/dist/commands/recode.d.ts +3 -0
  31. package/dist/commands/recode.d.ts.map +1 -0
  32. package/dist/commands/recode.js +60 -0
  33. package/dist/commands/recode.js.map +1 -0
  34. package/dist/commands/reindex.d.ts.map +1 -1
  35. package/dist/commands/reindex.js +6 -4
  36. package/dist/commands/reindex.js.map +1 -1
  37. package/dist/commands/rerun.d.ts +3 -0
  38. package/dist/commands/rerun.d.ts.map +1 -0
  39. package/dist/commands/rerun.js +91 -0
  40. package/dist/commands/rerun.js.map +1 -0
  41. package/dist/commands/search.d.ts.map +1 -1
  42. package/dist/commands/search.js +2 -1
  43. package/dist/commands/search.js.map +1 -1
  44. package/dist/commands/secrets.d.ts +3 -0
  45. package/dist/commands/secrets.d.ts.map +1 -0
  46. package/dist/commands/secrets.js +143 -0
  47. package/dist/commands/secrets.js.map +1 -0
  48. package/dist/commands/setup.d.ts.map +1 -1
  49. package/dist/commands/setup.js +90 -1
  50. package/dist/commands/setup.js.map +1 -1
  51. package/dist/commands/status.d.ts.map +1 -1
  52. package/dist/commands/status.js +2 -1
  53. package/dist/commands/status.js.map +1 -1
  54. package/dist/commands/transcribe.d.ts.map +1 -1
  55. package/dist/commands/transcribe.js +13 -2
  56. package/dist/commands/transcribe.js.map +1 -1
  57. package/dist/commands/validate.d.ts.map +1 -1
  58. package/dist/commands/validate.js +29 -1
  59. package/dist/commands/validate.js.map +1 -1
  60. package/dist/engine.d.ts +23 -0
  61. package/dist/engine.d.ts.map +1 -0
  62. package/dist/engine.js +32 -0
  63. package/dist/engine.js.map +1 -0
  64. package/dist/exporters/prov.d.ts +11 -0
  65. package/dist/exporters/prov.d.ts.map +1 -0
  66. package/dist/exporters/prov.js +151 -0
  67. package/dist/exporters/prov.js.map +1 -0
  68. package/dist/index.d.ts.map +1 -1
  69. package/dist/index.js +6 -0
  70. package/dist/index.js.map +1 -1
  71. package/dist/lib/agreement.d.ts +77 -0
  72. package/dist/lib/agreement.d.ts.map +1 -0
  73. package/dist/lib/agreement.js +261 -0
  74. package/dist/lib/agreement.js.map +1 -0
  75. package/dist/lib/artifacts.d.ts +32 -1
  76. package/dist/lib/artifacts.d.ts.map +1 -1
  77. package/dist/lib/artifacts.js +156 -22
  78. package/dist/lib/artifacts.js.map +1 -1
  79. package/dist/lib/blame.d.ts.map +1 -1
  80. package/dist/lib/blame.js +3 -2
  81. package/dist/lib/blame.js.map +1 -1
  82. package/dist/lib/config.d.ts +3 -0
  83. package/dist/lib/config.d.ts.map +1 -1
  84. package/dist/lib/config.js.map +1 -1
  85. package/dist/lib/doctor.d.ts +3 -0
  86. package/dist/lib/doctor.d.ts.map +1 -1
  87. package/dist/lib/doctor.js +24 -1
  88. package/dist/lib/doctor.js.map +1 -1
  89. package/dist/lib/events.d.ts +34 -1
  90. package/dist/lib/events.d.ts.map +1 -1
  91. package/dist/lib/events.js +35 -1
  92. package/dist/lib/events.js.map +1 -1
  93. package/dist/lib/importTranscript.d.ts +16 -0
  94. package/dist/lib/importTranscript.d.ts.map +1 -0
  95. package/dist/lib/importTranscript.js +94 -0
  96. package/dist/lib/importTranscript.js.map +1 -0
  97. package/dist/lib/ingest.d.ts.map +1 -1
  98. package/dist/lib/ingest.js +12 -6
  99. package/dist/lib/ingest.js.map +1 -1
  100. package/dist/lib/journal.d.ts +13 -0
  101. package/dist/lib/journal.d.ts.map +1 -1
  102. package/dist/lib/journal.js +58 -2
  103. package/dist/lib/journal.js.map +1 -1
  104. package/dist/lib/legacyNative.d.ts +24 -0
  105. package/dist/lib/legacyNative.d.ts.map +1 -0
  106. package/dist/lib/legacyNative.js +51 -0
  107. package/dist/lib/legacyNative.js.map +1 -0
  108. package/dist/lib/migrate.d.ts.map +1 -1
  109. package/dist/lib/migrate.js +1 -0
  110. package/dist/lib/migrate.js.map +1 -1
  111. package/dist/lib/nativeRuntime.d.ts +6 -3
  112. package/dist/lib/nativeRuntime.d.ts.map +1 -1
  113. package/dist/lib/nativeRuntime.js +6 -3
  114. package/dist/lib/nativeRuntime.js.map +1 -1
  115. package/dist/lib/provisionNative.js +1 -1
  116. package/dist/lib/provisionNative.js.map +1 -1
  117. package/dist/lib/queue.d.ts +25 -0
  118. package/dist/lib/queue.d.ts.map +1 -1
  119. package/dist/lib/queue.js +70 -3
  120. package/dist/lib/queue.js.map +1 -1
  121. package/dist/lib/reads.d.ts +24 -0
  122. package/dist/lib/reads.d.ts.map +1 -0
  123. package/dist/lib/reads.js +115 -0
  124. package/dist/lib/reads.js.map +1 -0
  125. package/dist/lib/recode.d.ts +19 -0
  126. package/dist/lib/recode.d.ts.map +1 -0
  127. package/dist/lib/recode.js +43 -0
  128. package/dist/lib/recode.js.map +1 -0
  129. package/dist/lib/rerun.d.ts +51 -0
  130. package/dist/lib/rerun.d.ts.map +1 -0
  131. package/dist/lib/rerun.js +166 -0
  132. package/dist/lib/rerun.js.map +1 -0
  133. package/dist/lib/retrieve.d.ts +8 -4
  134. package/dist/lib/retrieve.d.ts.map +1 -1
  135. package/dist/lib/retrieve.js +12 -10
  136. package/dist/lib/retrieve.js.map +1 -1
  137. package/dist/lib/schemas.generated.d.ts.map +1 -1
  138. package/dist/lib/schemas.generated.js +28 -0
  139. package/dist/lib/schemas.generated.js.map +1 -1
  140. package/dist/lib/secrets.d.ts +158 -0
  141. package/dist/lib/secrets.d.ts.map +1 -0
  142. package/dist/lib/secrets.js +507 -0
  143. package/dist/lib/secrets.js.map +1 -0
  144. package/dist/lib/seed.d.ts +5 -0
  145. package/dist/lib/seed.d.ts.map +1 -1
  146. package/dist/lib/seed.js +15 -2
  147. package/dist/lib/seed.js.map +1 -1
  148. package/dist/lib/seedResolve.d.ts.map +1 -1
  149. package/dist/lib/seedResolve.js +1 -0
  150. package/dist/lib/seedResolve.js.map +1 -1
  151. package/dist/lib/session.d.ts +14 -0
  152. package/dist/lib/session.d.ts.map +1 -1
  153. package/dist/lib/session.js +47 -0
  154. package/dist/lib/session.js.map +1 -1
  155. package/dist/lib/setup.d.ts +5 -0
  156. package/dist/lib/setup.d.ts.map +1 -1
  157. package/dist/lib/setup.js +78 -14
  158. package/dist/lib/setup.js.map +1 -1
  159. package/dist/lib/setupWizard.d.ts +51 -0
  160. package/dist/lib/setupWizard.d.ts.map +1 -0
  161. package/dist/lib/setupWizard.js +223 -0
  162. package/dist/lib/setupWizard.js.map +1 -0
  163. package/dist/lib/snap.d.ts.map +1 -1
  164. package/dist/lib/snap.js +2 -5
  165. package/dist/lib/snap.js.map +1 -1
  166. package/dist/lib/speakers.d.ts +41 -0
  167. package/dist/lib/speakers.d.ts.map +1 -0
  168. package/dist/lib/speakers.js +78 -0
  169. package/dist/lib/speakers.js.map +1 -0
  170. package/dist/lib/status.d.ts.map +1 -1
  171. package/dist/lib/status.js +21 -0
  172. package/dist/lib/status.js.map +1 -1
  173. package/dist/lib/userConfig.d.ts +22 -0
  174. package/dist/lib/userConfig.d.ts.map +1 -0
  175. package/dist/lib/userConfig.js +67 -0
  176. package/dist/lib/userConfig.js.map +1 -0
  177. package/dist/lib/validate.d.ts +18 -0
  178. package/dist/lib/validate.d.ts.map +1 -1
  179. package/dist/lib/validate.js +70 -1
  180. package/dist/lib/validate.js.map +1 -1
  181. package/dist/lib/version.d.ts +30 -0
  182. package/dist/lib/version.d.ts.map +1 -0
  183. package/dist/lib/version.js +73 -0
  184. package/dist/lib/version.js.map +1 -0
  185. package/dist/llm/adapter.d.ts.map +1 -1
  186. package/dist/llm/adapter.js +2 -0
  187. package/dist/llm/adapter.js.map +1 -1
  188. package/dist/llm/providers/ollama.d.ts.map +1 -1
  189. package/dist/llm/providers/ollama.js +6 -0
  190. package/dist/llm/providers/ollama.js.map +1 -1
  191. package/dist/loops/ingest_watcher.d.ts.map +1 -1
  192. package/dist/loops/ingest_watcher.js +6 -3
  193. package/dist/loops/ingest_watcher.js.map +1 -1
  194. package/dist/loops/legacy_worker.d.ts +28 -1
  195. package/dist/loops/legacy_worker.d.ts.map +1 -1
  196. package/dist/loops/legacy_worker.js +81 -9
  197. package/dist/loops/legacy_worker.js.map +1 -1
  198. package/dist/loops/supervisor.d.ts +3 -0
  199. package/dist/loops/supervisor.d.ts.map +1 -1
  200. package/dist/loops/supervisor.js +12 -0
  201. package/dist/loops/supervisor.js.map +1 -1
  202. package/dist/loops/synthesis.d.ts.map +1 -1
  203. package/dist/loops/synthesis.js +15 -0
  204. package/dist/loops/synthesis.js.map +1 -1
  205. package/dist/loops/transcribe_worker.d.ts.map +1 -1
  206. package/dist/loops/transcribe_worker.js +2 -4
  207. package/dist/loops/transcribe_worker.js.map +1 -1
  208. package/dist/output.d.ts +13 -1
  209. package/dist/output.d.ts.map +1 -1
  210. package/dist/output.js +22 -2
  211. package/dist/output.js.map +1 -1
  212. package/dist/render/human.d.ts +20 -0
  213. package/dist/render/human.d.ts.map +1 -0
  214. package/dist/render/human.js +54 -0
  215. package/dist/render/human.js.map +1 -0
  216. package/dist/router.d.ts.map +1 -1
  217. package/dist/router.js +17 -2
  218. package/dist/router.js.map +1 -1
  219. package/package.json +18 -5
  220. package/templates/config.toml +6 -1
  221. package/transcriber/app/__init__.py +3 -0
  222. package/transcriber/app/asr.py +198 -0
  223. package/transcriber/app/asr_parakeet.py +174 -0
  224. package/transcriber/app/cue_parser.py +110 -0
  225. package/transcriber/app/diarization.py +330 -0
  226. package/transcriber/app/frame_annotation.py +77 -0
  227. package/transcriber/app/frames.py +130 -0
  228. package/transcriber/app/health.py +70 -0
  229. package/transcriber/app/legacy.py +355 -0
  230. package/transcriber/app/legacy_cli.py +90 -0
  231. package/transcriber/app/main.py +30 -0
  232. package/transcriber/app/pipeline.py +210 -0
  233. package/transcriber/app/pptx_export.py +42 -0
  234. package/transcriber/app/prosody.py +128 -0
  235. package/transcriber/app/routes/__init__.py +1 -0
  236. package/transcriber/app/routes/legacy.py +117 -0
  237. package/transcriber/app/routes/transcribe.py +133 -0
  238. package/transcriber/app/shot_change.py +74 -0
  239. package/transcriber/app/silence_typer.py +144 -0
  240. package/transcriber/app/transcribe_cli.py +82 -0
  241. package/transcriber/app/vad.py +216 -0
  242. package/transcriber/pyproject.toml +56 -0
@@ -0,0 +1,70 @@
1
+ """Health endpoint for the transcriber service.
2
+
3
+ ROADMAP § Verification — `compost watch` and the CLI probe this on startup to
4
+ confirm the transcriber container is reachable before queuing work.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import platform
10
+ import sys
11
+ from importlib.metadata import PackageNotFoundError, version
12
+
13
+ from fastapi import APIRouter
14
+ from pydantic import BaseModel
15
+
16
+ from . import __version__
17
+
18
+ router = APIRouter()
19
+
20
+
21
+ class HealthResponse(BaseModel):
22
+ """Stable contract for /health. CLI parses these fields."""
23
+
24
+ status: str
25
+ service: str
26
+ versions: dict[str, str | None]
27
+
28
+
29
+ def _safe_version(pkg: str) -> str | None:
30
+ """Return the installed version of `pkg`, or None if it isn't installed.
31
+
32
+ Model-heavy optional deps (whisperx, pyannote.audio, silero-vad) are
33
+ declared in `pyproject.toml` under the `asr` extra and only installed
34
+ when their respective issues land (#9-#15). Until then, /health
35
+ reports them as `null` so the CLI can tell the user what's missing.
36
+ """
37
+ try:
38
+ return version(pkg)
39
+ except PackageNotFoundError:
40
+ return None
41
+
42
+
43
+ @router.get("/health", response_model=HealthResponse)
44
+ def get_health() -> HealthResponse:
45
+ return HealthResponse(
46
+ status="ok",
47
+ service="compost-transcriber",
48
+ versions={
49
+ "transcriber": __version__,
50
+ "python": platform.python_version(),
51
+ "fastapi": _safe_version("fastapi"),
52
+ "uvicorn": _safe_version("uvicorn"),
53
+ "whisperx": _safe_version("whisperx"),
54
+ "pyannote.audio": _safe_version("pyannote.audio"),
55
+ "silero-vad": _safe_version("silero-vad"),
56
+ },
57
+ )
58
+
59
+
60
+ __all__ = ["router", "HealthResponse", "get_health"]
61
+
62
+
63
+ def _python_metadata_check() -> None:
64
+ """Self-check at import time: make sure we're on a supported runtime."""
65
+ major, minor = sys.version_info[:2]
66
+ if (major, minor) < (3, 11):
67
+ raise RuntimeError(f"compost-transcriber requires Python >=3.11, got {major}.{minor}")
68
+
69
+
70
+ _python_metadata_check()
@@ -0,0 +1,355 @@
1
+ """Legacy asset ingestors (#29).
2
+
3
+ Normalize PDF / DOCX / PPTX / CSV into a transcript-shaped JSON with
4
+ kind="document": one utterance per paragraph (PDF/DOCX), per slide (PPTX),
5
+ or per row (CSV). Output validates against schema/transcript.schema.json
6
+ (kind="document", modality=["document"]).
7
+
8
+ Heavy parsers (pdfplumber, python-docx, python-pptx) are imported lazily so
9
+ the module loads without the `legacy` extra; each ingestor raises a clear
10
+ error if its dependency is missing.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import csv
16
+ import os
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ INGESTOR_VERSION = "compost-legacy@0.1.0"
21
+ DOC_SPEAKER = {"id": "S1", "name": "document", "type": "other"}
22
+
23
+
24
+ def _base(session_id: str, source: str, language: str = "und") -> dict[str, Any]:
25
+ return {
26
+ "schema_version": "1.0",
27
+ "kind": "document",
28
+ "session_id": session_id,
29
+ "source": source,
30
+ "language": language,
31
+ "duration_ms": 0,
32
+ "modality": ["document"],
33
+ "speakers": [dict(DOC_SPEAKER)],
34
+ "utterances": [],
35
+ "provenance": {"transcriber": INGESTOR_VERSION},
36
+ }
37
+
38
+
39
+ def _utt(idx: int, text: str, source_page: int | None = None, annotation: str | None = None) -> dict[str, Any]:
40
+ u: dict[str, Any] = {
41
+ "id": f"U-{idx:04d}",
42
+ "speaker_id": DOC_SPEAKER["id"],
43
+ "turn": idx,
44
+ "start_ms": 0,
45
+ "end_ms": 0,
46
+ "text": text,
47
+ }
48
+ if source_page is not None:
49
+ u["source_page"] = source_page
50
+ if annotation is not None:
51
+ u["annotation"] = annotation
52
+ return u
53
+
54
+
55
+ def _session_id(path: str | Path) -> str:
56
+ stem = Path(path).stem
57
+ safe = "".join(c if c.isalnum() or c in "-_" else "-" for c in stem)
58
+ return f"DOC-{safe}"[:64]
59
+
60
+
61
+ # ---------------------------------------------------------------- CSV / XLSX
62
+
63
+ # Auto-detect priority for the "text" column. First case-insensitive match
64
+ # in the source's header wins. Falls back to the first column.
65
+ TEXT_COL_CANDIDATES = (
66
+ "text",
67
+ "transcript",
68
+ "content",
69
+ "utterance",
70
+ "quote",
71
+ "message",
72
+ "body",
73
+ )
74
+
75
+
76
+ def _auto_text_col(fieldnames: list[str]) -> str:
77
+ """Pick the most-likely text column from a header. Case-insensitive match
78
+ against TEXT_COL_CANDIDATES, then a first-column fallback."""
79
+ lower = {f.lower(): f for f in fieldnames}
80
+ for candidate in TEXT_COL_CANDIDATES:
81
+ if candidate in lower:
82
+ return lower[candidate]
83
+ return fieldnames[0]
84
+
85
+
86
+ def ingest_csv(
87
+ path: str | Path,
88
+ text_col: str | None = None,
89
+ speaker_col: str | None = None,
90
+ ) -> dict[str, Any]:
91
+ """One utterance per row.
92
+
93
+ `text_col=None` triggers auto-detect: text → transcript → content →
94
+ utterance → quote → message → body (case-insensitive). Falls back to
95
+ the first column. The resolved column is recorded on the output's
96
+ `provenance.text_col_resolved` for caller visibility.
97
+ """
98
+ path = str(path)
99
+ doc = _base(_session_id(path), path)
100
+ with open(path, newline="", encoding="utf-8") as f:
101
+ reader = csv.DictReader(f)
102
+ if reader.fieldnames is None:
103
+ raise ValueError(f"CSV has no header row: {path}")
104
+ fields = list(reader.fieldnames)
105
+ resolved = text_col if text_col is not None else _auto_text_col(fields)
106
+ if resolved not in fields:
107
+ raise ValueError(f"CSV has no column '{resolved}' (columns: {fields})")
108
+ doc["provenance"]["text_col_resolved"] = resolved
109
+ idx = 1
110
+ for row in reader:
111
+ text = (row.get(resolved) or "").strip()
112
+ if not text:
113
+ continue
114
+ ann = None
115
+ if speaker_col is not None and row.get(speaker_col):
116
+ ann = f"[speaker: {row[speaker_col]}]"
117
+ doc["utterances"].append(_utt(idx, text, source_page=idx, annotation=ann))
118
+ idx += 1
119
+ return doc
120
+
121
+
122
+ # ---------------------------------------------------------------- DOCX
123
+
124
+
125
+ def ingest_docx(path: str | Path) -> dict[str, Any]:
126
+ try:
127
+ import docx # type: ignore
128
+ except ImportError as e:
129
+ raise RuntimeError("python-docx not installed (pip install -e '.[legacy]')") from e
130
+
131
+ path = str(path)
132
+ doc = _base(_session_id(path), path)
133
+ d = docx.Document(path)
134
+ idx = 1
135
+ current_heading: str | None = None
136
+ for para in d.paragraphs:
137
+ text = para.text.strip()
138
+ if not text:
139
+ continue
140
+ style = (para.style.name or "").lower() if para.style else ""
141
+ if style.startswith("heading"):
142
+ current_heading = text
143
+ # headings preserved as section anchors via annotation on the next utterances
144
+ continue
145
+ ann = f"[section: {current_heading}]" if current_heading else None
146
+ doc["utterances"].append(_utt(idx, text, annotation=ann))
147
+ idx += 1
148
+ return doc
149
+
150
+
151
+ # ---------------------------------------------------------------- PPTX
152
+
153
+
154
+ def ingest_pptx(path: str | Path, thumbnails_dir: str | Path | None = None) -> dict[str, Any]:
155
+ try:
156
+ from pptx import Presentation # type: ignore
157
+ except ImportError as e:
158
+ raise RuntimeError("python-pptx not installed (pip install -e '.[legacy]')") from e
159
+
160
+ path = str(path)
161
+ doc = _base(_session_id(path), path)
162
+ prs = Presentation(path)
163
+ idx = 1
164
+ for slide_no, slide in enumerate(prs.slides, start=1):
165
+ parts: list[str] = []
166
+ for shape in slide.shapes:
167
+ if shape.has_text_frame:
168
+ for p in shape.text_frame.paragraphs:
169
+ line = "".join(run.text for run in p.runs).strip()
170
+ if line:
171
+ parts.append(line)
172
+ notes = ""
173
+ if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
174
+ notes = slide.notes_slide.notes_text_frame.text.strip()
175
+ if notes:
176
+ parts.append(f"(notes) {notes}")
177
+ text = "\n".join(parts)
178
+ if text:
179
+ doc["utterances"].append(_utt(idx, text, source_page=slide_no))
180
+ idx += 1
181
+ # Thumbnail rendering requires LibreOffice/unoconv (not bundled); skipped
182
+ # gracefully. The slide text above is the load-bearing evidence.
183
+ if thumbnails_dir is not None:
184
+ os.makedirs(thumbnails_dir, exist_ok=True)
185
+ return doc
186
+
187
+
188
+ # ---------------------------------------------------------------- PDF
189
+
190
+
191
+ def ingest_pdf(path: str | Path) -> dict[str, Any]:
192
+ try:
193
+ import pdfplumber # type: ignore
194
+ except ImportError as e:
195
+ raise RuntimeError("pdfplumber not installed (pip install -e '.[legacy]')") from e
196
+
197
+ path = str(path)
198
+ doc = _base(_session_id(path), path)
199
+ idx = 1
200
+ with pdfplumber.open(path) as pdf:
201
+ for page_no, page in enumerate(pdf.pages, start=1):
202
+ text = page.extract_text() or ""
203
+ # OCR fallback for scanned pages (no extractable text) requires
204
+ # pytesseract + the page raster; attempted best-effort.
205
+ if not text.strip():
206
+ text = _ocr_page(page)
207
+ for para in _paragraphs(text):
208
+ doc["utterances"].append(_utt(idx, para, source_page=page_no))
209
+ idx += 1
210
+ return doc
211
+
212
+
213
+ def _paragraphs(text: str) -> list[str]:
214
+ out: list[str] = []
215
+ for block in text.split("\n\n"):
216
+ cleaned = " ".join(line.strip() for line in block.splitlines() if line.strip())
217
+ if cleaned:
218
+ out.append(cleaned)
219
+ return out
220
+
221
+
222
+ def _ocr_page(page: Any) -> str: # pragma: no cover - needs tesseract + a raster
223
+ try:
224
+ import pytesseract # type: ignore
225
+ from PIL import Image # type: ignore # noqa: F401
226
+ except ImportError:
227
+ return ""
228
+ try:
229
+ im = page.to_image(resolution=200).original
230
+ return pytesseract.image_to_string(im)
231
+ except Exception:
232
+ return ""
233
+
234
+
235
+ # ---------------------------------------------------------------- Markdown / Text
236
+
237
+
238
+ def ingest_text(path: str | Path) -> dict[str, Any]:
239
+ """Read a plain-text or Markdown file and split into paragraph utterances.
240
+
241
+ Both `.txt` (Otter / Zoom exports) and `.md` land here. Top-level
242
+ headings are recorded as section annotations on subsequent utterances,
243
+ mirroring the docx behavior.
244
+ """
245
+ path = str(path)
246
+ doc = _base(_session_id(path), path)
247
+ with open(path, encoding="utf-8") as f:
248
+ body = f.read()
249
+
250
+ current_heading: str | None = None
251
+ idx = 1
252
+ for para in _paragraphs(body):
253
+ # Markdown ATX heading line (`# ` through `###### `) → record as section
254
+ # anchor, skip the utterance. Setext (==== / ---- underline) not yet
255
+ # supported — rare in mod-era markdown.
256
+ if para.startswith(("# ", "## ", "### ", "#### ", "##### ", "###### ")):
257
+ current_heading = para.lstrip("# ").strip()
258
+ continue
259
+ ann = f"[section: {current_heading}]" if current_heading else None
260
+ doc["utterances"].append(_utt(idx, para, annotation=ann))
261
+ idx += 1
262
+ return doc
263
+
264
+
265
+ # ---------------------------------------------------------------- XLSX
266
+
267
+
268
+ def ingest_xlsx(
269
+ path: str | Path,
270
+ text_col: str | None = None,
271
+ speaker_col: str | None = None,
272
+ sheet: str | None = None,
273
+ ) -> dict[str, Any]:
274
+ """One utterance per row of a spreadsheet.
275
+
276
+ `text_col=None` triggers the same auto-detect as `ingest_csv`. The
277
+ resolved column lands on `provenance.text_col_resolved`. Use `sheet`
278
+ to pick a non-default tab.
279
+ """
280
+ try:
281
+ from openpyxl import load_workbook # type: ignore
282
+ except ImportError as e:
283
+ raise RuntimeError("openpyxl not installed (pip install -e '.[legacy]')") from e
284
+
285
+ path = str(path)
286
+ doc = _base(_session_id(path), path)
287
+ wb = load_workbook(path, read_only=True, data_only=True)
288
+ ws = wb[sheet] if sheet is not None else wb.active
289
+ if ws is None:
290
+ raise ValueError(f"XLSX has no worksheets: {path}")
291
+
292
+ rows = ws.iter_rows(values_only=True)
293
+ header_row = next(rows, None)
294
+ if header_row is None:
295
+ return doc # empty sheet
296
+ header = [str(c) if c is not None else "" for c in header_row]
297
+ resolved = text_col if text_col is not None else _auto_text_col(header)
298
+ if resolved not in header:
299
+ raise ValueError(f"XLSX has no column '{resolved}' (columns: {header})")
300
+ doc["provenance"]["text_col_resolved"] = resolved
301
+ text_idx = header.index(resolved)
302
+ speaker_idx = header.index(speaker_col) if speaker_col in header else -1
303
+
304
+ utt_idx = 1
305
+ # Track rows where the text column is empty but the row has other data —
306
+ # a strong proxy for "Excel never evaluated this formula so openpyxl
307
+ # returned None". Researchers seeing this should open the file in Excel
308
+ # once or pre-export to CSV.
309
+ rows_with_data_but_empty_text = 0
310
+ for row in rows:
311
+ if row is None:
312
+ continue
313
+ cell = row[text_idx] if text_idx < len(row) else None
314
+ text = str(cell).strip() if cell is not None else ""
315
+ if not text:
316
+ # Row has data elsewhere → likely an un-evaluated formula in the text column.
317
+ if any(c is not None and str(c).strip() for c in row):
318
+ rows_with_data_but_empty_text += 1
319
+ continue
320
+ ann = None
321
+ if speaker_idx >= 0 and speaker_idx < len(row) and row[speaker_idx] is not None:
322
+ ann = f"[speaker: {row[speaker_idx]}]"
323
+ doc["utterances"].append(_utt(utt_idx, text, source_page=utt_idx, annotation=ann))
324
+ utt_idx += 1
325
+ if rows_with_data_but_empty_text > 0:
326
+ doc["provenance"]["xlsx_rows_skipped_empty_text"] = rows_with_data_but_empty_text
327
+ return doc
328
+
329
+
330
+ def ingest(path: str | Path, **kwargs: Any) -> dict[str, Any]:
331
+ """Dispatch by extension. `text_col=None` (the default) triggers
332
+ auto-detect on CSV/XLSX inputs."""
333
+ ext = Path(path).suffix.lower()
334
+ if ext == ".csv":
335
+ return ingest_csv(
336
+ path,
337
+ text_col=kwargs.get("text_col"),
338
+ speaker_col=kwargs.get("speaker_col"),
339
+ )
340
+ if ext == ".docx":
341
+ return ingest_docx(path)
342
+ if ext == ".pptx":
343
+ return ingest_pptx(path, thumbnails_dir=kwargs.get("thumbnails_dir"))
344
+ if ext == ".pdf":
345
+ return ingest_pdf(path)
346
+ if ext in (".txt", ".md", ".markdown"):
347
+ return ingest_text(path)
348
+ if ext == ".xlsx":
349
+ return ingest_xlsx(
350
+ path,
351
+ text_col=kwargs.get("text_col"),
352
+ speaker_col=kwargs.get("speaker_col"),
353
+ sheet=kwargs.get("sheet"),
354
+ )
355
+ raise ValueError(f"Unsupported legacy asset: {ext}")
@@ -0,0 +1,90 @@
1
+ """Native (host) legacy-ingest entrypoint (#184).
2
+
3
+ Mirrors `app.transcribe_cli` for documents: runs the pure ingestors in
4
+ `app.legacy.ingest` in a host Python venv so PDF/DOCX/PPTX/CSV/XLSX/TXT ingest
5
+ works WITHOUT the Docker transcriber (demoted to a fallback). Shares the exact
6
+ write + response shape as the `/legacy-ingest` route so the Node legacy-worker
7
+ treats native and Docker results identically.
8
+
9
+ Usage:
10
+ python -m app.legacy_cli --seed-path <seed> --source-path <file> \
11
+ [--text-col COL] [--speaker-col COL] [--sheet NAME]
12
+ Prints exactly one JSON line; exit 0 on ok/empty, 1 on failure.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ from pathlib import Path
20
+
21
+ from .legacy import ingest as ingest_legacy
22
+
23
+
24
+ def main(argv: list[str] | None = None) -> int:
25
+ p = argparse.ArgumentParser(prog="compost-legacy-native")
26
+ p.add_argument("--seed-path", required=True)
27
+ p.add_argument("--source-path", required=True)
28
+ p.add_argument("--text-col", default=None)
29
+ p.add_argument("--speaker-col", default=None)
30
+ p.add_argument("--sheet", default=None)
31
+ args = p.parse_args(argv)
32
+
33
+ src = Path(args.source_path)
34
+ seed = Path(args.seed_path)
35
+ if not src.exists():
36
+ print(json.dumps({"status": "failed", "kind": "invalid_input", "error": f"source not found: {src}"}))
37
+ return 1
38
+ if not seed.exists():
39
+ print(json.dumps({"status": "failed", "kind": "invalid_input", "error": f"seed not found: {seed}"}))
40
+ return 1
41
+
42
+ kwargs: dict[str, str] = {}
43
+ if args.text_col is not None:
44
+ kwargs["text_col"] = args.text_col
45
+ if args.speaker_col is not None:
46
+ kwargs["speaker_col"] = args.speaker_col
47
+ if args.sheet is not None:
48
+ kwargs["sheet"] = args.sheet
49
+
50
+ try:
51
+ doc = ingest_legacy(src, **kwargs)
52
+ except ValueError as e: # unsupported ext / missing column
53
+ print(json.dumps({"status": "failed", "kind": "invalid_input", "error": str(e)}))
54
+ return 1
55
+ except RuntimeError as e: # missing optional dep (python-docx, openpyxl, …)
56
+ print(json.dumps({"status": "failed", "kind": "dep_missing", "error": str(e)}))
57
+ return 1
58
+
59
+ legacy_dir = seed / "legacy"
60
+ legacy_dir.mkdir(parents=True, exist_ok=True)
61
+ out_path = legacy_dir / f"{src.stem}.json"
62
+ out_path.write_text(json.dumps(doc, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
63
+
64
+ utt_count = len(doc.get("utterances", []))
65
+ prov = doc.get("provenance", {})
66
+ warnings: list[str] = []
67
+ skipped = prov.get("xlsx_rows_skipped_empty_text", 0)
68
+ if skipped > 0:
69
+ warnings.append(
70
+ f"{skipped} XLSX row(s) had data in other columns but an empty text cell — "
71
+ "likely an un-evaluated formula. Open the file in Excel once, or export to CSV."
72
+ )
73
+
74
+ print(
75
+ json.dumps(
76
+ {
77
+ "status": "ok" if utt_count > 0 else "empty",
78
+ "source_path": str(src),
79
+ "normalized_path": str(out_path),
80
+ "utterance_count": utt_count,
81
+ "text_col_resolved": prov.get("text_col_resolved"),
82
+ "warnings": warnings,
83
+ }
84
+ )
85
+ )
86
+ return 0
87
+
88
+
89
+ if __name__ == "__main__":
90
+ raise SystemExit(main())
@@ -0,0 +1,30 @@
1
+ """FastAPI entrypoint for the compost transcriber.
2
+
3
+ Mounts the routers each subsystem (transcription, legacy ingest, frames)
4
+ ships in its own issue. /health, /transcribe (v0.1-01), and /legacy-ingest
5
+ (v0.1-02) are live; frame extraction routes land under v0.2-12.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from fastapi import FastAPI
11
+
12
+ from . import __version__
13
+ from .health import router as health_router
14
+ from .routes.legacy import router as legacy_router
15
+ from .routes.transcribe import router as transcribe_router
16
+
17
+
18
+ def create_app() -> FastAPI:
19
+ app = FastAPI(
20
+ title="compost-transcriber",
21
+ version=__version__,
22
+ description="Descriptive audio transcription + frame extraction + legacy ingest.",
23
+ )
24
+ app.include_router(health_router)
25
+ app.include_router(transcribe_router)
26
+ app.include_router(legacy_router)
27
+ return app
28
+
29
+
30
+ app = create_app()