@they-juanreina/compost-cli 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/agreement.d.ts +3 -0
- package/dist/commands/agreement.d.ts.map +1 -0
- package/dist/commands/agreement.js +35 -0
- package/dist/commands/agreement.js.map +1 -0
- package/dist/commands/create.d.ts +1 -0
- package/dist/commands/create.d.ts.map +1 -1
- package/dist/commands/create.js +39 -1
- package/dist/commands/create.js.map +1 -1
- package/dist/commands/export.d.ts.map +1 -1
- package/dist/commands/export.js +47 -4
- package/dist/commands/export.js.map +1 -1
- package/dist/commands/import.d.ts +3 -0
- package/dist/commands/import.d.ts.map +1 -0
- package/dist/commands/import.js +59 -0
- package/dist/commands/import.js.map +1 -0
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +1 -0
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/jobs.d.ts +3 -0
- package/dist/commands/jobs.d.ts.map +1 -0
- package/dist/commands/jobs.js +105 -0
- package/dist/commands/jobs.js.map +1 -0
- package/dist/commands/label.d.ts +3 -0
- package/dist/commands/label.d.ts.map +1 -0
- package/dist/commands/label.js +67 -0
- package/dist/commands/label.js.map +1 -0
- package/dist/commands/models.d.ts.map +1 -1
- package/dist/commands/models.js +2 -1
- package/dist/commands/models.js.map +1 -1
- package/dist/commands/recode.d.ts +3 -0
- package/dist/commands/recode.d.ts.map +1 -0
- package/dist/commands/recode.js +60 -0
- package/dist/commands/recode.js.map +1 -0
- package/dist/commands/reindex.d.ts.map +1 -1
- package/dist/commands/reindex.js +6 -4
- package/dist/commands/reindex.js.map +1 -1
- package/dist/commands/rerun.d.ts +3 -0
- package/dist/commands/rerun.d.ts.map +1 -0
- package/dist/commands/rerun.js +91 -0
- package/dist/commands/rerun.js.map +1 -0
- package/dist/commands/search.d.ts.map +1 -1
- package/dist/commands/search.js +2 -1
- package/dist/commands/search.js.map +1 -1
- package/dist/commands/secrets.d.ts +3 -0
- package/dist/commands/secrets.d.ts.map +1 -0
- package/dist/commands/secrets.js +143 -0
- package/dist/commands/secrets.js.map +1 -0
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +90 -1
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/status.d.ts.map +1 -1
- package/dist/commands/status.js +2 -1
- package/dist/commands/status.js.map +1 -1
- package/dist/commands/transcribe.d.ts.map +1 -1
- package/dist/commands/transcribe.js +13 -2
- package/dist/commands/transcribe.js.map +1 -1
- package/dist/commands/validate.d.ts.map +1 -1
- package/dist/commands/validate.js +29 -1
- package/dist/commands/validate.js.map +1 -1
- package/dist/engine.d.ts +23 -0
- package/dist/engine.d.ts.map +1 -0
- package/dist/engine.js +32 -0
- package/dist/engine.js.map +1 -0
- package/dist/exporters/prov.d.ts +11 -0
- package/dist/exporters/prov.d.ts.map +1 -0
- package/dist/exporters/prov.js +151 -0
- package/dist/exporters/prov.js.map +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -1
- package/dist/lib/agreement.d.ts +77 -0
- package/dist/lib/agreement.d.ts.map +1 -0
- package/dist/lib/agreement.js +261 -0
- package/dist/lib/agreement.js.map +1 -0
- package/dist/lib/artifacts.d.ts +32 -1
- package/dist/lib/artifacts.d.ts.map +1 -1
- package/dist/lib/artifacts.js +156 -22
- package/dist/lib/artifacts.js.map +1 -1
- package/dist/lib/blame.d.ts.map +1 -1
- package/dist/lib/blame.js +3 -2
- package/dist/lib/blame.js.map +1 -1
- package/dist/lib/config.d.ts +3 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/doctor.d.ts +3 -0
- package/dist/lib/doctor.d.ts.map +1 -1
- package/dist/lib/doctor.js +24 -1
- package/dist/lib/doctor.js.map +1 -1
- package/dist/lib/events.d.ts +34 -1
- package/dist/lib/events.d.ts.map +1 -1
- package/dist/lib/events.js +35 -1
- package/dist/lib/events.js.map +1 -1
- package/dist/lib/importTranscript.d.ts +16 -0
- package/dist/lib/importTranscript.d.ts.map +1 -0
- package/dist/lib/importTranscript.js +94 -0
- package/dist/lib/importTranscript.js.map +1 -0
- package/dist/lib/ingest.d.ts.map +1 -1
- package/dist/lib/ingest.js +12 -6
- package/dist/lib/ingest.js.map +1 -1
- package/dist/lib/journal.d.ts +13 -0
- package/dist/lib/journal.d.ts.map +1 -1
- package/dist/lib/journal.js +58 -2
- package/dist/lib/journal.js.map +1 -1
- package/dist/lib/legacyNative.d.ts +24 -0
- package/dist/lib/legacyNative.d.ts.map +1 -0
- package/dist/lib/legacyNative.js +51 -0
- package/dist/lib/legacyNative.js.map +1 -0
- package/dist/lib/migrate.d.ts.map +1 -1
- package/dist/lib/migrate.js +1 -0
- package/dist/lib/migrate.js.map +1 -1
- package/dist/lib/nativeRuntime.d.ts +6 -3
- package/dist/lib/nativeRuntime.d.ts.map +1 -1
- package/dist/lib/nativeRuntime.js +6 -3
- package/dist/lib/nativeRuntime.js.map +1 -1
- package/dist/lib/provisionNative.js +1 -1
- package/dist/lib/provisionNative.js.map +1 -1
- package/dist/lib/queue.d.ts +25 -0
- package/dist/lib/queue.d.ts.map +1 -1
- package/dist/lib/queue.js +70 -3
- package/dist/lib/queue.js.map +1 -1
- package/dist/lib/reads.d.ts +24 -0
- package/dist/lib/reads.d.ts.map +1 -0
- package/dist/lib/reads.js +115 -0
- package/dist/lib/reads.js.map +1 -0
- package/dist/lib/recode.d.ts +19 -0
- package/dist/lib/recode.d.ts.map +1 -0
- package/dist/lib/recode.js +43 -0
- package/dist/lib/recode.js.map +1 -0
- package/dist/lib/rerun.d.ts +51 -0
- package/dist/lib/rerun.d.ts.map +1 -0
- package/dist/lib/rerun.js +166 -0
- package/dist/lib/rerun.js.map +1 -0
- package/dist/lib/retrieve.d.ts +8 -4
- package/dist/lib/retrieve.d.ts.map +1 -1
- package/dist/lib/retrieve.js +12 -10
- package/dist/lib/retrieve.js.map +1 -1
- package/dist/lib/schemas.generated.d.ts.map +1 -1
- package/dist/lib/schemas.generated.js +28 -0
- package/dist/lib/schemas.generated.js.map +1 -1
- package/dist/lib/secrets.d.ts +158 -0
- package/dist/lib/secrets.d.ts.map +1 -0
- package/dist/lib/secrets.js +507 -0
- package/dist/lib/secrets.js.map +1 -0
- package/dist/lib/seed.d.ts +5 -0
- package/dist/lib/seed.d.ts.map +1 -1
- package/dist/lib/seed.js +15 -2
- package/dist/lib/seed.js.map +1 -1
- package/dist/lib/seedResolve.d.ts.map +1 -1
- package/dist/lib/seedResolve.js +1 -0
- package/dist/lib/seedResolve.js.map +1 -1
- package/dist/lib/session.d.ts +14 -0
- package/dist/lib/session.d.ts.map +1 -1
- package/dist/lib/session.js +47 -0
- package/dist/lib/session.js.map +1 -1
- package/dist/lib/setup.d.ts +5 -0
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +78 -14
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/setupWizard.d.ts +51 -0
- package/dist/lib/setupWizard.d.ts.map +1 -0
- package/dist/lib/setupWizard.js +223 -0
- package/dist/lib/setupWizard.js.map +1 -0
- package/dist/lib/snap.d.ts.map +1 -1
- package/dist/lib/snap.js +2 -5
- package/dist/lib/snap.js.map +1 -1
- package/dist/lib/speakers.d.ts +41 -0
- package/dist/lib/speakers.d.ts.map +1 -0
- package/dist/lib/speakers.js +78 -0
- package/dist/lib/speakers.js.map +1 -0
- package/dist/lib/status.d.ts.map +1 -1
- package/dist/lib/status.js +21 -0
- package/dist/lib/status.js.map +1 -1
- package/dist/lib/userConfig.d.ts +22 -0
- package/dist/lib/userConfig.d.ts.map +1 -0
- package/dist/lib/userConfig.js +67 -0
- package/dist/lib/userConfig.js.map +1 -0
- package/dist/lib/validate.d.ts +18 -0
- package/dist/lib/validate.d.ts.map +1 -1
- package/dist/lib/validate.js +70 -1
- package/dist/lib/validate.js.map +1 -1
- package/dist/lib/version.d.ts +30 -0
- package/dist/lib/version.d.ts.map +1 -0
- package/dist/lib/version.js +73 -0
- package/dist/lib/version.js.map +1 -0
- package/dist/llm/adapter.d.ts.map +1 -1
- package/dist/llm/adapter.js +2 -0
- package/dist/llm/adapter.js.map +1 -1
- package/dist/llm/providers/ollama.d.ts.map +1 -1
- package/dist/llm/providers/ollama.js +6 -0
- package/dist/llm/providers/ollama.js.map +1 -1
- package/dist/loops/ingest_watcher.d.ts.map +1 -1
- package/dist/loops/ingest_watcher.js +6 -3
- package/dist/loops/ingest_watcher.js.map +1 -1
- package/dist/loops/legacy_worker.d.ts +28 -1
- package/dist/loops/legacy_worker.d.ts.map +1 -1
- package/dist/loops/legacy_worker.js +81 -9
- package/dist/loops/legacy_worker.js.map +1 -1
- package/dist/loops/supervisor.d.ts +3 -0
- package/dist/loops/supervisor.d.ts.map +1 -1
- package/dist/loops/supervisor.js +12 -0
- package/dist/loops/supervisor.js.map +1 -1
- package/dist/loops/synthesis.d.ts.map +1 -1
- package/dist/loops/synthesis.js +15 -0
- package/dist/loops/synthesis.js.map +1 -1
- package/dist/loops/transcribe_worker.d.ts.map +1 -1
- package/dist/loops/transcribe_worker.js +2 -4
- package/dist/loops/transcribe_worker.js.map +1 -1
- package/dist/output.d.ts +13 -1
- package/dist/output.d.ts.map +1 -1
- package/dist/output.js +22 -2
- package/dist/output.js.map +1 -1
- package/dist/render/human.d.ts +20 -0
- package/dist/render/human.d.ts.map +1 -0
- package/dist/render/human.js +54 -0
- package/dist/render/human.js.map +1 -0
- package/dist/router.d.ts.map +1 -1
- package/dist/router.js +17 -2
- package/dist/router.js.map +1 -1
- package/package.json +18 -5
- package/templates/config.toml +6 -1
- package/transcriber/app/__init__.py +3 -0
- package/transcriber/app/asr.py +198 -0
- package/transcriber/app/asr_parakeet.py +174 -0
- package/transcriber/app/cue_parser.py +110 -0
- package/transcriber/app/diarization.py +330 -0
- package/transcriber/app/frame_annotation.py +77 -0
- package/transcriber/app/frames.py +130 -0
- package/transcriber/app/health.py +70 -0
- package/transcriber/app/legacy.py +355 -0
- package/transcriber/app/legacy_cli.py +90 -0
- package/transcriber/app/main.py +30 -0
- package/transcriber/app/pipeline.py +210 -0
- package/transcriber/app/pptx_export.py +42 -0
- package/transcriber/app/prosody.py +128 -0
- package/transcriber/app/routes/__init__.py +1 -0
- package/transcriber/app/routes/legacy.py +117 -0
- package/transcriber/app/routes/transcribe.py +133 -0
- package/transcriber/app/shot_change.py +74 -0
- package/transcriber/app/silence_typer.py +144 -0
- package/transcriber/app/transcribe_cli.py +82 -0
- package/transcriber/app/vad.py +216 -0
- package/transcriber/pyproject.toml +56 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Health endpoint for the transcriber service.
|
|
2
|
+
|
|
3
|
+
ROADMAP § Verification — `compost watch` and the CLI probe this on startup to
|
|
4
|
+
confirm the transcriber container is reachable before queuing work.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import platform
|
|
10
|
+
import sys
|
|
11
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
12
|
+
|
|
13
|
+
from fastapi import APIRouter
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
from . import __version__
|
|
17
|
+
|
|
18
|
+
router = APIRouter()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HealthResponse(BaseModel):
|
|
22
|
+
"""Stable contract for /health. CLI parses these fields."""
|
|
23
|
+
|
|
24
|
+
status: str
|
|
25
|
+
service: str
|
|
26
|
+
versions: dict[str, str | None]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _safe_version(pkg: str) -> str | None:
|
|
30
|
+
"""Return the installed version of `pkg`, or None if it isn't installed.
|
|
31
|
+
|
|
32
|
+
Model-heavy optional deps (whisperx, pyannote.audio, silero-vad) are
|
|
33
|
+
declared in `pyproject.toml` under the `asr` extra and only installed
|
|
34
|
+
when their respective issues land (#9-#15). Until then, /health
|
|
35
|
+
reports them as `null` so the CLI can tell the user what's missing.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
return version(pkg)
|
|
39
|
+
except PackageNotFoundError:
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@router.get("/health", response_model=HealthResponse)
|
|
44
|
+
def get_health() -> HealthResponse:
|
|
45
|
+
return HealthResponse(
|
|
46
|
+
status="ok",
|
|
47
|
+
service="compost-transcriber",
|
|
48
|
+
versions={
|
|
49
|
+
"transcriber": __version__,
|
|
50
|
+
"python": platform.python_version(),
|
|
51
|
+
"fastapi": _safe_version("fastapi"),
|
|
52
|
+
"uvicorn": _safe_version("uvicorn"),
|
|
53
|
+
"whisperx": _safe_version("whisperx"),
|
|
54
|
+
"pyannote.audio": _safe_version("pyannote.audio"),
|
|
55
|
+
"silero-vad": _safe_version("silero-vad"),
|
|
56
|
+
},
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__all__ = ["router", "HealthResponse", "get_health"]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _python_metadata_check() -> None:
|
|
64
|
+
"""Self-check at import time: make sure we're on a supported runtime."""
|
|
65
|
+
major, minor = sys.version_info[:2]
|
|
66
|
+
if (major, minor) < (3, 11):
|
|
67
|
+
raise RuntimeError(f"compost-transcriber requires Python >=3.11, got {major}.{minor}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
_python_metadata_check()
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"""Legacy asset ingestors (#29).
|
|
2
|
+
|
|
3
|
+
Normalize PDF / DOCX / PPTX / CSV into a transcript-shaped JSON with
|
|
4
|
+
kind="document": one utterance per paragraph (PDF/DOCX), per slide (PPTX),
|
|
5
|
+
or per row (CSV). Output validates against schema/transcript.schema.json
|
|
6
|
+
(kind="document", modality=["document"]).
|
|
7
|
+
|
|
8
|
+
Heavy parsers (pdfplumber, python-docx, python-pptx) are imported lazily so
|
|
9
|
+
the module loads without the `legacy` extra; each ingestor raises a clear
|
|
10
|
+
error if its dependency is missing.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import csv
|
|
16
|
+
import os
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
INGESTOR_VERSION = "compost-legacy@0.1.0"
|
|
21
|
+
DOC_SPEAKER = {"id": "S1", "name": "document", "type": "other"}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _base(session_id: str, source: str, language: str = "und") -> dict[str, Any]:
|
|
25
|
+
return {
|
|
26
|
+
"schema_version": "1.0",
|
|
27
|
+
"kind": "document",
|
|
28
|
+
"session_id": session_id,
|
|
29
|
+
"source": source,
|
|
30
|
+
"language": language,
|
|
31
|
+
"duration_ms": 0,
|
|
32
|
+
"modality": ["document"],
|
|
33
|
+
"speakers": [dict(DOC_SPEAKER)],
|
|
34
|
+
"utterances": [],
|
|
35
|
+
"provenance": {"transcriber": INGESTOR_VERSION},
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _utt(idx: int, text: str, source_page: int | None = None, annotation: str | None = None) -> dict[str, Any]:
|
|
40
|
+
u: dict[str, Any] = {
|
|
41
|
+
"id": f"U-{idx:04d}",
|
|
42
|
+
"speaker_id": DOC_SPEAKER["id"],
|
|
43
|
+
"turn": idx,
|
|
44
|
+
"start_ms": 0,
|
|
45
|
+
"end_ms": 0,
|
|
46
|
+
"text": text,
|
|
47
|
+
}
|
|
48
|
+
if source_page is not None:
|
|
49
|
+
u["source_page"] = source_page
|
|
50
|
+
if annotation is not None:
|
|
51
|
+
u["annotation"] = annotation
|
|
52
|
+
return u
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _session_id(path: str | Path) -> str:
|
|
56
|
+
stem = Path(path).stem
|
|
57
|
+
safe = "".join(c if c.isalnum() or c in "-_" else "-" for c in stem)
|
|
58
|
+
return f"DOC-{safe}"[:64]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------- CSV / XLSX
|
|
62
|
+
|
|
63
|
+
# Auto-detect priority for the "text" column. First case-insensitive match
|
|
64
|
+
# in the source's header wins. Falls back to the first column.
|
|
65
|
+
TEXT_COL_CANDIDATES = (
|
|
66
|
+
"text",
|
|
67
|
+
"transcript",
|
|
68
|
+
"content",
|
|
69
|
+
"utterance",
|
|
70
|
+
"quote",
|
|
71
|
+
"message",
|
|
72
|
+
"body",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _auto_text_col(fieldnames: list[str]) -> str:
|
|
77
|
+
"""Pick the most-likely text column from a header. Case-insensitive match
|
|
78
|
+
against TEXT_COL_CANDIDATES, then a first-column fallback."""
|
|
79
|
+
lower = {f.lower(): f for f in fieldnames}
|
|
80
|
+
for candidate in TEXT_COL_CANDIDATES:
|
|
81
|
+
if candidate in lower:
|
|
82
|
+
return lower[candidate]
|
|
83
|
+
return fieldnames[0]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def ingest_csv(
|
|
87
|
+
path: str | Path,
|
|
88
|
+
text_col: str | None = None,
|
|
89
|
+
speaker_col: str | None = None,
|
|
90
|
+
) -> dict[str, Any]:
|
|
91
|
+
"""One utterance per row.
|
|
92
|
+
|
|
93
|
+
`text_col=None` triggers auto-detect: text → transcript → content →
|
|
94
|
+
utterance → quote → message → body (case-insensitive). Falls back to
|
|
95
|
+
the first column. The resolved column is recorded on the output's
|
|
96
|
+
`provenance.text_col_resolved` for caller visibility.
|
|
97
|
+
"""
|
|
98
|
+
path = str(path)
|
|
99
|
+
doc = _base(_session_id(path), path)
|
|
100
|
+
with open(path, newline="", encoding="utf-8") as f:
|
|
101
|
+
reader = csv.DictReader(f)
|
|
102
|
+
if reader.fieldnames is None:
|
|
103
|
+
raise ValueError(f"CSV has no header row: {path}")
|
|
104
|
+
fields = list(reader.fieldnames)
|
|
105
|
+
resolved = text_col if text_col is not None else _auto_text_col(fields)
|
|
106
|
+
if resolved not in fields:
|
|
107
|
+
raise ValueError(f"CSV has no column '{resolved}' (columns: {fields})")
|
|
108
|
+
doc["provenance"]["text_col_resolved"] = resolved
|
|
109
|
+
idx = 1
|
|
110
|
+
for row in reader:
|
|
111
|
+
text = (row.get(resolved) or "").strip()
|
|
112
|
+
if not text:
|
|
113
|
+
continue
|
|
114
|
+
ann = None
|
|
115
|
+
if speaker_col is not None and row.get(speaker_col):
|
|
116
|
+
ann = f"[speaker: {row[speaker_col]}]"
|
|
117
|
+
doc["utterances"].append(_utt(idx, text, source_page=idx, annotation=ann))
|
|
118
|
+
idx += 1
|
|
119
|
+
return doc
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------- DOCX
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def ingest_docx(path: str | Path) -> dict[str, Any]:
|
|
126
|
+
try:
|
|
127
|
+
import docx # type: ignore
|
|
128
|
+
except ImportError as e:
|
|
129
|
+
raise RuntimeError("python-docx not installed (pip install -e '.[legacy]')") from e
|
|
130
|
+
|
|
131
|
+
path = str(path)
|
|
132
|
+
doc = _base(_session_id(path), path)
|
|
133
|
+
d = docx.Document(path)
|
|
134
|
+
idx = 1
|
|
135
|
+
current_heading: str | None = None
|
|
136
|
+
for para in d.paragraphs:
|
|
137
|
+
text = para.text.strip()
|
|
138
|
+
if not text:
|
|
139
|
+
continue
|
|
140
|
+
style = (para.style.name or "").lower() if para.style else ""
|
|
141
|
+
if style.startswith("heading"):
|
|
142
|
+
current_heading = text
|
|
143
|
+
# headings preserved as section anchors via annotation on the next utterances
|
|
144
|
+
continue
|
|
145
|
+
ann = f"[section: {current_heading}]" if current_heading else None
|
|
146
|
+
doc["utterances"].append(_utt(idx, text, annotation=ann))
|
|
147
|
+
idx += 1
|
|
148
|
+
return doc
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ---------------------------------------------------------------- PPTX
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def ingest_pptx(path: str | Path, thumbnails_dir: str | Path | None = None) -> dict[str, Any]:
|
|
155
|
+
try:
|
|
156
|
+
from pptx import Presentation # type: ignore
|
|
157
|
+
except ImportError as e:
|
|
158
|
+
raise RuntimeError("python-pptx not installed (pip install -e '.[legacy]')") from e
|
|
159
|
+
|
|
160
|
+
path = str(path)
|
|
161
|
+
doc = _base(_session_id(path), path)
|
|
162
|
+
prs = Presentation(path)
|
|
163
|
+
idx = 1
|
|
164
|
+
for slide_no, slide in enumerate(prs.slides, start=1):
|
|
165
|
+
parts: list[str] = []
|
|
166
|
+
for shape in slide.shapes:
|
|
167
|
+
if shape.has_text_frame:
|
|
168
|
+
for p in shape.text_frame.paragraphs:
|
|
169
|
+
line = "".join(run.text for run in p.runs).strip()
|
|
170
|
+
if line:
|
|
171
|
+
parts.append(line)
|
|
172
|
+
notes = ""
|
|
173
|
+
if slide.has_notes_slide and slide.notes_slide.notes_text_frame is not None:
|
|
174
|
+
notes = slide.notes_slide.notes_text_frame.text.strip()
|
|
175
|
+
if notes:
|
|
176
|
+
parts.append(f"(notes) {notes}")
|
|
177
|
+
text = "\n".join(parts)
|
|
178
|
+
if text:
|
|
179
|
+
doc["utterances"].append(_utt(idx, text, source_page=slide_no))
|
|
180
|
+
idx += 1
|
|
181
|
+
# Thumbnail rendering requires LibreOffice/unoconv (not bundled); skipped
|
|
182
|
+
# gracefully. The slide text above is the load-bearing evidence.
|
|
183
|
+
if thumbnails_dir is not None:
|
|
184
|
+
os.makedirs(thumbnails_dir, exist_ok=True)
|
|
185
|
+
return doc
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# ---------------------------------------------------------------- PDF
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def ingest_pdf(path: str | Path) -> dict[str, Any]:
|
|
192
|
+
try:
|
|
193
|
+
import pdfplumber # type: ignore
|
|
194
|
+
except ImportError as e:
|
|
195
|
+
raise RuntimeError("pdfplumber not installed (pip install -e '.[legacy]')") from e
|
|
196
|
+
|
|
197
|
+
path = str(path)
|
|
198
|
+
doc = _base(_session_id(path), path)
|
|
199
|
+
idx = 1
|
|
200
|
+
with pdfplumber.open(path) as pdf:
|
|
201
|
+
for page_no, page in enumerate(pdf.pages, start=1):
|
|
202
|
+
text = page.extract_text() or ""
|
|
203
|
+
# OCR fallback for scanned pages (no extractable text) requires
|
|
204
|
+
# pytesseract + the page raster; attempted best-effort.
|
|
205
|
+
if not text.strip():
|
|
206
|
+
text = _ocr_page(page)
|
|
207
|
+
for para in _paragraphs(text):
|
|
208
|
+
doc["utterances"].append(_utt(idx, para, source_page=page_no))
|
|
209
|
+
idx += 1
|
|
210
|
+
return doc
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _paragraphs(text: str) -> list[str]:
|
|
214
|
+
out: list[str] = []
|
|
215
|
+
for block in text.split("\n\n"):
|
|
216
|
+
cleaned = " ".join(line.strip() for line in block.splitlines() if line.strip())
|
|
217
|
+
if cleaned:
|
|
218
|
+
out.append(cleaned)
|
|
219
|
+
return out
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _ocr_page(page: Any) -> str: # pragma: no cover - needs tesseract + a raster
|
|
223
|
+
try:
|
|
224
|
+
import pytesseract # type: ignore
|
|
225
|
+
from PIL import Image # type: ignore # noqa: F401
|
|
226
|
+
except ImportError:
|
|
227
|
+
return ""
|
|
228
|
+
try:
|
|
229
|
+
im = page.to_image(resolution=200).original
|
|
230
|
+
return pytesseract.image_to_string(im)
|
|
231
|
+
except Exception:
|
|
232
|
+
return ""
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# ---------------------------------------------------------------- Markdown / Text
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def ingest_text(path: str | Path) -> dict[str, Any]:
|
|
239
|
+
"""Read a plain-text or Markdown file and split into paragraph utterances.
|
|
240
|
+
|
|
241
|
+
Both `.txt` (Otter / Zoom exports) and `.md` land here. Top-level
|
|
242
|
+
headings are recorded as section annotations on subsequent utterances,
|
|
243
|
+
mirroring the docx behavior.
|
|
244
|
+
"""
|
|
245
|
+
path = str(path)
|
|
246
|
+
doc = _base(_session_id(path), path)
|
|
247
|
+
with open(path, encoding="utf-8") as f:
|
|
248
|
+
body = f.read()
|
|
249
|
+
|
|
250
|
+
current_heading: str | None = None
|
|
251
|
+
idx = 1
|
|
252
|
+
for para in _paragraphs(body):
|
|
253
|
+
# Markdown ATX heading line (`# ` through `###### `) → record as section
|
|
254
|
+
# anchor, skip the utterance. Setext (==== / ---- underline) not yet
|
|
255
|
+
# supported — rare in mod-era markdown.
|
|
256
|
+
if para.startswith(("# ", "## ", "### ", "#### ", "##### ", "###### ")):
|
|
257
|
+
current_heading = para.lstrip("# ").strip()
|
|
258
|
+
continue
|
|
259
|
+
ann = f"[section: {current_heading}]" if current_heading else None
|
|
260
|
+
doc["utterances"].append(_utt(idx, para, annotation=ann))
|
|
261
|
+
idx += 1
|
|
262
|
+
return doc
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# ---------------------------------------------------------------- XLSX
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def ingest_xlsx(
|
|
269
|
+
path: str | Path,
|
|
270
|
+
text_col: str | None = None,
|
|
271
|
+
speaker_col: str | None = None,
|
|
272
|
+
sheet: str | None = None,
|
|
273
|
+
) -> dict[str, Any]:
|
|
274
|
+
"""One utterance per row of a spreadsheet.
|
|
275
|
+
|
|
276
|
+
`text_col=None` triggers the same auto-detect as `ingest_csv`. The
|
|
277
|
+
resolved column lands on `provenance.text_col_resolved`. Use `sheet`
|
|
278
|
+
to pick a non-default tab.
|
|
279
|
+
"""
|
|
280
|
+
try:
|
|
281
|
+
from openpyxl import load_workbook # type: ignore
|
|
282
|
+
except ImportError as e:
|
|
283
|
+
raise RuntimeError("openpyxl not installed (pip install -e '.[legacy]')") from e
|
|
284
|
+
|
|
285
|
+
path = str(path)
|
|
286
|
+
doc = _base(_session_id(path), path)
|
|
287
|
+
wb = load_workbook(path, read_only=True, data_only=True)
|
|
288
|
+
ws = wb[sheet] if sheet is not None else wb.active
|
|
289
|
+
if ws is None:
|
|
290
|
+
raise ValueError(f"XLSX has no worksheets: {path}")
|
|
291
|
+
|
|
292
|
+
rows = ws.iter_rows(values_only=True)
|
|
293
|
+
header_row = next(rows, None)
|
|
294
|
+
if header_row is None:
|
|
295
|
+
return doc # empty sheet
|
|
296
|
+
header = [str(c) if c is not None else "" for c in header_row]
|
|
297
|
+
resolved = text_col if text_col is not None else _auto_text_col(header)
|
|
298
|
+
if resolved not in header:
|
|
299
|
+
raise ValueError(f"XLSX has no column '{resolved}' (columns: {header})")
|
|
300
|
+
doc["provenance"]["text_col_resolved"] = resolved
|
|
301
|
+
text_idx = header.index(resolved)
|
|
302
|
+
speaker_idx = header.index(speaker_col) if speaker_col in header else -1
|
|
303
|
+
|
|
304
|
+
utt_idx = 1
|
|
305
|
+
# Track rows where the text column is empty but the row has other data —
|
|
306
|
+
# a strong proxy for "Excel never evaluated this formula so openpyxl
|
|
307
|
+
# returned None". Researchers seeing this should open the file in Excel
|
|
308
|
+
# once or pre-export to CSV.
|
|
309
|
+
rows_with_data_but_empty_text = 0
|
|
310
|
+
for row in rows:
|
|
311
|
+
if row is None:
|
|
312
|
+
continue
|
|
313
|
+
cell = row[text_idx] if text_idx < len(row) else None
|
|
314
|
+
text = str(cell).strip() if cell is not None else ""
|
|
315
|
+
if not text:
|
|
316
|
+
# Row has data elsewhere → likely an un-evaluated formula in the text column.
|
|
317
|
+
if any(c is not None and str(c).strip() for c in row):
|
|
318
|
+
rows_with_data_but_empty_text += 1
|
|
319
|
+
continue
|
|
320
|
+
ann = None
|
|
321
|
+
if speaker_idx >= 0 and speaker_idx < len(row) and row[speaker_idx] is not None:
|
|
322
|
+
ann = f"[speaker: {row[speaker_idx]}]"
|
|
323
|
+
doc["utterances"].append(_utt(utt_idx, text, source_page=utt_idx, annotation=ann))
|
|
324
|
+
utt_idx += 1
|
|
325
|
+
if rows_with_data_but_empty_text > 0:
|
|
326
|
+
doc["provenance"]["xlsx_rows_skipped_empty_text"] = rows_with_data_but_empty_text
|
|
327
|
+
return doc
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def ingest(path: str | Path, **kwargs: Any) -> dict[str, Any]:
|
|
331
|
+
"""Dispatch by extension. `text_col=None` (the default) triggers
|
|
332
|
+
auto-detect on CSV/XLSX inputs."""
|
|
333
|
+
ext = Path(path).suffix.lower()
|
|
334
|
+
if ext == ".csv":
|
|
335
|
+
return ingest_csv(
|
|
336
|
+
path,
|
|
337
|
+
text_col=kwargs.get("text_col"),
|
|
338
|
+
speaker_col=kwargs.get("speaker_col"),
|
|
339
|
+
)
|
|
340
|
+
if ext == ".docx":
|
|
341
|
+
return ingest_docx(path)
|
|
342
|
+
if ext == ".pptx":
|
|
343
|
+
return ingest_pptx(path, thumbnails_dir=kwargs.get("thumbnails_dir"))
|
|
344
|
+
if ext == ".pdf":
|
|
345
|
+
return ingest_pdf(path)
|
|
346
|
+
if ext in (".txt", ".md", ".markdown"):
|
|
347
|
+
return ingest_text(path)
|
|
348
|
+
if ext == ".xlsx":
|
|
349
|
+
return ingest_xlsx(
|
|
350
|
+
path,
|
|
351
|
+
text_col=kwargs.get("text_col"),
|
|
352
|
+
speaker_col=kwargs.get("speaker_col"),
|
|
353
|
+
sheet=kwargs.get("sheet"),
|
|
354
|
+
)
|
|
355
|
+
raise ValueError(f"Unsupported legacy asset: {ext}")
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Native (host) legacy-ingest entrypoint (#184).
|
|
2
|
+
|
|
3
|
+
Mirrors `app.transcribe_cli` for documents: runs the pure ingestors in
|
|
4
|
+
`app.legacy.ingest` in a host Python venv so PDF/DOCX/PPTX/CSV/XLSX/TXT ingest
|
|
5
|
+
works WITHOUT the Docker transcriber (demoted to a fallback). Shares the exact
|
|
6
|
+
write + response shape as the `/legacy-ingest` route so the Node legacy-worker
|
|
7
|
+
treats native and Docker results identically.
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python -m app.legacy_cli --seed-path <seed> --source-path <file> \
|
|
11
|
+
[--text-col COL] [--speaker-col COL] [--sheet NAME]
|
|
12
|
+
Prints exactly one JSON line; exit 0 on ok/empty, 1 on failure.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from .legacy import ingest as ingest_legacy
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def main(argv: list[str] | None = None) -> int:
|
|
25
|
+
p = argparse.ArgumentParser(prog="compost-legacy-native")
|
|
26
|
+
p.add_argument("--seed-path", required=True)
|
|
27
|
+
p.add_argument("--source-path", required=True)
|
|
28
|
+
p.add_argument("--text-col", default=None)
|
|
29
|
+
p.add_argument("--speaker-col", default=None)
|
|
30
|
+
p.add_argument("--sheet", default=None)
|
|
31
|
+
args = p.parse_args(argv)
|
|
32
|
+
|
|
33
|
+
src = Path(args.source_path)
|
|
34
|
+
seed = Path(args.seed_path)
|
|
35
|
+
if not src.exists():
|
|
36
|
+
print(json.dumps({"status": "failed", "kind": "invalid_input", "error": f"source not found: {src}"}))
|
|
37
|
+
return 1
|
|
38
|
+
if not seed.exists():
|
|
39
|
+
print(json.dumps({"status": "failed", "kind": "invalid_input", "error": f"seed not found: {seed}"}))
|
|
40
|
+
return 1
|
|
41
|
+
|
|
42
|
+
kwargs: dict[str, str] = {}
|
|
43
|
+
if args.text_col is not None:
|
|
44
|
+
kwargs["text_col"] = args.text_col
|
|
45
|
+
if args.speaker_col is not None:
|
|
46
|
+
kwargs["speaker_col"] = args.speaker_col
|
|
47
|
+
if args.sheet is not None:
|
|
48
|
+
kwargs["sheet"] = args.sheet
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
doc = ingest_legacy(src, **kwargs)
|
|
52
|
+
except ValueError as e: # unsupported ext / missing column
|
|
53
|
+
print(json.dumps({"status": "failed", "kind": "invalid_input", "error": str(e)}))
|
|
54
|
+
return 1
|
|
55
|
+
except RuntimeError as e: # missing optional dep (python-docx, openpyxl, …)
|
|
56
|
+
print(json.dumps({"status": "failed", "kind": "dep_missing", "error": str(e)}))
|
|
57
|
+
return 1
|
|
58
|
+
|
|
59
|
+
legacy_dir = seed / "legacy"
|
|
60
|
+
legacy_dir.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
out_path = legacy_dir / f"{src.stem}.json"
|
|
62
|
+
out_path.write_text(json.dumps(doc, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
63
|
+
|
|
64
|
+
utt_count = len(doc.get("utterances", []))
|
|
65
|
+
prov = doc.get("provenance", {})
|
|
66
|
+
warnings: list[str] = []
|
|
67
|
+
skipped = prov.get("xlsx_rows_skipped_empty_text", 0)
|
|
68
|
+
if skipped > 0:
|
|
69
|
+
warnings.append(
|
|
70
|
+
f"{skipped} XLSX row(s) had data in other columns but an empty text cell — "
|
|
71
|
+
"likely an un-evaluated formula. Open the file in Excel once, or export to CSV."
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
print(
|
|
75
|
+
json.dumps(
|
|
76
|
+
{
|
|
77
|
+
"status": "ok" if utt_count > 0 else "empty",
|
|
78
|
+
"source_path": str(src),
|
|
79
|
+
"normalized_path": str(out_path),
|
|
80
|
+
"utterance_count": utt_count,
|
|
81
|
+
"text_col_resolved": prov.get("text_col_resolved"),
|
|
82
|
+
"warnings": warnings,
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
return 0
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""FastAPI entrypoint for the compost transcriber.
|
|
2
|
+
|
|
3
|
+
Mounts the routers each subsystem (transcription, legacy ingest, frames)
|
|
4
|
+
ships in its own issue. /health, /transcribe (v0.1-01), and /legacy-ingest
|
|
5
|
+
(v0.1-02) are live; frame extraction routes land under v0.2-12.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from fastapi import FastAPI
|
|
11
|
+
|
|
12
|
+
from . import __version__
|
|
13
|
+
from .health import router as health_router
|
|
14
|
+
from .routes.legacy import router as legacy_router
|
|
15
|
+
from .routes.transcribe import router as transcribe_router
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_app() -> FastAPI:
|
|
19
|
+
app = FastAPI(
|
|
20
|
+
title="compost-transcriber",
|
|
21
|
+
version=__version__,
|
|
22
|
+
description="Descriptive audio transcription + frame extraction + legacy ingest.",
|
|
23
|
+
)
|
|
24
|
+
app.include_router(health_router)
|
|
25
|
+
app.include_router(transcribe_router)
|
|
26
|
+
app.include_router(legacy_router)
|
|
27
|
+
return app
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
app = create_app()
|