@they-juanreina/compost-cli 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/blame.d.ts.map +1 -1
- package/dist/lib/blame.js +3 -2
- package/dist/lib/blame.js.map +1 -1
- package/dist/lib/ingest.d.ts +1 -0
- package/dist/lib/ingest.d.ts.map +1 -1
- package/dist/lib/ingest.js +46 -15
- package/dist/lib/ingest.js.map +1 -1
- package/dist/lib/journal.d.ts.map +1 -1
- package/dist/lib/journal.js +9 -0
- package/dist/lib/journal.js.map +1 -1
- package/dist/lib/migrate.d.ts.map +1 -1
- package/dist/lib/migrate.js +1 -0
- package/dist/lib/migrate.js.map +1 -1
- package/dist/lib/nativeRuntime.d.ts +18 -3
- package/dist/lib/nativeRuntime.d.ts.map +1 -1
- package/dist/lib/nativeRuntime.js +54 -3
- package/dist/lib/nativeRuntime.js.map +1 -1
- package/dist/lib/retrieve.d.ts.map +1 -1
- package/dist/lib/retrieve.js +0 -8
- package/dist/lib/retrieve.js.map +1 -1
- package/dist/lib/seedResolve.d.ts +5 -0
- package/dist/lib/seedResolve.d.ts.map +1 -1
- package/dist/lib/seedResolve.js +44 -4
- package/dist/lib/seedResolve.js.map +1 -1
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +27 -6
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/snap.d.ts.map +1 -1
- package/dist/lib/snap.js +2 -5
- package/dist/lib/snap.js.map +1 -1
- package/dist/loops/supervisor.d.ts.map +1 -1
- package/dist/loops/supervisor.js +1 -0
- package/dist/loops/supervisor.js.map +1 -1
- package/dist/loops/transcribe_worker.d.ts.map +1 -1
- package/dist/loops/transcribe_worker.js +0 -1
- package/dist/loops/transcribe_worker.js.map +1 -1
- package/dist/router.js +1 -1
- package/package.json +10 -4
- package/transcriber/app/__init__.py +3 -0
- package/transcriber/app/asr.py +198 -0
- package/transcriber/app/asr_parakeet.py +174 -0
- package/transcriber/app/cue_parser.py +110 -0
- package/transcriber/app/diarization.py +300 -0
- package/transcriber/app/frame_annotation.py +77 -0
- package/transcriber/app/frames.py +130 -0
- package/transcriber/app/health.py +70 -0
- package/transcriber/app/legacy.py +355 -0
- package/transcriber/app/main.py +30 -0
- package/transcriber/app/pipeline.py +204 -0
- package/transcriber/app/pptx_export.py +42 -0
- package/transcriber/app/prosody.py +123 -0
- package/transcriber/app/routes/__init__.py +1 -0
- package/transcriber/app/routes/legacy.py +117 -0
- package/transcriber/app/routes/transcribe.py +133 -0
- package/transcriber/app/shot_change.py +74 -0
- package/transcriber/app/silence_typer.py +144 -0
- package/transcriber/app/transcribe_cli.py +82 -0
- package/transcriber/app/vad.py +145 -0
- package/transcriber/pyproject.toml +56 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Prosody hint extractor (#13).
|
|
2
|
+
|
|
3
|
+
Deterministic per-utterance hints derived from word timings, optional VAD
|
|
4
|
+
energy, and speech rate. No ML model — cheap, reproducible context.
|
|
5
|
+
|
|
6
|
+
Output shape (matches transcript.schema.json #/$defs/prosody):
|
|
7
|
+
{"volume": "low|normal|high", "pace": "slow|normal|fast", "hesitations": int}
|
|
8
|
+
|
|
9
|
+
Thresholds are module constants, documented here for reproducibility:
|
|
10
|
+
|
|
11
|
+
pace (words per second over the utterance span):
|
|
12
|
+
< 2.0 → slow
|
|
13
|
+
> 3.3 → fast
|
|
14
|
+
else → normal
|
|
15
|
+
|
|
16
|
+
volume (mean VAD RMS energy, normalized 0..1; requires the energy signal
|
|
17
|
+
from Silero VAD, issue #9). When energy is unavailable we report "normal"
|
|
18
|
+
rather than guess:
|
|
19
|
+
< 0.33 → low
|
|
20
|
+
> 0.66 → high
|
|
21
|
+
else → normal
|
|
22
|
+
|
|
23
|
+
hesitations = filler tokens + immediate word repetitions + long
|
|
24
|
+
intra-utterance gaps (> 400 ms between consecutive words).
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import re
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
PACE_SLOW_WPS = 2.0
|
|
33
|
+
PACE_FAST_WPS = 3.3
|
|
34
|
+
VOLUME_LOW = 0.33
|
|
35
|
+
VOLUME_HIGH = 0.66
|
|
36
|
+
HESITATION_GAP_MS = 400
|
|
37
|
+
|
|
38
|
+
# Multilingual (es-CO + en) filler set.
|
|
39
|
+
_FILLERS = {
|
|
40
|
+
"uh",
|
|
41
|
+
"um",
|
|
42
|
+
"eh",
|
|
43
|
+
"em",
|
|
44
|
+
"este",
|
|
45
|
+
"esto",
|
|
46
|
+
"mmm",
|
|
47
|
+
"hmm",
|
|
48
|
+
"like",
|
|
49
|
+
"pues",
|
|
50
|
+
}
|
|
51
|
+
_FILLER_PHRASES = ("o sea", "you know", "es decir")
|
|
52
|
+
|
|
53
|
+
_WORD_RE = re.compile(r"[^\W\d_]+", re.UNICODE)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _pace(text: str, start_ms: int, end_ms: int) -> str:
|
|
57
|
+
duration_s = max((end_ms - start_ms) / 1000.0, 1e-6)
|
|
58
|
+
n_words = len(_WORD_RE.findall(text))
|
|
59
|
+
wps = n_words / duration_s
|
|
60
|
+
if wps < PACE_SLOW_WPS:
|
|
61
|
+
return "slow"
|
|
62
|
+
if wps > PACE_FAST_WPS:
|
|
63
|
+
return "fast"
|
|
64
|
+
return "normal"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _volume(energy: float | None) -> str:
|
|
68
|
+
if energy is None:
|
|
69
|
+
return "normal"
|
|
70
|
+
if energy < VOLUME_LOW:
|
|
71
|
+
return "low"
|
|
72
|
+
if energy > VOLUME_HIGH:
|
|
73
|
+
return "high"
|
|
74
|
+
return "normal"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _count_hesitations(text: str, words: list[dict[str, Any]] | None) -> int:
|
|
78
|
+
count = 0
|
|
79
|
+
tokens = [t.lower() for t in _WORD_RE.findall(text)]
|
|
80
|
+
|
|
81
|
+
# filler single tokens
|
|
82
|
+
count += sum(1 for t in tokens if t in _FILLERS)
|
|
83
|
+
|
|
84
|
+
# filler phrases
|
|
85
|
+
lowered = text.lower()
|
|
86
|
+
for phrase in _FILLER_PHRASES:
|
|
87
|
+
count += lowered.count(phrase)
|
|
88
|
+
|
|
89
|
+
# immediate repetitions ("yo yo", "the the")
|
|
90
|
+
for a, b in zip(tokens, tokens[1:], strict=False):
|
|
91
|
+
if a == b and len(a) > 1:
|
|
92
|
+
count += 1
|
|
93
|
+
|
|
94
|
+
# long gaps between consecutive words
|
|
95
|
+
if words:
|
|
96
|
+
for prev, nxt in zip(words, words[1:], strict=False):
|
|
97
|
+
if nxt.get("s", 0) - prev.get("e", 0) > HESITATION_GAP_MS:
|
|
98
|
+
count += 1
|
|
99
|
+
|
|
100
|
+
return count
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def extract_prosody(utterance: dict[str, Any], energy: float | None = None) -> dict[str, Any]:
|
|
104
|
+
"""Compute {volume, pace, hesitations} for a single utterance dict."""
|
|
105
|
+
text = utterance.get("text", "")
|
|
106
|
+
return {
|
|
107
|
+
"volume": _volume(energy),
|
|
108
|
+
"pace": _pace(text, utterance["start_ms"], utterance["end_ms"]),
|
|
109
|
+
"hesitations": _count_hesitations(text, utterance.get("words")),
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def annotate_prosody(
|
|
114
|
+
transcript: dict[str, Any],
|
|
115
|
+
energies: dict[str, float] | None = None,
|
|
116
|
+
) -> dict[str, Any]:
|
|
117
|
+
"""Attach `prosody` to every utterance. `energies` maps utterance id → mean
|
|
118
|
+
VAD RMS energy (0..1) when available. Mutates and returns the transcript.
|
|
119
|
+
"""
|
|
120
|
+
energies = energies or {}
|
|
121
|
+
for utt in transcript.get("utterances", []):
|
|
122
|
+
utt["prosody"] = extract_prosody(utt, energies.get(utt.get("id")))
|
|
123
|
+
return transcript
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""HTTP route modules. Each subsystem (#9-#15) mounts its own router."""
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""POST /legacy-ingest — normalize a legacy document into a transcript.json.
|
|
2
|
+
|
|
3
|
+
The Node-side legacy-worker (cli/src/loops/legacy_worker.ts) pulls
|
|
4
|
+
`legacy-ingest` jobs from the queue and POSTs each here. The route dispatches
|
|
5
|
+
by file extension to the pure ingestors in `app/legacy.py`, then writes the
|
|
6
|
+
normalized JSON to `<seed>/legacy/<basename>.json`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from fastapi import APIRouter, HTTPException, status
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
from ..legacy import ingest as ingest_legacy
|
|
19
|
+
|
|
20
|
+
router = APIRouter()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LegacyIngestRequest(BaseModel):
|
|
24
|
+
seed_path: str = Field(..., description="Absolute path to the seed root.")
|
|
25
|
+
source_path: str = Field(..., description="Absolute path to the asset to ingest.")
|
|
26
|
+
# CSV/XLSX column mapping — if text_col is None, the ingestor auto-detects
|
|
27
|
+
# from the header (text → transcript → content → utterance → quote →
|
|
28
|
+
# message → body, then first-column fallback). Node-side workers may also
|
|
29
|
+
# consult a `<source_path>.compost.json` sidecar that takes precedence.
|
|
30
|
+
text_col: str | None = Field(
|
|
31
|
+
None,
|
|
32
|
+
description="Column holding the utterance text (CSV/XLSX). Auto-detected if None.",
|
|
33
|
+
)
|
|
34
|
+
speaker_col: str | None = Field(None, description="Optional column for speaker label.")
|
|
35
|
+
sheet: str | None = Field(None, description="Optional XLSX sheet name (defaults to active).")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LegacyIngestResponse(BaseModel):
|
|
39
|
+
source_path: str
|
|
40
|
+
normalized_path: str
|
|
41
|
+
utterance_count: int
|
|
42
|
+
status: str # ok | empty | failed
|
|
43
|
+
text_col_resolved: str | None = None # which column was actually used (CSV/XLSX)
|
|
44
|
+
warnings: list[str] = [] # surfaced UX hints (e.g. xlsx un-evaluated formulas)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@router.post(
|
|
48
|
+
"/legacy-ingest",
|
|
49
|
+
response_model=LegacyIngestResponse,
|
|
50
|
+
status_code=status.HTTP_200_OK,
|
|
51
|
+
summary="Normalize a PDF/DOCX/PPTX/CSV/XLSX/TXT/MD into a transcript-shaped JSON.",
|
|
52
|
+
)
|
|
53
|
+
def post_legacy_ingest(req: LegacyIngestRequest) -> LegacyIngestResponse:
|
|
54
|
+
src = Path(req.source_path)
|
|
55
|
+
seed = Path(req.seed_path)
|
|
56
|
+
if not src.exists():
|
|
57
|
+
raise HTTPException(
|
|
58
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
59
|
+
detail=f"source not found: {req.source_path}",
|
|
60
|
+
)
|
|
61
|
+
if not seed.exists():
|
|
62
|
+
raise HTTPException(
|
|
63
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
64
|
+
detail=f"seed not found: {req.seed_path}",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
kwargs: dict[str, Any] = {}
|
|
68
|
+
if req.text_col is not None:
|
|
69
|
+
kwargs["text_col"] = req.text_col
|
|
70
|
+
if req.speaker_col is not None:
|
|
71
|
+
kwargs["speaker_col"] = req.speaker_col
|
|
72
|
+
if req.sheet is not None:
|
|
73
|
+
kwargs["sheet"] = req.sheet
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
doc = ingest_legacy(src, **kwargs)
|
|
77
|
+
except ValueError as e:
|
|
78
|
+
# Unsupported ext or missing column — surface as 422 so the worker
|
|
79
|
+
# can mark the job failed and the CLI can show the researcher what's wrong.
|
|
80
|
+
raise HTTPException(
|
|
81
|
+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
82
|
+
detail=f"invalid_input: {e}",
|
|
83
|
+
) from e
|
|
84
|
+
except RuntimeError as e:
|
|
85
|
+
# Missing optional dep (python-docx, openpyxl, etc.) — 503 so the
|
|
86
|
+
# CLI can route to `compost setup --fix`.
|
|
87
|
+
raise HTTPException(
|
|
88
|
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
89
|
+
detail=f"dep_missing: {e}",
|
|
90
|
+
) from e
|
|
91
|
+
|
|
92
|
+
# Write normalized JSON under <seed>/legacy/<basename>.json
|
|
93
|
+
legacy_dir = seed / "legacy"
|
|
94
|
+
legacy_dir.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
out_path = legacy_dir / f"{src.stem}.json"
|
|
96
|
+
out_path.write_text(json.dumps(doc, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
97
|
+
|
|
98
|
+
utt_count = len(doc.get("utterances", []))
|
|
99
|
+
prov = doc.get("provenance", {})
|
|
100
|
+
warnings: list[str] = []
|
|
101
|
+
skipped = prov.get("xlsx_rows_skipped_empty_text", 0)
|
|
102
|
+
if skipped > 0:
|
|
103
|
+
warnings.append(
|
|
104
|
+
f"{skipped} XLSX row(s) had data in other columns but an empty text cell — "
|
|
105
|
+
"likely an un-evaluated formula. Open the file in Excel once, or export to CSV."
|
|
106
|
+
)
|
|
107
|
+
return LegacyIngestResponse(
|
|
108
|
+
source_path=req.source_path,
|
|
109
|
+
normalized_path=str(out_path),
|
|
110
|
+
utterance_count=utt_count,
|
|
111
|
+
status="ok" if utt_count > 0 else "empty",
|
|
112
|
+
text_col_resolved=prov.get("text_col_resolved"),
|
|
113
|
+
warnings=warnings,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
__all__ = ["router", "LegacyIngestRequest", "LegacyIngestResponse"]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""POST /transcribe — orchestrate the full descriptive pipeline (v0.1-01).
|
|
2
|
+
|
|
3
|
+
Body shape mirrors the CLI's `TranscriberClient.transcribe()` contract: the
|
|
4
|
+
client passes the seed root, the session id, and the absolute source path
|
|
5
|
+
(already moved into `sessions/<sid>/source.<ext>` by the inbox watcher).
|
|
6
|
+
|
|
7
|
+
The route returns the transcript path and a status code the worker uses to
|
|
8
|
+
either commit the job, requeue for retry, or surface needs_speaker_labels to
|
|
9
|
+
the researcher.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Annotated
|
|
17
|
+
|
|
18
|
+
from fastapi import APIRouter, Depends, HTTPException, status
|
|
19
|
+
from pydantic import BaseModel, Field
|
|
20
|
+
|
|
21
|
+
from ..asr import ASRConfig
|
|
22
|
+
from ..pipeline import (
|
|
23
|
+
PipelineBackends,
|
|
24
|
+
PipelineConfig,
|
|
25
|
+
run_pipeline,
|
|
26
|
+
write_transcript,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
router = APIRouter()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TranscribeRequest(BaseModel):
|
|
33
|
+
"""JSON body for POST /transcribe."""
|
|
34
|
+
|
|
35
|
+
seed_path: str = Field(..., description="Absolute path to the seed root (Seeds/<name>/).")
|
|
36
|
+
session_id: str = Field(..., pattern=r"^[A-Za-z0-9_-]+$")
|
|
37
|
+
source_path: str = Field(..., description="Absolute path to the audio/video file.")
|
|
38
|
+
language: str | None = Field(None, description="Optional language hint (e.g. 'es-CO').")
|
|
39
|
+
model_name: str = Field("large-v3-turbo", description="Whisper model id.")
|
|
40
|
+
device: str = Field("auto", description="Device: auto | cpu | cuda | mps.")
|
|
41
|
+
compute_type: str = Field("int8", description="Compute precision (int8|float16|float32).")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class TranscribeResponse(BaseModel):
|
|
45
|
+
"""Response shape mirroring `TranscriberClient.TranscribeResponse`."""
|
|
46
|
+
|
|
47
|
+
session_id: str
|
|
48
|
+
transcript_path: str
|
|
49
|
+
status: str # ok | needs_speaker_labels | failed_transcription
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _build_backends() -> PipelineBackends:
|
|
53
|
+
"""Resolve real backends from the environment.
|
|
54
|
+
|
|
55
|
+
Each backend is lazy-loaded by its own module; this function just decides
|
|
56
|
+
*which* backend to inject. In production all three are None → each module
|
|
57
|
+
falls back to its real implementation (WhisperX / pyannote / Silero). In
|
|
58
|
+
tests we override via FastAPI's `app.dependency_overrides`.
|
|
59
|
+
"""
|
|
60
|
+
return PipelineBackends(vad=None, asr=None, diarization=None)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _build_pipeline_config(req: TranscribeRequest) -> PipelineConfig:
|
|
64
|
+
asr = ASRConfig(
|
|
65
|
+
model_name=req.model_name,
|
|
66
|
+
device=req.device,
|
|
67
|
+
compute_type=req.compute_type,
|
|
68
|
+
language=req.language,
|
|
69
|
+
event_tags=True,
|
|
70
|
+
)
|
|
71
|
+
return PipelineConfig(asr=asr)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@router.post(
|
|
75
|
+
"/transcribe",
|
|
76
|
+
response_model=TranscribeResponse,
|
|
77
|
+
status_code=status.HTTP_200_OK,
|
|
78
|
+
summary="Run the descriptive transcription pipeline on a session's source media.",
|
|
79
|
+
)
|
|
80
|
+
def post_transcribe(
|
|
81
|
+
req: TranscribeRequest,
|
|
82
|
+
backends: Annotated[PipelineBackends, Depends(_build_backends)],
|
|
83
|
+
) -> TranscribeResponse:
|
|
84
|
+
if not Path(req.source_path).exists():
|
|
85
|
+
raise HTTPException(
|
|
86
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
87
|
+
detail=f"source not found: {req.source_path}",
|
|
88
|
+
)
|
|
89
|
+
if not Path(req.seed_path).exists():
|
|
90
|
+
raise HTTPException(
|
|
91
|
+
status_code=status.HTTP_404_NOT_FOUND,
|
|
92
|
+
detail=f"seed not found: {req.seed_path}",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
config = _build_pipeline_config(req)
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
transcript = run_pipeline(
|
|
99
|
+
seed_path=req.seed_path,
|
|
100
|
+
session_id=req.session_id,
|
|
101
|
+
source_path=req.source_path,
|
|
102
|
+
config=config,
|
|
103
|
+
backends=backends,
|
|
104
|
+
)
|
|
105
|
+
except RuntimeError as e:
|
|
106
|
+
# Backend missing weights → distinguishable from generic failure so the
|
|
107
|
+
# CLI can suggest `compost setup --fix`.
|
|
108
|
+
if "asr extra" in str(e).lower() or "HUGGINGFACE_TOKEN" in str(e):
|
|
109
|
+
raise HTTPException(
|
|
110
|
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
111
|
+
detail=f"model_missing: {e}",
|
|
112
|
+
) from e
|
|
113
|
+
raise HTTPException(
|
|
114
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
115
|
+
detail=f"failed_transcription: {e}",
|
|
116
|
+
) from e
|
|
117
|
+
|
|
118
|
+
transcript_path = write_transcript(req.seed_path, req.session_id, transcript)
|
|
119
|
+
|
|
120
|
+
return TranscribeResponse(
|
|
121
|
+
session_id=req.session_id,
|
|
122
|
+
transcript_path=transcript_path,
|
|
123
|
+
status=transcript.get("status", "ok"),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def hf_token_present() -> bool:
|
|
128
|
+
"""Helper exposed for the /compost-setup doctor: whether a HuggingFace
|
|
129
|
+
token is on the environment (does NOT validate it works against pyannote)."""
|
|
130
|
+
return bool(os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN"))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
__all__ = ["router", "TranscribeRequest", "TranscribeResponse", "hf_token_present"]
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Perceptual-hash shot-change detector (#15).
|
|
2
|
+
|
|
3
|
+
Samples the video at a fixed cadence, computes a perceptual hash per sampled
|
|
4
|
+
frame, and reports the timestamps where the hash distance to the previous
|
|
5
|
+
sample crosses a threshold — i.e. a scene cut, slide change, or camera move.
|
|
6
|
+
|
|
7
|
+
Output is a list of at_ms values consumed by the frame extractor (#14) as
|
|
8
|
+
`shot_change` triggers. No classification beyond "something changed".
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import subprocess
|
|
14
|
+
import tempfile
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import imagehash
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
20
|
+
# Default Hamming-distance threshold between consecutive perceptual hashes.
|
|
21
|
+
# Tunable via config ([frames].shot_change_phash_distance).
|
|
22
|
+
DEFAULT_PHASH_DISTANCE = 12
|
|
23
|
+
DEFAULT_SAMPLE_INTERVAL_MS = 1000
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _sample_frame(video_path: Path, at_ms: int, out_path: Path) -> bool:
|
|
27
|
+
ts = at_ms / 1000.0
|
|
28
|
+
cmd = [
|
|
29
|
+
"ffmpeg",
|
|
30
|
+
"-y",
|
|
31
|
+
"-ss",
|
|
32
|
+
f"{ts:.3f}",
|
|
33
|
+
"-i",
|
|
34
|
+
str(video_path),
|
|
35
|
+
"-frames:v",
|
|
36
|
+
"1",
|
|
37
|
+
"-vf",
|
|
38
|
+
"scale=160:90",
|
|
39
|
+
str(out_path),
|
|
40
|
+
]
|
|
41
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
42
|
+
return proc.returncode == 0 and out_path.exists()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def detect_shot_changes(
|
|
46
|
+
video_path: str | Path,
|
|
47
|
+
duration_ms: int,
|
|
48
|
+
threshold: int = DEFAULT_PHASH_DISTANCE,
|
|
49
|
+
sample_interval_ms: int = DEFAULT_SAMPLE_INTERVAL_MS,
|
|
50
|
+
) -> list[int]:
|
|
51
|
+
"""Return at_ms timestamps where a shot change is detected.
|
|
52
|
+
|
|
53
|
+
The first sampled frame is never a "change" (no predecessor). Distances at
|
|
54
|
+
or above `threshold` mark a change.
|
|
55
|
+
"""
|
|
56
|
+
video_path = Path(video_path)
|
|
57
|
+
changes: list[int] = []
|
|
58
|
+
prev_hash: imagehash.ImageHash | None = None
|
|
59
|
+
|
|
60
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
61
|
+
tmp_dir = Path(tmp)
|
|
62
|
+
at = 0
|
|
63
|
+
idx = 0
|
|
64
|
+
while at < duration_ms:
|
|
65
|
+
frame_path = tmp_dir / f"s{idx}.png"
|
|
66
|
+
if _sample_frame(video_path, at, frame_path):
|
|
67
|
+
with Image.open(frame_path) as img:
|
|
68
|
+
h = imagehash.phash(img)
|
|
69
|
+
if prev_hash is not None and (h - prev_hash) >= threshold:
|
|
70
|
+
changes.append(at)
|
|
71
|
+
prev_hash = h
|
|
72
|
+
at += sample_interval_ms
|
|
73
|
+
idx += 1
|
|
74
|
+
return changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Silence typer — heuristic post-processor that assigns a semantic type to each
|
|
2
|
+
first-class silence (> threshold) from the surrounding utterance context.
|
|
3
|
+
|
|
4
|
+
Types (ROADMAP § Descriptive transcription A):
|
|
5
|
+
- after_question : the silence follows a moderator question
|
|
6
|
+
- mid_utterance : the silence sits inside one speaker's turn
|
|
7
|
+
- thinking : a pre-response pause that isn't clearly after a question
|
|
8
|
+
- interruption : the silence coincides with an overlap/turn-steal
|
|
9
|
+
|
|
10
|
+
Rules are versioned. Researchers can override any assignment downstream; an
|
|
11
|
+
override is recorded as a `researcher`-authored event in the provenance log
|
|
12
|
+
(see issue #12 / provenance writer #27).
|
|
13
|
+
|
|
14
|
+
CHANGELOG
|
|
15
|
+
v1 (2026-06-03): initial rule set.
|
|
16
|
+
- after_question: previous utterance is a moderator AND ends with '?'
|
|
17
|
+
(or a leading inverted '¿' question), and abuts the silence start.
|
|
18
|
+
- interruption: an overlap/interruption cue overlaps the silence window,
|
|
19
|
+
OR previous and next utterances are different speakers and the previous
|
|
20
|
+
did not end on sentence-final punctuation (cut off).
|
|
21
|
+
- mid_utterance: previous and next utterances are the same speaker.
|
|
22
|
+
- thinking: default.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
RULES_VERSION = "1"
|
|
30
|
+
|
|
31
|
+
_SILENCE_TYPES = ("after_question", "mid_utterance", "thinking", "interruption")
|
|
32
|
+
|
|
33
|
+
# How close (ms) the previous utterance's end must be to the silence start for
|
|
34
|
+
# the silence to be considered "abutting" that utterance.
|
|
35
|
+
_ABUT_TOLERANCE_MS = 250
|
|
36
|
+
|
|
37
|
+
_SENTENCE_FINAL = (".", "!", "?", "…")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ends_question(text: str) -> bool:
|
|
41
|
+
stripped = text.rstrip()
|
|
42
|
+
if stripped.endswith("?"):
|
|
43
|
+
return True
|
|
44
|
+
# Spanish inverted question mark opening with no closing yet still reads as a question.
|
|
45
|
+
return "¿" in stripped and "?" in stripped
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _ends_sentence_final(text: str) -> bool:
|
|
49
|
+
stripped = text.rstrip()
|
|
50
|
+
return stripped.endswith(_SENTENCE_FINAL)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _speaker_type(speakers: list[dict[str, Any]], speaker_id: str | None) -> str | None:
|
|
54
|
+
if speaker_id is None:
|
|
55
|
+
return None
|
|
56
|
+
for s in speakers:
|
|
57
|
+
if s.get("id") == speaker_id:
|
|
58
|
+
return s.get("type")
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _cue_overlaps(silence: dict[str, Any], cues: list[dict[str, Any]]) -> bool:
|
|
63
|
+
s_start = silence["start_ms"]
|
|
64
|
+
s_end = silence["end_ms"]
|
|
65
|
+
for cue in cues:
|
|
66
|
+
if cue.get("kind") not in ("overlap", "interruption"):
|
|
67
|
+
continue
|
|
68
|
+
# any temporal overlap between the cue and the silence window
|
|
69
|
+
if cue["start_ms"] <= s_end and cue["end_ms"] >= s_start:
|
|
70
|
+
return True
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def type_silence(
|
|
75
|
+
silence: dict[str, Any],
|
|
76
|
+
prev_utt: dict[str, Any] | None,
|
|
77
|
+
next_utt: dict[str, Any] | None,
|
|
78
|
+
speakers: list[dict[str, Any]],
|
|
79
|
+
cues: list[dict[str, Any]] | None = None,
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Return one of the four silence types for a single silence."""
|
|
82
|
+
cues = cues or []
|
|
83
|
+
|
|
84
|
+
if _cue_overlaps(silence, cues):
|
|
85
|
+
return "interruption"
|
|
86
|
+
|
|
87
|
+
if prev_utt is not None:
|
|
88
|
+
abuts = abs(silence["start_ms"] - prev_utt["end_ms"]) <= _ABUT_TOLERANCE_MS
|
|
89
|
+
prev_type = _speaker_type(speakers, prev_utt.get("speaker_id"))
|
|
90
|
+
if abuts and prev_type == "moderator" and _ends_question(prev_utt.get("text", "")):
|
|
91
|
+
return "after_question"
|
|
92
|
+
|
|
93
|
+
if (
|
|
94
|
+
prev_utt is not None
|
|
95
|
+
and next_utt is not None
|
|
96
|
+
and prev_utt.get("speaker_id") == next_utt.get("speaker_id")
|
|
97
|
+
):
|
|
98
|
+
return "mid_utterance"
|
|
99
|
+
|
|
100
|
+
# Different speakers (or unknown) and the previous turn was cut off → interruption.
|
|
101
|
+
if (
|
|
102
|
+
prev_utt is not None
|
|
103
|
+
and next_utt is not None
|
|
104
|
+
and prev_utt.get("speaker_id") != next_utt.get("speaker_id")
|
|
105
|
+
and not _ends_sentence_final(prev_utt.get("text", ""))
|
|
106
|
+
):
|
|
107
|
+
return "interruption"
|
|
108
|
+
|
|
109
|
+
return "thinking"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _utterance_before(utterances: list[dict[str, Any]], at_ms: int) -> dict[str, Any] | None:
|
|
113
|
+
candidate = None
|
|
114
|
+
for u in utterances:
|
|
115
|
+
if u["end_ms"] <= at_ms + _ABUT_TOLERANCE_MS and (
|
|
116
|
+
candidate is None or u["end_ms"] > candidate["end_ms"]
|
|
117
|
+
):
|
|
118
|
+
candidate = u
|
|
119
|
+
return candidate
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _utterance_after(utterances: list[dict[str, Any]], at_ms: int) -> dict[str, Any] | None:
|
|
123
|
+
candidate = None
|
|
124
|
+
for u in utterances:
|
|
125
|
+
if u["start_ms"] >= at_ms - _ABUT_TOLERANCE_MS and (
|
|
126
|
+
candidate is None or u["start_ms"] < candidate["start_ms"]
|
|
127
|
+
):
|
|
128
|
+
candidate = u
|
|
129
|
+
return candidate
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def type_all_silences(transcript: dict[str, Any]) -> dict[str, Any]:
|
|
133
|
+
"""Annotate every silence in a transcript dict with a `context` type.
|
|
134
|
+
|
|
135
|
+
Mutates and returns the transcript. Idempotent. Fast: O(silences × utterances).
|
|
136
|
+
"""
|
|
137
|
+
utterances = transcript.get("utterances", [])
|
|
138
|
+
cues = transcript.get("cues", [])
|
|
139
|
+
speakers = transcript.get("speakers", [])
|
|
140
|
+
for silence in transcript.get("silences", []):
|
|
141
|
+
prev_utt = _utterance_before(utterances, silence["start_ms"])
|
|
142
|
+
next_utt = _utterance_after(utterances, silence["end_ms"])
|
|
143
|
+
silence["context"] = type_silence(silence, prev_utt, next_utt, speakers, cues)
|
|
144
|
+
return transcript
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Native (host) transcription entrypoint (#176).
|
|
2
|
+
|
|
3
|
+
Runs the full pipeline ON THE HOST (no Docker) so Apple-Silicon ASR backends
|
|
4
|
+
(`parakeet-mlx` / Metal) and pyannote use the GPU/CPU directly — the Docker
|
|
5
|
+
container is CPU-only on macOS, which is the bottleneck this path removes. The
|
|
6
|
+
Node CLI shells out to this when `transcriber.runtime = native`; the Docker
|
|
7
|
+
`/transcribe` route stays the cross-platform fallback and shares the exact same
|
|
8
|
+
`run_pipeline` orchestration.
|
|
9
|
+
|
|
10
|
+
python -m app.transcribe_cli \
|
|
11
|
+
--seed-path <seed> --session-id S001 \
|
|
12
|
+
--source-path <seed>/sessions/S001/source.mp3 \
|
|
13
|
+
--engine parakeet --language en
|
|
14
|
+
|
|
15
|
+
Prints one JSON line mirroring the /transcribe response shape so the Node
|
|
16
|
+
caller parses both paths identically.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
|
|
24
|
+
from .asr import ASRConfig
|
|
25
|
+
from .pipeline import PipelineBackends, PipelineConfig, run_pipeline, write_transcript
|
|
26
|
+
|
|
27
|
+
_DEFAULT_MODEL = {
|
|
28
|
+
"parakeet": "mlx-community/parakeet-tdt-0.6b-v3",
|
|
29
|
+
"whisper": "large-v3-turbo",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main(argv: list[str] | None = None) -> int:
|
|
34
|
+
p = argparse.ArgumentParser(prog="compost-transcribe-native")
|
|
35
|
+
p.add_argument("--seed-path", required=True)
|
|
36
|
+
p.add_argument("--session-id", required=True)
|
|
37
|
+
p.add_argument("--source-path", required=True)
|
|
38
|
+
p.add_argument("--engine", default="parakeet", choices=["parakeet", "whisper"])
|
|
39
|
+
p.add_argument("--model", default=None, help="ASR model id (engine default if omitted)")
|
|
40
|
+
p.add_argument("--language", default=None)
|
|
41
|
+
p.add_argument("--device", default="auto")
|
|
42
|
+
p.add_argument("--compute-type", default="int8")
|
|
43
|
+
args = p.parse_args(argv)
|
|
44
|
+
|
|
45
|
+
asr = ASRConfig(
|
|
46
|
+
model_name=args.model or _DEFAULT_MODEL[args.engine],
|
|
47
|
+
device=args.device,
|
|
48
|
+
compute_type=args.compute_type,
|
|
49
|
+
language=args.language,
|
|
50
|
+
engine=args.engine,
|
|
51
|
+
)
|
|
52
|
+
config = PipelineConfig(asr=asr, asr_model_tag=f"{asr.model_name} ({args.engine})")
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
transcript = run_pipeline(
|
|
56
|
+
seed_path=args.seed_path,
|
|
57
|
+
session_id=args.session_id,
|
|
58
|
+
source_path=args.source_path,
|
|
59
|
+
config=config,
|
|
60
|
+
backends=PipelineBackends(), # all None → real lazy backends (Silero / engine ASR / pyannote)
|
|
61
|
+
)
|
|
62
|
+
except Exception as e: # surface as JSON so the Node caller can report it
|
|
63
|
+
print(json.dumps({"status": "failed", "error": str(e)}))
|
|
64
|
+
return 1
|
|
65
|
+
|
|
66
|
+
path = write_transcript(args.seed_path, args.session_id, transcript)
|
|
67
|
+
print(
|
|
68
|
+
json.dumps(
|
|
69
|
+
{
|
|
70
|
+
"session_id": args.session_id,
|
|
71
|
+
"transcript_path": path,
|
|
72
|
+
"status": transcript.get("status", "ok"),
|
|
73
|
+
"engine": args.engine,
|
|
74
|
+
"model": asr.model_name,
|
|
75
|
+
}
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
raise SystemExit(main())
|