@they-juanreina/compost-cli 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/dist/lib/blame.d.ts.map +1 -1
  2. package/dist/lib/blame.js +3 -2
  3. package/dist/lib/blame.js.map +1 -1
  4. package/dist/lib/ingest.d.ts +1 -0
  5. package/dist/lib/ingest.d.ts.map +1 -1
  6. package/dist/lib/ingest.js +46 -15
  7. package/dist/lib/ingest.js.map +1 -1
  8. package/dist/lib/journal.d.ts.map +1 -1
  9. package/dist/lib/journal.js +9 -0
  10. package/dist/lib/journal.js.map +1 -1
  11. package/dist/lib/migrate.d.ts.map +1 -1
  12. package/dist/lib/migrate.js +1 -0
  13. package/dist/lib/migrate.js.map +1 -1
  14. package/dist/lib/nativeRuntime.d.ts +18 -3
  15. package/dist/lib/nativeRuntime.d.ts.map +1 -1
  16. package/dist/lib/nativeRuntime.js +54 -3
  17. package/dist/lib/nativeRuntime.js.map +1 -1
  18. package/dist/lib/retrieve.d.ts.map +1 -1
  19. package/dist/lib/retrieve.js +0 -8
  20. package/dist/lib/retrieve.js.map +1 -1
  21. package/dist/lib/seedResolve.d.ts +5 -0
  22. package/dist/lib/seedResolve.d.ts.map +1 -1
  23. package/dist/lib/seedResolve.js +44 -4
  24. package/dist/lib/seedResolve.js.map +1 -1
  25. package/dist/lib/setup.d.ts.map +1 -1
  26. package/dist/lib/setup.js +27 -6
  27. package/dist/lib/setup.js.map +1 -1
  28. package/dist/lib/snap.d.ts.map +1 -1
  29. package/dist/lib/snap.js +2 -5
  30. package/dist/lib/snap.js.map +1 -1
  31. package/dist/loops/supervisor.d.ts.map +1 -1
  32. package/dist/loops/supervisor.js +1 -0
  33. package/dist/loops/supervisor.js.map +1 -1
  34. package/dist/loops/transcribe_worker.d.ts.map +1 -1
  35. package/dist/loops/transcribe_worker.js +0 -1
  36. package/dist/loops/transcribe_worker.js.map +1 -1
  37. package/dist/router.js +1 -1
  38. package/package.json +10 -4
  39. package/transcriber/app/__init__.py +3 -0
  40. package/transcriber/app/asr.py +198 -0
  41. package/transcriber/app/asr_parakeet.py +174 -0
  42. package/transcriber/app/cue_parser.py +110 -0
  43. package/transcriber/app/diarization.py +300 -0
  44. package/transcriber/app/frame_annotation.py +77 -0
  45. package/transcriber/app/frames.py +130 -0
  46. package/transcriber/app/health.py +70 -0
  47. package/transcriber/app/legacy.py +355 -0
  48. package/transcriber/app/main.py +30 -0
  49. package/transcriber/app/pipeline.py +204 -0
  50. package/transcriber/app/pptx_export.py +42 -0
  51. package/transcriber/app/prosody.py +123 -0
  52. package/transcriber/app/routes/__init__.py +1 -0
  53. package/transcriber/app/routes/legacy.py +117 -0
  54. package/transcriber/app/routes/transcribe.py +133 -0
  55. package/transcriber/app/shot_change.py +74 -0
  56. package/transcriber/app/silence_typer.py +144 -0
  57. package/transcriber/app/transcribe_cli.py +82 -0
  58. package/transcriber/app/vad.py +145 -0
  59. package/transcriber/pyproject.toml +56 -0
@@ -0,0 +1,123 @@
1
+ """Prosody hint extractor (#13).
2
+
3
+ Deterministic per-utterance hints derived from word timings, optional VAD
4
+ energy, and speech rate. No ML model — cheap, reproducible context.
5
+
6
+ Output shape (matches transcript.schema.json #/$defs/prosody):
7
+ {"volume": "low|normal|high", "pace": "slow|normal|fast", "hesitations": int}
8
+
9
+ Thresholds are module constants, documented here for reproducibility:
10
+
11
+ pace (words per second over the utterance span):
12
+ < 2.0 → slow
13
+ > 3.3 → fast
14
+ else → normal
15
+
16
+ volume (mean VAD RMS energy, normalized 0..1; requires the energy signal
17
+ from Silero VAD, issue #9). When energy is unavailable we report "normal"
18
+ rather than guess:
19
+ < 0.33 → low
20
+ > 0.66 → high
21
+ else → normal
22
+
23
+ hesitations = filler tokens + immediate word repetitions + long
24
+ intra-utterance gaps (> 400 ms between consecutive words).
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import re
30
+ from typing import Any
31
+
32
+ PACE_SLOW_WPS = 2.0
33
+ PACE_FAST_WPS = 3.3
34
+ VOLUME_LOW = 0.33
35
+ VOLUME_HIGH = 0.66
36
+ HESITATION_GAP_MS = 400
37
+
38
+ # Multilingual (es-CO + en) filler set.
39
+ _FILLERS = {
40
+ "uh",
41
+ "um",
42
+ "eh",
43
+ "em",
44
+ "este",
45
+ "esto",
46
+ "mmm",
47
+ "hmm",
48
+ "like",
49
+ "pues",
50
+ }
51
+ _FILLER_PHRASES = ("o sea", "you know", "es decir")
52
+
53
+ _WORD_RE = re.compile(r"[^\W\d_]+", re.UNICODE)
54
+
55
+
56
+ def _pace(text: str, start_ms: int, end_ms: int) -> str:
57
+ duration_s = max((end_ms - start_ms) / 1000.0, 1e-6)
58
+ n_words = len(_WORD_RE.findall(text))
59
+ wps = n_words / duration_s
60
+ if wps < PACE_SLOW_WPS:
61
+ return "slow"
62
+ if wps > PACE_FAST_WPS:
63
+ return "fast"
64
+ return "normal"
65
+
66
+
67
+ def _volume(energy: float | None) -> str:
68
+ if energy is None:
69
+ return "normal"
70
+ if energy < VOLUME_LOW:
71
+ return "low"
72
+ if energy > VOLUME_HIGH:
73
+ return "high"
74
+ return "normal"
75
+
76
+
77
+ def _count_hesitations(text: str, words: list[dict[str, Any]] | None) -> int:
78
+ count = 0
79
+ tokens = [t.lower() for t in _WORD_RE.findall(text)]
80
+
81
+ # filler single tokens
82
+ count += sum(1 for t in tokens if t in _FILLERS)
83
+
84
+ # filler phrases
85
+ lowered = text.lower()
86
+ for phrase in _FILLER_PHRASES:
87
+ count += lowered.count(phrase)
88
+
89
+ # immediate repetitions ("yo yo", "the the")
90
+ for a, b in zip(tokens, tokens[1:], strict=False):
91
+ if a == b and len(a) > 1:
92
+ count += 1
93
+
94
+ # long gaps between consecutive words
95
+ if words:
96
+ for prev, nxt in zip(words, words[1:], strict=False):
97
+ if nxt.get("s", 0) - prev.get("e", 0) > HESITATION_GAP_MS:
98
+ count += 1
99
+
100
+ return count
101
+
102
+
103
+ def extract_prosody(utterance: dict[str, Any], energy: float | None = None) -> dict[str, Any]:
104
+ """Compute {volume, pace, hesitations} for a single utterance dict."""
105
+ text = utterance.get("text", "")
106
+ return {
107
+ "volume": _volume(energy),
108
+ "pace": _pace(text, utterance["start_ms"], utterance["end_ms"]),
109
+ "hesitations": _count_hesitations(text, utterance.get("words")),
110
+ }
111
+
112
+
113
+ def annotate_prosody(
114
+ transcript: dict[str, Any],
115
+ energies: dict[str, float] | None = None,
116
+ ) -> dict[str, Any]:
117
+ """Attach `prosody` to every utterance. `energies` maps utterance id → mean
118
+ VAD RMS energy (0..1) when available. Mutates and returns the transcript.
119
+ """
120
+ energies = energies or {}
121
+ for utt in transcript.get("utterances", []):
122
+ utt["prosody"] = extract_prosody(utt, energies.get(utt.get("id")))
123
+ return transcript
@@ -0,0 +1 @@
1
+ """HTTP route modules. Each subsystem (#9-#15) mounts its own router."""
@@ -0,0 +1,117 @@
1
+ """POST /legacy-ingest — normalize a legacy document into a transcript.json.
2
+
3
+ The Node-side legacy-worker (cli/src/loops/legacy_worker.ts) pulls
4
+ `legacy-ingest` jobs from the queue and POSTs each here. The route dispatches
5
+ by file extension to the pure ingestors in `app/legacy.py`, then writes the
6
+ normalized JSON to `<seed>/legacy/<basename>.json`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from fastapi import APIRouter, HTTPException, status
16
+ from pydantic import BaseModel, Field
17
+
18
+ from ..legacy import ingest as ingest_legacy
19
+
20
+ router = APIRouter()
21
+
22
+
23
+ class LegacyIngestRequest(BaseModel):
24
+ seed_path: str = Field(..., description="Absolute path to the seed root.")
25
+ source_path: str = Field(..., description="Absolute path to the asset to ingest.")
26
+ # CSV/XLSX column mapping — if text_col is None, the ingestor auto-detects
27
+ # from the header (text → transcript → content → utterance → quote →
28
+ # message → body, then first-column fallback). Node-side workers may also
29
+ # consult a `<source_path>.compost.json` sidecar that takes precedence.
30
+ text_col: str | None = Field(
31
+ None,
32
+ description="Column holding the utterance text (CSV/XLSX). Auto-detected if None.",
33
+ )
34
+ speaker_col: str | None = Field(None, description="Optional column for speaker label.")
35
+ sheet: str | None = Field(None, description="Optional XLSX sheet name (defaults to active).")
36
+
37
+
38
+ class LegacyIngestResponse(BaseModel):
39
+ source_path: str
40
+ normalized_path: str
41
+ utterance_count: int
42
+ status: str # ok | empty | failed
43
+ text_col_resolved: str | None = None # which column was actually used (CSV/XLSX)
44
+ warnings: list[str] = [] # surfaced UX hints (e.g. xlsx un-evaluated formulas)
45
+
46
+
47
+ @router.post(
48
+ "/legacy-ingest",
49
+ response_model=LegacyIngestResponse,
50
+ status_code=status.HTTP_200_OK,
51
+ summary="Normalize a PDF/DOCX/PPTX/CSV/XLSX/TXT/MD into a transcript-shaped JSON.",
52
+ )
53
+ def post_legacy_ingest(req: LegacyIngestRequest) -> LegacyIngestResponse:
54
+ src = Path(req.source_path)
55
+ seed = Path(req.seed_path)
56
+ if not src.exists():
57
+ raise HTTPException(
58
+ status_code=status.HTTP_404_NOT_FOUND,
59
+ detail=f"source not found: {req.source_path}",
60
+ )
61
+ if not seed.exists():
62
+ raise HTTPException(
63
+ status_code=status.HTTP_404_NOT_FOUND,
64
+ detail=f"seed not found: {req.seed_path}",
65
+ )
66
+
67
+ kwargs: dict[str, Any] = {}
68
+ if req.text_col is not None:
69
+ kwargs["text_col"] = req.text_col
70
+ if req.speaker_col is not None:
71
+ kwargs["speaker_col"] = req.speaker_col
72
+ if req.sheet is not None:
73
+ kwargs["sheet"] = req.sheet
74
+
75
+ try:
76
+ doc = ingest_legacy(src, **kwargs)
77
+ except ValueError as e:
78
+ # Unsupported ext or missing column — surface as 422 so the worker
79
+ # can mark the job failed and the CLI can show the researcher what's wrong.
80
+ raise HTTPException(
81
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
82
+ detail=f"invalid_input: {e}",
83
+ ) from e
84
+ except RuntimeError as e:
85
+ # Missing optional dep (python-docx, openpyxl, etc.) — 503 so the
86
+ # CLI can route to `compost setup --fix`.
87
+ raise HTTPException(
88
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
89
+ detail=f"dep_missing: {e}",
90
+ ) from e
91
+
92
+ # Write normalized JSON under <seed>/legacy/<basename>.json
93
+ legacy_dir = seed / "legacy"
94
+ legacy_dir.mkdir(parents=True, exist_ok=True)
95
+ out_path = legacy_dir / f"{src.stem}.json"
96
+ out_path.write_text(json.dumps(doc, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
97
+
98
+ utt_count = len(doc.get("utterances", []))
99
+ prov = doc.get("provenance", {})
100
+ warnings: list[str] = []
101
+ skipped = prov.get("xlsx_rows_skipped_empty_text", 0)
102
+ if skipped > 0:
103
+ warnings.append(
104
+ f"{skipped} XLSX row(s) had data in other columns but an empty text cell — "
105
+ "likely an un-evaluated formula. Open the file in Excel once, or export to CSV."
106
+ )
107
+ return LegacyIngestResponse(
108
+ source_path=req.source_path,
109
+ normalized_path=str(out_path),
110
+ utterance_count=utt_count,
111
+ status="ok" if utt_count > 0 else "empty",
112
+ text_col_resolved=prov.get("text_col_resolved"),
113
+ warnings=warnings,
114
+ )
115
+
116
+
117
+ __all__ = ["router", "LegacyIngestRequest", "LegacyIngestResponse"]
@@ -0,0 +1,133 @@
1
+ """POST /transcribe — orchestrate the full descriptive pipeline (v0.1-01).
2
+
3
+ Body shape mirrors the CLI's `TranscriberClient.transcribe()` contract: the
4
+ client passes the seed root, the session id, and the absolute source path
5
+ (already moved into `sessions/<sid>/source.<ext>` by the inbox watcher).
6
+
7
+ The route returns the transcript path and a status code the worker uses to
8
+ either commit the job, requeue for retry, or surface needs_speaker_labels to
9
+ the researcher.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from pathlib import Path
16
+ from typing import Annotated
17
+
18
+ from fastapi import APIRouter, Depends, HTTPException, status
19
+ from pydantic import BaseModel, Field
20
+
21
+ from ..asr import ASRConfig
22
+ from ..pipeline import (
23
+ PipelineBackends,
24
+ PipelineConfig,
25
+ run_pipeline,
26
+ write_transcript,
27
+ )
28
+
29
+ router = APIRouter()
30
+
31
+
32
+ class TranscribeRequest(BaseModel):
33
+ """JSON body for POST /transcribe."""
34
+
35
+ seed_path: str = Field(..., description="Absolute path to the seed root (Seeds/<name>/).")
36
+ session_id: str = Field(..., pattern=r"^[A-Za-z0-9_-]+$")
37
+ source_path: str = Field(..., description="Absolute path to the audio/video file.")
38
+ language: str | None = Field(None, description="Optional language hint (e.g. 'es-CO').")
39
+ model_name: str = Field("large-v3-turbo", description="Whisper model id.")
40
+ device: str = Field("auto", description="Device: auto | cpu | cuda | mps.")
41
+ compute_type: str = Field("int8", description="Compute precision (int8|float16|float32).")
42
+
43
+
44
+ class TranscribeResponse(BaseModel):
45
+ """Response shape mirroring `TranscriberClient.TranscribeResponse`."""
46
+
47
+ session_id: str
48
+ transcript_path: str
49
+ status: str # ok | needs_speaker_labels | failed_transcription
50
+
51
+
52
+ def _build_backends() -> PipelineBackends:
53
+ """Resolve real backends from the environment.
54
+
55
+ Each backend is lazy-loaded by its own module; this function just decides
56
+ *which* backend to inject. In production all three are None → each module
57
+ falls back to its real implementation (WhisperX / pyannote / Silero). In
58
+ tests we override via FastAPI's `app.dependency_overrides`.
59
+ """
60
+ return PipelineBackends(vad=None, asr=None, diarization=None)
61
+
62
+
63
+ def _build_pipeline_config(req: TranscribeRequest) -> PipelineConfig:
64
+ asr = ASRConfig(
65
+ model_name=req.model_name,
66
+ device=req.device,
67
+ compute_type=req.compute_type,
68
+ language=req.language,
69
+ event_tags=True,
70
+ )
71
+ return PipelineConfig(asr=asr)
72
+
73
+
74
+ @router.post(
75
+ "/transcribe",
76
+ response_model=TranscribeResponse,
77
+ status_code=status.HTTP_200_OK,
78
+ summary="Run the descriptive transcription pipeline on a session's source media.",
79
+ )
80
+ def post_transcribe(
81
+ req: TranscribeRequest,
82
+ backends: Annotated[PipelineBackends, Depends(_build_backends)],
83
+ ) -> TranscribeResponse:
84
+ if not Path(req.source_path).exists():
85
+ raise HTTPException(
86
+ status_code=status.HTTP_404_NOT_FOUND,
87
+ detail=f"source not found: {req.source_path}",
88
+ )
89
+ if not Path(req.seed_path).exists():
90
+ raise HTTPException(
91
+ status_code=status.HTTP_404_NOT_FOUND,
92
+ detail=f"seed not found: {req.seed_path}",
93
+ )
94
+
95
+ config = _build_pipeline_config(req)
96
+
97
+ try:
98
+ transcript = run_pipeline(
99
+ seed_path=req.seed_path,
100
+ session_id=req.session_id,
101
+ source_path=req.source_path,
102
+ config=config,
103
+ backends=backends,
104
+ )
105
+ except RuntimeError as e:
106
+ # Backend missing weights → distinguishable from generic failure so the
107
+ # CLI can suggest `compost setup --fix`.
108
+ if "asr extra" in str(e).lower() or "HUGGINGFACE_TOKEN" in str(e):
109
+ raise HTTPException(
110
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
111
+ detail=f"model_missing: {e}",
112
+ ) from e
113
+ raise HTTPException(
114
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
115
+ detail=f"failed_transcription: {e}",
116
+ ) from e
117
+
118
+ transcript_path = write_transcript(req.seed_path, req.session_id, transcript)
119
+
120
+ return TranscribeResponse(
121
+ session_id=req.session_id,
122
+ transcript_path=transcript_path,
123
+ status=transcript.get("status", "ok"),
124
+ )
125
+
126
+
127
+ def hf_token_present() -> bool:
128
+ """Helper exposed for the /compost-setup doctor: whether a HuggingFace
129
+ token is on the environment (does NOT validate it works against pyannote)."""
130
+ return bool(os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_TOKEN"))
131
+
132
+
133
+ __all__ = ["router", "TranscribeRequest", "TranscribeResponse", "hf_token_present"]
@@ -0,0 +1,74 @@
1
+ """Perceptual-hash shot-change detector (#15).
2
+
3
+ Samples the video at a fixed cadence, computes a perceptual hash per sampled
4
+ frame, and reports the timestamps where the hash distance to the previous
5
+ sample crosses a threshold — i.e. a scene cut, slide change, or camera move.
6
+
7
+ Output is a list of at_ms values consumed by the frame extractor (#14) as
8
+ `shot_change` triggers. No classification beyond "something changed".
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import subprocess
14
+ import tempfile
15
+ from pathlib import Path
16
+
17
+ import imagehash
18
+ from PIL import Image
19
+
20
+ # Default Hamming-distance threshold between consecutive perceptual hashes.
21
+ # Tunable via config ([frames].shot_change_phash_distance).
22
+ DEFAULT_PHASH_DISTANCE = 12
23
+ DEFAULT_SAMPLE_INTERVAL_MS = 1000
24
+
25
+
26
+ def _sample_frame(video_path: Path, at_ms: int, out_path: Path) -> bool:
27
+ ts = at_ms / 1000.0
28
+ cmd = [
29
+ "ffmpeg",
30
+ "-y",
31
+ "-ss",
32
+ f"{ts:.3f}",
33
+ "-i",
34
+ str(video_path),
35
+ "-frames:v",
36
+ "1",
37
+ "-vf",
38
+ "scale=160:90",
39
+ str(out_path),
40
+ ]
41
+ proc = subprocess.run(cmd, capture_output=True, text=True)
42
+ return proc.returncode == 0 and out_path.exists()
43
+
44
+
45
+ def detect_shot_changes(
46
+ video_path: str | Path,
47
+ duration_ms: int,
48
+ threshold: int = DEFAULT_PHASH_DISTANCE,
49
+ sample_interval_ms: int = DEFAULT_SAMPLE_INTERVAL_MS,
50
+ ) -> list[int]:
51
+ """Return at_ms timestamps where a shot change is detected.
52
+
53
+ The first sampled frame is never a "change" (no predecessor). Distances at
54
+ or above `threshold` mark a change.
55
+ """
56
+ video_path = Path(video_path)
57
+ changes: list[int] = []
58
+ prev_hash: imagehash.ImageHash | None = None
59
+
60
+ with tempfile.TemporaryDirectory() as tmp:
61
+ tmp_dir = Path(tmp)
62
+ at = 0
63
+ idx = 0
64
+ while at < duration_ms:
65
+ frame_path = tmp_dir / f"s{idx}.png"
66
+ if _sample_frame(video_path, at, frame_path):
67
+ with Image.open(frame_path) as img:
68
+ h = imagehash.phash(img)
69
+ if prev_hash is not None and (h - prev_hash) >= threshold:
70
+ changes.append(at)
71
+ prev_hash = h
72
+ at += sample_interval_ms
73
+ idx += 1
74
+ return changes
@@ -0,0 +1,144 @@
1
+ """Silence typer — heuristic post-processor that assigns a semantic type to each
2
+ first-class silence (> threshold) from the surrounding utterance context.
3
+
4
+ Types (ROADMAP § Descriptive transcription A):
5
+ - after_question : the silence follows a moderator question
6
+ - mid_utterance : the silence sits inside one speaker's turn
7
+ - thinking : a pre-response pause that isn't clearly after a question
8
+ - interruption : the silence coincides with an overlap/turn-steal
9
+
10
+ Rules are versioned. Researchers can override any assignment downstream; an
11
+ override is recorded as a `researcher`-authored event in the provenance log
12
+ (see issue #12 / provenance writer #27).
13
+
14
+ CHANGELOG
15
+ v1 (2026-06-03): initial rule set.
16
+ - after_question: previous utterance is a moderator AND ends with '?'
17
+ (or a leading inverted '¿' question), and abuts the silence start.
18
+ - interruption: an overlap/interruption cue overlaps the silence window,
19
+ OR previous and next utterances are different speakers and the previous
20
+ did not end on sentence-final punctuation (cut off).
21
+ - mid_utterance: previous and next utterances are the same speaker.
22
+ - thinking: default.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import Any
28
+
29
+ RULES_VERSION = "1"
30
+
31
+ _SILENCE_TYPES = ("after_question", "mid_utterance", "thinking", "interruption")
32
+
33
+ # How close (ms) the previous utterance's end must be to the silence start for
34
+ # the silence to be considered "abutting" that utterance.
35
+ _ABUT_TOLERANCE_MS = 250
36
+
37
+ _SENTENCE_FINAL = (".", "!", "?", "…")
38
+
39
+
40
+ def _ends_question(text: str) -> bool:
41
+ stripped = text.rstrip()
42
+ if stripped.endswith("?"):
43
+ return True
44
+ # Spanish inverted question mark opening with no closing yet still reads as a question.
45
+ return "¿" in stripped and "?" in stripped
46
+
47
+
48
+ def _ends_sentence_final(text: str) -> bool:
49
+ stripped = text.rstrip()
50
+ return stripped.endswith(_SENTENCE_FINAL)
51
+
52
+
53
+ def _speaker_type(speakers: list[dict[str, Any]], speaker_id: str | None) -> str | None:
54
+ if speaker_id is None:
55
+ return None
56
+ for s in speakers:
57
+ if s.get("id") == speaker_id:
58
+ return s.get("type")
59
+ return None
60
+
61
+
62
+ def _cue_overlaps(silence: dict[str, Any], cues: list[dict[str, Any]]) -> bool:
63
+ s_start = silence["start_ms"]
64
+ s_end = silence["end_ms"]
65
+ for cue in cues:
66
+ if cue.get("kind") not in ("overlap", "interruption"):
67
+ continue
68
+ # any temporal overlap between the cue and the silence window
69
+ if cue["start_ms"] <= s_end and cue["end_ms"] >= s_start:
70
+ return True
71
+ return False
72
+
73
+
74
+ def type_silence(
75
+ silence: dict[str, Any],
76
+ prev_utt: dict[str, Any] | None,
77
+ next_utt: dict[str, Any] | None,
78
+ speakers: list[dict[str, Any]],
79
+ cues: list[dict[str, Any]] | None = None,
80
+ ) -> str:
81
+ """Return one of the four silence types for a single silence."""
82
+ cues = cues or []
83
+
84
+ if _cue_overlaps(silence, cues):
85
+ return "interruption"
86
+
87
+ if prev_utt is not None:
88
+ abuts = abs(silence["start_ms"] - prev_utt["end_ms"]) <= _ABUT_TOLERANCE_MS
89
+ prev_type = _speaker_type(speakers, prev_utt.get("speaker_id"))
90
+ if abuts and prev_type == "moderator" and _ends_question(prev_utt.get("text", "")):
91
+ return "after_question"
92
+
93
+ if (
94
+ prev_utt is not None
95
+ and next_utt is not None
96
+ and prev_utt.get("speaker_id") == next_utt.get("speaker_id")
97
+ ):
98
+ return "mid_utterance"
99
+
100
+ # Different speakers (or unknown) and the previous turn was cut off → interruption.
101
+ if (
102
+ prev_utt is not None
103
+ and next_utt is not None
104
+ and prev_utt.get("speaker_id") != next_utt.get("speaker_id")
105
+ and not _ends_sentence_final(prev_utt.get("text", ""))
106
+ ):
107
+ return "interruption"
108
+
109
+ return "thinking"
110
+
111
+
112
+ def _utterance_before(utterances: list[dict[str, Any]], at_ms: int) -> dict[str, Any] | None:
113
+ candidate = None
114
+ for u in utterances:
115
+ if u["end_ms"] <= at_ms + _ABUT_TOLERANCE_MS and (
116
+ candidate is None or u["end_ms"] > candidate["end_ms"]
117
+ ):
118
+ candidate = u
119
+ return candidate
120
+
121
+
122
+ def _utterance_after(utterances: list[dict[str, Any]], at_ms: int) -> dict[str, Any] | None:
123
+ candidate = None
124
+ for u in utterances:
125
+ if u["start_ms"] >= at_ms - _ABUT_TOLERANCE_MS and (
126
+ candidate is None or u["start_ms"] < candidate["start_ms"]
127
+ ):
128
+ candidate = u
129
+ return candidate
130
+
131
+
132
+ def type_all_silences(transcript: dict[str, Any]) -> dict[str, Any]:
133
+ """Annotate every silence in a transcript dict with a `context` type.
134
+
135
+ Mutates and returns the transcript. Idempotent. Fast: O(silences × utterances).
136
+ """
137
+ utterances = transcript.get("utterances", [])
138
+ cues = transcript.get("cues", [])
139
+ speakers = transcript.get("speakers", [])
140
+ for silence in transcript.get("silences", []):
141
+ prev_utt = _utterance_before(utterances, silence["start_ms"])
142
+ next_utt = _utterance_after(utterances, silence["end_ms"])
143
+ silence["context"] = type_silence(silence, prev_utt, next_utt, speakers, cues)
144
+ return transcript
@@ -0,0 +1,82 @@
1
+ """Native (host) transcription entrypoint (#176).
2
+
3
+ Runs the full pipeline ON THE HOST (no Docker) so Apple-Silicon ASR backends
4
+ (`parakeet-mlx` / Metal) and pyannote use the GPU/CPU directly — the Docker
5
+ container is CPU-only on macOS, which is the bottleneck this path removes. The
6
+ Node CLI shells out to this when `transcriber.runtime = native`; the Docker
7
+ `/transcribe` route stays the cross-platform fallback and shares the exact same
8
+ `run_pipeline` orchestration.
9
+
10
+ python -m app.transcribe_cli \
11
+ --seed-path <seed> --session-id S001 \
12
+ --source-path <seed>/sessions/S001/source.mp3 \
13
+ --engine parakeet --language en
14
+
15
+ Prints one JSON line mirroring the /transcribe response shape so the Node
16
+ caller parses both paths identically.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import json
23
+
24
+ from .asr import ASRConfig
25
+ from .pipeline import PipelineBackends, PipelineConfig, run_pipeline, write_transcript
26
+
27
+ _DEFAULT_MODEL = {
28
+ "parakeet": "mlx-community/parakeet-tdt-0.6b-v3",
29
+ "whisper": "large-v3-turbo",
30
+ }
31
+
32
+
33
+ def main(argv: list[str] | None = None) -> int:
34
+ p = argparse.ArgumentParser(prog="compost-transcribe-native")
35
+ p.add_argument("--seed-path", required=True)
36
+ p.add_argument("--session-id", required=True)
37
+ p.add_argument("--source-path", required=True)
38
+ p.add_argument("--engine", default="parakeet", choices=["parakeet", "whisper"])
39
+ p.add_argument("--model", default=None, help="ASR model id (engine default if omitted)")
40
+ p.add_argument("--language", default=None)
41
+ p.add_argument("--device", default="auto")
42
+ p.add_argument("--compute-type", default="int8")
43
+ args = p.parse_args(argv)
44
+
45
+ asr = ASRConfig(
46
+ model_name=args.model or _DEFAULT_MODEL[args.engine],
47
+ device=args.device,
48
+ compute_type=args.compute_type,
49
+ language=args.language,
50
+ engine=args.engine,
51
+ )
52
+ config = PipelineConfig(asr=asr, asr_model_tag=f"{asr.model_name} ({args.engine})")
53
+
54
+ try:
55
+ transcript = run_pipeline(
56
+ seed_path=args.seed_path,
57
+ session_id=args.session_id,
58
+ source_path=args.source_path,
59
+ config=config,
60
+ backends=PipelineBackends(), # all None → real lazy backends (Silero / engine ASR / pyannote)
61
+ )
62
+ except Exception as e: # surface as JSON so the Node caller can report it
63
+ print(json.dumps({"status": "failed", "error": str(e)}))
64
+ return 1
65
+
66
+ path = write_transcript(args.seed_path, args.session_id, transcript)
67
+ print(
68
+ json.dumps(
69
+ {
70
+ "session_id": args.session_id,
71
+ "transcript_path": path,
72
+ "status": transcript.get("status", "ok"),
73
+ "engine": args.engine,
74
+ "model": asr.model_name,
75
+ }
76
+ )
77
+ )
78
+ return 0
79
+
80
+
81
+ if __name__ == "__main__":
82
+ raise SystemExit(main())