@they-juanreina/compost-cli 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dist/lib/blame.d.ts.map +1 -1
  2. package/dist/lib/blame.js +3 -2
  3. package/dist/lib/blame.js.map +1 -1
  4. package/dist/lib/journal.d.ts.map +1 -1
  5. package/dist/lib/journal.js +9 -0
  6. package/dist/lib/journal.js.map +1 -1
  7. package/dist/lib/migrate.d.ts.map +1 -1
  8. package/dist/lib/migrate.js +1 -0
  9. package/dist/lib/migrate.js.map +1 -1
  10. package/dist/lib/nativeRuntime.d.ts +6 -3
  11. package/dist/lib/nativeRuntime.d.ts.map +1 -1
  12. package/dist/lib/nativeRuntime.js +6 -3
  13. package/dist/lib/nativeRuntime.js.map +1 -1
  14. package/dist/lib/retrieve.d.ts.map +1 -1
  15. package/dist/lib/retrieve.js +0 -8
  16. package/dist/lib/retrieve.js.map +1 -1
  17. package/dist/lib/seedResolve.d.ts.map +1 -1
  18. package/dist/lib/seedResolve.js +1 -0
  19. package/dist/lib/seedResolve.js.map +1 -1
  20. package/dist/lib/setup.d.ts.map +1 -1
  21. package/dist/lib/setup.js +9 -8
  22. package/dist/lib/setup.js.map +1 -1
  23. package/dist/lib/snap.d.ts.map +1 -1
  24. package/dist/lib/snap.js +2 -5
  25. package/dist/lib/snap.js.map +1 -1
  26. package/dist/loops/supervisor.d.ts.map +1 -1
  27. package/dist/loops/supervisor.js +1 -0
  28. package/dist/loops/supervisor.js.map +1 -1
  29. package/dist/loops/transcribe_worker.d.ts.map +1 -1
  30. package/dist/loops/transcribe_worker.js +0 -1
  31. package/dist/loops/transcribe_worker.js.map +1 -1
  32. package/dist/router.js +1 -1
  33. package/package.json +10 -4
  34. package/transcriber/app/__init__.py +3 -0
  35. package/transcriber/app/asr.py +198 -0
  36. package/transcriber/app/asr_parakeet.py +174 -0
  37. package/transcriber/app/cue_parser.py +110 -0
  38. package/transcriber/app/diarization.py +300 -0
  39. package/transcriber/app/frame_annotation.py +77 -0
  40. package/transcriber/app/frames.py +130 -0
  41. package/transcriber/app/health.py +70 -0
  42. package/transcriber/app/legacy.py +355 -0
  43. package/transcriber/app/main.py +30 -0
  44. package/transcriber/app/pipeline.py +204 -0
  45. package/transcriber/app/pptx_export.py +42 -0
  46. package/transcriber/app/prosody.py +123 -0
  47. package/transcriber/app/routes/__init__.py +1 -0
  48. package/transcriber/app/routes/legacy.py +117 -0
  49. package/transcriber/app/routes/transcribe.py +133 -0
  50. package/transcriber/app/shot_change.py +74 -0
  51. package/transcriber/app/silence_typer.py +144 -0
  52. package/transcriber/app/transcribe_cli.py +82 -0
  53. package/transcriber/app/vad.py +145 -0
  54. package/transcriber/pyproject.toml +56 -0
@@ -0,0 +1,145 @@
1
+ """Silero VAD integration + silence segmentation (#9).
2
+
3
+ Two outputs (ROADMAP § Descriptive transcription A):
4
+ (a) speech-segment boundaries → fed to ASR
5
+ (b) silence boundaries → fed to the silence typer (#12)
6
+
7
+ Silero v5 is loaded once per process (cold-start cached). The model call is
8
+ lazily imported so this module — and the silence-segmentation maths, which is
9
+ pure — works without torch installed. Install the `asr` extra for real VAD.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass
15
+ from functools import lru_cache
16
+ from typing import Any, Protocol
17
+
18
+ # Silences shorter than this are NOT first-class; they remain gaps only.
19
+ MIN_FIRST_CLASS_SILENCE_MS = 1500
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class Segment:
24
+ start_ms: int
25
+ end_ms: int
26
+
27
+ @property
28
+ def duration_ms(self) -> int:
29
+ return self.end_ms - self.start_ms
30
+
31
+
32
+ class VADBackend(Protocol):
33
+ def speech_timestamps(self, audio_path: str) -> list[dict[str, int]]: ...
34
+
35
+
36
+ SILERO_SAMPLE_RATE = 16000
37
+
38
+
39
+ class SileroBackend: # pragma: no cover - needs torch + weights
40
+ """Concrete VADBackend wrapping silero-vad.
41
+
42
+ The Silero v5 model is loaded once per process. Audio is decoded to a
43
+ 16 kHz mono waveform via the package's `read_audio` helper. Returns
44
+ speech segment boundaries in milliseconds.
45
+ """
46
+
47
+ def __init__(self) -> None:
48
+ try:
49
+ from silero_vad import ( # type: ignore
50
+ get_speech_timestamps,
51
+ load_silero_vad,
52
+ read_audio,
53
+ )
54
+ except ImportError as e:
55
+ raise RuntimeError(
56
+ "silero-vad is not installed. Install the asr extra: pip install -e '.[asr]'"
57
+ ) from e
58
+
59
+ self._model = load_silero_vad()
60
+ self._read_audio = read_audio
61
+ self._get_speech_timestamps = get_speech_timestamps
62
+
63
+ def speech_timestamps(self, audio_path: str) -> list[dict[str, int]]:
64
+ wav = self._read_audio(audio_path, sampling_rate=SILERO_SAMPLE_RATE)
65
+ raw = self._get_speech_timestamps(
66
+ wav,
67
+ self._model,
68
+ sampling_rate=SILERO_SAMPLE_RATE,
69
+ return_seconds=False,
70
+ )
71
+ # `raw` is a list of {start, end} in samples; convert to ms.
72
+ ms_per_sample = 1000 / SILERO_SAMPLE_RATE
73
+ return [
74
+ {
75
+ "start_ms": int(seg["start"] * ms_per_sample),
76
+ "end_ms": int(seg["end"] * ms_per_sample),
77
+ }
78
+ for seg in raw
79
+ ]
80
+
81
+
82
+ @lru_cache(maxsize=1)
83
+ def _load_silero() -> VADBackend: # pragma: no cover - needs torch + weights
84
+ try:
85
+ import torch # type: ignore # noqa: F401
86
+ except ImportError as e:
87
+ raise RuntimeError(
88
+ "torch/silero not installed. Install the asr extra: pip install -e '.[asr]'"
89
+ ) from e
90
+ return SileroBackend()
91
+
92
+
93
+ def speech_to_silences(
94
+ speech: list[Segment],
95
+ total_duration_ms: int,
96
+ min_silence_ms: int = MIN_FIRST_CLASS_SILENCE_MS,
97
+ ) -> list[Segment]:
98
+ """Derive first-class silence segments from speech segments.
99
+
100
+ Pure. Considers the gaps before the first speech, between speech segments,
101
+ and after the last speech. Only gaps >= min_silence_ms are returned.
102
+ Overlapping/auto-sorted by start.
103
+ """
104
+ ordered = sorted(speech, key=lambda s: s.start_ms)
105
+ silences: list[Segment] = []
106
+ cursor = 0
107
+ for seg in ordered:
108
+ if seg.start_ms - cursor >= min_silence_ms:
109
+ silences.append(Segment(cursor, seg.start_ms))
110
+ cursor = max(cursor, seg.end_ms)
111
+ if total_duration_ms - cursor >= min_silence_ms:
112
+ silences.append(Segment(cursor, total_duration_ms))
113
+ return silences
114
+
115
+
116
+ def silences_to_schema(silences: list[Segment]) -> list[dict[str, Any]]:
117
+ """Render silence segments as transcript.json silences[] entries (untyped;
118
+ the silence typer #12 fills `context`)."""
119
+ out: list[dict[str, Any]] = []
120
+ for i, s in enumerate(silences, start=1):
121
+ out.append(
122
+ {
123
+ "id": f"SIL-{i:03d}",
124
+ "start_ms": s.start_ms,
125
+ "end_ms": s.end_ms,
126
+ "duration_ms": s.duration_ms,
127
+ "context": "thinking", # placeholder until the typer runs
128
+ }
129
+ )
130
+ return out
131
+
132
+
133
+ class VAD:
134
+ def __init__(self, backend: VADBackend | None = None):
135
+ self._backend = backend
136
+
137
+ def _get_backend(self) -> VADBackend:
138
+ return self._backend if self._backend is not None else _load_silero()
139
+
140
+ def segment(self, audio_path: str, total_duration_ms: int) -> tuple[list[Segment], list[Segment]]:
141
+ """Return (speech_segments, first_class_silences)."""
142
+ raw = self._get_backend().speech_timestamps(audio_path)
143
+ speech = [Segment(int(t["start_ms"]), int(t["end_ms"])) for t in raw]
144
+ silences = speech_to_silences(speech, total_duration_ms)
145
+ return speech, silences
@@ -0,0 +1,56 @@
1
+ [project]
2
+ name = "compost-transcriber"
3
+ version = "0.1.2"
4
+ description = "Compost descriptive transcriber: WhisperX + pyannote + Silero VAD + Whisper-event-tags, plus frame extraction and legacy ingest."
5
+ requires-python = ">=3.11,<3.13"
6
+ license = { text = "MIT" }
7
+ authors = [{ name = "Juan Reina" }]
8
+
9
+ dependencies = [
10
+ "fastapi>=0.115.0",
11
+ "uvicorn[standard]>=0.30.0",
12
+ "pydantic>=2.9.0",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ # Pinned in their own M1 issues (#9-#15) — kept out of the base install so
17
+ # the skeleton boots quickly without pulling multi-GB ML wheels.
18
+ asr = [
19
+ "whisperx",
20
+ "pyannote.audio>=3.3",
21
+ "silero-vad",
22
+ # WhisperX brings these transitively but pinning lets the lock file note
23
+ # the M1-Mac-friendly compute path.
24
+ "torch>=2.3",
25
+ "torchaudio>=2.3",
26
+ "ffmpeg-python",
27
+ ]
28
+ # Native Apple-Silicon (Metal) path (#176/#183): Parakeet ASR + pyannote-on-MPS.
29
+ # No whisperx/ctranslate2 — the default native engine is Parakeet (parakeet-mlx).
30
+ # Provisioned by `compost setup --provision-native`.
31
+ native = [
32
+ "parakeet-mlx",
33
+ "pyannote.audio>=3.3",
34
+ "silero-vad",
35
+ "torchaudio>=2.3",
36
+ "ffmpeg-python",
37
+ ]
38
+ frames = [
39
+ "imagehash",
40
+ "Pillow",
41
+ ]
42
+ legacy = [
43
+ "pdfminer.six",
44
+ "pdfplumber",
45
+ "python-docx",
46
+ "python-pptx",
47
+ "openpyxl",
48
+ ]
49
+
50
+ [tool.ruff]
51
+ target-version = "py311"
52
+ line-length = 100
53
+
54
+ [tool.ruff.lint]
55
+ select = ["E", "F", "I", "W", "UP", "B", "SIM"]
56
+ ignore = ["E501"]