@they-juanreina/compost-cli 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/blame.d.ts.map +1 -1
- package/dist/lib/blame.js +3 -2
- package/dist/lib/blame.js.map +1 -1
- package/dist/lib/journal.d.ts.map +1 -1
- package/dist/lib/journal.js +9 -0
- package/dist/lib/journal.js.map +1 -1
- package/dist/lib/migrate.d.ts.map +1 -1
- package/dist/lib/migrate.js +1 -0
- package/dist/lib/migrate.js.map +1 -1
- package/dist/lib/nativeRuntime.d.ts +6 -3
- package/dist/lib/nativeRuntime.d.ts.map +1 -1
- package/dist/lib/nativeRuntime.js +6 -3
- package/dist/lib/nativeRuntime.js.map +1 -1
- package/dist/lib/retrieve.d.ts.map +1 -1
- package/dist/lib/retrieve.js +0 -8
- package/dist/lib/retrieve.js.map +1 -1
- package/dist/lib/seedResolve.d.ts.map +1 -1
- package/dist/lib/seedResolve.js +1 -0
- package/dist/lib/seedResolve.js.map +1 -1
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +9 -8
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/snap.d.ts.map +1 -1
- package/dist/lib/snap.js +2 -5
- package/dist/lib/snap.js.map +1 -1
- package/dist/loops/supervisor.d.ts.map +1 -1
- package/dist/loops/supervisor.js +1 -0
- package/dist/loops/supervisor.js.map +1 -1
- package/dist/loops/transcribe_worker.d.ts.map +1 -1
- package/dist/loops/transcribe_worker.js +0 -1
- package/dist/loops/transcribe_worker.js.map +1 -1
- package/dist/router.js +1 -1
- package/package.json +10 -4
- package/transcriber/app/__init__.py +3 -0
- package/transcriber/app/asr.py +198 -0
- package/transcriber/app/asr_parakeet.py +174 -0
- package/transcriber/app/cue_parser.py +110 -0
- package/transcriber/app/diarization.py +300 -0
- package/transcriber/app/frame_annotation.py +77 -0
- package/transcriber/app/frames.py +130 -0
- package/transcriber/app/health.py +70 -0
- package/transcriber/app/legacy.py +355 -0
- package/transcriber/app/main.py +30 -0
- package/transcriber/app/pipeline.py +204 -0
- package/transcriber/app/pptx_export.py +42 -0
- package/transcriber/app/prosody.py +123 -0
- package/transcriber/app/routes/__init__.py +1 -0
- package/transcriber/app/routes/legacy.py +117 -0
- package/transcriber/app/routes/transcribe.py +133 -0
- package/transcriber/app/shot_change.py +74 -0
- package/transcriber/app/silence_typer.py +144 -0
- package/transcriber/app/transcribe_cli.py +82 -0
- package/transcriber/app/vad.py +145 -0
- package/transcriber/pyproject.toml +56 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Silero VAD integration + silence segmentation (#9).
|
|
2
|
+
|
|
3
|
+
Two outputs (ROADMAP § Descriptive transcription A):
|
|
4
|
+
(a) speech-segment boundaries → fed to ASR
|
|
5
|
+
(b) silence boundaries → fed to the silence typer (#12)
|
|
6
|
+
|
|
7
|
+
Silero v5 is loaded once per process (cold-start cached). The model call is
|
|
8
|
+
lazily imported so this module — and the silence-segmentation maths, which is
|
|
9
|
+
pure — works without torch installed. Install the `asr` extra for real VAD.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
from typing import Any, Protocol
|
|
17
|
+
|
|
18
|
+
# Silences shorter than this are NOT first-class; they remain gaps only.
|
|
19
|
+
MIN_FIRST_CLASS_SILENCE_MS = 1500
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class Segment:
|
|
24
|
+
start_ms: int
|
|
25
|
+
end_ms: int
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def duration_ms(self) -> int:
|
|
29
|
+
return self.end_ms - self.start_ms
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class VADBackend(Protocol):
|
|
33
|
+
def speech_timestamps(self, audio_path: str) -> list[dict[str, int]]: ...
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
SILERO_SAMPLE_RATE = 16000
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SileroBackend: # pragma: no cover - needs torch + weights
|
|
40
|
+
"""Concrete VADBackend wrapping silero-vad.
|
|
41
|
+
|
|
42
|
+
The Silero v5 model is loaded once per process. Audio is decoded to a
|
|
43
|
+
16 kHz mono waveform via the package's `read_audio` helper. Returns
|
|
44
|
+
speech segment boundaries in milliseconds.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(self) -> None:
|
|
48
|
+
try:
|
|
49
|
+
from silero_vad import ( # type: ignore
|
|
50
|
+
get_speech_timestamps,
|
|
51
|
+
load_silero_vad,
|
|
52
|
+
read_audio,
|
|
53
|
+
)
|
|
54
|
+
except ImportError as e:
|
|
55
|
+
raise RuntimeError(
|
|
56
|
+
"silero-vad is not installed. Install the asr extra: pip install -e '.[asr]'"
|
|
57
|
+
) from e
|
|
58
|
+
|
|
59
|
+
self._model = load_silero_vad()
|
|
60
|
+
self._read_audio = read_audio
|
|
61
|
+
self._get_speech_timestamps = get_speech_timestamps
|
|
62
|
+
|
|
63
|
+
def speech_timestamps(self, audio_path: str) -> list[dict[str, int]]:
|
|
64
|
+
wav = self._read_audio(audio_path, sampling_rate=SILERO_SAMPLE_RATE)
|
|
65
|
+
raw = self._get_speech_timestamps(
|
|
66
|
+
wav,
|
|
67
|
+
self._model,
|
|
68
|
+
sampling_rate=SILERO_SAMPLE_RATE,
|
|
69
|
+
return_seconds=False,
|
|
70
|
+
)
|
|
71
|
+
# `raw` is a list of {start, end} in samples; convert to ms.
|
|
72
|
+
ms_per_sample = 1000 / SILERO_SAMPLE_RATE
|
|
73
|
+
return [
|
|
74
|
+
{
|
|
75
|
+
"start_ms": int(seg["start"] * ms_per_sample),
|
|
76
|
+
"end_ms": int(seg["end"] * ms_per_sample),
|
|
77
|
+
}
|
|
78
|
+
for seg in raw
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@lru_cache(maxsize=1)
|
|
83
|
+
def _load_silero() -> VADBackend: # pragma: no cover - needs torch + weights
|
|
84
|
+
try:
|
|
85
|
+
import torch # type: ignore # noqa: F401
|
|
86
|
+
except ImportError as e:
|
|
87
|
+
raise RuntimeError(
|
|
88
|
+
"torch/silero not installed. Install the asr extra: pip install -e '.[asr]'"
|
|
89
|
+
) from e
|
|
90
|
+
return SileroBackend()
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def speech_to_silences(
|
|
94
|
+
speech: list[Segment],
|
|
95
|
+
total_duration_ms: int,
|
|
96
|
+
min_silence_ms: int = MIN_FIRST_CLASS_SILENCE_MS,
|
|
97
|
+
) -> list[Segment]:
|
|
98
|
+
"""Derive first-class silence segments from speech segments.
|
|
99
|
+
|
|
100
|
+
Pure. Considers the gaps before the first speech, between speech segments,
|
|
101
|
+
and after the last speech. Only gaps >= min_silence_ms are returned.
|
|
102
|
+
Overlapping/auto-sorted by start.
|
|
103
|
+
"""
|
|
104
|
+
ordered = sorted(speech, key=lambda s: s.start_ms)
|
|
105
|
+
silences: list[Segment] = []
|
|
106
|
+
cursor = 0
|
|
107
|
+
for seg in ordered:
|
|
108
|
+
if seg.start_ms - cursor >= min_silence_ms:
|
|
109
|
+
silences.append(Segment(cursor, seg.start_ms))
|
|
110
|
+
cursor = max(cursor, seg.end_ms)
|
|
111
|
+
if total_duration_ms - cursor >= min_silence_ms:
|
|
112
|
+
silences.append(Segment(cursor, total_duration_ms))
|
|
113
|
+
return silences
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def silences_to_schema(silences: list[Segment]) -> list[dict[str, Any]]:
|
|
117
|
+
"""Render silence segments as transcript.json silences[] entries (untyped;
|
|
118
|
+
the silence typer #12 fills `context`)."""
|
|
119
|
+
out: list[dict[str, Any]] = []
|
|
120
|
+
for i, s in enumerate(silences, start=1):
|
|
121
|
+
out.append(
|
|
122
|
+
{
|
|
123
|
+
"id": f"SIL-{i:03d}",
|
|
124
|
+
"start_ms": s.start_ms,
|
|
125
|
+
"end_ms": s.end_ms,
|
|
126
|
+
"duration_ms": s.duration_ms,
|
|
127
|
+
"context": "thinking", # placeholder until the typer runs
|
|
128
|
+
}
|
|
129
|
+
)
|
|
130
|
+
return out
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class VAD:
|
|
134
|
+
def __init__(self, backend: VADBackend | None = None):
|
|
135
|
+
self._backend = backend
|
|
136
|
+
|
|
137
|
+
def _get_backend(self) -> VADBackend:
|
|
138
|
+
return self._backend if self._backend is not None else _load_silero()
|
|
139
|
+
|
|
140
|
+
def segment(self, audio_path: str, total_duration_ms: int) -> tuple[list[Segment], list[Segment]]:
|
|
141
|
+
"""Return (speech_segments, first_class_silences)."""
|
|
142
|
+
raw = self._get_backend().speech_timestamps(audio_path)
|
|
143
|
+
speech = [Segment(int(t["start_ms"]), int(t["end_ms"])) for t in raw]
|
|
144
|
+
silences = speech_to_silences(speech, total_duration_ms)
|
|
145
|
+
return speech, silences
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "compost-transcriber"
|
|
3
|
+
version = "0.1.2"
|
|
4
|
+
description = "Compost descriptive transcriber: WhisperX + pyannote + Silero VAD + Whisper-event-tags, plus frame extraction and legacy ingest."
|
|
5
|
+
requires-python = ">=3.11,<3.13"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [{ name = "Juan Reina" }]
|
|
8
|
+
|
|
9
|
+
dependencies = [
|
|
10
|
+
"fastapi>=0.115.0",
|
|
11
|
+
"uvicorn[standard]>=0.30.0",
|
|
12
|
+
"pydantic>=2.9.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.optional-dependencies]
|
|
16
|
+
# Pinned in their own M1 issues (#9-#15) — kept out of the base install so
|
|
17
|
+
# the skeleton boots quickly without pulling multi-GB ML wheels.
|
|
18
|
+
asr = [
|
|
19
|
+
"whisperx",
|
|
20
|
+
"pyannote.audio>=3.3",
|
|
21
|
+
"silero-vad",
|
|
22
|
+
# WhisperX brings these transitively but pinning lets the lock file note
|
|
23
|
+
# the M1-Mac-friendly compute path.
|
|
24
|
+
"torch>=2.3",
|
|
25
|
+
"torchaudio>=2.3",
|
|
26
|
+
"ffmpeg-python",
|
|
27
|
+
]
|
|
28
|
+
# Native Apple-Silicon (Metal) path (#176/#183): Parakeet ASR + pyannote-on-MPS.
|
|
29
|
+
# No whisperx/ctranslate2 — the default native engine is Parakeet (parakeet-mlx).
|
|
30
|
+
# Provisioned by `compost setup --provision-native`.
|
|
31
|
+
native = [
|
|
32
|
+
"parakeet-mlx",
|
|
33
|
+
"pyannote.audio>=3.3",
|
|
34
|
+
"silero-vad",
|
|
35
|
+
"torchaudio>=2.3",
|
|
36
|
+
"ffmpeg-python",
|
|
37
|
+
]
|
|
38
|
+
frames = [
|
|
39
|
+
"imagehash",
|
|
40
|
+
"Pillow",
|
|
41
|
+
]
|
|
42
|
+
legacy = [
|
|
43
|
+
"pdfminer.six",
|
|
44
|
+
"pdfplumber",
|
|
45
|
+
"python-docx",
|
|
46
|
+
"python-pptx",
|
|
47
|
+
"openpyxl",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
target-version = "py311"
|
|
52
|
+
line-length = 100
|
|
53
|
+
|
|
54
|
+
[tool.ruff.lint]
|
|
55
|
+
select = ["E", "F", "I", "W", "UP", "B", "SIM"]
|
|
56
|
+
ignore = ["E501"]
|