@they-juanreina/compost-cli 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/commands/agreement.d.ts +3 -0
- package/dist/commands/agreement.d.ts.map +1 -0
- package/dist/commands/agreement.js +35 -0
- package/dist/commands/agreement.js.map +1 -0
- package/dist/commands/create.d.ts +1 -0
- package/dist/commands/create.d.ts.map +1 -1
- package/dist/commands/create.js +39 -1
- package/dist/commands/create.js.map +1 -1
- package/dist/commands/export.d.ts.map +1 -1
- package/dist/commands/export.js +47 -4
- package/dist/commands/export.js.map +1 -1
- package/dist/commands/import.d.ts +3 -0
- package/dist/commands/import.d.ts.map +1 -0
- package/dist/commands/import.js +59 -0
- package/dist/commands/import.js.map +1 -0
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +1 -0
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/jobs.d.ts +3 -0
- package/dist/commands/jobs.d.ts.map +1 -0
- package/dist/commands/jobs.js +105 -0
- package/dist/commands/jobs.js.map +1 -0
- package/dist/commands/label.d.ts +3 -0
- package/dist/commands/label.d.ts.map +1 -0
- package/dist/commands/label.js +67 -0
- package/dist/commands/label.js.map +1 -0
- package/dist/commands/models.d.ts.map +1 -1
- package/dist/commands/models.js +2 -1
- package/dist/commands/models.js.map +1 -1
- package/dist/commands/recode.d.ts +3 -0
- package/dist/commands/recode.d.ts.map +1 -0
- package/dist/commands/recode.js +60 -0
- package/dist/commands/recode.js.map +1 -0
- package/dist/commands/reindex.d.ts.map +1 -1
- package/dist/commands/reindex.js +6 -4
- package/dist/commands/reindex.js.map +1 -1
- package/dist/commands/rerun.d.ts +3 -0
- package/dist/commands/rerun.d.ts.map +1 -0
- package/dist/commands/rerun.js +91 -0
- package/dist/commands/rerun.js.map +1 -0
- package/dist/commands/search.d.ts.map +1 -1
- package/dist/commands/search.js +2 -1
- package/dist/commands/search.js.map +1 -1
- package/dist/commands/secrets.d.ts +3 -0
- package/dist/commands/secrets.d.ts.map +1 -0
- package/dist/commands/secrets.js +143 -0
- package/dist/commands/secrets.js.map +1 -0
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +90 -1
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/status.d.ts.map +1 -1
- package/dist/commands/status.js +2 -1
- package/dist/commands/status.js.map +1 -1
- package/dist/commands/transcribe.d.ts.map +1 -1
- package/dist/commands/transcribe.js +13 -2
- package/dist/commands/transcribe.js.map +1 -1
- package/dist/commands/validate.d.ts.map +1 -1
- package/dist/commands/validate.js +29 -1
- package/dist/commands/validate.js.map +1 -1
- package/dist/engine.d.ts +23 -0
- package/dist/engine.d.ts.map +1 -0
- package/dist/engine.js +32 -0
- package/dist/engine.js.map +1 -0
- package/dist/exporters/prov.d.ts +11 -0
- package/dist/exporters/prov.d.ts.map +1 -0
- package/dist/exporters/prov.js +151 -0
- package/dist/exporters/prov.js.map +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -1
- package/dist/lib/agreement.d.ts +77 -0
- package/dist/lib/agreement.d.ts.map +1 -0
- package/dist/lib/agreement.js +261 -0
- package/dist/lib/agreement.js.map +1 -0
- package/dist/lib/artifacts.d.ts +32 -1
- package/dist/lib/artifacts.d.ts.map +1 -1
- package/dist/lib/artifacts.js +156 -22
- package/dist/lib/artifacts.js.map +1 -1
- package/dist/lib/blame.d.ts.map +1 -1
- package/dist/lib/blame.js +3 -2
- package/dist/lib/blame.js.map +1 -1
- package/dist/lib/config.d.ts +3 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/doctor.d.ts +3 -0
- package/dist/lib/doctor.d.ts.map +1 -1
- package/dist/lib/doctor.js +24 -1
- package/dist/lib/doctor.js.map +1 -1
- package/dist/lib/events.d.ts +34 -1
- package/dist/lib/events.d.ts.map +1 -1
- package/dist/lib/events.js +35 -1
- package/dist/lib/events.js.map +1 -1
- package/dist/lib/importTranscript.d.ts +16 -0
- package/dist/lib/importTranscript.d.ts.map +1 -0
- package/dist/lib/importTranscript.js +94 -0
- package/dist/lib/importTranscript.js.map +1 -0
- package/dist/lib/ingest.d.ts.map +1 -1
- package/dist/lib/ingest.js +12 -6
- package/dist/lib/ingest.js.map +1 -1
- package/dist/lib/journal.d.ts +13 -0
- package/dist/lib/journal.d.ts.map +1 -1
- package/dist/lib/journal.js +58 -2
- package/dist/lib/journal.js.map +1 -1
- package/dist/lib/legacyNative.d.ts +24 -0
- package/dist/lib/legacyNative.d.ts.map +1 -0
- package/dist/lib/legacyNative.js +51 -0
- package/dist/lib/legacyNative.js.map +1 -0
- package/dist/lib/migrate.d.ts.map +1 -1
- package/dist/lib/migrate.js +1 -0
- package/dist/lib/migrate.js.map +1 -1
- package/dist/lib/nativeRuntime.d.ts +6 -3
- package/dist/lib/nativeRuntime.d.ts.map +1 -1
- package/dist/lib/nativeRuntime.js +6 -3
- package/dist/lib/nativeRuntime.js.map +1 -1
- package/dist/lib/provisionNative.js +1 -1
- package/dist/lib/provisionNative.js.map +1 -1
- package/dist/lib/queue.d.ts +25 -0
- package/dist/lib/queue.d.ts.map +1 -1
- package/dist/lib/queue.js +70 -3
- package/dist/lib/queue.js.map +1 -1
- package/dist/lib/reads.d.ts +24 -0
- package/dist/lib/reads.d.ts.map +1 -0
- package/dist/lib/reads.js +115 -0
- package/dist/lib/reads.js.map +1 -0
- package/dist/lib/recode.d.ts +19 -0
- package/dist/lib/recode.d.ts.map +1 -0
- package/dist/lib/recode.js +43 -0
- package/dist/lib/recode.js.map +1 -0
- package/dist/lib/rerun.d.ts +51 -0
- package/dist/lib/rerun.d.ts.map +1 -0
- package/dist/lib/rerun.js +166 -0
- package/dist/lib/rerun.js.map +1 -0
- package/dist/lib/retrieve.d.ts +8 -4
- package/dist/lib/retrieve.d.ts.map +1 -1
- package/dist/lib/retrieve.js +12 -10
- package/dist/lib/retrieve.js.map +1 -1
- package/dist/lib/schemas.generated.d.ts.map +1 -1
- package/dist/lib/schemas.generated.js +28 -0
- package/dist/lib/schemas.generated.js.map +1 -1
- package/dist/lib/secrets.d.ts +158 -0
- package/dist/lib/secrets.d.ts.map +1 -0
- package/dist/lib/secrets.js +507 -0
- package/dist/lib/secrets.js.map +1 -0
- package/dist/lib/seed.d.ts +5 -0
- package/dist/lib/seed.d.ts.map +1 -1
- package/dist/lib/seed.js +15 -2
- package/dist/lib/seed.js.map +1 -1
- package/dist/lib/seedResolve.d.ts.map +1 -1
- package/dist/lib/seedResolve.js +1 -0
- package/dist/lib/seedResolve.js.map +1 -1
- package/dist/lib/session.d.ts +14 -0
- package/dist/lib/session.d.ts.map +1 -1
- package/dist/lib/session.js +47 -0
- package/dist/lib/session.js.map +1 -1
- package/dist/lib/setup.d.ts +5 -0
- package/dist/lib/setup.d.ts.map +1 -1
- package/dist/lib/setup.js +78 -14
- package/dist/lib/setup.js.map +1 -1
- package/dist/lib/setupWizard.d.ts +51 -0
- package/dist/lib/setupWizard.d.ts.map +1 -0
- package/dist/lib/setupWizard.js +223 -0
- package/dist/lib/setupWizard.js.map +1 -0
- package/dist/lib/snap.d.ts.map +1 -1
- package/dist/lib/snap.js +2 -5
- package/dist/lib/snap.js.map +1 -1
- package/dist/lib/speakers.d.ts +41 -0
- package/dist/lib/speakers.d.ts.map +1 -0
- package/dist/lib/speakers.js +78 -0
- package/dist/lib/speakers.js.map +1 -0
- package/dist/lib/status.d.ts.map +1 -1
- package/dist/lib/status.js +21 -0
- package/dist/lib/status.js.map +1 -1
- package/dist/lib/userConfig.d.ts +22 -0
- package/dist/lib/userConfig.d.ts.map +1 -0
- package/dist/lib/userConfig.js +67 -0
- package/dist/lib/userConfig.js.map +1 -0
- package/dist/lib/validate.d.ts +18 -0
- package/dist/lib/validate.d.ts.map +1 -1
- package/dist/lib/validate.js +70 -1
- package/dist/lib/validate.js.map +1 -1
- package/dist/lib/version.d.ts +30 -0
- package/dist/lib/version.d.ts.map +1 -0
- package/dist/lib/version.js +73 -0
- package/dist/lib/version.js.map +1 -0
- package/dist/llm/adapter.d.ts.map +1 -1
- package/dist/llm/adapter.js +2 -0
- package/dist/llm/adapter.js.map +1 -1
- package/dist/llm/providers/ollama.d.ts.map +1 -1
- package/dist/llm/providers/ollama.js +6 -0
- package/dist/llm/providers/ollama.js.map +1 -1
- package/dist/loops/ingest_watcher.d.ts.map +1 -1
- package/dist/loops/ingest_watcher.js +6 -3
- package/dist/loops/ingest_watcher.js.map +1 -1
- package/dist/loops/legacy_worker.d.ts +28 -1
- package/dist/loops/legacy_worker.d.ts.map +1 -1
- package/dist/loops/legacy_worker.js +81 -9
- package/dist/loops/legacy_worker.js.map +1 -1
- package/dist/loops/supervisor.d.ts +3 -0
- package/dist/loops/supervisor.d.ts.map +1 -1
- package/dist/loops/supervisor.js +12 -0
- package/dist/loops/supervisor.js.map +1 -1
- package/dist/loops/synthesis.d.ts.map +1 -1
- package/dist/loops/synthesis.js +15 -0
- package/dist/loops/synthesis.js.map +1 -1
- package/dist/loops/transcribe_worker.d.ts.map +1 -1
- package/dist/loops/transcribe_worker.js +2 -4
- package/dist/loops/transcribe_worker.js.map +1 -1
- package/dist/output.d.ts +13 -1
- package/dist/output.d.ts.map +1 -1
- package/dist/output.js +22 -2
- package/dist/output.js.map +1 -1
- package/dist/render/human.d.ts +20 -0
- package/dist/render/human.d.ts.map +1 -0
- package/dist/render/human.js +54 -0
- package/dist/render/human.js.map +1 -0
- package/dist/router.d.ts.map +1 -1
- package/dist/router.js +17 -2
- package/dist/router.js.map +1 -1
- package/package.json +18 -5
- package/templates/config.toml +6 -1
- package/transcriber/app/__init__.py +3 -0
- package/transcriber/app/asr.py +198 -0
- package/transcriber/app/asr_parakeet.py +174 -0
- package/transcriber/app/cue_parser.py +110 -0
- package/transcriber/app/diarization.py +330 -0
- package/transcriber/app/frame_annotation.py +77 -0
- package/transcriber/app/frames.py +130 -0
- package/transcriber/app/health.py +70 -0
- package/transcriber/app/legacy.py +355 -0
- package/transcriber/app/legacy_cli.py +90 -0
- package/transcriber/app/main.py +30 -0
- package/transcriber/app/pipeline.py +210 -0
- package/transcriber/app/pptx_export.py +42 -0
- package/transcriber/app/prosody.py +128 -0
- package/transcriber/app/routes/__init__.py +1 -0
- package/transcriber/app/routes/legacy.py +117 -0
- package/transcriber/app/routes/transcribe.py +133 -0
- package/transcriber/app/shot_change.py +74 -0
- package/transcriber/app/silence_typer.py +144 -0
- package/transcriber/app/transcribe_cli.py +82 -0
- package/transcriber/app/vad.py +216 -0
- package/transcriber/pyproject.toml +56 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Perceptual-hash shot-change detector (#15).
|
|
2
|
+
|
|
3
|
+
Samples the video at a fixed cadence, computes a perceptual hash per sampled
|
|
4
|
+
frame, and reports the timestamps where the hash distance to the previous
|
|
5
|
+
sample crosses a threshold — i.e. a scene cut, slide change, or camera move.
|
|
6
|
+
|
|
7
|
+
Output is a list of at_ms values consumed by the frame extractor (#14) as
|
|
8
|
+
`shot_change` triggers. No classification beyond "something changed".
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import subprocess
|
|
14
|
+
import tempfile
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import imagehash
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
20
|
+
# Default Hamming-distance threshold between consecutive perceptual hashes.
|
|
21
|
+
# Tunable via config ([frames].shot_change_phash_distance).
|
|
22
|
+
DEFAULT_PHASH_DISTANCE = 12
|
|
23
|
+
DEFAULT_SAMPLE_INTERVAL_MS = 1000
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _sample_frame(video_path: Path, at_ms: int, out_path: Path) -> bool:
|
|
27
|
+
ts = at_ms / 1000.0
|
|
28
|
+
cmd = [
|
|
29
|
+
"ffmpeg",
|
|
30
|
+
"-y",
|
|
31
|
+
"-ss",
|
|
32
|
+
f"{ts:.3f}",
|
|
33
|
+
"-i",
|
|
34
|
+
str(video_path),
|
|
35
|
+
"-frames:v",
|
|
36
|
+
"1",
|
|
37
|
+
"-vf",
|
|
38
|
+
"scale=160:90",
|
|
39
|
+
str(out_path),
|
|
40
|
+
]
|
|
41
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
42
|
+
return proc.returncode == 0 and out_path.exists()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def detect_shot_changes(
|
|
46
|
+
video_path: str | Path,
|
|
47
|
+
duration_ms: int,
|
|
48
|
+
threshold: int = DEFAULT_PHASH_DISTANCE,
|
|
49
|
+
sample_interval_ms: int = DEFAULT_SAMPLE_INTERVAL_MS,
|
|
50
|
+
) -> list[int]:
|
|
51
|
+
"""Return at_ms timestamps where a shot change is detected.
|
|
52
|
+
|
|
53
|
+
The first sampled frame is never a "change" (no predecessor). Distances at
|
|
54
|
+
or above `threshold` mark a change.
|
|
55
|
+
"""
|
|
56
|
+
video_path = Path(video_path)
|
|
57
|
+
changes: list[int] = []
|
|
58
|
+
prev_hash: imagehash.ImageHash | None = None
|
|
59
|
+
|
|
60
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
61
|
+
tmp_dir = Path(tmp)
|
|
62
|
+
at = 0
|
|
63
|
+
idx = 0
|
|
64
|
+
while at < duration_ms:
|
|
65
|
+
frame_path = tmp_dir / f"s{idx}.png"
|
|
66
|
+
if _sample_frame(video_path, at, frame_path):
|
|
67
|
+
with Image.open(frame_path) as img:
|
|
68
|
+
h = imagehash.phash(img)
|
|
69
|
+
if prev_hash is not None and (h - prev_hash) >= threshold:
|
|
70
|
+
changes.append(at)
|
|
71
|
+
prev_hash = h
|
|
72
|
+
at += sample_interval_ms
|
|
73
|
+
idx += 1
|
|
74
|
+
return changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Silence typer — heuristic post-processor that assigns a semantic type to each
|
|
2
|
+
first-class silence (> threshold) from the surrounding utterance context.
|
|
3
|
+
|
|
4
|
+
Types (ROADMAP § Descriptive transcription A):
|
|
5
|
+
- after_question : the silence follows a moderator question
|
|
6
|
+
- mid_utterance : the silence sits inside one speaker's turn
|
|
7
|
+
- thinking : a pre-response pause that isn't clearly after a question
|
|
8
|
+
- interruption : the silence coincides with an overlap/turn-steal
|
|
9
|
+
|
|
10
|
+
Rules are versioned. Researchers can override any assignment downstream; an
|
|
11
|
+
override is recorded as a `researcher`-authored event in the provenance log
|
|
12
|
+
(see issue #12 / provenance writer #27).
|
|
13
|
+
|
|
14
|
+
CHANGELOG
|
|
15
|
+
v1 (2026-06-03): initial rule set.
|
|
16
|
+
- after_question: previous utterance is a moderator AND ends with '?'
|
|
17
|
+
(or a leading inverted '¿' question), and abuts the silence start.
|
|
18
|
+
- interruption: an overlap/interruption cue overlaps the silence window,
|
|
19
|
+
OR previous and next utterances are different speakers and the previous
|
|
20
|
+
did not end on sentence-final punctuation (cut off).
|
|
21
|
+
- mid_utterance: previous and next utterances are the same speaker.
|
|
22
|
+
- thinking: default.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
RULES_VERSION = "1"
|
|
30
|
+
|
|
31
|
+
_SILENCE_TYPES = ("after_question", "mid_utterance", "thinking", "interruption")
|
|
32
|
+
|
|
33
|
+
# How close (ms) the previous utterance's end must be to the silence start for
|
|
34
|
+
# the silence to be considered "abutting" that utterance.
|
|
35
|
+
_ABUT_TOLERANCE_MS = 250
|
|
36
|
+
|
|
37
|
+
_SENTENCE_FINAL = (".", "!", "?", "…")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ends_question(text: str) -> bool:
|
|
41
|
+
stripped = text.rstrip()
|
|
42
|
+
if stripped.endswith("?"):
|
|
43
|
+
return True
|
|
44
|
+
# Spanish inverted question mark opening with no closing yet still reads as a question.
|
|
45
|
+
return "¿" in stripped and "?" in stripped
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _ends_sentence_final(text: str) -> bool:
|
|
49
|
+
stripped = text.rstrip()
|
|
50
|
+
return stripped.endswith(_SENTENCE_FINAL)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _speaker_type(speakers: list[dict[str, Any]], speaker_id: str | None) -> str | None:
|
|
54
|
+
if speaker_id is None:
|
|
55
|
+
return None
|
|
56
|
+
for s in speakers:
|
|
57
|
+
if s.get("id") == speaker_id:
|
|
58
|
+
return s.get("type")
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _cue_overlaps(silence: dict[str, Any], cues: list[dict[str, Any]]) -> bool:
|
|
63
|
+
s_start = silence["start_ms"]
|
|
64
|
+
s_end = silence["end_ms"]
|
|
65
|
+
for cue in cues:
|
|
66
|
+
if cue.get("kind") not in ("overlap", "interruption"):
|
|
67
|
+
continue
|
|
68
|
+
# any temporal overlap between the cue and the silence window
|
|
69
|
+
if cue["start_ms"] <= s_end and cue["end_ms"] >= s_start:
|
|
70
|
+
return True
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def type_silence(
|
|
75
|
+
silence: dict[str, Any],
|
|
76
|
+
prev_utt: dict[str, Any] | None,
|
|
77
|
+
next_utt: dict[str, Any] | None,
|
|
78
|
+
speakers: list[dict[str, Any]],
|
|
79
|
+
cues: list[dict[str, Any]] | None = None,
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Return one of the four silence types for a single silence."""
|
|
82
|
+
cues = cues or []
|
|
83
|
+
|
|
84
|
+
if _cue_overlaps(silence, cues):
|
|
85
|
+
return "interruption"
|
|
86
|
+
|
|
87
|
+
if prev_utt is not None:
|
|
88
|
+
abuts = abs(silence["start_ms"] - prev_utt["end_ms"]) <= _ABUT_TOLERANCE_MS
|
|
89
|
+
prev_type = _speaker_type(speakers, prev_utt.get("speaker_id"))
|
|
90
|
+
if abuts and prev_type == "moderator" and _ends_question(prev_utt.get("text", "")):
|
|
91
|
+
return "after_question"
|
|
92
|
+
|
|
93
|
+
if (
|
|
94
|
+
prev_utt is not None
|
|
95
|
+
and next_utt is not None
|
|
96
|
+
and prev_utt.get("speaker_id") == next_utt.get("speaker_id")
|
|
97
|
+
):
|
|
98
|
+
return "mid_utterance"
|
|
99
|
+
|
|
100
|
+
# Different speakers (or unknown) and the previous turn was cut off → interruption.
|
|
101
|
+
if (
|
|
102
|
+
prev_utt is not None
|
|
103
|
+
and next_utt is not None
|
|
104
|
+
and prev_utt.get("speaker_id") != next_utt.get("speaker_id")
|
|
105
|
+
and not _ends_sentence_final(prev_utt.get("text", ""))
|
|
106
|
+
):
|
|
107
|
+
return "interruption"
|
|
108
|
+
|
|
109
|
+
return "thinking"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _utterance_before(utterances: list[dict[str, Any]], at_ms: int) -> dict[str, Any] | None:
|
|
113
|
+
candidate = None
|
|
114
|
+
for u in utterances:
|
|
115
|
+
if u["end_ms"] <= at_ms + _ABUT_TOLERANCE_MS and (
|
|
116
|
+
candidate is None or u["end_ms"] > candidate["end_ms"]
|
|
117
|
+
):
|
|
118
|
+
candidate = u
|
|
119
|
+
return candidate
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _utterance_after(utterances: list[dict[str, Any]], at_ms: int) -> dict[str, Any] | None:
|
|
123
|
+
candidate = None
|
|
124
|
+
for u in utterances:
|
|
125
|
+
if u["start_ms"] >= at_ms - _ABUT_TOLERANCE_MS and (
|
|
126
|
+
candidate is None or u["start_ms"] < candidate["start_ms"]
|
|
127
|
+
):
|
|
128
|
+
candidate = u
|
|
129
|
+
return candidate
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def type_all_silences(transcript: dict[str, Any]) -> dict[str, Any]:
|
|
133
|
+
"""Annotate every silence in a transcript dict with a `context` type.
|
|
134
|
+
|
|
135
|
+
Mutates and returns the transcript. Idempotent. Fast: O(silences × utterances).
|
|
136
|
+
"""
|
|
137
|
+
utterances = transcript.get("utterances", [])
|
|
138
|
+
cues = transcript.get("cues", [])
|
|
139
|
+
speakers = transcript.get("speakers", [])
|
|
140
|
+
for silence in transcript.get("silences", []):
|
|
141
|
+
prev_utt = _utterance_before(utterances, silence["start_ms"])
|
|
142
|
+
next_utt = _utterance_after(utterances, silence["end_ms"])
|
|
143
|
+
silence["context"] = type_silence(silence, prev_utt, next_utt, speakers, cues)
|
|
144
|
+
return transcript
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Native (host) transcription entrypoint (#176).
|
|
2
|
+
|
|
3
|
+
Runs the full pipeline ON THE HOST (no Docker) so Apple-Silicon ASR backends
|
|
4
|
+
(`parakeet-mlx` / Metal) and pyannote use the GPU/CPU directly — the Docker
|
|
5
|
+
container is CPU-only on macOS, which is the bottleneck this path removes. The
|
|
6
|
+
Node CLI shells out to this when `transcriber.runtime = native`; the Docker
|
|
7
|
+
`/transcribe` route stays the cross-platform fallback and shares the exact same
|
|
8
|
+
`run_pipeline` orchestration.
|
|
9
|
+
|
|
10
|
+
python -m app.transcribe_cli \
|
|
11
|
+
--seed-path <seed> --session-id S001 \
|
|
12
|
+
--source-path <seed>/sessions/S001/source.mp3 \
|
|
13
|
+
--engine parakeet --language en
|
|
14
|
+
|
|
15
|
+
Prints one JSON line mirroring the /transcribe response shape so the Node
|
|
16
|
+
caller parses both paths identically.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
|
|
24
|
+
from .asr import ASRConfig
|
|
25
|
+
from .pipeline import PipelineBackends, PipelineConfig, run_pipeline, write_transcript
|
|
26
|
+
|
|
27
|
+
_DEFAULT_MODEL = {
|
|
28
|
+
"parakeet": "mlx-community/parakeet-tdt-0.6b-v3",
|
|
29
|
+
"whisper": "large-v3-turbo",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def main(argv: list[str] | None = None) -> int:
|
|
34
|
+
p = argparse.ArgumentParser(prog="compost-transcribe-native")
|
|
35
|
+
p.add_argument("--seed-path", required=True)
|
|
36
|
+
p.add_argument("--session-id", required=True)
|
|
37
|
+
p.add_argument("--source-path", required=True)
|
|
38
|
+
p.add_argument("--engine", default="parakeet", choices=["parakeet", "whisper"])
|
|
39
|
+
p.add_argument("--model", default=None, help="ASR model id (engine default if omitted)")
|
|
40
|
+
p.add_argument("--language", default=None)
|
|
41
|
+
p.add_argument("--device", default="auto")
|
|
42
|
+
p.add_argument("--compute-type", default="int8")
|
|
43
|
+
args = p.parse_args(argv)
|
|
44
|
+
|
|
45
|
+
asr = ASRConfig(
|
|
46
|
+
model_name=args.model or _DEFAULT_MODEL[args.engine],
|
|
47
|
+
device=args.device,
|
|
48
|
+
compute_type=args.compute_type,
|
|
49
|
+
language=args.language,
|
|
50
|
+
engine=args.engine,
|
|
51
|
+
)
|
|
52
|
+
config = PipelineConfig(asr=asr, asr_model_tag=f"{asr.model_name} ({args.engine})")
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
transcript = run_pipeline(
|
|
56
|
+
seed_path=args.seed_path,
|
|
57
|
+
session_id=args.session_id,
|
|
58
|
+
source_path=args.source_path,
|
|
59
|
+
config=config,
|
|
60
|
+
backends=PipelineBackends(), # all None → real lazy backends (Silero / engine ASR / pyannote)
|
|
61
|
+
)
|
|
62
|
+
except Exception as e: # surface as JSON so the Node caller can report it
|
|
63
|
+
print(json.dumps({"status": "failed", "error": str(e)}))
|
|
64
|
+
return 1
|
|
65
|
+
|
|
66
|
+
path = write_transcript(args.seed_path, args.session_id, transcript)
|
|
67
|
+
print(
|
|
68
|
+
json.dumps(
|
|
69
|
+
{
|
|
70
|
+
"session_id": args.session_id,
|
|
71
|
+
"transcript_path": path,
|
|
72
|
+
"status": transcript.get("status", "ok"),
|
|
73
|
+
"engine": args.engine,
|
|
74
|
+
"model": asr.model_name,
|
|
75
|
+
}
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""Silero VAD integration + silence segmentation (#9).
|
|
2
|
+
|
|
3
|
+
Two outputs (ROADMAP § Descriptive transcription A):
|
|
4
|
+
(a) speech-segment boundaries → fed to ASR
|
|
5
|
+
(b) silence boundaries → fed to the silence typer (#12)
|
|
6
|
+
|
|
7
|
+
Silero v5 is loaded once per process (cold-start cached). The model call is
|
|
8
|
+
lazily imported so this module — and the silence-segmentation maths, which is
|
|
9
|
+
pure — works without torch installed. Install the `asr` extra for real VAD.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
from typing import Any, Protocol
|
|
17
|
+
|
|
18
|
+
# Silences shorter than this are NOT first-class; they remain gaps only.
|
|
19
|
+
MIN_FIRST_CLASS_SILENCE_MS = 1500
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class Segment:
|
|
24
|
+
start_ms: int
|
|
25
|
+
end_ms: int
|
|
26
|
+
# Mean RMS energy over the segment's waveform (raw amplitude, ~0..1; speech
|
|
27
|
+
# peaks well below 1.0). `None` when the backend doesn't report it — the
|
|
28
|
+
# prosody stage then leaves volume at "normal" rather than guessing.
|
|
29
|
+
energy: float | None = None
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def duration_ms(self) -> int:
|
|
33
|
+
return self.end_ms - self.start_ms
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class VADBackend(Protocol):
|
|
37
|
+
def speech_timestamps(self, audio_path: str) -> list[dict[str, Any]]:
|
|
38
|
+
"""Return speech segments as ``{"start_ms", "end_ms"}`` dicts, optionally
|
|
39
|
+
with a float ``"energy"`` (mean RMS over the segment)."""
|
|
40
|
+
...
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
SILERO_SAMPLE_RATE = 16000
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SileroBackend: # pragma: no cover - needs torch + weights
|
|
47
|
+
"""Concrete VADBackend wrapping silero-vad.
|
|
48
|
+
|
|
49
|
+
The Silero v5 model is loaded once per process. Audio is decoded to a
|
|
50
|
+
16 kHz mono waveform via the package's `read_audio` helper. Returns
|
|
51
|
+
speech segment boundaries in milliseconds.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self) -> None:
|
|
55
|
+
try:
|
|
56
|
+
from silero_vad import ( # type: ignore
|
|
57
|
+
get_speech_timestamps,
|
|
58
|
+
load_silero_vad,
|
|
59
|
+
read_audio,
|
|
60
|
+
)
|
|
61
|
+
except ImportError as e:
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
"silero-vad is not installed. Install the asr extra: pip install -e '.[asr]'"
|
|
64
|
+
) from e
|
|
65
|
+
|
|
66
|
+
self._model = load_silero_vad()
|
|
67
|
+
self._read_audio = read_audio
|
|
68
|
+
self._get_speech_timestamps = get_speech_timestamps
|
|
69
|
+
|
|
70
|
+
def speech_timestamps(self, audio_path: str) -> list[dict[str, Any]]:
|
|
71
|
+
wav = self._read_audio(audio_path, sampling_rate=SILERO_SAMPLE_RATE)
|
|
72
|
+
raw = self._get_speech_timestamps(
|
|
73
|
+
wav,
|
|
74
|
+
self._model,
|
|
75
|
+
sampling_rate=SILERO_SAMPLE_RATE,
|
|
76
|
+
return_seconds=False,
|
|
77
|
+
)
|
|
78
|
+
# `raw` is a list of {start, end} in samples; convert to ms and compute
|
|
79
|
+
# the mean RMS energy over each segment's waveform window (#9 → #13).
|
|
80
|
+
# `wav` is a mono float tensor in [-1, 1]; RMS = sqrt(mean(x^2)).
|
|
81
|
+
ms_per_sample = 1000 / SILERO_SAMPLE_RATE
|
|
82
|
+
out: list[dict[str, Any]] = []
|
|
83
|
+
for seg in raw:
|
|
84
|
+
start_sample, end_sample = int(seg["start"]), int(seg["end"])
|
|
85
|
+
window = wav[start_sample:end_sample]
|
|
86
|
+
rms = float(window.pow(2).mean().sqrt()) if window.numel() else 0.0
|
|
87
|
+
out.append(
|
|
88
|
+
{
|
|
89
|
+
"start_ms": int(seg["start"] * ms_per_sample),
|
|
90
|
+
"end_ms": int(seg["end"] * ms_per_sample),
|
|
91
|
+
"energy": rms,
|
|
92
|
+
}
|
|
93
|
+
)
|
|
94
|
+
return out
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@lru_cache(maxsize=1)
|
|
98
|
+
def _load_silero() -> VADBackend: # pragma: no cover - needs torch + weights
|
|
99
|
+
try:
|
|
100
|
+
import torch # type: ignore # noqa: F401
|
|
101
|
+
except ImportError as e:
|
|
102
|
+
raise RuntimeError(
|
|
103
|
+
"torch/silero not installed. Install the asr extra: pip install -e '.[asr]'"
|
|
104
|
+
) from e
|
|
105
|
+
return SileroBackend()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def speech_to_silences(
|
|
109
|
+
speech: list[Segment],
|
|
110
|
+
total_duration_ms: int,
|
|
111
|
+
min_silence_ms: int = MIN_FIRST_CLASS_SILENCE_MS,
|
|
112
|
+
) -> list[Segment]:
|
|
113
|
+
"""Derive first-class silence segments from speech segments.
|
|
114
|
+
|
|
115
|
+
Pure. Considers the gaps before the first speech, between speech segments,
|
|
116
|
+
and after the last speech. Only gaps >= min_silence_ms are returned.
|
|
117
|
+
Overlapping/auto-sorted by start.
|
|
118
|
+
"""
|
|
119
|
+
ordered = sorted(speech, key=lambda s: s.start_ms)
|
|
120
|
+
silences: list[Segment] = []
|
|
121
|
+
cursor = 0
|
|
122
|
+
for seg in ordered:
|
|
123
|
+
if seg.start_ms - cursor >= min_silence_ms:
|
|
124
|
+
silences.append(Segment(cursor, seg.start_ms))
|
|
125
|
+
cursor = max(cursor, seg.end_ms)
|
|
126
|
+
if total_duration_ms - cursor >= min_silence_ms:
|
|
127
|
+
silences.append(Segment(cursor, total_duration_ms))
|
|
128
|
+
return silences
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def silences_to_schema(silences: list[Segment]) -> list[dict[str, Any]]:
|
|
132
|
+
"""Render silence segments as transcript.json silences[] entries (untyped;
|
|
133
|
+
the silence typer #12 fills `context`)."""
|
|
134
|
+
out: list[dict[str, Any]] = []
|
|
135
|
+
for i, s in enumerate(silences, start=1):
|
|
136
|
+
out.append(
|
|
137
|
+
{
|
|
138
|
+
"id": f"SIL-{i:03d}",
|
|
139
|
+
"start_ms": s.start_ms,
|
|
140
|
+
"end_ms": s.end_ms,
|
|
141
|
+
"duration_ms": s.duration_ms,
|
|
142
|
+
"context": "thinking", # placeholder until the typer runs
|
|
143
|
+
}
|
|
144
|
+
)
|
|
145
|
+
return out
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def utterance_energies(
|
|
149
|
+
speech: list[Segment],
|
|
150
|
+
utterances: list[dict[str, Any]],
|
|
151
|
+
) -> dict[str, float]:
|
|
152
|
+
"""Map utterance id → mean VAD RMS energy, normalized 0..1 across the session.
|
|
153
|
+
|
|
154
|
+
Pure. Feeds ``prosody.annotate_prosody(transcript, energies=...)`` so the
|
|
155
|
+
low|normal|high volume bucketing actually runs (without this signal volume
|
|
156
|
+
defaults to "normal" for every utterance).
|
|
157
|
+
|
|
158
|
+
Each utterance's raw energy is the overlap-duration-weighted mean RMS of the
|
|
159
|
+
speech segments it spans; segments with no energy reading are ignored. Raw
|
|
160
|
+
speech RMS peaks far below 1.0, so applying prosody's fixed 0.33/0.66 split
|
|
161
|
+
to raw values would bucket everything as "low" — we normalize by the loudest
|
|
162
|
+
utterance in the session so the split is meaningful and reproducible.
|
|
163
|
+
Utterances with no overlapping energy-bearing segment are omitted, so the
|
|
164
|
+
caller reports "normal" rather than guessing.
|
|
165
|
+
|
|
166
|
+
TODO(#13): normalization is per-session (global max) and prosody's
|
|
167
|
+
VOLUME_LOW/HIGH are global constants, so a soft speaker's loudest moment
|
|
168
|
+
still reads quieter than a loud speaker's baseline. Per-speaker
|
|
169
|
+
normalization (group by ``utterance["speaker_id"]`` and normalize within
|
|
170
|
+
each speaker) would make the buckets speaker-relative. Out of scope here.
|
|
171
|
+
"""
|
|
172
|
+
raw: dict[str, float] = {}
|
|
173
|
+
for utt in utterances:
|
|
174
|
+
uid = utt.get("id")
|
|
175
|
+
if uid is None:
|
|
176
|
+
continue
|
|
177
|
+
u_start, u_end = utt.get("start_ms", 0), utt.get("end_ms", 0)
|
|
178
|
+
weighted_sum = 0.0
|
|
179
|
+
total_overlap = 0
|
|
180
|
+
for seg in speech:
|
|
181
|
+
if seg.energy is None:
|
|
182
|
+
continue
|
|
183
|
+
overlap = min(u_end, seg.end_ms) - max(u_start, seg.start_ms)
|
|
184
|
+
if overlap <= 0:
|
|
185
|
+
continue
|
|
186
|
+
weighted_sum += seg.energy * overlap
|
|
187
|
+
total_overlap += overlap
|
|
188
|
+
if total_overlap > 0:
|
|
189
|
+
raw[uid] = weighted_sum / total_overlap
|
|
190
|
+
|
|
191
|
+
peak = max(raw.values(), default=0.0)
|
|
192
|
+
if peak <= 0:
|
|
193
|
+
return {}
|
|
194
|
+
return {uid: value / peak for uid, value in raw.items()}
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class VAD:
|
|
198
|
+
def __init__(self, backend: VADBackend | None = None):
|
|
199
|
+
self._backend = backend
|
|
200
|
+
|
|
201
|
+
def _get_backend(self) -> VADBackend:
|
|
202
|
+
return self._backend if self._backend is not None else _load_silero()
|
|
203
|
+
|
|
204
|
+
def segment(self, audio_path: str, total_duration_ms: int) -> tuple[list[Segment], list[Segment]]:
|
|
205
|
+
"""Return (speech_segments, first_class_silences)."""
|
|
206
|
+
raw = self._get_backend().speech_timestamps(audio_path)
|
|
207
|
+
speech = [
|
|
208
|
+
Segment(
|
|
209
|
+
int(t["start_ms"]),
|
|
210
|
+
int(t["end_ms"]),
|
|
211
|
+
float(t["energy"]) if t.get("energy") is not None else None,
|
|
212
|
+
)
|
|
213
|
+
for t in raw
|
|
214
|
+
]
|
|
215
|
+
silences = speech_to_silences(speech, total_duration_ms)
|
|
216
|
+
return speech, silences
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "compost-transcriber"
|
|
3
|
+
version = "0.1.2"
|
|
4
|
+
description = "Compost descriptive transcriber: WhisperX + pyannote + Silero VAD + Whisper-event-tags, plus frame extraction and legacy ingest."
|
|
5
|
+
requires-python = ">=3.11,<3.13"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [{ name = "Juan Reina" }]
|
|
8
|
+
|
|
9
|
+
dependencies = [
|
|
10
|
+
"fastapi>=0.115.0",
|
|
11
|
+
"uvicorn[standard]>=0.30.0",
|
|
12
|
+
"pydantic>=2.9.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.optional-dependencies]
|
|
16
|
+
# Pinned in their own M1 issues (#9-#15) — kept out of the base install so
|
|
17
|
+
# the skeleton boots quickly without pulling multi-GB ML wheels.
|
|
18
|
+
asr = [
|
|
19
|
+
"whisperx",
|
|
20
|
+
"pyannote.audio>=3.3",
|
|
21
|
+
"silero-vad",
|
|
22
|
+
# WhisperX brings these transitively but pinning lets the lock file note
|
|
23
|
+
# the M1-Mac-friendly compute path.
|
|
24
|
+
"torch>=2.3",
|
|
25
|
+
"torchaudio>=2.3",
|
|
26
|
+
"ffmpeg-python",
|
|
27
|
+
]
|
|
28
|
+
# Native Apple-Silicon (Metal) path (#176/#183): Parakeet ASR + pyannote-on-MPS.
|
|
29
|
+
# No whisperx/ctranslate2 — the default native engine is Parakeet (parakeet-mlx).
|
|
30
|
+
# Provisioned by `compost setup --provision-native`.
|
|
31
|
+
native = [
|
|
32
|
+
"parakeet-mlx",
|
|
33
|
+
"pyannote.audio>=3.3",
|
|
34
|
+
"silero-vad",
|
|
35
|
+
"torchaudio>=2.3",
|
|
36
|
+
"ffmpeg-python",
|
|
37
|
+
]
|
|
38
|
+
frames = [
|
|
39
|
+
"imagehash",
|
|
40
|
+
"Pillow",
|
|
41
|
+
]
|
|
42
|
+
legacy = [
|
|
43
|
+
"pdfminer.six",
|
|
44
|
+
"pdfplumber",
|
|
45
|
+
"python-docx",
|
|
46
|
+
"python-pptx",
|
|
47
|
+
"openpyxl",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
target-version = "py311"
|
|
52
|
+
line-length = 100
|
|
53
|
+
|
|
54
|
+
[tool.ruff.lint]
|
|
55
|
+
select = ["E", "F", "I", "W", "UP", "B", "SIM"]
|
|
56
|
+
ignore = ["E501"]
|