@they-juanreina/compost-cli 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. package/dist/commands/agreement.d.ts +3 -0
  2. package/dist/commands/agreement.d.ts.map +1 -0
  3. package/dist/commands/agreement.js +35 -0
  4. package/dist/commands/agreement.js.map +1 -0
  5. package/dist/commands/create.d.ts +1 -0
  6. package/dist/commands/create.d.ts.map +1 -1
  7. package/dist/commands/create.js +39 -1
  8. package/dist/commands/create.js.map +1 -1
  9. package/dist/commands/export.d.ts.map +1 -1
  10. package/dist/commands/export.js +47 -4
  11. package/dist/commands/export.js.map +1 -1
  12. package/dist/commands/import.d.ts +3 -0
  13. package/dist/commands/import.d.ts.map +1 -0
  14. package/dist/commands/import.js +59 -0
  15. package/dist/commands/import.js.map +1 -0
  16. package/dist/commands/init.d.ts.map +1 -1
  17. package/dist/commands/init.js +1 -0
  18. package/dist/commands/init.js.map +1 -1
  19. package/dist/commands/jobs.d.ts +3 -0
  20. package/dist/commands/jobs.d.ts.map +1 -0
  21. package/dist/commands/jobs.js +105 -0
  22. package/dist/commands/jobs.js.map +1 -0
  23. package/dist/commands/label.d.ts +3 -0
  24. package/dist/commands/label.d.ts.map +1 -0
  25. package/dist/commands/label.js +67 -0
  26. package/dist/commands/label.js.map +1 -0
  27. package/dist/commands/models.d.ts.map +1 -1
  28. package/dist/commands/models.js +2 -1
  29. package/dist/commands/models.js.map +1 -1
  30. package/dist/commands/recode.d.ts +3 -0
  31. package/dist/commands/recode.d.ts.map +1 -0
  32. package/dist/commands/recode.js +60 -0
  33. package/dist/commands/recode.js.map +1 -0
  34. package/dist/commands/reindex.d.ts.map +1 -1
  35. package/dist/commands/reindex.js +6 -4
  36. package/dist/commands/reindex.js.map +1 -1
  37. package/dist/commands/rerun.d.ts +3 -0
  38. package/dist/commands/rerun.d.ts.map +1 -0
  39. package/dist/commands/rerun.js +91 -0
  40. package/dist/commands/rerun.js.map +1 -0
  41. package/dist/commands/search.d.ts.map +1 -1
  42. package/dist/commands/search.js +2 -1
  43. package/dist/commands/search.js.map +1 -1
  44. package/dist/commands/secrets.d.ts +3 -0
  45. package/dist/commands/secrets.d.ts.map +1 -0
  46. package/dist/commands/secrets.js +143 -0
  47. package/dist/commands/secrets.js.map +1 -0
  48. package/dist/commands/setup.d.ts.map +1 -1
  49. package/dist/commands/setup.js +90 -1
  50. package/dist/commands/setup.js.map +1 -1
  51. package/dist/commands/status.d.ts.map +1 -1
  52. package/dist/commands/status.js +2 -1
  53. package/dist/commands/status.js.map +1 -1
  54. package/dist/commands/transcribe.d.ts.map +1 -1
  55. package/dist/commands/transcribe.js +13 -2
  56. package/dist/commands/transcribe.js.map +1 -1
  57. package/dist/commands/validate.d.ts.map +1 -1
  58. package/dist/commands/validate.js +29 -1
  59. package/dist/commands/validate.js.map +1 -1
  60. package/dist/engine.d.ts +23 -0
  61. package/dist/engine.d.ts.map +1 -0
  62. package/dist/engine.js +32 -0
  63. package/dist/engine.js.map +1 -0
  64. package/dist/exporters/prov.d.ts +11 -0
  65. package/dist/exporters/prov.d.ts.map +1 -0
  66. package/dist/exporters/prov.js +151 -0
  67. package/dist/exporters/prov.js.map +1 -0
  68. package/dist/index.d.ts.map +1 -1
  69. package/dist/index.js +6 -0
  70. package/dist/index.js.map +1 -1
  71. package/dist/lib/agreement.d.ts +77 -0
  72. package/dist/lib/agreement.d.ts.map +1 -0
  73. package/dist/lib/agreement.js +261 -0
  74. package/dist/lib/agreement.js.map +1 -0
  75. package/dist/lib/artifacts.d.ts +32 -1
  76. package/dist/lib/artifacts.d.ts.map +1 -1
  77. package/dist/lib/artifacts.js +156 -22
  78. package/dist/lib/artifacts.js.map +1 -1
  79. package/dist/lib/blame.d.ts.map +1 -1
  80. package/dist/lib/blame.js +3 -2
  81. package/dist/lib/blame.js.map +1 -1
  82. package/dist/lib/config.d.ts +3 -0
  83. package/dist/lib/config.d.ts.map +1 -1
  84. package/dist/lib/config.js.map +1 -1
  85. package/dist/lib/doctor.d.ts +3 -0
  86. package/dist/lib/doctor.d.ts.map +1 -1
  87. package/dist/lib/doctor.js +24 -1
  88. package/dist/lib/doctor.js.map +1 -1
  89. package/dist/lib/events.d.ts +34 -1
  90. package/dist/lib/events.d.ts.map +1 -1
  91. package/dist/lib/events.js +35 -1
  92. package/dist/lib/events.js.map +1 -1
  93. package/dist/lib/importTranscript.d.ts +16 -0
  94. package/dist/lib/importTranscript.d.ts.map +1 -0
  95. package/dist/lib/importTranscript.js +94 -0
  96. package/dist/lib/importTranscript.js.map +1 -0
  97. package/dist/lib/ingest.d.ts.map +1 -1
  98. package/dist/lib/ingest.js +12 -6
  99. package/dist/lib/ingest.js.map +1 -1
  100. package/dist/lib/journal.d.ts +13 -0
  101. package/dist/lib/journal.d.ts.map +1 -1
  102. package/dist/lib/journal.js +58 -2
  103. package/dist/lib/journal.js.map +1 -1
  104. package/dist/lib/legacyNative.d.ts +24 -0
  105. package/dist/lib/legacyNative.d.ts.map +1 -0
  106. package/dist/lib/legacyNative.js +51 -0
  107. package/dist/lib/legacyNative.js.map +1 -0
  108. package/dist/lib/migrate.d.ts.map +1 -1
  109. package/dist/lib/migrate.js +1 -0
  110. package/dist/lib/migrate.js.map +1 -1
  111. package/dist/lib/nativeRuntime.d.ts +6 -3
  112. package/dist/lib/nativeRuntime.d.ts.map +1 -1
  113. package/dist/lib/nativeRuntime.js +6 -3
  114. package/dist/lib/nativeRuntime.js.map +1 -1
  115. package/dist/lib/provisionNative.js +1 -1
  116. package/dist/lib/provisionNative.js.map +1 -1
  117. package/dist/lib/queue.d.ts +25 -0
  118. package/dist/lib/queue.d.ts.map +1 -1
  119. package/dist/lib/queue.js +70 -3
  120. package/dist/lib/queue.js.map +1 -1
  121. package/dist/lib/reads.d.ts +24 -0
  122. package/dist/lib/reads.d.ts.map +1 -0
  123. package/dist/lib/reads.js +115 -0
  124. package/dist/lib/reads.js.map +1 -0
  125. package/dist/lib/recode.d.ts +19 -0
  126. package/dist/lib/recode.d.ts.map +1 -0
  127. package/dist/lib/recode.js +43 -0
  128. package/dist/lib/recode.js.map +1 -0
  129. package/dist/lib/rerun.d.ts +51 -0
  130. package/dist/lib/rerun.d.ts.map +1 -0
  131. package/dist/lib/rerun.js +166 -0
  132. package/dist/lib/rerun.js.map +1 -0
  133. package/dist/lib/retrieve.d.ts +8 -4
  134. package/dist/lib/retrieve.d.ts.map +1 -1
  135. package/dist/lib/retrieve.js +12 -10
  136. package/dist/lib/retrieve.js.map +1 -1
  137. package/dist/lib/schemas.generated.d.ts.map +1 -1
  138. package/dist/lib/schemas.generated.js +28 -0
  139. package/dist/lib/schemas.generated.js.map +1 -1
  140. package/dist/lib/secrets.d.ts +158 -0
  141. package/dist/lib/secrets.d.ts.map +1 -0
  142. package/dist/lib/secrets.js +507 -0
  143. package/dist/lib/secrets.js.map +1 -0
  144. package/dist/lib/seed.d.ts +5 -0
  145. package/dist/lib/seed.d.ts.map +1 -1
  146. package/dist/lib/seed.js +15 -2
  147. package/dist/lib/seed.js.map +1 -1
  148. package/dist/lib/seedResolve.d.ts.map +1 -1
  149. package/dist/lib/seedResolve.js +1 -0
  150. package/dist/lib/seedResolve.js.map +1 -1
  151. package/dist/lib/session.d.ts +14 -0
  152. package/dist/lib/session.d.ts.map +1 -1
  153. package/dist/lib/session.js +47 -0
  154. package/dist/lib/session.js.map +1 -1
  155. package/dist/lib/setup.d.ts +5 -0
  156. package/dist/lib/setup.d.ts.map +1 -1
  157. package/dist/lib/setup.js +78 -14
  158. package/dist/lib/setup.js.map +1 -1
  159. package/dist/lib/setupWizard.d.ts +51 -0
  160. package/dist/lib/setupWizard.d.ts.map +1 -0
  161. package/dist/lib/setupWizard.js +223 -0
  162. package/dist/lib/setupWizard.js.map +1 -0
  163. package/dist/lib/snap.d.ts.map +1 -1
  164. package/dist/lib/snap.js +2 -5
  165. package/dist/lib/snap.js.map +1 -1
  166. package/dist/lib/speakers.d.ts +41 -0
  167. package/dist/lib/speakers.d.ts.map +1 -0
  168. package/dist/lib/speakers.js +78 -0
  169. package/dist/lib/speakers.js.map +1 -0
  170. package/dist/lib/status.d.ts.map +1 -1
  171. package/dist/lib/status.js +21 -0
  172. package/dist/lib/status.js.map +1 -1
  173. package/dist/lib/userConfig.d.ts +22 -0
  174. package/dist/lib/userConfig.d.ts.map +1 -0
  175. package/dist/lib/userConfig.js +67 -0
  176. package/dist/lib/userConfig.js.map +1 -0
  177. package/dist/lib/validate.d.ts +18 -0
  178. package/dist/lib/validate.d.ts.map +1 -1
  179. package/dist/lib/validate.js +70 -1
  180. package/dist/lib/validate.js.map +1 -1
  181. package/dist/lib/version.d.ts +30 -0
  182. package/dist/lib/version.d.ts.map +1 -0
  183. package/dist/lib/version.js +73 -0
  184. package/dist/lib/version.js.map +1 -0
  185. package/dist/llm/adapter.d.ts.map +1 -1
  186. package/dist/llm/adapter.js +2 -0
  187. package/dist/llm/adapter.js.map +1 -1
  188. package/dist/llm/providers/ollama.d.ts.map +1 -1
  189. package/dist/llm/providers/ollama.js +6 -0
  190. package/dist/llm/providers/ollama.js.map +1 -1
  191. package/dist/loops/ingest_watcher.d.ts.map +1 -1
  192. package/dist/loops/ingest_watcher.js +6 -3
  193. package/dist/loops/ingest_watcher.js.map +1 -1
  194. package/dist/loops/legacy_worker.d.ts +28 -1
  195. package/dist/loops/legacy_worker.d.ts.map +1 -1
  196. package/dist/loops/legacy_worker.js +81 -9
  197. package/dist/loops/legacy_worker.js.map +1 -1
  198. package/dist/loops/supervisor.d.ts +3 -0
  199. package/dist/loops/supervisor.d.ts.map +1 -1
  200. package/dist/loops/supervisor.js +12 -0
  201. package/dist/loops/supervisor.js.map +1 -1
  202. package/dist/loops/synthesis.d.ts.map +1 -1
  203. package/dist/loops/synthesis.js +15 -0
  204. package/dist/loops/synthesis.js.map +1 -1
  205. package/dist/loops/transcribe_worker.d.ts.map +1 -1
  206. package/dist/loops/transcribe_worker.js +2 -4
  207. package/dist/loops/transcribe_worker.js.map +1 -1
  208. package/dist/output.d.ts +13 -1
  209. package/dist/output.d.ts.map +1 -1
  210. package/dist/output.js +22 -2
  211. package/dist/output.js.map +1 -1
  212. package/dist/render/human.d.ts +20 -0
  213. package/dist/render/human.d.ts.map +1 -0
  214. package/dist/render/human.js +54 -0
  215. package/dist/render/human.js.map +1 -0
  216. package/dist/router.d.ts.map +1 -1
  217. package/dist/router.js +17 -2
  218. package/dist/router.js.map +1 -1
  219. package/package.json +18 -5
  220. package/templates/config.toml +6 -1
  221. package/transcriber/app/__init__.py +3 -0
  222. package/transcriber/app/asr.py +198 -0
  223. package/transcriber/app/asr_parakeet.py +174 -0
  224. package/transcriber/app/cue_parser.py +110 -0
  225. package/transcriber/app/diarization.py +330 -0
  226. package/transcriber/app/frame_annotation.py +77 -0
  227. package/transcriber/app/frames.py +130 -0
  228. package/transcriber/app/health.py +70 -0
  229. package/transcriber/app/legacy.py +355 -0
  230. package/transcriber/app/legacy_cli.py +90 -0
  231. package/transcriber/app/main.py +30 -0
  232. package/transcriber/app/pipeline.py +210 -0
  233. package/transcriber/app/pptx_export.py +42 -0
  234. package/transcriber/app/prosody.py +128 -0
  235. package/transcriber/app/routes/__init__.py +1 -0
  236. package/transcriber/app/routes/legacy.py +117 -0
  237. package/transcriber/app/routes/transcribe.py +133 -0
  238. package/transcriber/app/shot_change.py +74 -0
  239. package/transcriber/app/silence_typer.py +144 -0
  240. package/transcriber/app/transcribe_cli.py +82 -0
  241. package/transcriber/app/vad.py +216 -0
  242. package/transcriber/pyproject.toml +56 -0
@@ -0,0 +1,74 @@
1
+ """Perceptual-hash shot-change detector (#15).
2
+
3
+ Samples the video at a fixed cadence, computes a perceptual hash per sampled
4
+ frame, and reports the timestamps where the hash distance to the previous
5
+ sample crosses a threshold — i.e. a scene cut, slide change, or camera move.
6
+
7
+ Output is a list of at_ms values consumed by the frame extractor (#14) as
8
+ `shot_change` triggers. No classification beyond "something changed".
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import subprocess
14
+ import tempfile
15
+ from pathlib import Path
16
+
17
+ import imagehash
18
+ from PIL import Image
19
+
20
+ # Default Hamming-distance threshold between consecutive perceptual hashes.
21
+ # Tunable via config ([frames].shot_change_phash_distance).
22
+ DEFAULT_PHASH_DISTANCE = 12
23
+ DEFAULT_SAMPLE_INTERVAL_MS = 1000
24
+
25
+
26
+ def _sample_frame(video_path: Path, at_ms: int, out_path: Path) -> bool:
27
+ ts = at_ms / 1000.0
28
+ cmd = [
29
+ "ffmpeg",
30
+ "-y",
31
+ "-ss",
32
+ f"{ts:.3f}",
33
+ "-i",
34
+ str(video_path),
35
+ "-frames:v",
36
+ "1",
37
+ "-vf",
38
+ "scale=160:90",
39
+ str(out_path),
40
+ ]
41
+ proc = subprocess.run(cmd, capture_output=True, text=True)
42
+ return proc.returncode == 0 and out_path.exists()
43
+
44
+
45
+ def detect_shot_changes(
46
+ video_path: str | Path,
47
+ duration_ms: int,
48
+ threshold: int = DEFAULT_PHASH_DISTANCE,
49
+ sample_interval_ms: int = DEFAULT_SAMPLE_INTERVAL_MS,
50
+ ) -> list[int]:
51
+ """Return at_ms timestamps where a shot change is detected.
52
+
53
+ The first sampled frame is never a "change" (no predecessor). Distances at
54
+ or above `threshold` mark a change.
55
+ """
56
+ video_path = Path(video_path)
57
+ changes: list[int] = []
58
+ prev_hash: imagehash.ImageHash | None = None
59
+
60
+ with tempfile.TemporaryDirectory() as tmp:
61
+ tmp_dir = Path(tmp)
62
+ at = 0
63
+ idx = 0
64
+ while at < duration_ms:
65
+ frame_path = tmp_dir / f"s{idx}.png"
66
+ if _sample_frame(video_path, at, frame_path):
67
+ with Image.open(frame_path) as img:
68
+ h = imagehash.phash(img)
69
+ if prev_hash is not None and (h - prev_hash) >= threshold:
70
+ changes.append(at)
71
+ prev_hash = h
72
+ at += sample_interval_ms
73
+ idx += 1
74
+ return changes
@@ -0,0 +1,144 @@
1
+ """Silence typer — heuristic post-processor that assigns a semantic type to each
2
+ first-class silence (> threshold) from the surrounding utterance context.
3
+
4
+ Types (ROADMAP § Descriptive transcription A):
5
+ - after_question : the silence follows a moderator question
6
+ - mid_utterance : the silence sits inside one speaker's turn
7
+ - thinking : a pre-response pause that isn't clearly after a question
8
+ - interruption : the silence coincides with an overlap/turn-steal
9
+
10
+ Rules are versioned. Researchers can override any assignment downstream; an
11
+ override is recorded as a `researcher`-authored event in the provenance log
12
+ (see issue #12 / provenance writer #27).
13
+
14
+ CHANGELOG
15
+ v1 (2026-06-03): initial rule set.
16
+ - after_question: previous utterance is a moderator AND ends with '?'
17
+ (or a leading inverted '¿' question), and abuts the silence start.
18
+ - interruption: an overlap/interruption cue overlaps the silence window,
19
+ OR previous and next utterances are different speakers and the previous
20
+ did not end on sentence-final punctuation (cut off).
21
+ - mid_utterance: previous and next utterances are the same speaker.
22
+ - thinking: default.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import Any
28
+
29
+ RULES_VERSION = "1"
30
+
31
+ _SILENCE_TYPES = ("after_question", "mid_utterance", "thinking", "interruption")
32
+
33
+ # How close (ms) the previous utterance's end must be to the silence start for
34
+ # the silence to be considered "abutting" that utterance.
35
+ _ABUT_TOLERANCE_MS = 250
36
+
37
+ _SENTENCE_FINAL = (".", "!", "?", "…")
38
+
39
+
40
+ def _ends_question(text: str) -> bool:
41
+ stripped = text.rstrip()
42
+ if stripped.endswith("?"):
43
+ return True
44
+ # Spanish inverted question mark opening with no closing yet still reads as a question.
45
+ return "¿" in stripped and "?" in stripped
46
+
47
+
48
+ def _ends_sentence_final(text: str) -> bool:
49
+ stripped = text.rstrip()
50
+ return stripped.endswith(_SENTENCE_FINAL)
51
+
52
+
53
+ def _speaker_type(speakers: list[dict[str, Any]], speaker_id: str | None) -> str | None:
54
+ if speaker_id is None:
55
+ return None
56
+ for s in speakers:
57
+ if s.get("id") == speaker_id:
58
+ return s.get("type")
59
+ return None
60
+
61
+
62
+ def _cue_overlaps(silence: dict[str, Any], cues: list[dict[str, Any]]) -> bool:
63
+ s_start = silence["start_ms"]
64
+ s_end = silence["end_ms"]
65
+ for cue in cues:
66
+ if cue.get("kind") not in ("overlap", "interruption"):
67
+ continue
68
+ # any temporal overlap between the cue and the silence window
69
+ if cue["start_ms"] <= s_end and cue["end_ms"] >= s_start:
70
+ return True
71
+ return False
72
+
73
+
74
+ def type_silence(
75
+ silence: dict[str, Any],
76
+ prev_utt: dict[str, Any] | None,
77
+ next_utt: dict[str, Any] | None,
78
+ speakers: list[dict[str, Any]],
79
+ cues: list[dict[str, Any]] | None = None,
80
+ ) -> str:
81
+ """Return one of the four silence types for a single silence."""
82
+ cues = cues or []
83
+
84
+ if _cue_overlaps(silence, cues):
85
+ return "interruption"
86
+
87
+ if prev_utt is not None:
88
+ abuts = abs(silence["start_ms"] - prev_utt["end_ms"]) <= _ABUT_TOLERANCE_MS
89
+ prev_type = _speaker_type(speakers, prev_utt.get("speaker_id"))
90
+ if abuts and prev_type == "moderator" and _ends_question(prev_utt.get("text", "")):
91
+ return "after_question"
92
+
93
+ if (
94
+ prev_utt is not None
95
+ and next_utt is not None
96
+ and prev_utt.get("speaker_id") == next_utt.get("speaker_id")
97
+ ):
98
+ return "mid_utterance"
99
+
100
+ # Different speakers (or unknown) and the previous turn was cut off → interruption.
101
+ if (
102
+ prev_utt is not None
103
+ and next_utt is not None
104
+ and prev_utt.get("speaker_id") != next_utt.get("speaker_id")
105
+ and not _ends_sentence_final(prev_utt.get("text", ""))
106
+ ):
107
+ return "interruption"
108
+
109
+ return "thinking"
110
+
111
+
112
+ def _utterance_before(utterances: list[dict[str, Any]], at_ms: int) -> dict[str, Any] | None:
113
+ candidate = None
114
+ for u in utterances:
115
+ if u["end_ms"] <= at_ms + _ABUT_TOLERANCE_MS and (
116
+ candidate is None or u["end_ms"] > candidate["end_ms"]
117
+ ):
118
+ candidate = u
119
+ return candidate
120
+
121
+
122
+ def _utterance_after(utterances: list[dict[str, Any]], at_ms: int) -> dict[str, Any] | None:
123
+ candidate = None
124
+ for u in utterances:
125
+ if u["start_ms"] >= at_ms - _ABUT_TOLERANCE_MS and (
126
+ candidate is None or u["start_ms"] < candidate["start_ms"]
127
+ ):
128
+ candidate = u
129
+ return candidate
130
+
131
+
132
+ def type_all_silences(transcript: dict[str, Any]) -> dict[str, Any]:
133
+ """Annotate every silence in a transcript dict with a `context` type.
134
+
135
+ Mutates and returns the transcript. Idempotent. Fast: O(silences × utterances).
136
+ """
137
+ utterances = transcript.get("utterances", [])
138
+ cues = transcript.get("cues", [])
139
+ speakers = transcript.get("speakers", [])
140
+ for silence in transcript.get("silences", []):
141
+ prev_utt = _utterance_before(utterances, silence["start_ms"])
142
+ next_utt = _utterance_after(utterances, silence["end_ms"])
143
+ silence["context"] = type_silence(silence, prev_utt, next_utt, speakers, cues)
144
+ return transcript
@@ -0,0 +1,82 @@
1
+ """Native (host) transcription entrypoint (#176).
2
+
3
+ Runs the full pipeline ON THE HOST (no Docker) so Apple-Silicon ASR backends
4
+ (`parakeet-mlx` / Metal) and pyannote use the GPU/CPU directly — the Docker
5
+ container is CPU-only on macOS, which is the bottleneck this path removes. The
6
+ Node CLI shells out to this when `transcriber.runtime = native`; the Docker
7
+ `/transcribe` route stays the cross-platform fallback and shares the exact same
8
+ `run_pipeline` orchestration.
9
+
10
+ python -m app.transcribe_cli \
11
+ --seed-path <seed> --session-id S001 \
12
+ --source-path <seed>/sessions/S001/source.mp3 \
13
+ --engine parakeet --language en
14
+
15
+ Prints one JSON line mirroring the /transcribe response shape so the Node
16
+ caller parses both paths identically.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import json
23
+
24
+ from .asr import ASRConfig
25
+ from .pipeline import PipelineBackends, PipelineConfig, run_pipeline, write_transcript
26
+
27
+ _DEFAULT_MODEL = {
28
+ "parakeet": "mlx-community/parakeet-tdt-0.6b-v3",
29
+ "whisper": "large-v3-turbo",
30
+ }
31
+
32
+
33
+ def main(argv: list[str] | None = None) -> int:
34
+ p = argparse.ArgumentParser(prog="compost-transcribe-native")
35
+ p.add_argument("--seed-path", required=True)
36
+ p.add_argument("--session-id", required=True)
37
+ p.add_argument("--source-path", required=True)
38
+ p.add_argument("--engine", default="parakeet", choices=["parakeet", "whisper"])
39
+ p.add_argument("--model", default=None, help="ASR model id (engine default if omitted)")
40
+ p.add_argument("--language", default=None)
41
+ p.add_argument("--device", default="auto")
42
+ p.add_argument("--compute-type", default="int8")
43
+ args = p.parse_args(argv)
44
+
45
+ asr = ASRConfig(
46
+ model_name=args.model or _DEFAULT_MODEL[args.engine],
47
+ device=args.device,
48
+ compute_type=args.compute_type,
49
+ language=args.language,
50
+ engine=args.engine,
51
+ )
52
+ config = PipelineConfig(asr=asr, asr_model_tag=f"{asr.model_name} ({args.engine})")
53
+
54
+ try:
55
+ transcript = run_pipeline(
56
+ seed_path=args.seed_path,
57
+ session_id=args.session_id,
58
+ source_path=args.source_path,
59
+ config=config,
60
+ backends=PipelineBackends(), # all None → real lazy backends (Silero / engine ASR / pyannote)
61
+ )
62
+ except Exception as e: # surface as JSON so the Node caller can report it
63
+ print(json.dumps({"status": "failed", "error": str(e)}))
64
+ return 1
65
+
66
+ path = write_transcript(args.seed_path, args.session_id, transcript)
67
+ print(
68
+ json.dumps(
69
+ {
70
+ "session_id": args.session_id,
71
+ "transcript_path": path,
72
+ "status": transcript.get("status", "ok"),
73
+ "engine": args.engine,
74
+ "model": asr.model_name,
75
+ }
76
+ )
77
+ )
78
+ return 0
79
+
80
+
81
+ if __name__ == "__main__":
82
+ raise SystemExit(main())
@@ -0,0 +1,216 @@
1
+ """Silero VAD integration + silence segmentation (#9).
2
+
3
+ Two outputs (ROADMAP § Descriptive transcription A):
4
+ (a) speech-segment boundaries → fed to ASR
5
+ (b) silence boundaries → fed to the silence typer (#12)
6
+
7
+ Silero v5 is loaded once per process (cold-start cached). The model call is
8
+ lazily imported so this module — and the silence-segmentation maths, which is
9
+ pure — works without torch installed. Install the `asr` extra for real VAD.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass
15
+ from functools import lru_cache
16
+ from typing import Any, Protocol
17
+
18
+ # Silences shorter than this are NOT first-class; they remain gaps only.
19
+ MIN_FIRST_CLASS_SILENCE_MS = 1500
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class Segment:
24
+ start_ms: int
25
+ end_ms: int
26
+ # Mean RMS energy over the segment's waveform (raw amplitude, ~0..1; speech
27
+ # peaks well below 1.0). `None` when the backend doesn't report it — the
28
+ # prosody stage then leaves volume at "normal" rather than guessing.
29
+ energy: float | None = None
30
+
31
+ @property
32
+ def duration_ms(self) -> int:
33
+ return self.end_ms - self.start_ms
34
+
35
+
36
+ class VADBackend(Protocol):
37
+ def speech_timestamps(self, audio_path: str) -> list[dict[str, Any]]:
38
+ """Return speech segments as ``{"start_ms", "end_ms"}`` dicts, optionally
39
+ with a float ``"energy"`` (mean RMS over the segment)."""
40
+ ...
41
+
42
+
43
+ SILERO_SAMPLE_RATE = 16000
44
+
45
+
46
+ class SileroBackend: # pragma: no cover - needs torch + weights
47
+ """Concrete VADBackend wrapping silero-vad.
48
+
49
+ The Silero v5 model is loaded once per process. Audio is decoded to a
50
+ 16 kHz mono waveform via the package's `read_audio` helper. Returns
51
+ speech segment boundaries in milliseconds.
52
+ """
53
+
54
+ def __init__(self) -> None:
55
+ try:
56
+ from silero_vad import ( # type: ignore
57
+ get_speech_timestamps,
58
+ load_silero_vad,
59
+ read_audio,
60
+ )
61
+ except ImportError as e:
62
+ raise RuntimeError(
63
+ "silero-vad is not installed. Install the asr extra: pip install -e '.[asr]'"
64
+ ) from e
65
+
66
+ self._model = load_silero_vad()
67
+ self._read_audio = read_audio
68
+ self._get_speech_timestamps = get_speech_timestamps
69
+
70
+ def speech_timestamps(self, audio_path: str) -> list[dict[str, Any]]:
71
+ wav = self._read_audio(audio_path, sampling_rate=SILERO_SAMPLE_RATE)
72
+ raw = self._get_speech_timestamps(
73
+ wav,
74
+ self._model,
75
+ sampling_rate=SILERO_SAMPLE_RATE,
76
+ return_seconds=False,
77
+ )
78
+ # `raw` is a list of {start, end} in samples; convert to ms and compute
79
+ # the mean RMS energy over each segment's waveform window (#9 → #13).
80
+ # `wav` is a mono float tensor in [-1, 1]; RMS = sqrt(mean(x^2)).
81
+ ms_per_sample = 1000 / SILERO_SAMPLE_RATE
82
+ out: list[dict[str, Any]] = []
83
+ for seg in raw:
84
+ start_sample, end_sample = int(seg["start"]), int(seg["end"])
85
+ window = wav[start_sample:end_sample]
86
+ rms = float(window.pow(2).mean().sqrt()) if window.numel() else 0.0
87
+ out.append(
88
+ {
89
+ "start_ms": int(seg["start"] * ms_per_sample),
90
+ "end_ms": int(seg["end"] * ms_per_sample),
91
+ "energy": rms,
92
+ }
93
+ )
94
+ return out
95
+
96
+
97
+ @lru_cache(maxsize=1)
98
+ def _load_silero() -> VADBackend: # pragma: no cover - needs torch + weights
99
+ try:
100
+ import torch # type: ignore # noqa: F401
101
+ except ImportError as e:
102
+ raise RuntimeError(
103
+ "torch/silero not installed. Install the asr extra: pip install -e '.[asr]'"
104
+ ) from e
105
+ return SileroBackend()
106
+
107
+
108
+ def speech_to_silences(
109
+ speech: list[Segment],
110
+ total_duration_ms: int,
111
+ min_silence_ms: int = MIN_FIRST_CLASS_SILENCE_MS,
112
+ ) -> list[Segment]:
113
+ """Derive first-class silence segments from speech segments.
114
+
115
+ Pure. Considers the gaps before the first speech, between speech segments,
116
+ and after the last speech. Only gaps >= min_silence_ms are returned.
117
+ Overlapping/auto-sorted by start.
118
+ """
119
+ ordered = sorted(speech, key=lambda s: s.start_ms)
120
+ silences: list[Segment] = []
121
+ cursor = 0
122
+ for seg in ordered:
123
+ if seg.start_ms - cursor >= min_silence_ms:
124
+ silences.append(Segment(cursor, seg.start_ms))
125
+ cursor = max(cursor, seg.end_ms)
126
+ if total_duration_ms - cursor >= min_silence_ms:
127
+ silences.append(Segment(cursor, total_duration_ms))
128
+ return silences
129
+
130
+
131
+ def silences_to_schema(silences: list[Segment]) -> list[dict[str, Any]]:
132
+ """Render silence segments as transcript.json silences[] entries (untyped;
133
+ the silence typer #12 fills `context`)."""
134
+ out: list[dict[str, Any]] = []
135
+ for i, s in enumerate(silences, start=1):
136
+ out.append(
137
+ {
138
+ "id": f"SIL-{i:03d}",
139
+ "start_ms": s.start_ms,
140
+ "end_ms": s.end_ms,
141
+ "duration_ms": s.duration_ms,
142
+ "context": "thinking", # placeholder until the typer runs
143
+ }
144
+ )
145
+ return out
146
+
147
+
148
+ def utterance_energies(
149
+ speech: list[Segment],
150
+ utterances: list[dict[str, Any]],
151
+ ) -> dict[str, float]:
152
+ """Map utterance id → mean VAD RMS energy, normalized 0..1 across the session.
153
+
154
+ Pure. Feeds ``prosody.annotate_prosody(transcript, energies=...)`` so the
155
+ low|normal|high volume bucketing actually runs (without this signal volume
156
+ defaults to "normal" for every utterance).
157
+
158
+ Each utterance's raw energy is the overlap-duration-weighted mean RMS of the
159
+ speech segments it spans; segments with no energy reading are ignored. Raw
160
+ speech RMS peaks far below 1.0, so applying prosody's fixed 0.33/0.66 split
161
+ to raw values would bucket everything as "low" — we normalize by the loudest
162
+ utterance in the session so the split is meaningful and reproducible.
163
+ Utterances with no overlapping energy-bearing segment are omitted, so the
164
+ caller reports "normal" rather than guessing.
165
+
166
+ TODO(#13): normalization is per-session (global max) and prosody's
167
+ VOLUME_LOW/HIGH are global constants, so a soft speaker's loudest moment
168
+ still reads quieter than a loud speaker's baseline. Per-speaker
169
+ normalization (group by ``utterance["speaker_id"]`` and normalize within
170
+ each speaker) would make the buckets speaker-relative. Out of scope here.
171
+ """
172
+ raw: dict[str, float] = {}
173
+ for utt in utterances:
174
+ uid = utt.get("id")
175
+ if uid is None:
176
+ continue
177
+ u_start, u_end = utt.get("start_ms", 0), utt.get("end_ms", 0)
178
+ weighted_sum = 0.0
179
+ total_overlap = 0
180
+ for seg in speech:
181
+ if seg.energy is None:
182
+ continue
183
+ overlap = min(u_end, seg.end_ms) - max(u_start, seg.start_ms)
184
+ if overlap <= 0:
185
+ continue
186
+ weighted_sum += seg.energy * overlap
187
+ total_overlap += overlap
188
+ if total_overlap > 0:
189
+ raw[uid] = weighted_sum / total_overlap
190
+
191
+ peak = max(raw.values(), default=0.0)
192
+ if peak <= 0:
193
+ return {}
194
+ return {uid: value / peak for uid, value in raw.items()}
195
+
196
+
197
+ class VAD:
198
+ def __init__(self, backend: VADBackend | None = None):
199
+ self._backend = backend
200
+
201
+ def _get_backend(self) -> VADBackend:
202
+ return self._backend if self._backend is not None else _load_silero()
203
+
204
+ def segment(self, audio_path: str, total_duration_ms: int) -> tuple[list[Segment], list[Segment]]:
205
+ """Return (speech_segments, first_class_silences)."""
206
+ raw = self._get_backend().speech_timestamps(audio_path)
207
+ speech = [
208
+ Segment(
209
+ int(t["start_ms"]),
210
+ int(t["end_ms"]),
211
+ float(t["energy"]) if t.get("energy") is not None else None,
212
+ )
213
+ for t in raw
214
+ ]
215
+ silences = speech_to_silences(speech, total_duration_ms)
216
+ return speech, silences
@@ -0,0 +1,56 @@
1
+ [project]
2
+ name = "compost-transcriber"
3
+ version = "0.1.2"
4
+ description = "Compost descriptive transcriber: WhisperX + pyannote + Silero VAD + Whisper-event-tags, plus frame extraction and legacy ingest."
5
+ requires-python = ">=3.11,<3.13"
6
+ license = { text = "MIT" }
7
+ authors = [{ name = "Juan Reina" }]
8
+
9
+ dependencies = [
10
+ "fastapi>=0.115.0",
11
+ "uvicorn[standard]>=0.30.0",
12
+ "pydantic>=2.9.0",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ # Pinned in their own M1 issues (#9-#15) — kept out of the base install so
17
+ # the skeleton boots quickly without pulling multi-GB ML wheels.
18
+ asr = [
19
+ "whisperx",
20
+ "pyannote.audio>=3.3",
21
+ "silero-vad",
22
+ # WhisperX brings these transitively but pinning lets the lock file note
23
+ # the M1-Mac-friendly compute path.
24
+ "torch>=2.3",
25
+ "torchaudio>=2.3",
26
+ "ffmpeg-python",
27
+ ]
28
+ # Native Apple-Silicon (Metal) path (#176/#183): Parakeet ASR + pyannote-on-MPS.
29
+ # No whisperx/ctranslate2 — the default native engine is Parakeet (parakeet-mlx).
30
+ # Provisioned by `compost setup --provision-native`.
31
+ native = [
32
+ "parakeet-mlx",
33
+ "pyannote.audio>=3.3",
34
+ "silero-vad",
35
+ "torchaudio>=2.3",
36
+ "ffmpeg-python",
37
+ ]
38
+ frames = [
39
+ "imagehash",
40
+ "Pillow",
41
+ ]
42
+ legacy = [
43
+ "pdfminer.six",
44
+ "pdfplumber",
45
+ "python-docx",
46
+ "python-pptx",
47
+ "openpyxl",
48
+ ]
49
+
50
+ [tool.ruff]
51
+ target-version = "py311"
52
+ line-length = 100
53
+
54
+ [tool.ruff.lint]
55
+ select = ["E", "F", "I", "W", "UP", "B", "SIM"]
56
+ ignore = ["E501"]