clipwright-silence 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ Metadata-Version: 2.3
2
+ Name: clipwright-silence
3
+ Version: 0.1.0
4
+ Summary: MCP tool for silence detection and OTIO timeline generation. Detects silence with ffmpeg silencedetect and generates timeline.otio with clip sequence for regions to keep.
5
+ Author: satoh-y-0323
6
+ Author-email: satoh-y-0323 <shoma.papa.0323@gmail.com>
7
+ License: MIT
8
+ Requires-Dist: clipwright>=0.1.0
9
+ Requires-Dist: mcp[cli]>=1.27.2
10
+ Requires-Dist: opentimelineio>=0.18
11
+ Requires-Dist: pydantic>=2
12
+ Requires-Dist: silero-vad>=5.1 ; extra == 'vad'
13
+ Requires-Dist: onnxruntime>=1.17 ; extra == 'vad'
14
+ Requires-Dist: numpy>=1.24 ; extra == 'vad'
15
+ Requires-Python: >=3.11
16
+ Provides-Extra: vad
17
+ Description-Content-Type: text/markdown
18
+
19
+ # clipwright-silence
20
+
21
+ MCP tool for silence detection and OTIO timeline generation.
@@ -0,0 +1,3 @@
1
+ # clipwright-silence
2
+
3
+ MCP tool for silence detection and OTIO timeline generation.
@@ -0,0 +1,102 @@
1
+ [project]
2
+ name = "clipwright-silence"
3
+ version = "0.1.0"
4
+ description = "MCP tool for silence detection and OTIO timeline generation. Detects silence with ffmpeg silencedetect and generates timeline.otio with clip sequence for regions to keep."
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ authors = [
8
+ { name = "satoh-y-0323", email = "shoma.papa.0323@gmail.com" }
9
+ ]
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "clipwright>=0.1.0",
13
+ "mcp[cli]>=1.27.2",
14
+ "opentimelineio>=0.18",
15
+ "pydantic>=2",
16
+ ]
17
+
18
+ [project.optional-dependencies]
19
+ vad = [
20
+ "silero-vad>=5.1",
21
+ "onnxruntime>=1.17",
22
+ # numpy is a transitive dependency of onnxruntime, but explicitly listed to pin minimum version
23
+ "numpy>=1.24",
24
+ ]
25
+
26
+ [project.scripts]
27
+ clipwright-silence = "clipwright_silence.server:main"
28
+ clipwright-silence-vad = "clipwright_silence.vad_cli:main"
29
+
30
+ [build-system]
31
+ requires = ["uv_build>=0.11.19,<0.12.0"]
32
+ build-backend = "uv_build"
33
+
34
+ [dependency-groups]
35
+ dev = [
36
+ "mypy>=2.1.0",
37
+ "pytest>=9.0.3",
38
+ "pytest-cov>=7.1.0",
39
+ "pytest-mock>=3.15.1",
40
+ "ruff>=0.15.16",
41
+ ]
42
+
43
+ # Resolve clipwright (core) within workspace by path reference
44
+ [tool.uv.sources]
45
+ clipwright = { workspace = true }
46
+
47
+ # --- Ruff ---
48
+ [tool.ruff]
49
+ target-version = "py311"
50
+ line-length = 88
51
+
52
+ [tool.ruff.lint]
53
+ select = ["E", "F", "W", "I", "UP", "B", "C4", "SIM"]
54
+ ignore = []
55
+
56
+ [tool.ruff.lint.per-file-ignores]
57
+ # Allow E501 for English docstrings/comments in test files
58
+ "tests/*.py" = ["E501"]
59
+
60
+ [tool.ruff.format]
61
+ # Default ruff formatter is OK
62
+
63
+ # --- mypy ---
64
+ [tool.mypy]
65
+ python_version = "3.11"
66
+ strict = true
67
+ warn_return_any = true
68
+ warn_unused_configs = true
69
+ disallow_untyped_defs = true
70
+ disallow_any_generics = true
71
+
72
+ # opentimelineio has no stubs, ignored with mypy strict
73
+ [[tool.mypy.overrides]]
74
+ module = "opentimelineio.*"
75
+ ignore_missing_imports = true
76
+
77
+ # silero-vad / onnxruntime have no stubs, ignored as VAD is optional extra
78
+ [[tool.mypy.overrides]]
79
+ module = "silero_vad.*"
80
+ ignore_missing_imports = true
81
+
82
+ [[tool.mypy.overrides]]
83
+ module = "onnxruntime.*"
84
+ ignore_missing_imports = true
85
+
86
+ # --- pytest ---
87
+ [tool.pytest.ini_options]
88
+ testpaths = ["tests"]
89
+ addopts = "--strict-markers -q"
90
+ markers = [
91
+ "integration: integration test requiring actual ffmpeg/ffprobe binaries",
92
+ "slow: test with long execution time",
93
+ ]
94
+
95
+ # --- coverage ---
96
+ [tool.coverage.run]
97
+ source = ["clipwright_silence"]
98
+ omit = ["tests/*"]
99
+
100
+ [tool.coverage.report]
101
+ show_missing = true
102
+ skip_covered = false
@@ -0,0 +1,3 @@
1
+ """clipwright-silence: silence detection → OTIO timeline generation MCP tool."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,523 @@
1
+ """detect.py — clipwright-silence orchestration layer.
2
+
3
+ Handles the full flow: input validation -> inspect_media -> silencedetect
4
+ execution/parsing -> KEEP derivation -> OTIO construction/save -> envelope return.
5
+
6
+ Design decisions:
7
+ - _detect_silence_intervals() encapsulates ffmpeg execution and stderr parsing,
8
+ allowing future backend replacement (adapter abstraction, AD-1).
9
+ - _detect_vad_silence_intervals() handles spawning the VAD CLI as a separate
10
+ process and inverting speech -> silence. Both return (silence interval list)
11
+ with a common contract so derive_keep_ranges onward uses a shared flow.
12
+ - source_range rate is taken from inspect_media MediaInfo.duration.rate,
13
+ and value = seconds * rate (DC-AS-003).
14
+ - output is only permitted in the same directory as media (DC-AS-001).
15
+ - Error messages do not expose full paths or raw ffmpeg stderr (basename only, M-1).
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import json
21
+ import math
22
+ import re
23
+ import sys
24
+ from pathlib import Path
25
+ from typing import Any
26
+
27
+ from clipwright.envelope import error_result, ok_result
28
+ from clipwright.errors import ClipwrightError, ErrorCode
29
+ from clipwright.media import inspect_media
30
+ from clipwright.otio_utils import add_clip, new_timeline, save_timeline
31
+ from clipwright.process import resolve_tool, run
32
+ from clipwright.schemas import MediaRef, RationalTimeModel, TimeRangeModel
33
+
34
+ import clipwright_silence
35
+ from clipwright_silence.plan import derive_keep_ranges
36
+ from clipwright_silence.schemas import DetectSilenceOptions
37
+
38
+ # Regex to extract silence_start / silence_end lines
39
+ # (DC-AM-003: line-start match, '.' fixed decimal)
40
+ _RE_SILENCE_START = re.compile(r"silence_start:\s*([0-9]+(?:\.[0-9]+)?)")
41
+ _RE_SILENCE_END = re.compile(r"silence_end:\s*([0-9]+(?:\.[0-9]+)?)")
42
+
43
+
44
+ def _fmt_sec(sec: float) -> str:
45
+ """Convert seconds to a human-readable minutes/seconds string (for summary).
46
+
47
+ Format examples: 90.0 -> "1m30.0s", 45.5 -> "45.5s"
48
+ """
49
+ m = int(sec) // 60
50
+ s = sec - m * 60
51
+ return f"{m}m{s:.1f}s" if m > 0 else f"{s:.1f}s"
52
+
53
+
54
+ def _parse_silence_intervals(
55
+ stderr: str,
56
+ total_duration_sec: float,
57
+ ) -> list[tuple[float, float]]:
58
+ """Extract silence interval list from silencedetect stderr.
59
+
60
+ Parses line by line using a line-start match regex with fixed '.'
61
+ decimal (DC-AM-003). A trailing silence_start with no matching
62
+ silence_end is completed using
63
+ total_duration_sec (DC-AM-002).
64
+
65
+ Args:
66
+ stderr: ffmpeg standard error output string.
67
+ total_duration_sec: Total duration of the source (seconds). Used for completion.
68
+
69
+ Returns:
70
+ List of silence intervals. Each element is a (start_sec, end_sec) tuple.
71
+ """
72
+ intervals: list[tuple[float, float]] = []
73
+ pending_start: float | None = None
74
+
75
+ for line in stderr.splitlines():
76
+ m_start = _RE_SILENCE_START.search(line)
77
+ if m_start:
78
+ pending_start = float(m_start.group(1))
79
+ continue
80
+
81
+ m_end = _RE_SILENCE_END.search(line)
82
+ # An isolated silence_end does not occur in normal silencedetect output
83
+ # (start->end are always paired). If encountered, skip as abnormal output.
84
+ # This is an intentional ignore per silencedetect spec, not suppression.
85
+ if m_end and pending_start is not None:
86
+ end = float(m_end.group(1))
87
+ # SR L-3: skip abnormal intervals where end < start
88
+ # (defensive check for future backend replacement)
89
+ if end < pending_start:
90
+ pending_start = None
91
+ continue
92
+ intervals.append((pending_start, end))
93
+ pending_start = None
94
+
95
+ # Trailing silence_end missing -> complete with total_duration (DC-AM-002)
96
+ if pending_start is not None:
97
+ intervals.append((pending_start, total_duration_sec))
98
+
99
+ return intervals
100
+
101
+
102
+ def _detect_silence_intervals(
103
+ ffmpeg: str,
104
+ source: str,
105
+ options: DetectSilenceOptions,
106
+ total_duration_sec: float,
107
+ ) -> list[tuple[float, float]]:
108
+ """Run ffmpeg silencedetect and return a list of silence intervals.
109
+
110
+ Adapter abstraction (AD-1).
111
+
112
+ When replacing the backend in the future, only this function needs to be swapped.
113
+
114
+ Args:
115
+ ffmpeg: Path to ffmpeg executable.
116
+ source: Input media file path.
117
+ options: DetectSilenceOptions.
118
+ total_duration_sec: Total duration of source (seconds).
119
+ Used for trailing completion.
120
+
121
+ Returns:
122
+ List of silence intervals. Each element is (start_sec, end_sec).
123
+
124
+ Raises:
125
+ ClipwrightError: SUBPROCESS_FAILED / SUBPROCESS_TIMEOUT (raised by run).
126
+ """
127
+ # Filter string uses explicit format to be locale-independent (DC-AM-003)
128
+ filter_str = (
129
+ f"silencedetect=noise={options.silence_threshold_db:.3f}dB"
130
+ f":d={options.min_silence_duration:.6f}"
131
+ )
132
+ timeout = max(60, math.ceil(total_duration_sec * 2))
133
+
134
+ cmd = [
135
+ ffmpeg,
136
+ "-hide_banner",
137
+ "-nostats",
138
+ "-i",
139
+ source,
140
+ "-af",
141
+ filter_str,
142
+ "-f",
143
+ "null",
144
+ "-",
145
+ ]
146
+ result = run(cmd, timeout=float(timeout))
147
+ return _parse_silence_intervals(result.stderr, total_duration_sec)
148
+
149
+
150
+ def _detect_vad_silence_intervals(
151
+ source: str,
152
+ options: DetectSilenceOptions,
153
+ total_duration_sec: float,
154
+ ) -> tuple[list[tuple[float, float]], int]:
155
+ """Spawn VAD CLI as a separate process; return silence intervals and speech_count.
156
+
157
+ VAD-AD-02/04: Inverts the speech intervals returned by the VAD CLI against
158
+ total_duration_sec to produce silence intervals. speech_count is used only
159
+ for VAD summary generation and is not passed to the common flow (§7.5).
160
+
161
+ Args:
162
+ source: Absolute path to the input media file.
163
+ options: DetectSilenceOptions (references vad_* fields).
164
+ total_duration_sec: Total duration of source (seconds). Used for inversion.
165
+
166
+ Returns:
167
+ Tuple of (silence interval list, speech_count).
168
+
169
+ Raises:
170
+ ClipwrightError: Maps VAD CLI error JSON to corresponding ErrorCode.
171
+ SUBPROCESS_FAILED if run() exits non-zero.
172
+ """
173
+ timeout = float(max(60, math.ceil(total_duration_sec * 4)))
174
+ cmd = [
175
+ sys.executable,
176
+ "-m",
177
+ "clipwright_silence.vad_cli",
178
+ "--media",
179
+ source,
180
+ "--threshold",
181
+ f"{options.vad_threshold}",
182
+ "--min-speech",
183
+ f"{options.vad_min_speech_duration}",
184
+ "--min-silence",
185
+ f"{options.vad_min_silence_duration}",
186
+ "--media-duration",
187
+ # vad_cli uses ceil(total*2); float precision has no practical impact (NF-L-1)
188
+ f"{total_duration_sec}",
189
+ ]
190
+ result = run(cmd, timeout=timeout)
191
+
192
+ try:
193
+ payload: dict[str, Any] = json.loads(result.stdout)
194
+ except json.JSONDecodeError as exc:
195
+ raise ClipwrightError(
196
+ code=ErrorCode.SUBPROCESS_FAILED,
197
+ message="VAD CLI returned invalid JSON output",
198
+ hint=(
199
+ "VAD CLI did not return the expected JSON. "
200
+ "If the error persists, please report with reproduction steps."
201
+ ),
202
+ ) from exc
203
+
204
+ # If error JSON, map to ErrorCode and raise ClipwrightError (§7.1)
205
+ if "error" in payload:
206
+ err = payload["error"]
207
+ raw_code: str = err.get("code", "INTERNAL")
208
+ message: str = err.get("message", "An error occurred in VAD CLI")
209
+ hint: str = err.get("hint", "Please report with reproduction steps.")
210
+
211
+ # Map to known ErrorCode; fall back to SUBPROCESS_FAILED for unknown codes
212
+ try:
213
+ error_code = ErrorCode(raw_code)
214
+ except ValueError:
215
+ error_code = ErrorCode.SUBPROCESS_FAILED
216
+
217
+ raise ClipwrightError(code=error_code, message=message, hint=hint)
218
+
219
+ # Pre-process speech intervals (§7.4): clip and remove degenerate intervals
220
+ raw_segments: list[dict[str, Any]] = payload.get("speech_segments", [])
221
+ total = total_duration_sec
222
+ speech_segments: list[tuple[float, float]] = []
223
+ for seg in raw_segments:
224
+ try:
225
+ # Accept both dict {"start": ..., "end": ...} and list [start, end] formats
226
+ if isinstance(seg, (list, tuple)):
227
+ start, end = float(seg[0]), float(seg[1])
228
+ else:
229
+ start, end = float(seg["start"]), float(seg["end"])
230
+ except (TypeError, KeyError, ValueError, IndexError):
231
+ # Skip malformed elements (null/string/empty dict, etc.) — SR L-3
232
+ continue
233
+ # Clip start < 0 to 0, clip end > total to total
234
+ start = max(0.0, start)
235
+ end = min(total, end)
236
+ # Remove degenerate intervals (start >= end)
237
+ if start >= end:
238
+ continue
239
+ speech_segments.append((start, end))
240
+
241
+ speech_count = len(speech_segments)
242
+
243
+ # Invert speech intervals -> silence intervals (VAD-AD-04)
244
+ # Sort speech intervals ascending and take the complement of [0, total]
245
+ sorted_speech = sorted(speech_segments, key=lambda iv: iv[0])
246
+ silence_intervals: list[tuple[float, float]] = []
247
+ cursor = 0.0
248
+ for s_start, s_end in sorted_speech:
249
+ if s_start > cursor:
250
+ silence_intervals.append((cursor, s_start))
251
+ cursor = max(cursor, s_end)
252
+ if cursor < total:
253
+ silence_intervals.append((cursor, total))
254
+
255
+ return silence_intervals, speech_count
256
+
257
+
258
+ def detect_silence(
259
+ media: str,
260
+ output: str,
261
+ options: DetectSilenceOptions,
262
+ ) -> dict[str, Any]:
263
+ """Detect silence intervals and generate a KEEP interval OTIO timeline (AD-2/AD-5).
264
+
265
+ Non-destructive: does not modify the input media file in any way.
266
+ Output returns the path of the newly created timeline.otio in artifacts.
267
+
268
+ Flow:
269
+ 1. Output validation (extension, parent directory, output==media, same directory)
270
+ 2. inspect_media -> verify audio/video streams and duration
271
+ 3. Run ffmpeg silencedetect and parse stderr
272
+ 4. Derive KEEP intervals with derive_keep_ranges
273
+ 5. Build and save OTIO timeline
274
+ 6. Return envelope
275
+
276
+ Args:
277
+ media: Input media file path.
278
+ output: Output timeline.otio file path (must be in the same directory as media).
279
+ options: DetectSilenceOptions.
280
+
281
+ Returns:
282
+ Envelope dict from ok_result or error_result.
283
+ """
284
+ try:
285
+ return _detect_inner(media, output, options)
286
+ except ClipwrightError as exc:
287
+ return error_result(exc.code, exc.message, exc.hint)
288
+
289
+
290
+ def _detect_inner(
291
+ media: str,
292
+ output: str,
293
+ options: DetectSilenceOptions,
294
+ ) -> dict[str, Any]:
295
+ """Internal implementation of detect_silence. Raises ClipwrightError directly."""
296
+ output_path = Path(output)
297
+ media_path = Path(media)
298
+
299
+ # --- 1. Output validation ---
300
+
301
+ # Extension must be .otio (AD-5)
302
+ if output_path.suffix.lower() != ".otio":
303
+ raise ClipwrightError(
304
+ code=ErrorCode.INVALID_INPUT,
305
+ message=(
306
+ f"Invalid output file extension: {output_path.suffix!r}. "
307
+ "Only .otio is allowed."
308
+ ),
309
+ hint="Change the output file path extension to .otio.",
310
+ )
311
+
312
+ # Verify parent directory exists (no auto-creation, AD-5)
313
+ if not output_path.parent.exists():
314
+ raise ClipwrightError(
315
+ code=ErrorCode.INVALID_INPUT,
316
+ message=(
317
+ "The output directory does not exist. "
318
+ "Check the parent directory of the specified output path."
319
+ ),
320
+ hint="Create the output directory first, then re-run.",
321
+ )
322
+
323
+ # Prevent output == media (avoid overwriting the same path)
324
+ try:
325
+ if output_path.resolve() == media_path.resolve():
326
+ raise ClipwrightError(
327
+ code=ErrorCode.INVALID_INPUT,
328
+ message="The output path and input media path are identical.",
329
+ hint=(
330
+ "Change the output file path to a path"
331
+ " different from the input media."
332
+ ),
333
+ )
334
+ except OSError as exc:
335
+ if str(output_path) == str(media_path):
336
+ raise ClipwrightError(
337
+ code=ErrorCode.INVALID_INPUT,
338
+ message="The output path and input media path are identical.",
339
+ hint=(
340
+ "Change the output file path to a path"
341
+ " different from the input media."
342
+ ),
343
+ ) from exc
344
+
345
+ # --- 2. inspect_media -> verify streams and duration ---
346
+
347
+ # inspect_media raises FILE_NOT_FOUND (incl. symlink rejection) / PROBE_FAILED, etc.
348
+ # SR L-2: Replace FILE_NOT_FOUND message with basename only
349
+ # (prevents full path exposure; same policy as render._probe M-1).
350
+ try:
351
+ media_info = inspect_media(media)
352
+ except ClipwrightError as exc:
353
+ if exc.code == ErrorCode.FILE_NOT_FOUND:
354
+ raise ClipwrightError(
355
+ code=ErrorCode.FILE_NOT_FOUND,
356
+ message=f"File not found: {media_path.name}",
357
+ hint=exc.hint,
358
+ ) from exc
359
+ raise
360
+
361
+ # Verify output is in the same directory as media (DC-AS-001)
362
+ # Done after inspect_media so the path has been confirmed to exist before resolve()
363
+ try:
364
+ media_dir = media_path.resolve().parent
365
+ output_dir = output_path.parent.resolve()
366
+ if media_dir != output_dir:
367
+ raise ClipwrightError(
368
+ code=ErrorCode.INVALID_INPUT,
369
+ message=(
370
+ "The output timeline must be placed in the same"
371
+ f" directory as the input media (input: {media_path.name})."
372
+ ),
373
+ hint=(
374
+ "Change the output path to be in the same directory"
375
+ " as the media file."
376
+ " (e.g., output = same directory as media / timeline.otio)"
377
+ ),
378
+ )
379
+ except ClipwrightError:
380
+ raise
381
+ except OSError:
382
+ # resolve failure (network paths, etc.) is skipped on best-effort basis
383
+ pass
384
+
385
+ # Verify video stream (DC-AS-002)
386
+ has_video = any(s.codec_type == "video" for s in media_info.streams)
387
+ has_audio = any(s.codec_type == "audio" for s in media_info.streams)
388
+
389
+ if not has_video:
390
+ raise ClipwrightError(
391
+ code=ErrorCode.UNSUPPORTED_OPERATION,
392
+ message=f"No video stream found: {media_path.name}",
393
+ hint=(
394
+ "This tool targets media with both video and audio streams. "
395
+ "Specify a media file that contains video."
396
+ ),
397
+ )
398
+
399
+ if not has_audio:
400
+ raise ClipwrightError(
401
+ code=ErrorCode.UNSUPPORTED_OPERATION,
402
+ message=f"No audio stream found: {media_path.name}",
403
+ hint=(
404
+ "Silence detection requires an audio stream. "
405
+ "Specify a media file that contains audio."
406
+ ),
407
+ )
408
+
409
+ # Verify duration (DC-AS-004)
410
+ if media_info.duration is None:
411
+ raise ClipwrightError(
412
+ code=ErrorCode.PROBE_FAILED,
413
+ message=f"Could not retrieve media duration: {media_path.name}",
414
+ hint=(
415
+ "Check that the media file is not corrupted. "
416
+ "You can also verify manually with ffprobe."
417
+ ),
418
+ )
419
+
420
+ total_duration_sec = media_info.duration.value / media_info.duration.rate
421
+ rate = media_info.duration.rate
422
+
423
+ # --- 3. Run detection (branch by backend) ---
424
+
425
+ abs_media = str(media_path.resolve())
426
+
427
+ # speech_count is for VAD summary only; silence interval list is shared
428
+ speech_count: int | None = None
429
+
430
+ if options.backend == "vad":
431
+ # VAD path: spawn as separate process via sys.executable -m
432
+ # resolve_tool is not used (sys.executable -m ensures same venv, VAD-AD-02)
433
+ silence_intervals, speech_count = _detect_vad_silence_intervals(
434
+ abs_media, options, total_duration_sec
435
+ )
436
+ else:
437
+ # silencedetect path (existing; backend="silencedetect")
438
+ ffmpeg = resolve_tool("ffmpeg", "CLIPWRIGHT_FFMPEG")
439
+ silence_intervals = _detect_silence_intervals(
440
+ ffmpeg, abs_media, options, total_duration_sec
441
+ )
442
+
443
+ # --- 4. Derive KEEP intervals ---
444
+
445
+ keep_ranges = derive_keep_ranges(total_duration_sec, silence_intervals, options)
446
+
447
+ # --- 5. Build and save OTIO timeline ---
448
+
449
+ timeline = new_timeline(media_path.name)
450
+ v1 = timeline.tracks[0] # V1 (Video) track
451
+
452
+ for start_sec, end_sec in keep_ranges:
453
+ start_value = start_sec * rate
454
+ dur_value = (end_sec - start_sec) * rate
455
+ source_range = TimeRangeModel(
456
+ start_time=RationalTimeModel(value=start_value, rate=rate),
457
+ duration=RationalTimeModel(value=dur_value, rate=rate),
458
+ )
459
+ media_ref = MediaRef(target_url=abs_media)
460
+ add_clip(
461
+ v1,
462
+ media_ref,
463
+ source_range,
464
+ name="keep",
465
+ metadata={
466
+ "tool": "clipwright-silence",
467
+ "version": clipwright_silence.__version__,
468
+ "kind": "keep",
469
+ "backend": options.backend, # VAD-AD-07
470
+ },
471
+ )
472
+
473
+ save_timeline(timeline, output)
474
+
475
+ # --- 6. Return envelope ---
476
+
477
+ silence_count = len(silence_intervals)
478
+ keep_count = len(keep_ranges)
479
+ total_silence_seconds = sum(e - s for s, e in silence_intervals)
480
+ total_keep_seconds = sum(e - s for s, e in keep_ranges)
481
+
482
+ # Differentiate summary by backend (VAD-AD-08, §7.5)
483
+ # In the VAD path, _detect_vad_silence_intervals always returns an int,
484
+ # so speech_count is guaranteed to be int (assert to satisfy mypy)
485
+ if options.backend == "vad":
486
+ assert speech_count is not None # Always set on the VAD path
487
+ _silence_fmt = _fmt_sec(total_silence_seconds)
488
+ _keep_fmt = _fmt_sec(total_keep_seconds)
489
+ summary = (
490
+ f"Detected {speech_count} speech interval(s). "
491
+ f"Removed {silence_count} non-speech interval(s) (total {_silence_fmt}). "
492
+ f"Generated {output_path.name} with {keep_count} interval(s) to keep"
493
+ f" (total {_keep_fmt})."
494
+ )
495
+ else:
496
+ _silence_fmt = _fmt_sec(total_silence_seconds)
497
+ _keep_fmt = _fmt_sec(total_keep_seconds)
498
+ summary = (
499
+ f"Detected {silence_count} silence interval(s) (total {_silence_fmt})"
500
+ f" from source with duration {_fmt_sec(total_duration_sec)}. "
501
+ f"Generated {output_path.name} with {keep_count} interval(s) to keep"
502
+ f" (total {_keep_fmt})."
503
+ )
504
+
505
+ warnings: list[str] = []
506
+ if keep_count == 0:
507
+ warnings.append(
508
+ "No intervals to keep (all intervals classified as silence). "
509
+ "The V1 track of the generated timeline.otio is empty. "
510
+ "Passing it to render will result in INVALID_INPUT."
511
+ )
512
+
513
+ return ok_result(
514
+ summary,
515
+ data={
516
+ "silence_count": silence_count,
517
+ "total_silence_seconds": total_silence_seconds,
518
+ "keep_count": keep_count,
519
+ "total_keep_seconds": total_keep_seconds,
520
+ },
521
+ artifacts=[{"role": "timeline", "path": str(output_path), "format": "otio"}],
522
+ warnings=warnings,
523
+ )
@@ -0,0 +1,107 @@
1
+ """plan.py — Pure logic for deriving KEEP intervals from silence intervals.
2
+
3
+ Does not execute ffmpeg at all. Performs interval arithmetic on float seconds
4
+ and delegates OTIO conversion to the detect layer (AD-2/AD-3 design policy).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from clipwright_silence.schemas import DetectSilenceOptions
10
+
11
+ # CR-Q-004: Floating-point comparison tolerance. Used to preserve boundary
12
+ # equality (DC-AM-001: min_keep equal preservation / _merge_intervals adjacent merging).
13
+ _EPSILON = 1e-9
14
+
15
+
16
+ def derive_keep_ranges(
17
+ total_duration_sec: float,
18
+ silence_intervals: list[tuple[float, float]],
19
+ options: DetectSilenceOptions,
20
+ ) -> list[tuple[float, float]]:
21
+ """Derive KEEP intervals from a list of silence intervals.
22
+
23
+ Processing flow (AD-3):
24
+ 1. Sort silence intervals by start time.
25
+ 2. Invert [0, total_duration_sec] against silence intervals to get KEEP intervals.
26
+ - Zero silence -> [(0.0, total_duration_sec)] as a single interval.
27
+ - All silence -> empty list.
28
+ 3. Extend each KEEP by padding and clamp to [0, total].
29
+ 4. Merge overlapping KEEPs (DC-GP-001 short-silence fill-in).
30
+ 5. Discard intervals shorter than min_keep_duration (DC-AM-001 opt-in).
31
+
32
+ Args:
33
+ total_duration_sec: Total duration of the source media (seconds).
34
+ silence_intervals: List of silence intervals. Each element is
35
+ (start_sec, end_sec).
36
+ options: DetectSilenceOptions. Uses padding / min_keep_duration.
37
+ (silence_threshold_db / min_silence_duration are the silencedetect
38
+ layer's responsibility and are not referenced in this function.)
39
+
40
+ Returns:
41
+ List of KEEP intervals. Each element is a tuple[float, float]
42
+ of (start_sec, end_sec).
43
+ Sorted by time, non-overlapping.
44
+ """
45
+ total = total_duration_sec
46
+ padding = options.padding
47
+ min_keep = options.min_keep_duration
48
+
49
+ # 1. Sort silence intervals.
50
+ sorted_silence = sorted(silence_intervals, key=lambda iv: iv[0])
51
+
52
+ # 2. Invert: subtract silence intervals from [0, total] to get KEEPs.
53
+ keeps: list[tuple[float, float]] = []
54
+ cursor = 0.0
55
+ for s_start, s_end in sorted_silence:
56
+ if s_start > cursor:
57
+ keeps.append((cursor, s_start))
58
+ # Advance cursor to end of silence (handles overlapping silences).
59
+ cursor = max(cursor, s_end)
60
+ # Trailing speech interval.
61
+ if cursor < total:
62
+ keeps.append((cursor, total))
63
+
64
+ # 3. Padding extension + clamp.
65
+ if padding > 0.0:
66
+ padded: list[tuple[float, float]] = []
67
+ for start, end in keeps:
68
+ new_start = max(0.0, start - padding)
69
+ new_end = min(total, end + padding)
70
+ padded.append((new_start, new_end))
71
+ keeps = padded
72
+
73
+ # 4. Merge overlapping intervals (DC-GP-001).
74
+ keeps = _merge_intervals(keeps)
75
+
76
+ # 5. Discard intervals shorter than min_keep_duration (default 0.0 = no discard).
77
+ if min_keep > 0.0:
78
+ # DC-AM-001: Use _EPSILON to preserve intervals equal to min_keep
79
+ keeps = [
80
+ (start, end) for start, end in keeps if (end - start) >= min_keep - _EPSILON
81
+ ]
82
+
83
+ return keeps
84
+
85
+
86
+ def _merge_intervals(
87
+ intervals: list[tuple[float, float]],
88
+ ) -> list[tuple[float, float]]:
89
+ """Merge overlapping intervals and return a sorted, non-overlapping list.
90
+
91
+ Intervals do not need to be pre-sorted by start time (sorted internally).
92
+ """
93
+ if not intervals:
94
+ return []
95
+
96
+ sorted_ivs = sorted(intervals, key=lambda iv: iv[0])
97
+ merged: list[tuple[float, float]] = [sorted_ivs[0]]
98
+
99
+ for start, end in sorted_ivs[1:]:
100
+ prev_start, prev_end = merged[-1]
101
+ if start <= prev_end + _EPSILON:
102
+ # Overlapping or adjacent -> merge.
103
+ merged[-1] = (prev_start, max(prev_end, end))
104
+ else:
105
+ merged.append((start, end))
106
+
107
+ return merged
@@ -0,0 +1,128 @@
1
+ """schemas.py — clipwright-silence specific Pydantic schemas.
2
+
3
+ Common types (MediaRef / Artifact / ToolResult, etc.) are centrally defined
4
+ in clipwright.schemas and are not redefined in this module.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Annotated, Literal
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+
14
+ class DetectSilenceOptions(BaseModel):
15
+ """Options for clipwright_detect_silence (AD-2/AD-3, DC-AM-001).
16
+
17
+ silence_threshold_db and min_silence_duration are detection parameters
18
+ passed directly to the ffmpeg silencedetect filter.
19
+ padding and min_keep_duration are post-processing parameters used by
20
+ the KEEP derivation logic in plan.py.
21
+ vad_* fields are only effective when backend="vad" (VAD-AD-05).
22
+ """
23
+
24
+ silence_threshold_db: Annotated[
25
+ float,
26
+ Field(
27
+ default=-30.0,
28
+ le=0.0,
29
+ description=(
30
+ "silencedetect backend only. Use vad_* when using VAD. "
31
+ "Volume threshold (dB) for silence detection. Must be <= 0. "
32
+ "Example: -30.0 dB (default), -40.0 dB (stricter detection)."
33
+ ),
34
+ ),
35
+ ] = -30.0
36
+
37
+ min_silence_duration: Annotated[
38
+ float,
39
+ Field(
40
+ default=0.5,
41
+ gt=0.0,
42
+ description=(
43
+ "silencedetect backend only. Use vad_* when using VAD. "
44
+ "Minimum duration (seconds) to consider as silence. Must be > 0. "
45
+ "Silences shorter than this value are ignored. Default is 0.5 seconds."
46
+ ),
47
+ ),
48
+ ] = 0.5
49
+
50
+ padding: Annotated[
51
+ float,
52
+ Field(
53
+ default=0.1,
54
+ ge=0.0,
55
+ description=(
56
+ "Padding width (seconds) to extend each KEEP interval on both sides."
57
+ " Must be >= 0. If extension causes adjacent KEEPs to overlap,"
58
+ " they are merged (prevents word cutoff). Default is 0.1 seconds."
59
+ ),
60
+ ),
61
+ ] = 0.1
62
+
63
+ min_keep_duration: Annotated[
64
+ float,
65
+ Field(
66
+ default=0.0,
67
+ ge=0.0,
68
+ description=(
69
+ "Minimum interval length (seconds) to retain as KEEP. Must be >= 0."
70
+ " KEEP intervals shorter than this value are discarded after"
71
+ " padding and merging."
72
+ " Default is 0.0 (no discard; DC-AM-001 opt-in guard)."
73
+ ),
74
+ ),
75
+ ] = 0.0
76
+
77
+ backend: Annotated[
78
+ Literal["silencedetect", "vad"],
79
+ Field(
80
+ default="silencedetect",
81
+ description=(
82
+ "Detection backend to use. "
83
+ '"silencedetect" (default) uses the ffmpeg silencedetect filter. '
84
+ '"vad" uses Silero VAD (ONNX). VAD-AD-01 backward-compatible opt-in.'
85
+ ),
86
+ ),
87
+ ] = "silencedetect"
88
+
89
+ vad_threshold: Annotated[
90
+ float,
91
+ Field(
92
+ default=0.5,
93
+ ge=0.0,
94
+ le=1.0,
95
+ description=(
96
+ "VAD backend only. "
97
+ "Speech probability threshold (0.0-1.0)."
98
+ " Values >= this are considered speech. Default is 0.5."
99
+ ),
100
+ ),
101
+ ] = 0.5
102
+
103
+ vad_min_speech_duration: Annotated[
104
+ float,
105
+ Field(
106
+ default=0.25,
107
+ gt=0.0,
108
+ description=(
109
+ "VAD backend only. "
110
+ "Minimum duration (seconds) to classify as speech. Must be > 0. "
111
+ "Default is 0.25 seconds."
112
+ ),
113
+ ),
114
+ ] = 0.25
115
+
116
+ vad_min_silence_duration: Annotated[
117
+ float,
118
+ Field(
119
+ default=0.1,
120
+ gt=0.0,
121
+ description=(
122
+ "VAD backend only. "
123
+ "Minimum silence duration (seconds) between speech intervals."
124
+ " Must be > 0. Silences shorter than this value are absorbed"
125
+ " into speech intervals. Default is 0.1 seconds."
126
+ ),
127
+ ),
128
+ ] = 0.1
@@ -0,0 +1,88 @@
1
+ """server.py — clipwright-silence MCP server + CLI entry point.
2
+
3
+ A thin wrapper that delegates business logic to detect.py.
4
+ ClipwrightError conversion is handled on the detect.py side;
5
+ no double conversion is done here.
6
+
7
+ Transport defaults to stdio (mcp.run(transport="stdio")).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Annotated, Any
13
+
14
+ from mcp.server.fastmcp import FastMCP
15
+ from mcp.types import ToolAnnotations
16
+ from pydantic import Field
17
+
18
+ from clipwright_silence.detect import detect_silence
19
+ from clipwright_silence.schemas import DetectSilenceOptions
20
+
21
+ # FastMCP instance (server name)
22
+ mcp = FastMCP("clipwright-silence")
23
+
24
+
25
+ # ===========================================================================
26
+ # clipwright_detect_silence MCP tool
27
+ # ===========================================================================
28
+
29
+
30
+ @mcp.tool(
31
+ annotations=ToolAnnotations(
32
+ readOnlyHint=True,
33
+ destructiveHint=False,
34
+ idempotentHint=True,
35
+ openWorldHint=False,
36
+ )
37
+ )
38
+ def clipwright_detect_silence(
39
+ media: Annotated[
40
+ str,
41
+ Field(description="Input media file path (source containing video and audio)."),
42
+ ],
43
+ output: Annotated[
44
+ str,
45
+ Field(description="Output OTIO timeline file path (.otio extension)."),
46
+ ],
47
+ options: Annotated[
48
+ DetectSilenceOptions | None,
49
+ Field(
50
+ description=(
51
+ "Silence detection options (silence_threshold_db / min_silence_duration"
52
+ " / padding / min_keep_duration). All values use defaults when omitted."
53
+ )
54
+ ),
55
+ ] = None,
56
+ ) -> dict[str, Any]:
57
+ """MCP tool: detect silence intervals and generate a KEEP interval OTIO timeline.
58
+
59
+ Does not modify the input media file (non-destructive, readOnly).
60
+ Output returns the path of the newly created timeline.otio in artifacts.
61
+
62
+ Delegates business logic to detect.detect_silence.
63
+ Uses default DetectSilenceOptions() when options is None.
64
+ """
65
+ resolved_options = options if options is not None else DetectSilenceOptions()
66
+ return detect_silence(
67
+ media=media,
68
+ output=output,
69
+ options=resolved_options,
70
+ )
71
+
72
+
73
+ # ===========================================================================
74
+ # Entry point (MCP stdio launch / DC-GP-002)
75
+ # ===========================================================================
76
+
77
+
78
+ def main() -> None:
79
+ """CLI entry point. Launches the MCP server over stdio (DC-GP-002).
80
+
81
+ Registered in pyproject.toml [project.scripts] as:
82
+ clipwright-silence = "clipwright_silence.server:main"
83
+ """
84
+ mcp.run(transport="stdio")
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
@@ -0,0 +1,284 @@
1
+ """vad_cli.py — Separate-process small CLI for the Silero VAD backend.
2
+
3
+ Not imported by the MCP server process (§2.4 subprocess loose coupling).
4
+ detect.py spawns this as a separate process via
5
+ sys.executable -m clipwright_silence.vad_cli.
6
+
7
+ CLI contract (§7.1 unified):
8
+ - main(argv) catches all exceptions at the top level, always writes stdout JSON,
9
+ and returns 0.
10
+ - Success: {"speech_segments": [[start_sec, end_sec], ...]}
11
+ - Error: {"error": {"code": str, "message": str, "hint": str}}
12
+ - stdout is JSON only. Logs and progress go to stderr.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import contextlib
19
+ import json
20
+ import math
21
+ import os
22
+ import sys
23
+ import tempfile
24
+ import wave
25
+ from typing import Any
26
+
27
+ from clipwright.errors import ClipwrightError, ErrorCode
28
+ from clipwright.process import resolve_tool, run
29
+
30
+ # Fixed sample rate (§7.3)
31
+ _SAMPLE_RATE = 16000
32
+ # pip install hint string
33
+ _VAD_INSTALL_HINT = (
34
+ "Install VAD dependencies with `pip install 'clipwright-silence[vad]'`."
35
+ )
36
+ # Sanitized generic message for SUBPROCESS_FAILED/TIMEOUT
37
+ # (SR M-1: prevents ffmpeg stderr leakage)
38
+ _SUBPROCESS_SAFE_MESSAGE = "internal subprocess failed"
39
+
40
+
41
+ def _error_output(code: str, message: str, hint: str) -> None:
42
+ """Write error JSON to stdout.
43
+
44
+ The caller must sanitize path information before passing it here.
45
+ """
46
+ result: dict[str, Any] = {
47
+ "error": {
48
+ "code": code,
49
+ "message": message,
50
+ "hint": hint,
51
+ }
52
+ }
53
+ print(json.dumps(result, ensure_ascii=False), file=sys.stdout)
54
+
55
+
56
+ def _extract_pcm(ffmpeg: str, media: str, output_path: str, timeout: float) -> None:
57
+ """Write 16kHz mono s16le PCM to a temporary file using ffmpeg.
58
+
59
+ Executed with shell=False and argument array only (subprocess discipline).
60
+ """
61
+ cmd = [
62
+ ffmpeg,
63
+ "-hide_banner",
64
+ "-nostats",
65
+ "-i",
66
+ media,
67
+ "-vn",
68
+ "-acodec",
69
+ "pcm_s16le",
70
+ "-ar",
71
+ str(_SAMPLE_RATE),
72
+ "-ac",
73
+ "1",
74
+ "-y",
75
+ output_path,
76
+ ]
77
+ run(cmd, timeout=timeout)
78
+
79
+
80
+ def _load_audio_as_float32(
81
+ pcm_path: str,
82
+ ) -> tuple[Any, int]:
83
+ """Read a PCM WAV file and return a float32 numpy array and sample_rate.
84
+
85
+ Normalizes int16 -> float32 (/32768.0).
86
+
87
+ Precondition: This function must always be called via main().
88
+ main() lazily imports numpy and registers it in sys.modules, so the import
89
+ inside this function is effectively a cache lookup (NF-M-2: document precondition).
90
+ Calling directly from tests or utilities breaks the loose-coupling intent of CR L-2.
91
+ """
92
+ import numpy as np # See docstring (cached in sys.modules by main())
93
+
94
+ with wave.open(pcm_path, "rb") as wf:
95
+ n_frames = wf.getnframes()
96
+ sample_rate = wf.getframerate()
97
+ raw = wf.readframes(n_frames)
98
+
99
+ audio_int16 = np.frombuffer(raw, dtype=np.int16)
100
+ audio_float32: Any = audio_int16.astype(np.float32) / 32768.0
101
+ return audio_float32, sample_rate
102
+
103
+
104
+ def main(argv: list[str] | None = None) -> int:
105
+ """VAD CLI entry point.
106
+
107
+ Catches all exceptions at the top level, writes JSON to stdout,
108
+ and returns 0 (§7.1).
109
+
110
+ Args:
111
+ argv: Command-line argument list. Uses sys.argv[1:] if None.
112
+
113
+ Returns:
114
+ Exit code (always 0).
115
+ """
116
+ # Lazily import numpy inside main (CR L-2: avoid top-level import to keep
117
+ # server process loosely coupled. Separate process via sys.executable -m).
118
+ # Pre-import to cache in sys.modules (referenced by _load_audio_as_float32).
119
+ # noqa: F401 = suppress lint warning for not referencing directly (NF-L-2).
120
+ import numpy as np # noqa: F401
121
+
122
+ # --- Parse arguments ---
123
+ parser = argparse.ArgumentParser(
124
+ description="Detect speech intervals using Silero VAD and output JSON to stdout." # noqa: E501
125
+ )
126
+ parser.add_argument("--media", required=True, help="Input media file path")
127
+ parser.add_argument(
128
+ "--threshold",
129
+ type=float,
130
+ default=0.5,
131
+ help="Speech probability threshold (0.0-1.0, default: 0.5)",
132
+ )
133
+ parser.add_argument(
134
+ "--min-speech",
135
+ type=float,
136
+ default=0.25,
137
+ help="Minimum speech duration (seconds, default: 0.25)",
138
+ )
139
+ parser.add_argument(
140
+ "--min-silence",
141
+ type=float,
142
+ default=0.1,
143
+ help="Minimum silence duration between speech intervals (seconds, default: 0.1)", # noqa: E501
144
+ )
145
+ parser.add_argument(
146
+ "--media-duration",
147
+ type=float,
148
+ default=None,
149
+ help=(
150
+ "Total media duration (seconds). Used to calculate the inner"
151
+ " ffmpeg timeout proportional to total duration (§7.7)."
152
+ " Uses a safe default (60s) if omitted."
153
+ ),
154
+ )
155
+
156
+ try:
157
+ args = parser.parse_args(argv)
158
+ except SystemExit as exc:
159
+ # Catch SystemExit from argparse (--help or missing required args)
160
+ _error_output(
161
+ code=ErrorCode.INVALID_INPUT,
162
+ message=f"Argument parsing failed: exit code {exc.code}",
163
+ hint="Specify --media <path> as a required argument.",
164
+ )
165
+ return 0
166
+
167
+ media: str = args.media
168
+ threshold: float = args.threshold
169
+ min_speech_sec: float = args.min_speech
170
+ min_silence_sec: float = args.min_silence
171
+ media_duration: float | None = args.media_duration
172
+
173
+ try:
174
+ # --- Lazy import of silero_vad (keep out of server process, §2.4) ---
175
+ try:
176
+ import silero_vad
177
+ except ImportError:
178
+ # SR L-2: str(exc) may contain internal paths; use fixed message
179
+ _error_output(
180
+ code=ErrorCode.DEPENDENCY_MISSING,
181
+ message="Failed to import silero_vad or onnxruntime",
182
+ hint=_VAD_INSTALL_HINT,
183
+ )
184
+ return 0
185
+
186
+ # --- Load Silero VAD model (before ffmpeg, to catch ImportError early) ---
187
+ # load_silero_vad raises ImportError if onnxruntime is missing (§7.3)
188
+ model = silero_vad.load_silero_vad(onnx=True)
189
+
190
+ # --- Resolve ffmpeg (§7.2) ---
191
+ ffmpeg = resolve_tool("ffmpeg", "CLIPWRIGHT_FFMPEG")
192
+
193
+ # --- Extract 16kHz mono s16le PCM to a temp file using ffmpeg (§7.3) ---
194
+ # Inner timeout must always be shorter than outer (§7.7)
195
+ # max(60, ceil(total*4)) for outer; proportional for inner
196
+ # When --media-duration given: max(30, ceil(total * 2))
197
+ # When omitted: safe default of 60 seconds
198
+ if media_duration is not None:
199
+ ffmpeg_timeout = float(max(30, math.ceil(media_duration * 2)))
200
+ else:
201
+ ffmpeg_timeout = 60.0
202
+
203
+ tmp_path: str = ""
204
+ audio_float32: Any
205
+ sample_rate: int
206
+
207
+ # Open with delete=False to get the name, then delete in try/finally (§7.3)
208
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
209
+ tmp_path = tmp_file.name
210
+ try:
211
+ _extract_pcm(ffmpeg, media, tmp_path, timeout=ffmpeg_timeout)
212
+ audio_float32, sample_rate = _load_audio_as_float32(tmp_path)
213
+ finally:
214
+ # Ensure deletion even on exception (§7.3)
215
+ if tmp_path and os.path.exists(tmp_path):
216
+ with contextlib.suppress(OSError):
217
+ os.unlink(tmp_path)
218
+
219
+ # get_speech_timestamps returns sample-unit values
220
+ # min_speech_duration_ms / min_silence_duration_ms are in milliseconds
221
+ raw_segments: list[dict[str, Any]] = silero_vad.get_speech_timestamps(
222
+ audio_float32,
223
+ model,
224
+ threshold=threshold,
225
+ sampling_rate=sample_rate,
226
+ min_speech_duration_ms=int(min_speech_sec * 1000),
227
+ min_silence_duration_ms=int(min_silence_sec * 1000),
228
+ return_seconds=False,
229
+ )
230
+
231
+ # Convert sample units -> seconds (built in ascending order)
232
+ speech_segments: list[list[float]] = []
233
+ for seg in sorted(raw_segments, key=lambda s: s["start"]):
234
+ start_sec = float(seg["start"]) / sample_rate
235
+ end_sec = float(seg["end"]) / sample_rate
236
+ speech_segments.append([start_sec, end_sec])
237
+
238
+ result: dict[str, Any] = {"speech_segments": speech_segments}
239
+ print(json.dumps(result, ensure_ascii=False), file=sys.stdout)
240
+ return 0
241
+
242
+ except ClipwrightError as exc:
243
+ # Catch ClipwrightError from core run() (SUBPROCESS_FAILED/TIMEOUT) and
244
+ # resolve_tool DEPENDENCY_MISSING here (§7.1/§7.2)
245
+ # SR M-1: SUBPROCESS_FAILED/TIMEOUT may embed ffmpeg stderr in message;
246
+ # replace with generic message to prevent path leakage
247
+ if exc.code in (ErrorCode.SUBPROCESS_FAILED, ErrorCode.SUBPROCESS_TIMEOUT):
248
+ safe_message = f"{_SUBPROCESS_SAFE_MESSAGE} (code: {exc.code})"
249
+ else:
250
+ safe_message = exc.message
251
+ _error_output(
252
+ code=str(exc.code),
253
+ message=safe_message,
254
+ hint=exc.hint,
255
+ )
256
+ return 0
257
+
258
+ except ImportError:
259
+ # ImportError propagated from load_silero_vad etc. when onnxruntime is missing
260
+ # SR L-2: str(exc) may contain internal paths; use fixed message
261
+ _error_output(
262
+ code=ErrorCode.DEPENDENCY_MISSING,
263
+ message="Failed to import silero_vad or onnxruntime",
264
+ hint=_VAD_INSTALL_HINT,
265
+ )
266
+ return 0
267
+
268
+ except Exception:
269
+ # Catch all unexpected exceptions and return error JSON (§7.1)
270
+ # SR NF-L-1: str(exc) may contain internal paths; use fixed message.
271
+ # Debug details are stderr-only; do not leak into stdout JSON (MCP response).
272
+ import traceback
273
+
274
+ traceback.print_exc(file=sys.stderr)
275
+ _error_output(
276
+ code=ErrorCode.INTERNAL,
277
+ message="An unexpected error occurred in VAD CLI",
278
+ hint="Please report with reproduction steps.",
279
+ )
280
+ return 0
281
+
282
+
283
+ if __name__ == "__main__":
284
+ sys.exit(main())