clipwright-silence 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clipwright_silence-0.1.0/PKG-INFO +21 -0
- clipwright_silence-0.1.0/README.md +3 -0
- clipwright_silence-0.1.0/pyproject.toml +102 -0
- clipwright_silence-0.1.0/src/clipwright_silence/__init__.py +3 -0
- clipwright_silence-0.1.0/src/clipwright_silence/detect.py +523 -0
- clipwright_silence-0.1.0/src/clipwright_silence/plan.py +107 -0
- clipwright_silence-0.1.0/src/clipwright_silence/py.typed +0 -0
- clipwright_silence-0.1.0/src/clipwright_silence/schemas.py +128 -0
- clipwright_silence-0.1.0/src/clipwright_silence/server.py +88 -0
- clipwright_silence-0.1.0/src/clipwright_silence/vad_cli.py +284 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: clipwright-silence
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP tool for silence detection and OTIO timeline generation. Detects silence with ffmpeg silencedetect and generates timeline.otio with clip sequence for regions to keep.
|
|
5
|
+
Author: satoh-y-0323
|
|
6
|
+
Author-email: satoh-y-0323 <shoma.papa.0323@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Dist: clipwright>=0.1.0
|
|
9
|
+
Requires-Dist: mcp[cli]>=1.27.2
|
|
10
|
+
Requires-Dist: opentimelineio>=0.18
|
|
11
|
+
Requires-Dist: pydantic>=2
|
|
12
|
+
Requires-Dist: silero-vad>=5.1 ; extra == 'vad'
|
|
13
|
+
Requires-Dist: onnxruntime>=1.17 ; extra == 'vad'
|
|
14
|
+
Requires-Dist: numpy>=1.24 ; extra == 'vad'
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Provides-Extra: vad
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# clipwright-silence
|
|
20
|
+
|
|
21
|
+
MCP tool for silence detection and OTIO timeline generation.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "clipwright-silence"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MCP tool for silence detection and OTIO timeline generation. Detects silence with ffmpeg silencedetect and generates timeline.otio with clip sequence for regions to keep."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "satoh-y-0323", email = "shoma.papa.0323@gmail.com" }
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"clipwright>=0.1.0",
|
|
13
|
+
"mcp[cli]>=1.27.2",
|
|
14
|
+
"opentimelineio>=0.18",
|
|
15
|
+
"pydantic>=2",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
vad = [
|
|
20
|
+
"silero-vad>=5.1",
|
|
21
|
+
"onnxruntime>=1.17",
|
|
22
|
+
# numpy is a transitive dependency of onnxruntime, but explicitly listed to pin minimum version
|
|
23
|
+
"numpy>=1.24",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[project.scripts]
|
|
27
|
+
clipwright-silence = "clipwright_silence.server:main"
|
|
28
|
+
clipwright-silence-vad = "clipwright_silence.vad_cli:main"
|
|
29
|
+
|
|
30
|
+
[build-system]
|
|
31
|
+
requires = ["uv_build>=0.11.19,<0.12.0"]
|
|
32
|
+
build-backend = "uv_build"
|
|
33
|
+
|
|
34
|
+
[dependency-groups]
|
|
35
|
+
dev = [
|
|
36
|
+
"mypy>=2.1.0",
|
|
37
|
+
"pytest>=9.0.3",
|
|
38
|
+
"pytest-cov>=7.1.0",
|
|
39
|
+
"pytest-mock>=3.15.1",
|
|
40
|
+
"ruff>=0.15.16",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# Resolve clipwright (core) within workspace by path reference
|
|
44
|
+
[tool.uv.sources]
|
|
45
|
+
clipwright = { workspace = true }
|
|
46
|
+
|
|
47
|
+
# --- Ruff ---
|
|
48
|
+
[tool.ruff]
|
|
49
|
+
target-version = "py311"
|
|
50
|
+
line-length = 88
|
|
51
|
+
|
|
52
|
+
[tool.ruff.lint]
|
|
53
|
+
select = ["E", "F", "W", "I", "UP", "B", "C4", "SIM"]
|
|
54
|
+
ignore = []
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint.per-file-ignores]
|
|
57
|
+
# Allow E501 for English docstrings/comments in test files
|
|
58
|
+
"tests/*.py" = ["E501"]
|
|
59
|
+
|
|
60
|
+
[tool.ruff.format]
|
|
61
|
+
# Default ruff formatter is OK
|
|
62
|
+
|
|
63
|
+
# --- mypy ---
|
|
64
|
+
[tool.mypy]
|
|
65
|
+
python_version = "3.11"
|
|
66
|
+
strict = true
|
|
67
|
+
warn_return_any = true
|
|
68
|
+
warn_unused_configs = true
|
|
69
|
+
disallow_untyped_defs = true
|
|
70
|
+
disallow_any_generics = true
|
|
71
|
+
|
|
72
|
+
# opentimelineio has no stubs, ignored with mypy strict
|
|
73
|
+
[[tool.mypy.overrides]]
|
|
74
|
+
module = "opentimelineio.*"
|
|
75
|
+
ignore_missing_imports = true
|
|
76
|
+
|
|
77
|
+
# silero-vad / onnxruntime have no stubs, ignored as VAD is optional extra
|
|
78
|
+
[[tool.mypy.overrides]]
|
|
79
|
+
module = "silero_vad.*"
|
|
80
|
+
ignore_missing_imports = true
|
|
81
|
+
|
|
82
|
+
[[tool.mypy.overrides]]
|
|
83
|
+
module = "onnxruntime.*"
|
|
84
|
+
ignore_missing_imports = true
|
|
85
|
+
|
|
86
|
+
# --- pytest ---
|
|
87
|
+
[tool.pytest.ini_options]
|
|
88
|
+
testpaths = ["tests"]
|
|
89
|
+
addopts = "--strict-markers -q"
|
|
90
|
+
markers = [
|
|
91
|
+
"integration: integration test requiring actual ffmpeg/ffprobe binaries",
|
|
92
|
+
"slow: test with long execution time",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
# --- coverage ---
|
|
96
|
+
[tool.coverage.run]
|
|
97
|
+
source = ["clipwright_silence"]
|
|
98
|
+
omit = ["tests/*"]
|
|
99
|
+
|
|
100
|
+
[tool.coverage.report]
|
|
101
|
+
show_missing = true
|
|
102
|
+
skip_covered = false
|
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
"""detect.py — clipwright-silence orchestration layer.
|
|
2
|
+
|
|
3
|
+
Handles the full flow: input validation -> inspect_media -> silencedetect
|
|
4
|
+
execution/parsing -> KEEP derivation -> OTIO construction/save -> envelope return.
|
|
5
|
+
|
|
6
|
+
Design decisions:
|
|
7
|
+
- _detect_silence_intervals() encapsulates ffmpeg execution and stderr parsing,
|
|
8
|
+
allowing future backend replacement (adapter abstraction, AD-1).
|
|
9
|
+
- _detect_vad_silence_intervals() handles spawning the VAD CLI as a separate
|
|
10
|
+
process and inverting speech -> silence. Both return (silence interval list)
|
|
11
|
+
with a common contract so derive_keep_ranges onward uses a shared flow.
|
|
12
|
+
- source_range rate is taken from inspect_media MediaInfo.duration.rate,
|
|
13
|
+
and value = seconds * rate (DC-AS-003).
|
|
14
|
+
- output is only permitted in the same directory as media (DC-AS-001).
|
|
15
|
+
- Error messages do not expose full paths or raw ffmpeg stderr (basename only, M-1).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import math
|
|
22
|
+
import re
|
|
23
|
+
import sys
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from clipwright.envelope import error_result, ok_result
|
|
28
|
+
from clipwright.errors import ClipwrightError, ErrorCode
|
|
29
|
+
from clipwright.media import inspect_media
|
|
30
|
+
from clipwright.otio_utils import add_clip, new_timeline, save_timeline
|
|
31
|
+
from clipwright.process import resolve_tool, run
|
|
32
|
+
from clipwright.schemas import MediaRef, RationalTimeModel, TimeRangeModel
|
|
33
|
+
|
|
34
|
+
import clipwright_silence
|
|
35
|
+
from clipwright_silence.plan import derive_keep_ranges
|
|
36
|
+
from clipwright_silence.schemas import DetectSilenceOptions
|
|
37
|
+
|
|
38
|
+
# Regex to extract silence_start / silence_end lines
|
|
39
|
+
# (DC-AM-003: line-start match, '.' fixed decimal)
|
|
40
|
+
_RE_SILENCE_START = re.compile(r"silence_start:\s*([0-9]+(?:\.[0-9]+)?)")
|
|
41
|
+
_RE_SILENCE_END = re.compile(r"silence_end:\s*([0-9]+(?:\.[0-9]+)?)")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _fmt_sec(sec: float) -> str:
|
|
45
|
+
"""Convert seconds to a human-readable minutes/seconds string (for summary).
|
|
46
|
+
|
|
47
|
+
Format examples: 90.0 -> "1m30.0s", 45.5 -> "45.5s"
|
|
48
|
+
"""
|
|
49
|
+
m = int(sec) // 60
|
|
50
|
+
s = sec - m * 60
|
|
51
|
+
return f"{m}m{s:.1f}s" if m > 0 else f"{s:.1f}s"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _parse_silence_intervals(
|
|
55
|
+
stderr: str,
|
|
56
|
+
total_duration_sec: float,
|
|
57
|
+
) -> list[tuple[float, float]]:
|
|
58
|
+
"""Extract silence interval list from silencedetect stderr.
|
|
59
|
+
|
|
60
|
+
Parses line by line using a line-start match regex with fixed '.'
|
|
61
|
+
decimal (DC-AM-003). A trailing silence_start with no matching
|
|
62
|
+
silence_end is completed using
|
|
63
|
+
total_duration_sec (DC-AM-002).
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
stderr: ffmpeg standard error output string.
|
|
67
|
+
total_duration_sec: Total duration of the source (seconds). Used for completion.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List of silence intervals. Each element is a (start_sec, end_sec) tuple.
|
|
71
|
+
"""
|
|
72
|
+
intervals: list[tuple[float, float]] = []
|
|
73
|
+
pending_start: float | None = None
|
|
74
|
+
|
|
75
|
+
for line in stderr.splitlines():
|
|
76
|
+
m_start = _RE_SILENCE_START.search(line)
|
|
77
|
+
if m_start:
|
|
78
|
+
pending_start = float(m_start.group(1))
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
m_end = _RE_SILENCE_END.search(line)
|
|
82
|
+
# An isolated silence_end does not occur in normal silencedetect output
|
|
83
|
+
# (start->end are always paired). If encountered, skip as abnormal output.
|
|
84
|
+
# This is an intentional ignore per silencedetect spec, not suppression.
|
|
85
|
+
if m_end and pending_start is not None:
|
|
86
|
+
end = float(m_end.group(1))
|
|
87
|
+
# SR L-3: skip abnormal intervals where end < start
|
|
88
|
+
# (defensive check for future backend replacement)
|
|
89
|
+
if end < pending_start:
|
|
90
|
+
pending_start = None
|
|
91
|
+
continue
|
|
92
|
+
intervals.append((pending_start, end))
|
|
93
|
+
pending_start = None
|
|
94
|
+
|
|
95
|
+
# Trailing silence_end missing -> complete with total_duration (DC-AM-002)
|
|
96
|
+
if pending_start is not None:
|
|
97
|
+
intervals.append((pending_start, total_duration_sec))
|
|
98
|
+
|
|
99
|
+
return intervals
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _detect_silence_intervals(
|
|
103
|
+
ffmpeg: str,
|
|
104
|
+
source: str,
|
|
105
|
+
options: DetectSilenceOptions,
|
|
106
|
+
total_duration_sec: float,
|
|
107
|
+
) -> list[tuple[float, float]]:
|
|
108
|
+
"""Run ffmpeg silencedetect and return a list of silence intervals.
|
|
109
|
+
|
|
110
|
+
Adapter abstraction (AD-1).
|
|
111
|
+
|
|
112
|
+
When replacing the backend in the future, only this function needs to be swapped.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
ffmpeg: Path to ffmpeg executable.
|
|
116
|
+
source: Input media file path.
|
|
117
|
+
options: DetectSilenceOptions.
|
|
118
|
+
total_duration_sec: Total duration of source (seconds).
|
|
119
|
+
Used for trailing completion.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
List of silence intervals. Each element is (start_sec, end_sec).
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
ClipwrightError: SUBPROCESS_FAILED / SUBPROCESS_TIMEOUT (raised by run).
|
|
126
|
+
"""
|
|
127
|
+
# Filter string uses explicit format to be locale-independent (DC-AM-003)
|
|
128
|
+
filter_str = (
|
|
129
|
+
f"silencedetect=noise={options.silence_threshold_db:.3f}dB"
|
|
130
|
+
f":d={options.min_silence_duration:.6f}"
|
|
131
|
+
)
|
|
132
|
+
timeout = max(60, math.ceil(total_duration_sec * 2))
|
|
133
|
+
|
|
134
|
+
cmd = [
|
|
135
|
+
ffmpeg,
|
|
136
|
+
"-hide_banner",
|
|
137
|
+
"-nostats",
|
|
138
|
+
"-i",
|
|
139
|
+
source,
|
|
140
|
+
"-af",
|
|
141
|
+
filter_str,
|
|
142
|
+
"-f",
|
|
143
|
+
"null",
|
|
144
|
+
"-",
|
|
145
|
+
]
|
|
146
|
+
result = run(cmd, timeout=float(timeout))
|
|
147
|
+
return _parse_silence_intervals(result.stderr, total_duration_sec)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _detect_vad_silence_intervals(
|
|
151
|
+
source: str,
|
|
152
|
+
options: DetectSilenceOptions,
|
|
153
|
+
total_duration_sec: float,
|
|
154
|
+
) -> tuple[list[tuple[float, float]], int]:
|
|
155
|
+
"""Spawn VAD CLI as a separate process; return silence intervals and speech_count.
|
|
156
|
+
|
|
157
|
+
VAD-AD-02/04: Inverts the speech intervals returned by the VAD CLI against
|
|
158
|
+
total_duration_sec to produce silence intervals. speech_count is used only
|
|
159
|
+
for VAD summary generation and is not passed to the common flow (§7.5).
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
source: Absolute path to the input media file.
|
|
163
|
+
options: DetectSilenceOptions (references vad_* fields).
|
|
164
|
+
total_duration_sec: Total duration of source (seconds). Used for inversion.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Tuple of (silence interval list, speech_count).
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
ClipwrightError: Maps VAD CLI error JSON to corresponding ErrorCode.
|
|
171
|
+
SUBPROCESS_FAILED if run() exits non-zero.
|
|
172
|
+
"""
|
|
173
|
+
timeout = float(max(60, math.ceil(total_duration_sec * 4)))
|
|
174
|
+
cmd = [
|
|
175
|
+
sys.executable,
|
|
176
|
+
"-m",
|
|
177
|
+
"clipwright_silence.vad_cli",
|
|
178
|
+
"--media",
|
|
179
|
+
source,
|
|
180
|
+
"--threshold",
|
|
181
|
+
f"{options.vad_threshold}",
|
|
182
|
+
"--min-speech",
|
|
183
|
+
f"{options.vad_min_speech_duration}",
|
|
184
|
+
"--min-silence",
|
|
185
|
+
f"{options.vad_min_silence_duration}",
|
|
186
|
+
"--media-duration",
|
|
187
|
+
# vad_cli uses ceil(total*2); float precision has no practical impact (NF-L-1)
|
|
188
|
+
f"{total_duration_sec}",
|
|
189
|
+
]
|
|
190
|
+
result = run(cmd, timeout=timeout)
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
payload: dict[str, Any] = json.loads(result.stdout)
|
|
194
|
+
except json.JSONDecodeError as exc:
|
|
195
|
+
raise ClipwrightError(
|
|
196
|
+
code=ErrorCode.SUBPROCESS_FAILED,
|
|
197
|
+
message="VAD CLI returned invalid JSON output",
|
|
198
|
+
hint=(
|
|
199
|
+
"VAD CLI did not return the expected JSON. "
|
|
200
|
+
"If the error persists, please report with reproduction steps."
|
|
201
|
+
),
|
|
202
|
+
) from exc
|
|
203
|
+
|
|
204
|
+
# If error JSON, map to ErrorCode and raise ClipwrightError (§7.1)
|
|
205
|
+
if "error" in payload:
|
|
206
|
+
err = payload["error"]
|
|
207
|
+
raw_code: str = err.get("code", "INTERNAL")
|
|
208
|
+
message: str = err.get("message", "An error occurred in VAD CLI")
|
|
209
|
+
hint: str = err.get("hint", "Please report with reproduction steps.")
|
|
210
|
+
|
|
211
|
+
# Map to known ErrorCode; fall back to SUBPROCESS_FAILED for unknown codes
|
|
212
|
+
try:
|
|
213
|
+
error_code = ErrorCode(raw_code)
|
|
214
|
+
except ValueError:
|
|
215
|
+
error_code = ErrorCode.SUBPROCESS_FAILED
|
|
216
|
+
|
|
217
|
+
raise ClipwrightError(code=error_code, message=message, hint=hint)
|
|
218
|
+
|
|
219
|
+
# Pre-process speech intervals (§7.4): clip and remove degenerate intervals
|
|
220
|
+
raw_segments: list[dict[str, Any]] = payload.get("speech_segments", [])
|
|
221
|
+
total = total_duration_sec
|
|
222
|
+
speech_segments: list[tuple[float, float]] = []
|
|
223
|
+
for seg in raw_segments:
|
|
224
|
+
try:
|
|
225
|
+
# Accept both dict {"start": ..., "end": ...} and list [start, end] formats
|
|
226
|
+
if isinstance(seg, (list, tuple)):
|
|
227
|
+
start, end = float(seg[0]), float(seg[1])
|
|
228
|
+
else:
|
|
229
|
+
start, end = float(seg["start"]), float(seg["end"])
|
|
230
|
+
except (TypeError, KeyError, ValueError, IndexError):
|
|
231
|
+
# Skip malformed elements (null/string/empty dict, etc.) — SR L-3
|
|
232
|
+
continue
|
|
233
|
+
# Clip start < 0 to 0, clip end > total to total
|
|
234
|
+
start = max(0.0, start)
|
|
235
|
+
end = min(total, end)
|
|
236
|
+
# Remove degenerate intervals (start >= end)
|
|
237
|
+
if start >= end:
|
|
238
|
+
continue
|
|
239
|
+
speech_segments.append((start, end))
|
|
240
|
+
|
|
241
|
+
speech_count = len(speech_segments)
|
|
242
|
+
|
|
243
|
+
# Invert speech intervals -> silence intervals (VAD-AD-04)
|
|
244
|
+
# Sort speech intervals ascending and take the complement of [0, total]
|
|
245
|
+
sorted_speech = sorted(speech_segments, key=lambda iv: iv[0])
|
|
246
|
+
silence_intervals: list[tuple[float, float]] = []
|
|
247
|
+
cursor = 0.0
|
|
248
|
+
for s_start, s_end in sorted_speech:
|
|
249
|
+
if s_start > cursor:
|
|
250
|
+
silence_intervals.append((cursor, s_start))
|
|
251
|
+
cursor = max(cursor, s_end)
|
|
252
|
+
if cursor < total:
|
|
253
|
+
silence_intervals.append((cursor, total))
|
|
254
|
+
|
|
255
|
+
return silence_intervals, speech_count
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def detect_silence(
|
|
259
|
+
media: str,
|
|
260
|
+
output: str,
|
|
261
|
+
options: DetectSilenceOptions,
|
|
262
|
+
) -> dict[str, Any]:
|
|
263
|
+
"""Detect silence intervals and generate a KEEP interval OTIO timeline (AD-2/AD-5).
|
|
264
|
+
|
|
265
|
+
Non-destructive: does not modify the input media file in any way.
|
|
266
|
+
Output returns the path of the newly created timeline.otio in artifacts.
|
|
267
|
+
|
|
268
|
+
Flow:
|
|
269
|
+
1. Output validation (extension, parent directory, output==media, same directory)
|
|
270
|
+
2. inspect_media -> verify audio/video streams and duration
|
|
271
|
+
3. Run ffmpeg silencedetect and parse stderr
|
|
272
|
+
4. Derive KEEP intervals with derive_keep_ranges
|
|
273
|
+
5. Build and save OTIO timeline
|
|
274
|
+
6. Return envelope
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
media: Input media file path.
|
|
278
|
+
output: Output timeline.otio file path (must be in the same directory as media).
|
|
279
|
+
options: DetectSilenceOptions.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
Envelope dict from ok_result or error_result.
|
|
283
|
+
"""
|
|
284
|
+
try:
|
|
285
|
+
return _detect_inner(media, output, options)
|
|
286
|
+
except ClipwrightError as exc:
|
|
287
|
+
return error_result(exc.code, exc.message, exc.hint)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _detect_inner(
|
|
291
|
+
media: str,
|
|
292
|
+
output: str,
|
|
293
|
+
options: DetectSilenceOptions,
|
|
294
|
+
) -> dict[str, Any]:
|
|
295
|
+
"""Internal implementation of detect_silence. Raises ClipwrightError directly."""
|
|
296
|
+
output_path = Path(output)
|
|
297
|
+
media_path = Path(media)
|
|
298
|
+
|
|
299
|
+
# --- 1. Output validation ---
|
|
300
|
+
|
|
301
|
+
# Extension must be .otio (AD-5)
|
|
302
|
+
if output_path.suffix.lower() != ".otio":
|
|
303
|
+
raise ClipwrightError(
|
|
304
|
+
code=ErrorCode.INVALID_INPUT,
|
|
305
|
+
message=(
|
|
306
|
+
f"Invalid output file extension: {output_path.suffix!r}. "
|
|
307
|
+
"Only .otio is allowed."
|
|
308
|
+
),
|
|
309
|
+
hint="Change the output file path extension to .otio.",
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
# Verify parent directory exists (no auto-creation, AD-5)
|
|
313
|
+
if not output_path.parent.exists():
|
|
314
|
+
raise ClipwrightError(
|
|
315
|
+
code=ErrorCode.INVALID_INPUT,
|
|
316
|
+
message=(
|
|
317
|
+
"The output directory does not exist. "
|
|
318
|
+
"Check the parent directory of the specified output path."
|
|
319
|
+
),
|
|
320
|
+
hint="Create the output directory first, then re-run.",
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Prevent output == media (avoid overwriting the same path)
|
|
324
|
+
try:
|
|
325
|
+
if output_path.resolve() == media_path.resolve():
|
|
326
|
+
raise ClipwrightError(
|
|
327
|
+
code=ErrorCode.INVALID_INPUT,
|
|
328
|
+
message="The output path and input media path are identical.",
|
|
329
|
+
hint=(
|
|
330
|
+
"Change the output file path to a path"
|
|
331
|
+
" different from the input media."
|
|
332
|
+
),
|
|
333
|
+
)
|
|
334
|
+
except OSError as exc:
|
|
335
|
+
if str(output_path) == str(media_path):
|
|
336
|
+
raise ClipwrightError(
|
|
337
|
+
code=ErrorCode.INVALID_INPUT,
|
|
338
|
+
message="The output path and input media path are identical.",
|
|
339
|
+
hint=(
|
|
340
|
+
"Change the output file path to a path"
|
|
341
|
+
" different from the input media."
|
|
342
|
+
),
|
|
343
|
+
) from exc
|
|
344
|
+
|
|
345
|
+
# --- 2. inspect_media -> verify streams and duration ---
|
|
346
|
+
|
|
347
|
+
# inspect_media raises FILE_NOT_FOUND (incl. symlink rejection) / PROBE_FAILED, etc.
|
|
348
|
+
# SR L-2: Replace FILE_NOT_FOUND message with basename only
|
|
349
|
+
# (prevents full path exposure; same policy as render._probe M-1).
|
|
350
|
+
try:
|
|
351
|
+
media_info = inspect_media(media)
|
|
352
|
+
except ClipwrightError as exc:
|
|
353
|
+
if exc.code == ErrorCode.FILE_NOT_FOUND:
|
|
354
|
+
raise ClipwrightError(
|
|
355
|
+
code=ErrorCode.FILE_NOT_FOUND,
|
|
356
|
+
message=f"File not found: {media_path.name}",
|
|
357
|
+
hint=exc.hint,
|
|
358
|
+
) from exc
|
|
359
|
+
raise
|
|
360
|
+
|
|
361
|
+
# Verify output is in the same directory as media (DC-AS-001)
|
|
362
|
+
# Done after inspect_media so the path has been confirmed to exist before resolve()
|
|
363
|
+
try:
|
|
364
|
+
media_dir = media_path.resolve().parent
|
|
365
|
+
output_dir = output_path.parent.resolve()
|
|
366
|
+
if media_dir != output_dir:
|
|
367
|
+
raise ClipwrightError(
|
|
368
|
+
code=ErrorCode.INVALID_INPUT,
|
|
369
|
+
message=(
|
|
370
|
+
"The output timeline must be placed in the same"
|
|
371
|
+
f" directory as the input media (input: {media_path.name})."
|
|
372
|
+
),
|
|
373
|
+
hint=(
|
|
374
|
+
"Change the output path to be in the same directory"
|
|
375
|
+
" as the media file."
|
|
376
|
+
" (e.g., output = same directory as media / timeline.otio)"
|
|
377
|
+
),
|
|
378
|
+
)
|
|
379
|
+
except ClipwrightError:
|
|
380
|
+
raise
|
|
381
|
+
except OSError:
|
|
382
|
+
# resolve failure (network paths, etc.) is skipped on best-effort basis
|
|
383
|
+
pass
|
|
384
|
+
|
|
385
|
+
# Verify video stream (DC-AS-002)
|
|
386
|
+
has_video = any(s.codec_type == "video" for s in media_info.streams)
|
|
387
|
+
has_audio = any(s.codec_type == "audio" for s in media_info.streams)
|
|
388
|
+
|
|
389
|
+
if not has_video:
|
|
390
|
+
raise ClipwrightError(
|
|
391
|
+
code=ErrorCode.UNSUPPORTED_OPERATION,
|
|
392
|
+
message=f"No video stream found: {media_path.name}",
|
|
393
|
+
hint=(
|
|
394
|
+
"This tool targets media with both video and audio streams. "
|
|
395
|
+
"Specify a media file that contains video."
|
|
396
|
+
),
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if not has_audio:
|
|
400
|
+
raise ClipwrightError(
|
|
401
|
+
code=ErrorCode.UNSUPPORTED_OPERATION,
|
|
402
|
+
message=f"No audio stream found: {media_path.name}",
|
|
403
|
+
hint=(
|
|
404
|
+
"Silence detection requires an audio stream. "
|
|
405
|
+
"Specify a media file that contains audio."
|
|
406
|
+
),
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# Verify duration (DC-AS-004)
|
|
410
|
+
if media_info.duration is None:
|
|
411
|
+
raise ClipwrightError(
|
|
412
|
+
code=ErrorCode.PROBE_FAILED,
|
|
413
|
+
message=f"Could not retrieve media duration: {media_path.name}",
|
|
414
|
+
hint=(
|
|
415
|
+
"Check that the media file is not corrupted. "
|
|
416
|
+
"You can also verify manually with ffprobe."
|
|
417
|
+
),
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
total_duration_sec = media_info.duration.value / media_info.duration.rate
|
|
421
|
+
rate = media_info.duration.rate
|
|
422
|
+
|
|
423
|
+
# --- 3. Run detection (branch by backend) ---
|
|
424
|
+
|
|
425
|
+
abs_media = str(media_path.resolve())
|
|
426
|
+
|
|
427
|
+
# speech_count is for VAD summary only; silence interval list is shared
|
|
428
|
+
speech_count: int | None = None
|
|
429
|
+
|
|
430
|
+
if options.backend == "vad":
|
|
431
|
+
# VAD path: spawn as separate process via sys.executable -m
|
|
432
|
+
# resolve_tool is not used (sys.executable -m ensures same venv, VAD-AD-02)
|
|
433
|
+
silence_intervals, speech_count = _detect_vad_silence_intervals(
|
|
434
|
+
abs_media, options, total_duration_sec
|
|
435
|
+
)
|
|
436
|
+
else:
|
|
437
|
+
# silencedetect path (existing; backend="silencedetect")
|
|
438
|
+
ffmpeg = resolve_tool("ffmpeg", "CLIPWRIGHT_FFMPEG")
|
|
439
|
+
silence_intervals = _detect_silence_intervals(
|
|
440
|
+
ffmpeg, abs_media, options, total_duration_sec
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# --- 4. Derive KEEP intervals ---
|
|
444
|
+
|
|
445
|
+
keep_ranges = derive_keep_ranges(total_duration_sec, silence_intervals, options)
|
|
446
|
+
|
|
447
|
+
# --- 5. Build and save OTIO timeline ---
|
|
448
|
+
|
|
449
|
+
timeline = new_timeline(media_path.name)
|
|
450
|
+
v1 = timeline.tracks[0] # V1 (Video) track
|
|
451
|
+
|
|
452
|
+
for start_sec, end_sec in keep_ranges:
|
|
453
|
+
start_value = start_sec * rate
|
|
454
|
+
dur_value = (end_sec - start_sec) * rate
|
|
455
|
+
source_range = TimeRangeModel(
|
|
456
|
+
start_time=RationalTimeModel(value=start_value, rate=rate),
|
|
457
|
+
duration=RationalTimeModel(value=dur_value, rate=rate),
|
|
458
|
+
)
|
|
459
|
+
media_ref = MediaRef(target_url=abs_media)
|
|
460
|
+
add_clip(
|
|
461
|
+
v1,
|
|
462
|
+
media_ref,
|
|
463
|
+
source_range,
|
|
464
|
+
name="keep",
|
|
465
|
+
metadata={
|
|
466
|
+
"tool": "clipwright-silence",
|
|
467
|
+
"version": clipwright_silence.__version__,
|
|
468
|
+
"kind": "keep",
|
|
469
|
+
"backend": options.backend, # VAD-AD-07
|
|
470
|
+
},
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
save_timeline(timeline, output)
|
|
474
|
+
|
|
475
|
+
# --- 6. Return envelope ---
|
|
476
|
+
|
|
477
|
+
silence_count = len(silence_intervals)
|
|
478
|
+
keep_count = len(keep_ranges)
|
|
479
|
+
total_silence_seconds = sum(e - s for s, e in silence_intervals)
|
|
480
|
+
total_keep_seconds = sum(e - s for s, e in keep_ranges)
|
|
481
|
+
|
|
482
|
+
# Differentiate summary by backend (VAD-AD-08, §7.5)
|
|
483
|
+
# In the VAD path, _detect_vad_silence_intervals always returns an int,
|
|
484
|
+
# so speech_count is guaranteed to be int (assert to satisfy mypy)
|
|
485
|
+
if options.backend == "vad":
|
|
486
|
+
assert speech_count is not None # Always set on the VAD path
|
|
487
|
+
_silence_fmt = _fmt_sec(total_silence_seconds)
|
|
488
|
+
_keep_fmt = _fmt_sec(total_keep_seconds)
|
|
489
|
+
summary = (
|
|
490
|
+
f"Detected {speech_count} speech interval(s). "
|
|
491
|
+
f"Removed {silence_count} non-speech interval(s) (total {_silence_fmt}). "
|
|
492
|
+
f"Generated {output_path.name} with {keep_count} interval(s) to keep"
|
|
493
|
+
f" (total {_keep_fmt})."
|
|
494
|
+
)
|
|
495
|
+
else:
|
|
496
|
+
_silence_fmt = _fmt_sec(total_silence_seconds)
|
|
497
|
+
_keep_fmt = _fmt_sec(total_keep_seconds)
|
|
498
|
+
summary = (
|
|
499
|
+
f"Detected {silence_count} silence interval(s) (total {_silence_fmt})"
|
|
500
|
+
f" from source with duration {_fmt_sec(total_duration_sec)}. "
|
|
501
|
+
f"Generated {output_path.name} with {keep_count} interval(s) to keep"
|
|
502
|
+
f" (total {_keep_fmt})."
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
warnings: list[str] = []
|
|
506
|
+
if keep_count == 0:
|
|
507
|
+
warnings.append(
|
|
508
|
+
"No intervals to keep (all intervals classified as silence). "
|
|
509
|
+
"The V1 track of the generated timeline.otio is empty. "
|
|
510
|
+
"Passing it to render will result in INVALID_INPUT."
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
return ok_result(
|
|
514
|
+
summary,
|
|
515
|
+
data={
|
|
516
|
+
"silence_count": silence_count,
|
|
517
|
+
"total_silence_seconds": total_silence_seconds,
|
|
518
|
+
"keep_count": keep_count,
|
|
519
|
+
"total_keep_seconds": total_keep_seconds,
|
|
520
|
+
},
|
|
521
|
+
artifacts=[{"role": "timeline", "path": str(output_path), "format": "otio"}],
|
|
522
|
+
warnings=warnings,
|
|
523
|
+
)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""plan.py — Pure logic for deriving KEEP intervals from silence intervals.
|
|
2
|
+
|
|
3
|
+
Does not execute ffmpeg at all. Performs interval arithmetic on float seconds
|
|
4
|
+
and delegates OTIO conversion to the detect layer (AD-2/AD-3 design policy).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from clipwright_silence.schemas import DetectSilenceOptions
|
|
10
|
+
|
|
11
|
+
# CR-Q-004: Floating-point comparison tolerance. Used to preserve boundary
|
|
12
|
+
# equality (DC-AM-001: min_keep equal preservation / _merge_intervals adjacent merging).
|
|
13
|
+
_EPSILON = 1e-9
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def derive_keep_ranges(
|
|
17
|
+
total_duration_sec: float,
|
|
18
|
+
silence_intervals: list[tuple[float, float]],
|
|
19
|
+
options: DetectSilenceOptions,
|
|
20
|
+
) -> list[tuple[float, float]]:
|
|
21
|
+
"""Derive KEEP intervals from a list of silence intervals.
|
|
22
|
+
|
|
23
|
+
Processing flow (AD-3):
|
|
24
|
+
1. Sort silence intervals by start time.
|
|
25
|
+
2. Invert [0, total_duration_sec] against silence intervals to get KEEP intervals.
|
|
26
|
+
- Zero silence -> [(0.0, total_duration_sec)] as a single interval.
|
|
27
|
+
- All silence -> empty list.
|
|
28
|
+
3. Extend each KEEP by padding and clamp to [0, total].
|
|
29
|
+
4. Merge overlapping KEEPs (DC-GP-001 short-silence fill-in).
|
|
30
|
+
5. Discard intervals shorter than min_keep_duration (DC-AM-001 opt-in).
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
total_duration_sec: Total duration of the source media (seconds).
|
|
34
|
+
silence_intervals: List of silence intervals. Each element is
|
|
35
|
+
(start_sec, end_sec).
|
|
36
|
+
options: DetectSilenceOptions. Uses padding / min_keep_duration.
|
|
37
|
+
(silence_threshold_db / min_silence_duration are the silencedetect
|
|
38
|
+
layer's responsibility and are not referenced in this function.)
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List of KEEP intervals. Each element is a tuple[float, float]
|
|
42
|
+
of (start_sec, end_sec).
|
|
43
|
+
Sorted by time, non-overlapping.
|
|
44
|
+
"""
|
|
45
|
+
total = total_duration_sec
|
|
46
|
+
padding = options.padding
|
|
47
|
+
min_keep = options.min_keep_duration
|
|
48
|
+
|
|
49
|
+
# 1. Sort silence intervals.
|
|
50
|
+
sorted_silence = sorted(silence_intervals, key=lambda iv: iv[0])
|
|
51
|
+
|
|
52
|
+
# 2. Invert: subtract silence intervals from [0, total] to get KEEPs.
|
|
53
|
+
keeps: list[tuple[float, float]] = []
|
|
54
|
+
cursor = 0.0
|
|
55
|
+
for s_start, s_end in sorted_silence:
|
|
56
|
+
if s_start > cursor:
|
|
57
|
+
keeps.append((cursor, s_start))
|
|
58
|
+
# Advance cursor to end of silence (handles overlapping silences).
|
|
59
|
+
cursor = max(cursor, s_end)
|
|
60
|
+
# Trailing speech interval.
|
|
61
|
+
if cursor < total:
|
|
62
|
+
keeps.append((cursor, total))
|
|
63
|
+
|
|
64
|
+
# 3. Padding extension + clamp.
|
|
65
|
+
if padding > 0.0:
|
|
66
|
+
padded: list[tuple[float, float]] = []
|
|
67
|
+
for start, end in keeps:
|
|
68
|
+
new_start = max(0.0, start - padding)
|
|
69
|
+
new_end = min(total, end + padding)
|
|
70
|
+
padded.append((new_start, new_end))
|
|
71
|
+
keeps = padded
|
|
72
|
+
|
|
73
|
+
# 4. Merge overlapping intervals (DC-GP-001).
|
|
74
|
+
keeps = _merge_intervals(keeps)
|
|
75
|
+
|
|
76
|
+
# 5. Discard intervals shorter than min_keep_duration (default 0.0 = no discard).
|
|
77
|
+
if min_keep > 0.0:
|
|
78
|
+
# DC-AM-001: Use _EPSILON to preserve intervals equal to min_keep
|
|
79
|
+
keeps = [
|
|
80
|
+
(start, end) for start, end in keeps if (end - start) >= min_keep - _EPSILON
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
return keeps
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _merge_intervals(
|
|
87
|
+
intervals: list[tuple[float, float]],
|
|
88
|
+
) -> list[tuple[float, float]]:
|
|
89
|
+
"""Merge overlapping intervals and return a sorted, non-overlapping list.
|
|
90
|
+
|
|
91
|
+
Intervals do not need to be pre-sorted by start time (sorted internally).
|
|
92
|
+
"""
|
|
93
|
+
if not intervals:
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
sorted_ivs = sorted(intervals, key=lambda iv: iv[0])
|
|
97
|
+
merged: list[tuple[float, float]] = [sorted_ivs[0]]
|
|
98
|
+
|
|
99
|
+
for start, end in sorted_ivs[1:]:
|
|
100
|
+
prev_start, prev_end = merged[-1]
|
|
101
|
+
if start <= prev_end + _EPSILON:
|
|
102
|
+
# Overlapping or adjacent -> merge.
|
|
103
|
+
merged[-1] = (prev_start, max(prev_end, end))
|
|
104
|
+
else:
|
|
105
|
+
merged.append((start, end))
|
|
106
|
+
|
|
107
|
+
return merged
|
|
File without changes
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""schemas.py — clipwright-silence specific Pydantic schemas.
|
|
2
|
+
|
|
3
|
+
Common types (MediaRef / Artifact / ToolResult, etc.) are centrally defined
|
|
4
|
+
in clipwright.schemas and are not redefined in this module.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Annotated, Literal
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DetectSilenceOptions(BaseModel):
|
|
15
|
+
"""Options for clipwright_detect_silence (AD-2/AD-3, DC-AM-001).
|
|
16
|
+
|
|
17
|
+
silence_threshold_db and min_silence_duration are detection parameters
|
|
18
|
+
passed directly to the ffmpeg silencedetect filter.
|
|
19
|
+
padding and min_keep_duration are post-processing parameters used by
|
|
20
|
+
the KEEP derivation logic in plan.py.
|
|
21
|
+
vad_* fields are only effective when backend="vad" (VAD-AD-05).
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
silence_threshold_db: Annotated[
|
|
25
|
+
float,
|
|
26
|
+
Field(
|
|
27
|
+
default=-30.0,
|
|
28
|
+
le=0.0,
|
|
29
|
+
description=(
|
|
30
|
+
"silencedetect backend only. Use vad_* when using VAD. "
|
|
31
|
+
"Volume threshold (dB) for silence detection. Must be <= 0. "
|
|
32
|
+
"Example: -30.0 dB (default), -40.0 dB (stricter detection)."
|
|
33
|
+
),
|
|
34
|
+
),
|
|
35
|
+
] = -30.0
|
|
36
|
+
|
|
37
|
+
min_silence_duration: Annotated[
|
|
38
|
+
float,
|
|
39
|
+
Field(
|
|
40
|
+
default=0.5,
|
|
41
|
+
gt=0.0,
|
|
42
|
+
description=(
|
|
43
|
+
"silencedetect backend only. Use vad_* when using VAD. "
|
|
44
|
+
"Minimum duration (seconds) to consider as silence. Must be > 0. "
|
|
45
|
+
"Silences shorter than this value are ignored. Default is 0.5 seconds."
|
|
46
|
+
),
|
|
47
|
+
),
|
|
48
|
+
] = 0.5
|
|
49
|
+
|
|
50
|
+
padding: Annotated[
|
|
51
|
+
float,
|
|
52
|
+
Field(
|
|
53
|
+
default=0.1,
|
|
54
|
+
ge=0.0,
|
|
55
|
+
description=(
|
|
56
|
+
"Padding width (seconds) to extend each KEEP interval on both sides."
|
|
57
|
+
" Must be >= 0. If extension causes adjacent KEEPs to overlap,"
|
|
58
|
+
" they are merged (prevents word cutoff). Default is 0.1 seconds."
|
|
59
|
+
),
|
|
60
|
+
),
|
|
61
|
+
] = 0.1
|
|
62
|
+
|
|
63
|
+
min_keep_duration: Annotated[
|
|
64
|
+
float,
|
|
65
|
+
Field(
|
|
66
|
+
default=0.0,
|
|
67
|
+
ge=0.0,
|
|
68
|
+
description=(
|
|
69
|
+
"Minimum interval length (seconds) to retain as KEEP. Must be >= 0."
|
|
70
|
+
" KEEP intervals shorter than this value are discarded after"
|
|
71
|
+
" padding and merging."
|
|
72
|
+
" Default is 0.0 (no discard; DC-AM-001 opt-in guard)."
|
|
73
|
+
),
|
|
74
|
+
),
|
|
75
|
+
] = 0.0
|
|
76
|
+
|
|
77
|
+
backend: Annotated[
|
|
78
|
+
Literal["silencedetect", "vad"],
|
|
79
|
+
Field(
|
|
80
|
+
default="silencedetect",
|
|
81
|
+
description=(
|
|
82
|
+
"Detection backend to use. "
|
|
83
|
+
'"silencedetect" (default) uses the ffmpeg silencedetect filter. '
|
|
84
|
+
'"vad" uses Silero VAD (ONNX). VAD-AD-01 backward-compatible opt-in.'
|
|
85
|
+
),
|
|
86
|
+
),
|
|
87
|
+
] = "silencedetect"
|
|
88
|
+
|
|
89
|
+
vad_threshold: Annotated[
|
|
90
|
+
float,
|
|
91
|
+
Field(
|
|
92
|
+
default=0.5,
|
|
93
|
+
ge=0.0,
|
|
94
|
+
le=1.0,
|
|
95
|
+
description=(
|
|
96
|
+
"VAD backend only. "
|
|
97
|
+
"Speech probability threshold (0.0-1.0)."
|
|
98
|
+
" Values >= this are considered speech. Default is 0.5."
|
|
99
|
+
),
|
|
100
|
+
),
|
|
101
|
+
] = 0.5
|
|
102
|
+
|
|
103
|
+
vad_min_speech_duration: Annotated[
|
|
104
|
+
float,
|
|
105
|
+
Field(
|
|
106
|
+
default=0.25,
|
|
107
|
+
gt=0.0,
|
|
108
|
+
description=(
|
|
109
|
+
"VAD backend only. "
|
|
110
|
+
"Minimum duration (seconds) to classify as speech. Must be > 0. "
|
|
111
|
+
"Default is 0.25 seconds."
|
|
112
|
+
),
|
|
113
|
+
),
|
|
114
|
+
] = 0.25
|
|
115
|
+
|
|
116
|
+
vad_min_silence_duration: Annotated[
|
|
117
|
+
float,
|
|
118
|
+
Field(
|
|
119
|
+
default=0.1,
|
|
120
|
+
gt=0.0,
|
|
121
|
+
description=(
|
|
122
|
+
"VAD backend only. "
|
|
123
|
+
"Minimum silence duration (seconds) between speech intervals."
|
|
124
|
+
" Must be > 0. Silences shorter than this value are absorbed"
|
|
125
|
+
" into speech intervals. Default is 0.1 seconds."
|
|
126
|
+
),
|
|
127
|
+
),
|
|
128
|
+
] = 0.1
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""server.py — clipwright-silence MCP server + CLI entry point.
|
|
2
|
+
|
|
3
|
+
A thin wrapper that delegates business logic to detect.py.
|
|
4
|
+
ClipwrightError conversion is handled on the detect.py side;
|
|
5
|
+
no double conversion is done here.
|
|
6
|
+
|
|
7
|
+
Transport defaults to stdio (mcp.run(transport="stdio")).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Annotated, Any
|
|
13
|
+
|
|
14
|
+
from mcp.server.fastmcp import FastMCP
|
|
15
|
+
from mcp.types import ToolAnnotations
|
|
16
|
+
from pydantic import Field
|
|
17
|
+
|
|
18
|
+
from clipwright_silence.detect import detect_silence
|
|
19
|
+
from clipwright_silence.schemas import DetectSilenceOptions
|
|
20
|
+
|
|
21
|
+
# FastMCP instance (server name)
|
|
22
|
+
mcp = FastMCP("clipwright-silence")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ===========================================================================
|
|
26
|
+
# clipwright_detect_silence MCP tool
|
|
27
|
+
# ===========================================================================
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@mcp.tool(
|
|
31
|
+
annotations=ToolAnnotations(
|
|
32
|
+
readOnlyHint=True,
|
|
33
|
+
destructiveHint=False,
|
|
34
|
+
idempotentHint=True,
|
|
35
|
+
openWorldHint=False,
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
def clipwright_detect_silence(
|
|
39
|
+
media: Annotated[
|
|
40
|
+
str,
|
|
41
|
+
Field(description="Input media file path (source containing video and audio)."),
|
|
42
|
+
],
|
|
43
|
+
output: Annotated[
|
|
44
|
+
str,
|
|
45
|
+
Field(description="Output OTIO timeline file path (.otio extension)."),
|
|
46
|
+
],
|
|
47
|
+
options: Annotated[
|
|
48
|
+
DetectSilenceOptions | None,
|
|
49
|
+
Field(
|
|
50
|
+
description=(
|
|
51
|
+
"Silence detection options (silence_threshold_db / min_silence_duration"
|
|
52
|
+
" / padding / min_keep_duration). All values use defaults when omitted."
|
|
53
|
+
)
|
|
54
|
+
),
|
|
55
|
+
] = None,
|
|
56
|
+
) -> dict[str, Any]:
|
|
57
|
+
"""MCP tool: detect silence intervals and generate a KEEP interval OTIO timeline.
|
|
58
|
+
|
|
59
|
+
Does not modify the input media file (non-destructive, readOnly).
|
|
60
|
+
Output returns the path of the newly created timeline.otio in artifacts.
|
|
61
|
+
|
|
62
|
+
Delegates business logic to detect.detect_silence.
|
|
63
|
+
Uses default DetectSilenceOptions() when options is None.
|
|
64
|
+
"""
|
|
65
|
+
resolved_options = options if options is not None else DetectSilenceOptions()
|
|
66
|
+
return detect_silence(
|
|
67
|
+
media=media,
|
|
68
|
+
output=output,
|
|
69
|
+
options=resolved_options,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ===========================================================================
|
|
74
|
+
# Entry point (MCP stdio launch / DC-GP-002)
|
|
75
|
+
# ===========================================================================
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def main() -> None:
|
|
79
|
+
"""CLI entry point. Launches the MCP server over stdio (DC-GP-002).
|
|
80
|
+
|
|
81
|
+
Registered in pyproject.toml [project.scripts] as:
|
|
82
|
+
clipwright-silence = "clipwright_silence.server:main"
|
|
83
|
+
"""
|
|
84
|
+
mcp.run(transport="stdio")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
if __name__ == "__main__":
|
|
88
|
+
main()
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""vad_cli.py — Separate-process small CLI for the Silero VAD backend.
|
|
2
|
+
|
|
3
|
+
Not imported by the MCP server process (§2.4 subprocess loose coupling).
|
|
4
|
+
detect.py spawns this as a separate process via
|
|
5
|
+
sys.executable -m clipwright_silence.vad_cli.
|
|
6
|
+
|
|
7
|
+
CLI contract (§7.1 unified):
|
|
8
|
+
- main(argv) catches all exceptions at the top level, always writes stdout JSON,
|
|
9
|
+
and returns 0.
|
|
10
|
+
- Success: {"speech_segments": [[start_sec, end_sec], ...]}
|
|
11
|
+
- Error: {"error": {"code": str, "message": str, "hint": str}}
|
|
12
|
+
- stdout is JSON only. Logs and progress go to stderr.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import contextlib
|
|
19
|
+
import json
|
|
20
|
+
import math
|
|
21
|
+
import os
|
|
22
|
+
import sys
|
|
23
|
+
import tempfile
|
|
24
|
+
import wave
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from clipwright.errors import ClipwrightError, ErrorCode
|
|
28
|
+
from clipwright.process import resolve_tool, run
|
|
29
|
+
|
|
30
|
+
# Fixed sample rate (§7.3)
|
|
31
|
+
_SAMPLE_RATE = 16000
|
|
32
|
+
# pip install hint string
|
|
33
|
+
_VAD_INSTALL_HINT = (
|
|
34
|
+
"Install VAD dependencies with `pip install 'clipwright-silence[vad]'`."
|
|
35
|
+
)
|
|
36
|
+
# Sanitized generic message for SUBPROCESS_FAILED/TIMEOUT
|
|
37
|
+
# (SR M-1: prevents ffmpeg stderr leakage)
|
|
38
|
+
_SUBPROCESS_SAFE_MESSAGE = "internal subprocess failed"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _error_output(code: str, message: str, hint: str) -> None:
|
|
42
|
+
"""Write error JSON to stdout.
|
|
43
|
+
|
|
44
|
+
The caller must sanitize path information before passing it here.
|
|
45
|
+
"""
|
|
46
|
+
result: dict[str, Any] = {
|
|
47
|
+
"error": {
|
|
48
|
+
"code": code,
|
|
49
|
+
"message": message,
|
|
50
|
+
"hint": hint,
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
print(json.dumps(result, ensure_ascii=False), file=sys.stdout)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _extract_pcm(ffmpeg: str, media: str, output_path: str, timeout: float) -> None:
|
|
57
|
+
"""Write 16kHz mono s16le PCM to a temporary file using ffmpeg.
|
|
58
|
+
|
|
59
|
+
Executed with shell=False and argument array only (subprocess discipline).
|
|
60
|
+
"""
|
|
61
|
+
cmd = [
|
|
62
|
+
ffmpeg,
|
|
63
|
+
"-hide_banner",
|
|
64
|
+
"-nostats",
|
|
65
|
+
"-i",
|
|
66
|
+
media,
|
|
67
|
+
"-vn",
|
|
68
|
+
"-acodec",
|
|
69
|
+
"pcm_s16le",
|
|
70
|
+
"-ar",
|
|
71
|
+
str(_SAMPLE_RATE),
|
|
72
|
+
"-ac",
|
|
73
|
+
"1",
|
|
74
|
+
"-y",
|
|
75
|
+
output_path,
|
|
76
|
+
]
|
|
77
|
+
run(cmd, timeout=timeout)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _load_audio_as_float32(
|
|
81
|
+
pcm_path: str,
|
|
82
|
+
) -> tuple[Any, int]:
|
|
83
|
+
"""Read a PCM WAV file and return a float32 numpy array and sample_rate.
|
|
84
|
+
|
|
85
|
+
Normalizes int16 -> float32 (/32768.0).
|
|
86
|
+
|
|
87
|
+
Precondition: This function must always be called via main().
|
|
88
|
+
main() lazily imports numpy and registers it in sys.modules, so the import
|
|
89
|
+
inside this function is effectively a cache lookup (NF-M-2: document precondition).
|
|
90
|
+
Calling directly from tests or utilities breaks the loose-coupling intent of CR L-2.
|
|
91
|
+
"""
|
|
92
|
+
import numpy as np # See docstring (cached in sys.modules by main())
|
|
93
|
+
|
|
94
|
+
with wave.open(pcm_path, "rb") as wf:
|
|
95
|
+
n_frames = wf.getnframes()
|
|
96
|
+
sample_rate = wf.getframerate()
|
|
97
|
+
raw = wf.readframes(n_frames)
|
|
98
|
+
|
|
99
|
+
audio_int16 = np.frombuffer(raw, dtype=np.int16)
|
|
100
|
+
audio_float32: Any = audio_int16.astype(np.float32) / 32768.0
|
|
101
|
+
return audio_float32, sample_rate
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def main(argv: list[str] | None = None) -> int:
|
|
105
|
+
"""VAD CLI entry point.
|
|
106
|
+
|
|
107
|
+
Catches all exceptions at the top level, writes JSON to stdout,
|
|
108
|
+
and returns 0 (§7.1).
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
argv: Command-line argument list. Uses sys.argv[1:] if None.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Exit code (always 0).
|
|
115
|
+
"""
|
|
116
|
+
# Lazily import numpy inside main (CR L-2: avoid top-level import to keep
|
|
117
|
+
# server process loosely coupled. Separate process via sys.executable -m).
|
|
118
|
+
# Pre-import to cache in sys.modules (referenced by _load_audio_as_float32).
|
|
119
|
+
# noqa: F401 = suppress lint warning for not referencing directly (NF-L-2).
|
|
120
|
+
import numpy as np # noqa: F401
|
|
121
|
+
|
|
122
|
+
# --- Parse arguments ---
|
|
123
|
+
parser = argparse.ArgumentParser(
|
|
124
|
+
description="Detect speech intervals using Silero VAD and output JSON to stdout." # noqa: E501
|
|
125
|
+
)
|
|
126
|
+
parser.add_argument("--media", required=True, help="Input media file path")
|
|
127
|
+
parser.add_argument(
|
|
128
|
+
"--threshold",
|
|
129
|
+
type=float,
|
|
130
|
+
default=0.5,
|
|
131
|
+
help="Speech probability threshold (0.0-1.0, default: 0.5)",
|
|
132
|
+
)
|
|
133
|
+
parser.add_argument(
|
|
134
|
+
"--min-speech",
|
|
135
|
+
type=float,
|
|
136
|
+
default=0.25,
|
|
137
|
+
help="Minimum speech duration (seconds, default: 0.25)",
|
|
138
|
+
)
|
|
139
|
+
parser.add_argument(
|
|
140
|
+
"--min-silence",
|
|
141
|
+
type=float,
|
|
142
|
+
default=0.1,
|
|
143
|
+
help="Minimum silence duration between speech intervals (seconds, default: 0.1)", # noqa: E501
|
|
144
|
+
)
|
|
145
|
+
parser.add_argument(
|
|
146
|
+
"--media-duration",
|
|
147
|
+
type=float,
|
|
148
|
+
default=None,
|
|
149
|
+
help=(
|
|
150
|
+
"Total media duration (seconds). Used to calculate the inner"
|
|
151
|
+
" ffmpeg timeout proportional to total duration (§7.7)."
|
|
152
|
+
" Uses a safe default (60s) if omitted."
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
args = parser.parse_args(argv)
|
|
158
|
+
except SystemExit as exc:
|
|
159
|
+
# Catch SystemExit from argparse (--help or missing required args)
|
|
160
|
+
_error_output(
|
|
161
|
+
code=ErrorCode.INVALID_INPUT,
|
|
162
|
+
message=f"Argument parsing failed: exit code {exc.code}",
|
|
163
|
+
hint="Specify --media <path> as a required argument.",
|
|
164
|
+
)
|
|
165
|
+
return 0
|
|
166
|
+
|
|
167
|
+
media: str = args.media
|
|
168
|
+
threshold: float = args.threshold
|
|
169
|
+
min_speech_sec: float = args.min_speech
|
|
170
|
+
min_silence_sec: float = args.min_silence
|
|
171
|
+
media_duration: float | None = args.media_duration
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
# --- Lazy import of silero_vad (keep out of server process, §2.4) ---
|
|
175
|
+
try:
|
|
176
|
+
import silero_vad
|
|
177
|
+
except ImportError:
|
|
178
|
+
# SR L-2: str(exc) may contain internal paths; use fixed message
|
|
179
|
+
_error_output(
|
|
180
|
+
code=ErrorCode.DEPENDENCY_MISSING,
|
|
181
|
+
message="Failed to import silero_vad or onnxruntime",
|
|
182
|
+
hint=_VAD_INSTALL_HINT,
|
|
183
|
+
)
|
|
184
|
+
return 0
|
|
185
|
+
|
|
186
|
+
# --- Load Silero VAD model (before ffmpeg, to catch ImportError early) ---
|
|
187
|
+
# load_silero_vad raises ImportError if onnxruntime is missing (§7.3)
|
|
188
|
+
model = silero_vad.load_silero_vad(onnx=True)
|
|
189
|
+
|
|
190
|
+
# --- Resolve ffmpeg (§7.2) ---
|
|
191
|
+
ffmpeg = resolve_tool("ffmpeg", "CLIPWRIGHT_FFMPEG")
|
|
192
|
+
|
|
193
|
+
# --- Extract 16kHz mono s16le PCM to a temp file using ffmpeg (§7.3) ---
|
|
194
|
+
# Inner timeout must always be shorter than outer (§7.7)
|
|
195
|
+
# max(60, ceil(total*4)) for outer; proportional for inner
|
|
196
|
+
# When --media-duration given: max(30, ceil(total * 2))
|
|
197
|
+
# When omitted: safe default of 60 seconds
|
|
198
|
+
if media_duration is not None:
|
|
199
|
+
ffmpeg_timeout = float(max(30, math.ceil(media_duration * 2)))
|
|
200
|
+
else:
|
|
201
|
+
ffmpeg_timeout = 60.0
|
|
202
|
+
|
|
203
|
+
tmp_path: str = ""
|
|
204
|
+
audio_float32: Any
|
|
205
|
+
sample_rate: int
|
|
206
|
+
|
|
207
|
+
# Open with delete=False to get the name, then delete in try/finally (§7.3)
|
|
208
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
|
|
209
|
+
tmp_path = tmp_file.name
|
|
210
|
+
try:
|
|
211
|
+
_extract_pcm(ffmpeg, media, tmp_path, timeout=ffmpeg_timeout)
|
|
212
|
+
audio_float32, sample_rate = _load_audio_as_float32(tmp_path)
|
|
213
|
+
finally:
|
|
214
|
+
# Ensure deletion even on exception (§7.3)
|
|
215
|
+
if tmp_path and os.path.exists(tmp_path):
|
|
216
|
+
with contextlib.suppress(OSError):
|
|
217
|
+
os.unlink(tmp_path)
|
|
218
|
+
|
|
219
|
+
# get_speech_timestamps returns sample-unit values
|
|
220
|
+
# min_speech_duration_ms / min_silence_duration_ms are in milliseconds
|
|
221
|
+
raw_segments: list[dict[str, Any]] = silero_vad.get_speech_timestamps(
|
|
222
|
+
audio_float32,
|
|
223
|
+
model,
|
|
224
|
+
threshold=threshold,
|
|
225
|
+
sampling_rate=sample_rate,
|
|
226
|
+
min_speech_duration_ms=int(min_speech_sec * 1000),
|
|
227
|
+
min_silence_duration_ms=int(min_silence_sec * 1000),
|
|
228
|
+
return_seconds=False,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Convert sample units -> seconds (built in ascending order)
|
|
232
|
+
speech_segments: list[list[float]] = []
|
|
233
|
+
for seg in sorted(raw_segments, key=lambda s: s["start"]):
|
|
234
|
+
start_sec = float(seg["start"]) / sample_rate
|
|
235
|
+
end_sec = float(seg["end"]) / sample_rate
|
|
236
|
+
speech_segments.append([start_sec, end_sec])
|
|
237
|
+
|
|
238
|
+
result: dict[str, Any] = {"speech_segments": speech_segments}
|
|
239
|
+
print(json.dumps(result, ensure_ascii=False), file=sys.stdout)
|
|
240
|
+
return 0
|
|
241
|
+
|
|
242
|
+
except ClipwrightError as exc:
|
|
243
|
+
# Catch ClipwrightError from core run() (SUBPROCESS_FAILED/TIMEOUT) and
|
|
244
|
+
# resolve_tool DEPENDENCY_MISSING here (§7.1/§7.2)
|
|
245
|
+
# SR M-1: SUBPROCESS_FAILED/TIMEOUT may embed ffmpeg stderr in message;
|
|
246
|
+
# replace with generic message to prevent path leakage
|
|
247
|
+
if exc.code in (ErrorCode.SUBPROCESS_FAILED, ErrorCode.SUBPROCESS_TIMEOUT):
|
|
248
|
+
safe_message = f"{_SUBPROCESS_SAFE_MESSAGE} (code: {exc.code})"
|
|
249
|
+
else:
|
|
250
|
+
safe_message = exc.message
|
|
251
|
+
_error_output(
|
|
252
|
+
code=str(exc.code),
|
|
253
|
+
message=safe_message,
|
|
254
|
+
hint=exc.hint,
|
|
255
|
+
)
|
|
256
|
+
return 0
|
|
257
|
+
|
|
258
|
+
except ImportError:
|
|
259
|
+
# ImportError propagated from load_silero_vad etc. when onnxruntime is missing
|
|
260
|
+
# SR L-2: str(exc) may contain internal paths; use fixed message
|
|
261
|
+
_error_output(
|
|
262
|
+
code=ErrorCode.DEPENDENCY_MISSING,
|
|
263
|
+
message="Failed to import silero_vad or onnxruntime",
|
|
264
|
+
hint=_VAD_INSTALL_HINT,
|
|
265
|
+
)
|
|
266
|
+
return 0
|
|
267
|
+
|
|
268
|
+
except Exception:
|
|
269
|
+
# Catch all unexpected exceptions and return error JSON (§7.1)
|
|
270
|
+
# SR NF-L-1: str(exc) may contain internal paths; use fixed message.
|
|
271
|
+
# Debug details are stderr-only; do not leak into stdout JSON (MCP response).
|
|
272
|
+
import traceback
|
|
273
|
+
|
|
274
|
+
traceback.print_exc(file=sys.stderr)
|
|
275
|
+
_error_output(
|
|
276
|
+
code=ErrorCode.INTERNAL,
|
|
277
|
+
message="An unexpected error occurred in VAD CLI",
|
|
278
|
+
hint="Please report with reproduction steps.",
|
|
279
|
+
)
|
|
280
|
+
return 0
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
if __name__ == "__main__":
|
|
284
|
+
sys.exit(main())
|