clipwright-transcribe 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,65 @@
1
+ Metadata-Version: 2.3
2
+ Name: clipwright-transcribe
3
+ Version: 0.1.1
4
+ Summary: MCP tool to transcribe audio/video files with whisper.cpp binary and generate SRT/VTT captions and OTIO timeline.
5
+ Author: satoh-y-0323
6
+ Author-email: satoh-y-0323 <shoma.papa.0323@gmail.com>
7
+ License: MIT
8
+ Requires-Dist: clipwright>=0.1.1
9
+ Requires-Dist: mcp[cli]>=1.27.2
10
+ Requires-Dist: opentimelineio>=0.18
11
+ Requires-Dist: pydantic>=2
12
+ Requires-Python: >=3.11
13
+ Description-Content-Type: text/markdown
14
+
15
+ # clipwright-transcribe
16
+
17
+ MCP tool to transcribe audio/video files and generate SRT/VTT captions and OTIO timeline.
18
+
19
+ ## External Binaries / Files
20
+
21
+ This tool requires the following external binaries/files to exist in the execution environment. **They are not installed via pip**, so obtain them separately.
22
+
23
+ ### whisper.cpp Binary
24
+
25
+ Used for transcription.
26
+
27
+ - Place `whisper-cli` (or the binary name appropriate for your environment) on PATH, or specify the full path in the `CLIPWRIGHT_WHISPER` environment variable.
28
+ - Obtain: Build from https://github.com/ggerganov/whisper.cpp, or use release binaries.
29
+
30
+ ```
31
+ export CLIPWRIGHT_WHISPER=/path/to/whisper-cli
32
+ ```
33
+
34
+ ### ggml Model File
35
+
36
+ Speech recognition model (`.bin` file) used by whisper.cpp.
37
+
38
+ - Specify the full path to the model file in the `CLIPWRIGHT_WHISPER_MODEL` environment variable. Can be overridden by the `model_path` parameter at tool invocation.
39
+ - Obtain: Download from https://huggingface.co/ggerganov/whisper.cpp etc.
40
+
41
+ ```
42
+ export CLIPWRIGHT_WHISPER_MODEL=/path/to/ggml-base.bin
43
+ ```
44
+
45
+ ### ffmpeg
46
+
47
+ Required to convert audio to 16kHz mono WAV (input format for whisper.cpp).
48
+
49
+ - Place `ffmpeg` on PATH, or specify the full path in the `CLIPWRIGHT_FFMPEG` environment variable.
50
+
51
+ ```
52
+ export CLIPWRIGHT_FFMPEG=/path/to/ffmpeg
53
+ ```
54
+
55
+ ## Environment Variables Summary
56
+
57
+ | Environment Variable | Purpose | Required |
58
+ |---|---|---|
59
+ | `CLIPWRIGHT_WHISPER` | Path to whisper.cpp binary (required if not on PATH) | Conditional |
60
+ | `CLIPWRIGHT_WHISPER_MODEL` | Path to ggml model file (`model_path` parameter takes precedence) | Conditional |
61
+ | `CLIPWRIGHT_FFMPEG` | Path to ffmpeg binary (required if not on PATH) | Conditional |
62
+
63
+ ## MCP Tool
64
+
65
+ `clipwright_transcribe(media, output, options?)` — Transcribe audio/video file and generate `output.otio` / `output.srt` / `output.vtt`.
@@ -0,0 +1,51 @@
1
+ # clipwright-transcribe
2
+
3
+ MCP tool to transcribe audio/video files and generate SRT/VTT captions and OTIO timeline.
4
+
5
+ ## External Binaries / Files
6
+
7
+ This tool requires the following external binaries/files to exist in the execution environment. **They are not installed via pip**, so obtain them separately.
8
+
9
+ ### whisper.cpp Binary
10
+
11
+ Used for transcription.
12
+
13
+ - Place `whisper-cli` (or the binary name appropriate for your environment) on PATH, or specify the full path in the `CLIPWRIGHT_WHISPER` environment variable.
14
+ - Obtain: Build from https://github.com/ggerganov/whisper.cpp, or use release binaries.
15
+
16
+ ```
17
+ export CLIPWRIGHT_WHISPER=/path/to/whisper-cli
18
+ ```
19
+
20
+ ### ggml Model File
21
+
22
+ Speech recognition model (`.bin` file) used by whisper.cpp.
23
+
24
+ - Specify the full path to the model file in the `CLIPWRIGHT_WHISPER_MODEL` environment variable. Can be overridden by the `model_path` parameter at tool invocation.
25
+ - Obtain: Download from https://huggingface.co/ggerganov/whisper.cpp etc.
26
+
27
+ ```
28
+ export CLIPWRIGHT_WHISPER_MODEL=/path/to/ggml-base.bin
29
+ ```
30
+
31
+ ### ffmpeg
32
+
33
+ Required to convert audio to 16kHz mono WAV (input format for whisper.cpp).
34
+
35
+ - Place `ffmpeg` on PATH, or specify the full path in the `CLIPWRIGHT_FFMPEG` environment variable.
36
+
37
+ ```
38
+ export CLIPWRIGHT_FFMPEG=/path/to/ffmpeg
39
+ ```
40
+
41
+ ## Environment Variables Summary
42
+
43
+ | Environment Variable | Purpose | Required |
44
+ |---|---|---|
45
+ | `CLIPWRIGHT_WHISPER` | Path to whisper.cpp binary (required if not on PATH) | Conditional |
46
+ | `CLIPWRIGHT_WHISPER_MODEL` | Path to ggml model file (`model_path` parameter takes precedence) | Conditional |
47
+ | `CLIPWRIGHT_FFMPEG` | Path to ffmpeg binary (required if not on PATH) | Conditional |
48
+
49
+ ## MCP Tool
50
+
51
+ `clipwright_transcribe(media, output, options?)` — Transcribe audio/video file and generate `output.otio` / `output.srt` / `output.vtt`.
@@ -0,0 +1,84 @@
1
+ [project]
2
+ name = "clipwright-transcribe"
3
+ version = "0.1.1"
4
+ description = "MCP tool to transcribe audio/video files with whisper.cpp binary and generate SRT/VTT captions and OTIO timeline."
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ authors = [
8
+ { name = "satoh-y-0323", email = "shoma.papa.0323@gmail.com" }
9
+ ]
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "clipwright>=0.1.1",
13
+ "mcp[cli]>=1.27.2",
14
+ "opentimelineio>=0.18",
15
+ "pydantic>=2",
16
+ ]
17
+
18
+ [project.scripts]
19
+ clipwright-transcribe = "clipwright_transcribe.server:main"
20
+
21
+ [build-system]
22
+ requires = ["uv_build>=0.11.19,<0.12.0"]
23
+ build-backend = "uv_build"
24
+
25
+ [dependency-groups]
26
+ dev = [
27
+ "mypy>=2.1.0",
28
+ "pytest>=9.0.3",
29
+ "pytest-cov>=7.1.0",
30
+ "pytest-mock>=3.15.1",
31
+ "ruff>=0.15.16",
32
+ ]
33
+
34
+ # Resolve clipwright (core) within workspace by path reference
35
+ [tool.uv.sources]
36
+ clipwright = { workspace = true }
37
+
38
+ # --- Ruff ---
39
+ [tool.ruff]
40
+ target-version = "py311"
41
+ line-length = 88
42
+
43
+ [tool.ruff.lint]
44
+ select = ["E", "F", "W", "I", "UP", "B", "C4", "SIM"]
45
+ ignore = []
46
+
47
+ [tool.ruff.lint.per-file-ignores]
48
+ # Allow E501 for English docstrings/comments in test files
49
+ "tests/*.py" = ["E501"]
50
+
51
+ [tool.ruff.format]
52
+ # Default ruff formatter is OK
53
+
54
+ # --- mypy ---
55
+ [tool.mypy]
56
+ python_version = "3.11"
57
+ strict = true
58
+ warn_return_any = true
59
+ warn_unused_configs = true
60
+ disallow_untyped_defs = true
61
+ disallow_any_generics = true
62
+
63
+ # opentimelineio has no stubs, ignored with mypy strict
64
+ [[tool.mypy.overrides]]
65
+ module = "opentimelineio.*"
66
+ ignore_missing_imports = true
67
+
68
+ # --- pytest ---
69
+ [tool.pytest.ini_options]
70
+ testpaths = ["tests"]
71
+ addopts = "--strict-markers -q"
72
+ markers = [
73
+ "integration: integration test requiring actual ffmpeg/ffprobe binaries",
74
+ "slow: test with long execution time",
75
+ ]
76
+
77
+ # --- coverage ---
78
+ [tool.coverage.run]
79
+ source = ["clipwright_transcribe"]
80
+ omit = ["tests/*"]
81
+
82
+ [tool.coverage.report]
83
+ show_missing = true
84
+ skip_covered = false
@@ -0,0 +1 @@
1
+ __version__ = "0.1.1"
@@ -0,0 +1,156 @@
1
+ """captions.py — clipwright-transcribe pure logic layer (mirrors plan.py structure).
2
+
3
+ Converts whisper.cpp `-oj` JSON (transcription[].offsets.from/to in ms, text) into
4
+ normalised segments and generates SRT/VTT strings.
5
+
6
+ Design decisions:
7
+ - Pure functions; no external processes are executed (target: 100% contract coverage).
8
+ - SRT/VTT timecodes are derived from the same second value to guarantee consistency
9
+ (DC-AS-005). Only the separator differs (SRT="HH:MM:SS,mmm" / VTT="HH:MM:SS.mmm").
10
+ - When segments is empty, to_srt returns an empty string and to_vtt returns only the
11
+ "WEBVTT" header (DC-GP-002).
12
+ - Defensive handling of whisper output: removes entries with empty text, degenerate
13
+ intervals (start>=end), or missing keys.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from typing import Any, TypedDict
19
+
20
+
21
+ class Segment(TypedDict):
22
+ """Normalised caption segment.
23
+
24
+ start_sec / end_sec are in seconds (float); text has leading/trailing whitespace
25
+ stripped.
26
+ """
27
+
28
+ start_sec: float
29
+ end_sec: float
30
+ text: str
31
+
32
+
33
+ def normalize_segments(whisper_json: dict[str, Any]) -> list[Segment]:
34
+ """Convert a whisper `-oj` JSON dict into normalised segments.
35
+
36
+ Converts transcription[].offsets.from/to (milliseconds) to seconds and strips
37
+ text whitespace.
38
+ Defensive cleanup (DC-GP-002 supplement) removes entries where:
39
+ - offsets / from / to / text keys are missing
40
+ - text is empty or whitespace-only
41
+ - the interval is degenerate (start_sec >= end_sec)
42
+
43
+ Returns an empty list when the transcription key is absent or not a list.
44
+
45
+ Args:
46
+ whisper_json: dict loaded from a whisper.cpp `-oj` JSON file.
47
+
48
+ Returns:
49
+ List of normalised Segment objects.
50
+ """
51
+ transcription = whisper_json.get("transcription")
52
+ if not isinstance(transcription, list):
53
+ return []
54
+
55
+ segments: list[Segment] = []
56
+ for entry in transcription:
57
+ if not isinstance(entry, dict):
58
+ continue
59
+
60
+ offsets = entry.get("offsets")
61
+ if not isinstance(offsets, dict):
62
+ continue
63
+ if "from" not in offsets or "to" not in offsets:
64
+ continue
65
+ if "text" not in entry:
66
+ continue
67
+
68
+ try:
69
+ start_ms = float(offsets["from"])
70
+ end_ms = float(offsets["to"])
71
+ except (TypeError, ValueError):
72
+ continue
73
+
74
+ text = str(entry["text"]).strip()
75
+ if not text:
76
+ continue
77
+
78
+ start_sec = start_ms / 1000.0
79
+ end_sec = end_ms / 1000.0
80
+ # Remove degenerate intervals (start >= end).
81
+ if start_sec >= end_sec:
82
+ continue
83
+
84
+ segments.append({"start_sec": start_sec, "end_sec": end_sec, "text": text})
85
+
86
+ return segments
87
+
88
+
89
+ def _format_timecode(total_seconds: float, *, ms_separator: str) -> str:
90
+ """Format seconds as "HH:MM:SS{sep}mmm" timecode.
91
+
92
+ The ms_separator switches between SRT (",") and VTT (".").
93
+ Both formats share the same second and millisecond values for consistency
94
+ (DC-AS-005).
95
+ Milliseconds are computed with round-half-up (round → int conversion).
96
+
97
+ Args:
98
+ total_seconds: Duration in seconds.
99
+ ms_separator: Separator between seconds and milliseconds ("," or ".").
100
+
101
+ Returns:
102
+ Formatted timecode string.
103
+ """
104
+ total_ms = int(round(total_seconds * 1000.0))
105
+ hours, rem_ms = divmod(total_ms, 3_600_000)
106
+ minutes, rem_ms = divmod(rem_ms, 60_000)
107
+ seconds, milliseconds = divmod(rem_ms, 1000)
108
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}{ms_separator}{milliseconds:03d}"
109
+
110
+
111
+ def to_srt(segments: list[Segment]) -> str:
112
+ """Convert normalised segments to an SRT string.
113
+
114
+ 1-based index, "HH:MM:SS,mmm" timecodes, blank-line separator.
115
+ Returns an empty string when segments is empty (DC-GP-002).
116
+
117
+ Args:
118
+ segments: List of normalised Segment objects.
119
+
120
+ Returns:
121
+ SRT-formatted string.
122
+ """
123
+ if not segments:
124
+ return ""
125
+
126
+ blocks: list[str] = []
127
+ for index, seg in enumerate(segments, start=1):
128
+ start_tc = _format_timecode(seg["start_sec"], ms_separator=",")
129
+ end_tc = _format_timecode(seg["end_sec"], ms_separator=",")
130
+ blocks.append(f"{index}\n{start_tc} --> {end_tc}\n{seg['text']}\n")
131
+
132
+ return "\n".join(blocks)
133
+
134
+
135
+ def to_vtt(segments: list[Segment]) -> str:
136
+ """Convert normalised segments to a WebVTT string.
137
+
138
+ "WEBVTT" header, "HH:MM:SS.mmm" timecodes (dot separator).
139
+ Returns only the "WEBVTT" header when segments is empty (DC-GP-002).
140
+
141
+ Args:
142
+ segments: List of normalised Segment objects.
143
+
144
+ Returns:
145
+ WebVTT-formatted string.
146
+ """
147
+ if not segments:
148
+ return "WEBVTT\n"
149
+
150
+ blocks: list[str] = ["WEBVTT\n"]
151
+ for seg in segments:
152
+ start_tc = _format_timecode(seg["start_sec"], ms_separator=".")
153
+ end_tc = _format_timecode(seg["end_sec"], ms_separator=".")
154
+ blocks.append(f"{start_tc} --> {end_tc}\n{seg['text']}\n")
155
+
156
+ return "\n".join(blocks)
@@ -0,0 +1,63 @@
1
+ """schemas.py — clipwright-transcribe-specific Pydantic schemas.
2
+
3
+ Common types (MediaRef / Artifact / ToolResult, etc.) are defined in clipwright.schemas;
4
+ this module does not redefine them.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Annotated
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+
14
+ class TranscribeOptions(BaseModel):
15
+ """Options for clipwright_transcribe (TR-AD-06).
16
+
17
+ language: whisper language code (None = auto-detect). model_path: path to the ggml
18
+ model file (None falls back to env CLIPWRIGHT_WHISPER_MODEL).
19
+ initial_prompt: context hint to improve whisper recognition accuracy.
20
+ """
21
+
22
+ language: Annotated[
23
+ str | None,
24
+ Field(
25
+ default=None,
26
+ max_length=10,
27
+ pattern=r"^[a-zA-Z]{2,}$|^auto$",
28
+ description=(
29
+ 'Transcription language code (e.g. "ja", "en"). '
30
+ "None (default) lets whisper auto-detect the language. "
31
+ "ISO 639-1 compatible: 2 or more ASCII letters, or 'auto'. "
32
+ "Anything else is rejected."
33
+ ),
34
+ ),
35
+ ] = None
36
+
37
+ model_path: Annotated[
38
+ str | None,
39
+ Field(
40
+ default=None,
41
+ max_length=4096,
42
+ description=(
43
+ "Path to the whisper.cpp ggml model file"
44
+ " (max 4096 chars, equivalent to OS path length limit)."
45
+ " None (default) uses the CLIPWRIGHT_WHISPER_MODEL"
46
+ " environment variable."
47
+ " If neither is set or the file does not exist, an error is raised."
48
+ ),
49
+ ),
50
+ ] = None
51
+
52
+ initial_prompt: Annotated[
53
+ str | None,
54
+ Field(
55
+ default=None,
56
+ max_length=2048,
57
+ description=(
58
+ "Context hint passed to whisper (proper nouns, technical terms, etc.)."
59
+ " None (default) means no prompt. Used to tune recognition accuracy."
60
+ " Maximum 2048 characters (equivalent to whisper.cpp context length)."
61
+ ),
62
+ ),
63
+ ] = None
@@ -0,0 +1,90 @@
1
+ """server.py — clipwright-transcribe MCP server + CLI entry point.
2
+
3
+ Thin wrapper that delegates business logic to transcribe.py.
4
+ ClipwrightError conversion is handled in transcribe.py; no double conversion here.
5
+
6
+ Transport defaults to stdio (mcp.run(transport="stdio")).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Annotated, Any
12
+
13
+ from mcp.server.fastmcp import FastMCP
14
+ from mcp.types import ToolAnnotations
15
+ from pydantic import Field
16
+
17
+ from clipwright_transcribe.schemas import TranscribeOptions
18
+ from clipwright_transcribe.transcribe import transcribe_media
19
+
20
+ # FastMCP instance (server name)
21
+ mcp = FastMCP("clipwright-transcribe")
22
+
23
+
24
+ # ===========================================================================
25
+ # clipwright_transcribe MCP tool
26
+ # ===========================================================================
27
+
28
+
29
+ @mcp.tool(
30
+ annotations=ToolAnnotations(
31
+ readOnlyHint=True,
32
+ destructiveHint=False,
33
+ idempotentHint=True,
34
+ openWorldHint=False,
35
+ )
36
+ )
37
+ def clipwright_transcribe(
38
+ media: Annotated[
39
+ str,
40
+ Field(
41
+ description="Input media file path (must contain audio; video is optional)."
42
+ ),
43
+ ],
44
+ output: Annotated[
45
+ str,
46
+ Field(description="Output OTIO timeline file path (.otio extension required)."),
47
+ ],
48
+ options: Annotated[
49
+ TranscribeOptions | None,
50
+ Field(
51
+ description=(
52
+ "Transcription options (language / model_path / initial_prompt). "
53
+ "When omitted, all fields use their defaults "
54
+ "(auto language detection, model from env)."
55
+ )
56
+ ),
57
+ ] = None,
58
+ ) -> dict[str, Any]:
59
+ """MCP tool: transcribe audio and produce SRT/VTT captions and an OTIO timeline.
60
+
61
+ Non-destructive (readOnly): the input media file is never modified.
62
+ Outputs are newly created files; their paths are returned in artifacts.
63
+
64
+ Business logic is delegated to transcribe.transcribe_media.
65
+ When options is None, default TranscribeOptions() is used.
66
+ """
67
+ resolved_options = options if options is not None else TranscribeOptions()
68
+ return transcribe_media(
69
+ media=media,
70
+ output=output,
71
+ options=resolved_options,
72
+ )
73
+
74
+
75
+ # ===========================================================================
76
+ # Entry point (MCP stdio)
77
+ # ===========================================================================
78
+
79
+
80
+ def main() -> None:
81
+ """CLI entry point. Starts the MCP server over stdio.
82
+
83
+ Registered in pyproject.toml [project.scripts] as:
84
+ clipwright-transcribe = "clipwright_transcribe.server:main"
85
+ """
86
+ mcp.run(transport="stdio")
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
@@ -0,0 +1,483 @@
1
+ """transcribe.py — clipwright-transcribe orchestration layer (mirrors detect.py).
2
+
3
+ Flow: input validation -> inspect_media -> model resolution -> ffmpeg WAV extraction ->
4
+ whisper-cli execution -> SRT/VTT generation via captions -> OTIO construction/save ->
5
+ envelope return.
6
+
7
+ Design decisions:
8
+ - _run_whisper() is the single adapter function (TR-AD-01) that encapsulates ffmpeg WAV
9
+ extraction, whisper-cli invocation, and JSON loading. To swap backends
10
+ (e.g. faster-whisper), replace only this function.
11
+ - The whisper binary name and language auto-detect flag are isolated as module constants
12
+ (spike-whisper confirmed values, replaceable via e2e; DC-AS-003/DC-AM-002).
13
+ - Model resolution uses os.path.isfile rather than resolve_tool (the model is not an
14
+ executable; DC-AS-003). Resolution order: options.model_path -> env
15
+ CLIPWRIGHT_WHISPER_MODEL.
16
+ - marker.marked_range uses whisper second values (media coordinates) directly.
17
+ Coordinates match because the clip is full-length with source_range.start_time=0
18
+ (DC-AM-001).
19
+ - SRT/VTT timecodes and marker second values share the same origin (DC-AS-005).
20
+ - Error messages expose only basename; raw whisper/ffmpeg stderr fragments are replaced
21
+ with a sanitised generic message (TR-AD-09, following VAD M-1 precedent).
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import json
27
+ import math
28
+ import os
29
+ import tempfile
30
+ from pathlib import Path
31
+ from typing import Any
32
+
33
+ from clipwright.envelope import error_result, ok_result
34
+ from clipwright.errors import ClipwrightError, ErrorCode
35
+ from clipwright.media import inspect_media
36
+ from clipwright.otio_utils import add_clip, add_marker, new_timeline, save_timeline
37
+ from clipwright.process import resolve_tool, run, safe_subprocess_message
38
+ from clipwright.schemas import MediaRef, RationalTimeModel, TimeRangeModel
39
+
40
+ import clipwright_transcribe
41
+ from clipwright_transcribe.captions import Segment, normalize_segments, to_srt, to_vtt
42
+ from clipwright_transcribe.schemas import TranscribeOptions
43
+
44
+ # whisper.cpp executable name (spike-whisper confirmed; latest = whisper-cli,
45
+ # legacy = main).
46
+ # Must match the binary name pointed to by env CLIPWRIGHT_WHISPER (DC-AS-003-R).
47
+ # Verified by e2e test.
48
+ WHISPER_BINARY_NAME = "whisper-cli"
49
+ # Language auto-detect flag (spike confirmed; replaceable via e2e as a list; DC-AM-002)
50
+ LANG_AUTO_FLAG: list[str] = ["-l", "auto"]
51
+ # Maximum display length for marker name (excess is truncated; full text kept in
52
+ # metadata.text; DC-GP-003).
53
+ _MARKER_NAME_MAX = 40
54
+ # Hint shown when the whisper model cannot be resolved (actionable; TR-AD-05).
55
+ _MODEL_MISSING_HINT = (
56
+ "Specify the ggml model file path via options.model_path or "
57
+ "set the CLIPWRIGHT_WHISPER_MODEL environment variable. "
58
+ "Download the model from the whisper.cpp distribution (e.g. ggml-base.bin)."
59
+ )
60
+
61
+
62
+ def _fmt_sec(sec: float) -> str:
63
+ """Convert seconds to a human-readable "Xm Ys" string (used in summary)."""
64
+ m = int(sec) // 60
65
+ s = sec - m * 60
66
+ return f"{m}m{s:.1f}s" if m > 0 else f"{s:.1f}s"
67
+
68
+
69
+ def _truncate_name(text: str) -> str:
70
+ """Truncate text to the first _MARKER_NAME_MAX characters for use as a marker name
71
+ (DC-GP-003).
72
+
73
+ Appends an ellipsis "…" when truncated. The full text is preserved in metadata.text.
74
+ """
75
+ if len(text) <= _MARKER_NAME_MAX:
76
+ return text
77
+ return text[:_MARKER_NAME_MAX] + "…"
78
+
79
+
80
+ def _sanitize_subprocess_error(exc: ClipwrightError) -> ClipwrightError:
81
+ """Replace SUBPROCESS_FAILED/TIMEOUT message with a generic string (TR-AD-09).
82
+
83
+ run() messages may contain stderr fragments and executable paths; this function
84
+ substitutes a fixed string to prevent leakage into MCP responses. hint is
85
+ preserved. Other error codes are returned unchanged.
86
+ """
87
+ if exc.code in (ErrorCode.SUBPROCESS_FAILED, ErrorCode.SUBPROCESS_TIMEOUT):
88
+ return ClipwrightError(
89
+ code=exc.code,
90
+ message=safe_subprocess_message(exc),
91
+ hint=exc.hint,
92
+ )
93
+ return exc
94
+
95
+
96
+ def _resolve_model_path(options: TranscribeOptions) -> str:
97
+ """Resolve the whisper model file path (DC-AS-003).
98
+
99
+ Resolution order: options.model_path -> env CLIPWRIGHT_WHISPER_MODEL.
100
+ Uses os.path.isfile rather than resolve_tool (the model is not an executable).
101
+ Raises DEPENDENCY_MISSING when neither candidate exists.
102
+
103
+ Args:
104
+ options: TranscribeOptions (model_path field is inspected).
105
+
106
+ Returns:
107
+ Absolute or relative path to an existing model file.
108
+
109
+ Raises:
110
+ ClipwrightError: When the model file cannot be found (DEPENDENCY_MISSING).
111
+ """
112
+ candidates: list[str] = []
113
+ if options.model_path is not None:
114
+ candidates.append(options.model_path)
115
+ env_model = os.environ.get("CLIPWRIGHT_WHISPER_MODEL")
116
+ if env_model is not None:
117
+ candidates.append(env_model)
118
+
119
+ for candidate in candidates:
120
+ if os.path.isfile(candidate):
121
+ return candidate
122
+
123
+ raise ClipwrightError(
124
+ code=ErrorCode.DEPENDENCY_MISSING,
125
+ message="whisper model file not found",
126
+ hint=_MODEL_MISSING_HINT,
127
+ )
128
+
129
+
130
+ def _extract_wav(ffmpeg: str, media: str, output_path: str, timeout: float) -> None:
131
+ """Extract a 16 kHz mono s16le WAV to a temporary file using ffmpeg (TR-AD-01).
132
+
133
+ whisper.cpp requires 16 kHz mono WAV; this conversion satisfies that requirement.
134
+ Executed with shell=False and an argument list (subprocess discipline).
135
+ """
136
+ cmd = [
137
+ ffmpeg,
138
+ "-hide_banner",
139
+ "-nostats",
140
+ "-i",
141
+ media,
142
+ "-vn",
143
+ "-acodec",
144
+ "pcm_s16le",
145
+ "-ar",
146
+ "16000",
147
+ "-ac",
148
+ "1",
149
+ "-y",
150
+ output_path,
151
+ ]
152
+ run(cmd, timeout=timeout)
153
+
154
+
155
+ def _build_whisper_cmd(
156
+ whisper: str,
157
+ model_path: str,
158
+ wav_path: str,
159
+ prefix: str,
160
+ options: TranscribeOptions,
161
+ ) -> list[str]:
162
+ """Build the whisper-cli argument list (TR-AD-01, DC-AM-002/003).
163
+
164
+ `-oj` writes JSON to `<prefix>.json`. Language None uses auto-detection
165
+ (LANG_AUTO_FLAG); an explicit code uses `-l <code>`. initial_prompt is passed
166
+ via `--prompt`.
167
+ """
168
+ cmd = [whisper, "-m", model_path, "-f", wav_path, "-oj", "-of", prefix]
169
+ if options.language is None:
170
+ cmd.extend(LANG_AUTO_FLAG)
171
+ else:
172
+ cmd.extend(["-l", options.language])
173
+ if options.initial_prompt is not None:
174
+ cmd.extend(["--prompt", options.initial_prompt])
175
+ return cmd
176
+
177
+
178
+ def _run_whisper(
179
+ media: str,
180
+ options: TranscribeOptions,
181
+ total_duration_sec: float,
182
+ model_path: str,
183
+ ) -> tuple[list[Segment], str | None]:
184
+ """Single adapter: ffmpeg WAV extraction -> whisper-cli -> JSON normalisation
185
+ (TR-AD-01).
186
+
187
+ Replace only this function to swap the backend (e.g. faster-whisper).
188
+ WAV and JSON are written to a temporary directory so the source media directory
189
+ is not polluted.
190
+
191
+ Args:
192
+ media: Absolute path to the input media file.
193
+ options: TranscribeOptions.
194
+ total_duration_sec: Total duration of the media (seconds); used to compute
195
+ timeouts.
196
+ model_path: Resolved model file path.
197
+
198
+ Returns:
199
+ Tuple of (normalised Segment list, detected language code or None).
200
+
201
+ Raises:
202
+ ClipwrightError: DEPENDENCY_MISSING (missing tool), sanitised
203
+ SUBPROCESS_FAILED/SUBPROCESS_TIMEOUT, or SUBPROCESS_FAILED on JSON parse
204
+ failure.
205
+ """
206
+ ffmpeg = resolve_tool("ffmpeg", "CLIPWRIGHT_FFMPEG")
207
+ whisper = resolve_tool(WHISPER_BINARY_NAME, "CLIPWRIGHT_WHISPER")
208
+
209
+ # Timeouts scale with duration; whisper is computationally expensive (TR-AD-10).
210
+ ffmpeg_timeout = float(max(60, math.ceil(total_duration_sec * 2)))
211
+ whisper_timeout = float(max(300, math.ceil(total_duration_sec * 30)))
212
+
213
+ with tempfile.TemporaryDirectory() as tmpdir:
214
+ wav_path = os.path.join(tmpdir, "audio.wav")
215
+ prefix = os.path.join(tmpdir, "transcript")
216
+
217
+ try:
218
+ _extract_wav(ffmpeg, media, wav_path, ffmpeg_timeout)
219
+ except ClipwrightError as exc:
220
+ raise _sanitize_subprocess_error(exc) from None
221
+
222
+ cmd = _build_whisper_cmd(whisper, model_path, wav_path, prefix, options)
223
+ try:
224
+ run(cmd, timeout=whisper_timeout)
225
+ except ClipwrightError as exc:
226
+ raise _sanitize_subprocess_error(exc) from None
227
+
228
+ # whisper `-oj -of <prefix>` produces <prefix>.json (DC-AM-003).
229
+ json_path = prefix + ".json"
230
+ try:
231
+ with open(json_path, encoding="utf-8") as f:
232
+ whisper_json: dict[str, Any] = json.load(f)
233
+ except (OSError, json.JSONDecodeError):
234
+ raise ClipwrightError(
235
+ code=ErrorCode.SUBPROCESS_FAILED,
236
+ message="failed to read whisper output JSON",
237
+ hint=(
238
+ "Check the whisper.cpp version and arguments. "
239
+ "Please report with reproduction steps."
240
+ ),
241
+ ) from None
242
+
243
+ # Complete JSON loading and normalisation inside the with block so that the
244
+ # temporary directory still exists while data is accessed (CR M-2).
245
+ segments = normalize_segments(whisper_json)
246
+ result = whisper_json.get("result")
247
+ language = result.get("language") if isinstance(result, dict) else None
248
+
249
+ return segments, language
250
+
251
+
252
+ def transcribe_media(
253
+ media: str,
254
+ output: str,
255
+ options: TranscribeOptions,
256
+ ) -> dict[str, Any]:
257
+ """Transcribe audio and produce SRT/VTT captions and an OTIO timeline (TR-AD-04).
258
+
259
+ Non-destructive: the input media file is never modified.
260
+ Outputs are newly created files; their paths are returned in artifacts.
261
+
262
+ Args:
263
+ media: Input media file path (audio required; video optional).
264
+ output: Output timeline.otio file path (must be in the same directory as media).
265
+ options: TranscribeOptions.
266
+
267
+ Returns:
268
+ ok_result or error_result envelope dict.
269
+ """
270
+ try:
271
+ return _transcribe_inner(media, output, options)
272
+ except ClipwrightError as exc:
273
+ return error_result(exc.code, exc.message, exc.hint)
274
+
275
+
276
+ def _transcribe_inner(
277
+ media: str,
278
+ output: str,
279
+ options: TranscribeOptions,
280
+ ) -> dict[str, Any]:
281
+ """Internal implementation of transcribe_media. Raises ClipwrightError directly."""
282
+ output_path = Path(output)
283
+ media_path = Path(media)
284
+
285
+ # --- 1. Output validation ---
286
+
287
+ if output_path.suffix.lower() != ".otio":
288
+ raise ClipwrightError(
289
+ code=ErrorCode.INVALID_INPUT,
290
+ message=(
291
+ f"Invalid output file extension: {output_path.suffix!r}. "
292
+ "Only .otio is allowed."
293
+ ),
294
+ hint="Change the output file path extension to .otio.",
295
+ )
296
+
297
+ if not output_path.parent.exists():
298
+ raise ClipwrightError(
299
+ code=ErrorCode.INVALID_INPUT,
300
+ message=(
301
+ "Output directory does not exist. "
302
+ "Check the parent directory of the specified output path."
303
+ ),
304
+ hint="Create the output directory before running again.",
305
+ )
306
+
307
+ try:
308
+ if output_path.resolve() == media_path.resolve():
309
+ raise ClipwrightError(
310
+ code=ErrorCode.INVALID_INPUT,
311
+ message="Output path and input media path are identical.",
312
+ hint="Change the output file path to differ from the input media.",
313
+ )
314
+ except OSError as exc:
315
+ if str(output_path) == str(media_path):
316
+ raise ClipwrightError(
317
+ code=ErrorCode.INVALID_INPUT,
318
+ message="Output path and input media path are identical.",
319
+ hint="Change the output file path to differ from the input media.",
320
+ ) from exc
321
+
322
+ # --- 2. inspect_media -> stream and duration check ---
323
+
324
+ # Replace FILE_NOT_FOUND message with basename only (TR-AD-09; no full path
325
+ # exposure).
326
+ try:
327
+ media_info = inspect_media(media)
328
+ except ClipwrightError as exc:
329
+ if exc.code == ErrorCode.FILE_NOT_FOUND:
330
+ raise ClipwrightError(
331
+ code=ErrorCode.FILE_NOT_FOUND,
332
+ message=f"File not found: {media_path.name}",
333
+ hint=exc.hint,
334
+ ) from exc
335
+ raise
336
+
337
+ # Verify that output is in the same directory as media (TR-AD-08).
338
+ # ClipwrightError propagates; OSError is skipped best-effort.
339
+ try:
340
+ media_dir = media_path.resolve().parent
341
+ output_dir = output_path.parent.resolve()
342
+ if media_dir != output_dir:
343
+ raise ClipwrightError(
344
+ code=ErrorCode.INVALID_INPUT,
345
+ message=(
346
+ "Output timeline must be placed in the same directory as "
347
+ f"the input media (input: {media_path.name})."
348
+ ),
349
+ hint=(
350
+ "Change the output path to a location inside the same directory as "
351
+ "the media file."
352
+ ),
353
+ )
354
+ except OSError:
355
+ # resolve() may fail for network paths; skip best-effort.
356
+ pass
357
+
358
+ # Audio stream check (TR-AD-03). Video is optional (audio-only sources accepted).
359
+ has_audio = any(s.codec_type == "audio" for s in media_info.streams)
360
+ if not has_audio:
361
+ raise ClipwrightError(
362
+ code=ErrorCode.UNSUPPORTED_OPERATION,
363
+ message=f"No audio stream found: {media_path.name}",
364
+ hint=(
365
+ "Transcription requires an audio stream. "
366
+ "Provide a media file that contains audio."
367
+ ),
368
+ )
369
+
370
+ # Duration check
371
+ if media_info.duration is None:
372
+ raise ClipwrightError(
373
+ code=ErrorCode.PROBE_FAILED,
374
+ message=f"Could not retrieve media duration: {media_path.name}",
375
+ hint=(
376
+ "Check that the media file is not corrupted. "
377
+ "You can also inspect it manually with ffprobe."
378
+ ),
379
+ )
380
+
381
+ total_duration_sec = media_info.duration.value / media_info.duration.rate
382
+ rate = media_info.duration.rate
383
+ abs_media = str(media_path.resolve())
384
+
385
+ # --- 3. Model resolution (DC-AS-003) ---
386
+
387
+ model_path = _resolve_model_path(options)
388
+
389
+ # --- 4. whisper execution (adapter) ---
390
+
391
+ segments, detected_language = _run_whisper(
392
+ abs_media, options, total_duration_sec, model_path
393
+ )
394
+
395
+ # Language priority: detected result > explicit option > unknown
396
+ language = detected_language or options.language or "unknown"
397
+
398
+ # --- 5. SRT/VTT generation and write (TR-AD-08) ---
399
+
400
+ srt_path = output_path.with_suffix(".srt")
401
+ vtt_path = output_path.with_suffix(".vtt")
402
+ srt_path.write_text(to_srt(segments), encoding="utf-8")
403
+ vtt_path.write_text(to_vtt(segments), encoding="utf-8")
404
+
405
+ # --- 6. OTIO construction and save (TR-AD-04/DC-AM-001/DC-AM-101) ---
406
+
407
+ timeline = new_timeline(media_path.name)
408
+ v1 = timeline.tracks[0] # V1 (Video) track
409
+
410
+ # Full-length single clip (source_range.start_time=0)
411
+ full_source_range = TimeRangeModel(
412
+ start_time=RationalTimeModel(value=0.0, rate=rate),
413
+ duration=RationalTimeModel(value=media_info.duration.value, rate=rate),
414
+ )
415
+ add_clip(
416
+ v1,
417
+ MediaRef(target_url=abs_media),
418
+ full_source_range,
419
+ name=media_path.name,
420
+ metadata={
421
+ "tool": "clipwright-transcribe",
422
+ "version": clipwright_transcribe.__version__,
423
+ "kind": "transcript-source",
424
+ },
425
+ )
426
+
427
+ # Attach each segment as a marker on the V1 track (DC-AM-101).
428
+ # marked_range uses whisper second values directly (media coord = track coord;
429
+ # DC-AM-001).
430
+ for seg in segments:
431
+ start_value = seg["start_sec"] * rate
432
+ dur_value = (seg["end_sec"] - seg["start_sec"]) * rate
433
+ marked_range = TimeRangeModel(
434
+ start_time=RationalTimeModel(value=start_value, rate=rate),
435
+ duration=RationalTimeModel(value=dur_value, rate=rate),
436
+ )
437
+ add_marker(
438
+ item=v1,
439
+ marked_range=marked_range,
440
+ name=_truncate_name(seg["text"]),
441
+ metadata={
442
+ "tool": "clipwright-transcribe",
443
+ "version": clipwright_transcribe.__version__,
444
+ "kind": "caption",
445
+ "text": seg["text"],
446
+ "language": language,
447
+ },
448
+ )
449
+
450
+ save_timeline(timeline, output)
451
+
452
+ # --- 7. Return envelope ---
453
+
454
+ segment_count = len(segments)
455
+ summary = (
456
+ f"Language {language} · {segment_count} segment(s) · "
457
+ f"total duration {_fmt_sec(total_duration_sec)} transcribed. "
458
+ f"Generated {srt_path.name} / {vtt_path.name} / {output_path.name}."
459
+ )
460
+
461
+ warnings: list[str] = []
462
+ if segment_count == 0:
463
+ warnings.append(
464
+ "No transcription segments found "
465
+ "(possible silence or recognition failure). "
466
+ "SRT is empty, VTT has header only, no markers were added. "
467
+ "The timeline contains only the full-length clip."
468
+ )
469
+
470
+ return ok_result(
471
+ summary,
472
+ data={
473
+ "segment_count": segment_count,
474
+ "language": language,
475
+ "total_duration_seconds": total_duration_sec,
476
+ },
477
+ artifacts=[
478
+ {"role": "timeline", "path": str(output_path), "format": "otio"},
479
+ {"role": "captions", "path": str(srt_path), "format": "srt"},
480
+ {"role": "captions", "path": str(vtt_path), "format": "vtt"},
481
+ ],
482
+ warnings=warnings,
483
+ )