clipwright-transcribe 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clipwright_transcribe-0.1.1/PKG-INFO +65 -0
- clipwright_transcribe-0.1.1/README.md +51 -0
- clipwright_transcribe-0.1.1/pyproject.toml +84 -0
- clipwright_transcribe-0.1.1/src/clipwright_transcribe/__init__.py +1 -0
- clipwright_transcribe-0.1.1/src/clipwright_transcribe/captions.py +156 -0
- clipwright_transcribe-0.1.1/src/clipwright_transcribe/py.typed +0 -0
- clipwright_transcribe-0.1.1/src/clipwright_transcribe/schemas.py +63 -0
- clipwright_transcribe-0.1.1/src/clipwright_transcribe/server.py +90 -0
- clipwright_transcribe-0.1.1/src/clipwright_transcribe/transcribe.py +483 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: clipwright-transcribe
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: MCP tool to transcribe audio/video files with whisper.cpp binary and generate SRT/VTT captions and OTIO timeline.
|
|
5
|
+
Author: satoh-y-0323
|
|
6
|
+
Author-email: satoh-y-0323 <shoma.papa.0323@gmail.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Dist: clipwright>=0.1.1
|
|
9
|
+
Requires-Dist: mcp[cli]>=1.27.2
|
|
10
|
+
Requires-Dist: opentimelineio>=0.18
|
|
11
|
+
Requires-Dist: pydantic>=2
|
|
12
|
+
Requires-Python: >=3.11
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# clipwright-transcribe
|
|
16
|
+
|
|
17
|
+
MCP tool to transcribe audio/video files and generate SRT/VTT captions and OTIO timeline.
|
|
18
|
+
|
|
19
|
+
## External Binaries / Files
|
|
20
|
+
|
|
21
|
+
This tool requires the following external binaries/files to exist in the execution environment. **They are not installed via pip**, so obtain them separately.
|
|
22
|
+
|
|
23
|
+
### whisper.cpp Binary
|
|
24
|
+
|
|
25
|
+
Used for transcription.
|
|
26
|
+
|
|
27
|
+
- Place `whisper-cli` (or the binary name appropriate for your environment) on PATH, or specify the full path in the `CLIPWRIGHT_WHISPER` environment variable.
|
|
28
|
+
- Obtain: Build from https://github.com/ggerganov/whisper.cpp, or use release binaries.
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
export CLIPWRIGHT_WHISPER=/path/to/whisper-cli
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### ggml Model File
|
|
35
|
+
|
|
36
|
+
Speech recognition model (`.bin` file) used by whisper.cpp.
|
|
37
|
+
|
|
38
|
+
- Specify the full path to the model file in the `CLIPWRIGHT_WHISPER_MODEL` environment variable. Can be overridden by the `model_path` parameter at tool invocation.
|
|
39
|
+
- Obtain: Download from https://huggingface.co/ggerganov/whisper.cpp etc.
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
export CLIPWRIGHT_WHISPER_MODEL=/path/to/ggml-base.bin
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### ffmpeg
|
|
46
|
+
|
|
47
|
+
Required to convert audio to 16kHz mono WAV (input format for whisper.cpp).
|
|
48
|
+
|
|
49
|
+
- Place `ffmpeg` on PATH, or specify the full path in the `CLIPWRIGHT_FFMPEG` environment variable.
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
export CLIPWRIGHT_FFMPEG=/path/to/ffmpeg
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Environment Variables Summary
|
|
56
|
+
|
|
57
|
+
| Environment Variable | Purpose | Required |
|
|
58
|
+
|---|---|---|
|
|
59
|
+
| `CLIPWRIGHT_WHISPER` | Path to whisper.cpp binary (required if not on PATH) | Conditional |
|
|
60
|
+
| `CLIPWRIGHT_WHISPER_MODEL` | Path to ggml model file (`model_path` parameter takes precedence) | Conditional |
|
|
61
|
+
| `CLIPWRIGHT_FFMPEG` | Path to ffmpeg binary (required if not on PATH) | Conditional |
|
|
62
|
+
|
|
63
|
+
## MCP Tool
|
|
64
|
+
|
|
65
|
+
`clipwright_transcribe(media, output, options?)` — Transcribe audio/video file and generate `output.otio` / `output.srt` / `output.vtt`.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# clipwright-transcribe
|
|
2
|
+
|
|
3
|
+
MCP tool to transcribe audio/video files and generate SRT/VTT captions and OTIO timeline.
|
|
4
|
+
|
|
5
|
+
## External Binaries / Files
|
|
6
|
+
|
|
7
|
+
This tool requires the following external binaries/files to exist in the execution environment. **They are not installed via pip**, so obtain them separately.
|
|
8
|
+
|
|
9
|
+
### whisper.cpp Binary
|
|
10
|
+
|
|
11
|
+
Used for transcription.
|
|
12
|
+
|
|
13
|
+
- Place `whisper-cli` (or the binary name appropriate for your environment) on PATH, or specify the full path in the `CLIPWRIGHT_WHISPER` environment variable.
|
|
14
|
+
- Obtain: Build from https://github.com/ggerganov/whisper.cpp, or use release binaries.
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
export CLIPWRIGHT_WHISPER=/path/to/whisper-cli
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### ggml Model File
|
|
21
|
+
|
|
22
|
+
Speech recognition model (`.bin` file) used by whisper.cpp.
|
|
23
|
+
|
|
24
|
+
- Specify the full path to the model file in the `CLIPWRIGHT_WHISPER_MODEL` environment variable. Can be overridden by the `model_path` parameter at tool invocation.
|
|
25
|
+
- Obtain: Download from https://huggingface.co/ggerganov/whisper.cpp etc.
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
export CLIPWRIGHT_WHISPER_MODEL=/path/to/ggml-base.bin
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### ffmpeg
|
|
32
|
+
|
|
33
|
+
Required to convert audio to 16kHz mono WAV (input format for whisper.cpp).
|
|
34
|
+
|
|
35
|
+
- Place `ffmpeg` on PATH, or specify the full path in the `CLIPWRIGHT_FFMPEG` environment variable.
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
export CLIPWRIGHT_FFMPEG=/path/to/ffmpeg
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Environment Variables Summary
|
|
42
|
+
|
|
43
|
+
| Environment Variable | Purpose | Required |
|
|
44
|
+
|---|---|---|
|
|
45
|
+
| `CLIPWRIGHT_WHISPER` | Path to whisper.cpp binary (required if not on PATH) | Conditional |
|
|
46
|
+
| `CLIPWRIGHT_WHISPER_MODEL` | Path to ggml model file (`model_path` parameter takes precedence) | Conditional |
|
|
47
|
+
| `CLIPWRIGHT_FFMPEG` | Path to ffmpeg binary (required if not on PATH) | Conditional |
|
|
48
|
+
|
|
49
|
+
## MCP Tool
|
|
50
|
+
|
|
51
|
+
`clipwright_transcribe(media, output, options?)` — Transcribe audio/video file and generate `output.otio` / `output.srt` / `output.vtt`.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "clipwright-transcribe"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = "MCP tool to transcribe audio/video files with whisper.cpp binary and generate SRT/VTT captions and OTIO timeline."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "satoh-y-0323", email = "shoma.papa.0323@gmail.com" }
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"clipwright>=0.1.1",
|
|
13
|
+
"mcp[cli]>=1.27.2",
|
|
14
|
+
"opentimelineio>=0.18",
|
|
15
|
+
"pydantic>=2",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
clipwright-transcribe = "clipwright_transcribe.server:main"
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["uv_build>=0.11.19,<0.12.0"]
|
|
23
|
+
build-backend = "uv_build"
|
|
24
|
+
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = [
|
|
27
|
+
"mypy>=2.1.0",
|
|
28
|
+
"pytest>=9.0.3",
|
|
29
|
+
"pytest-cov>=7.1.0",
|
|
30
|
+
"pytest-mock>=3.15.1",
|
|
31
|
+
"ruff>=0.15.16",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# Resolve clipwright (core) within workspace by path reference
|
|
35
|
+
[tool.uv.sources]
|
|
36
|
+
clipwright = { workspace = true }
|
|
37
|
+
|
|
38
|
+
# --- Ruff ---
|
|
39
|
+
[tool.ruff]
|
|
40
|
+
target-version = "py311"
|
|
41
|
+
line-length = 88
|
|
42
|
+
|
|
43
|
+
[tool.ruff.lint]
|
|
44
|
+
select = ["E", "F", "W", "I", "UP", "B", "C4", "SIM"]
|
|
45
|
+
ignore = []
|
|
46
|
+
|
|
47
|
+
[tool.ruff.lint.per-file-ignores]
|
|
48
|
+
# Allow E501 for English docstrings/comments in test files
|
|
49
|
+
"tests/*.py" = ["E501"]
|
|
50
|
+
|
|
51
|
+
[tool.ruff.format]
|
|
52
|
+
# Default ruff formatter is OK
|
|
53
|
+
|
|
54
|
+
# --- mypy ---
|
|
55
|
+
[tool.mypy]
|
|
56
|
+
python_version = "3.11"
|
|
57
|
+
strict = true
|
|
58
|
+
warn_return_any = true
|
|
59
|
+
warn_unused_configs = true
|
|
60
|
+
disallow_untyped_defs = true
|
|
61
|
+
disallow_any_generics = true
|
|
62
|
+
|
|
63
|
+
# opentimelineio has no stubs, ignored with mypy strict
|
|
64
|
+
[[tool.mypy.overrides]]
|
|
65
|
+
module = "opentimelineio.*"
|
|
66
|
+
ignore_missing_imports = true
|
|
67
|
+
|
|
68
|
+
# --- pytest ---
|
|
69
|
+
[tool.pytest.ini_options]
|
|
70
|
+
testpaths = ["tests"]
|
|
71
|
+
addopts = "--strict-markers -q"
|
|
72
|
+
markers = [
|
|
73
|
+
"integration: integration test requiring actual ffmpeg/ffprobe binaries",
|
|
74
|
+
"slow: test with long execution time",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
# --- coverage ---
|
|
78
|
+
[tool.coverage.run]
|
|
79
|
+
source = ["clipwright_transcribe"]
|
|
80
|
+
omit = ["tests/*"]
|
|
81
|
+
|
|
82
|
+
[tool.coverage.report]
|
|
83
|
+
show_missing = true
|
|
84
|
+
skip_covered = false
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.1"
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""captions.py — clipwright-transcribe pure logic layer (mirrors plan.py structure).
|
|
2
|
+
|
|
3
|
+
Converts whisper.cpp `-oj` JSON (transcription[].offsets.from/to in ms, text) into
|
|
4
|
+
normalised segments and generates SRT/VTT strings.
|
|
5
|
+
|
|
6
|
+
Design decisions:
|
|
7
|
+
- Pure functions; no external processes are executed (target: 100% contract coverage).
|
|
8
|
+
- SRT/VTT timecodes are derived from the same second value to guarantee consistency
|
|
9
|
+
(DC-AS-005). Only the separator differs (SRT="HH:MM:SS,mmm" / VTT="HH:MM:SS.mmm").
|
|
10
|
+
- When segments is empty, to_srt returns an empty string and to_vtt returns only the
|
|
11
|
+
"WEBVTT" header (DC-GP-002).
|
|
12
|
+
- Defensive handling of whisper output: removes entries with empty text, degenerate
|
|
13
|
+
intervals (start>=end), or missing keys.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import Any, TypedDict
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Segment(TypedDict):
|
|
22
|
+
"""Normalised caption segment.
|
|
23
|
+
|
|
24
|
+
start_sec / end_sec are in seconds (float); text has leading/trailing whitespace
|
|
25
|
+
stripped.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
start_sec: float
|
|
29
|
+
end_sec: float
|
|
30
|
+
text: str
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def normalize_segments(whisper_json: dict[str, Any]) -> list[Segment]:
|
|
34
|
+
"""Convert a whisper `-oj` JSON dict into normalised segments.
|
|
35
|
+
|
|
36
|
+
Converts transcription[].offsets.from/to (milliseconds) to seconds and strips
|
|
37
|
+
text whitespace.
|
|
38
|
+
Defensive cleanup (DC-GP-002 supplement) removes entries where:
|
|
39
|
+
- offsets / from / to / text keys are missing
|
|
40
|
+
- text is empty or whitespace-only
|
|
41
|
+
- the interval is degenerate (start_sec >= end_sec)
|
|
42
|
+
|
|
43
|
+
Returns an empty list when the transcription key is absent or not a list.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
whisper_json: dict loaded from a whisper.cpp `-oj` JSON file.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
List of normalised Segment objects.
|
|
50
|
+
"""
|
|
51
|
+
transcription = whisper_json.get("transcription")
|
|
52
|
+
if not isinstance(transcription, list):
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
segments: list[Segment] = []
|
|
56
|
+
for entry in transcription:
|
|
57
|
+
if not isinstance(entry, dict):
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
offsets = entry.get("offsets")
|
|
61
|
+
if not isinstance(offsets, dict):
|
|
62
|
+
continue
|
|
63
|
+
if "from" not in offsets or "to" not in offsets:
|
|
64
|
+
continue
|
|
65
|
+
if "text" not in entry:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
start_ms = float(offsets["from"])
|
|
70
|
+
end_ms = float(offsets["to"])
|
|
71
|
+
except (TypeError, ValueError):
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
text = str(entry["text"]).strip()
|
|
75
|
+
if not text:
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
start_sec = start_ms / 1000.0
|
|
79
|
+
end_sec = end_ms / 1000.0
|
|
80
|
+
# Remove degenerate intervals (start >= end).
|
|
81
|
+
if start_sec >= end_sec:
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
segments.append({"start_sec": start_sec, "end_sec": end_sec, "text": text})
|
|
85
|
+
|
|
86
|
+
return segments
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _format_timecode(total_seconds: float, *, ms_separator: str) -> str:
|
|
90
|
+
"""Format seconds as "HH:MM:SS{sep}mmm" timecode.
|
|
91
|
+
|
|
92
|
+
The ms_separator switches between SRT (",") and VTT (".").
|
|
93
|
+
Both formats share the same second and millisecond values for consistency
|
|
94
|
+
(DC-AS-005).
|
|
95
|
+
Milliseconds are computed with round-half-up (round → int conversion).
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
total_seconds: Duration in seconds.
|
|
99
|
+
ms_separator: Separator between seconds and milliseconds ("," or ".").
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Formatted timecode string.
|
|
103
|
+
"""
|
|
104
|
+
total_ms = int(round(total_seconds * 1000.0))
|
|
105
|
+
hours, rem_ms = divmod(total_ms, 3_600_000)
|
|
106
|
+
minutes, rem_ms = divmod(rem_ms, 60_000)
|
|
107
|
+
seconds, milliseconds = divmod(rem_ms, 1000)
|
|
108
|
+
return f"{hours:02d}:{minutes:02d}:{seconds:02d}{ms_separator}{milliseconds:03d}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def to_srt(segments: list[Segment]) -> str:
|
|
112
|
+
"""Convert normalised segments to an SRT string.
|
|
113
|
+
|
|
114
|
+
1-based index, "HH:MM:SS,mmm" timecodes, blank-line separator.
|
|
115
|
+
Returns an empty string when segments is empty (DC-GP-002).
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
segments: List of normalised Segment objects.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
SRT-formatted string.
|
|
122
|
+
"""
|
|
123
|
+
if not segments:
|
|
124
|
+
return ""
|
|
125
|
+
|
|
126
|
+
blocks: list[str] = []
|
|
127
|
+
for index, seg in enumerate(segments, start=1):
|
|
128
|
+
start_tc = _format_timecode(seg["start_sec"], ms_separator=",")
|
|
129
|
+
end_tc = _format_timecode(seg["end_sec"], ms_separator=",")
|
|
130
|
+
blocks.append(f"{index}\n{start_tc} --> {end_tc}\n{seg['text']}\n")
|
|
131
|
+
|
|
132
|
+
return "\n".join(blocks)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def to_vtt(segments: list[Segment]) -> str:
|
|
136
|
+
"""Convert normalised segments to a WebVTT string.
|
|
137
|
+
|
|
138
|
+
"WEBVTT" header, "HH:MM:SS.mmm" timecodes (dot separator).
|
|
139
|
+
Returns only the "WEBVTT" header when segments is empty (DC-GP-002).
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
segments: List of normalised Segment objects.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
WebVTT-formatted string.
|
|
146
|
+
"""
|
|
147
|
+
if not segments:
|
|
148
|
+
return "WEBVTT\n"
|
|
149
|
+
|
|
150
|
+
blocks: list[str] = ["WEBVTT\n"]
|
|
151
|
+
for seg in segments:
|
|
152
|
+
start_tc = _format_timecode(seg["start_sec"], ms_separator=".")
|
|
153
|
+
end_tc = _format_timecode(seg["end_sec"], ms_separator=".")
|
|
154
|
+
blocks.append(f"{start_tc} --> {end_tc}\n{seg['text']}\n")
|
|
155
|
+
|
|
156
|
+
return "\n".join(blocks)
|
|
File without changes
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""schemas.py — clipwright-transcribe-specific Pydantic schemas.
|
|
2
|
+
|
|
3
|
+
Common types (MediaRef / Artifact / ToolResult, etc.) are defined in clipwright.schemas;
|
|
4
|
+
this module does not redefine them.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Annotated
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TranscribeOptions(BaseModel):
|
|
15
|
+
"""Options for clipwright_transcribe (TR-AD-06).
|
|
16
|
+
|
|
17
|
+
language: whisper language code (None = auto-detect). model_path: path to the ggml
|
|
18
|
+
model file (None falls back to env CLIPWRIGHT_WHISPER_MODEL).
|
|
19
|
+
initial_prompt: context hint to improve whisper recognition accuracy.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
language: Annotated[
|
|
23
|
+
str | None,
|
|
24
|
+
Field(
|
|
25
|
+
default=None,
|
|
26
|
+
max_length=10,
|
|
27
|
+
pattern=r"^[a-zA-Z]{2,}$|^auto$",
|
|
28
|
+
description=(
|
|
29
|
+
'Transcription language code (e.g. "ja", "en"). '
|
|
30
|
+
"None (default) lets whisper auto-detect the language. "
|
|
31
|
+
"ISO 639-1 compatible: 2 or more ASCII letters, or 'auto'. "
|
|
32
|
+
"Anything else is rejected."
|
|
33
|
+
),
|
|
34
|
+
),
|
|
35
|
+
] = None
|
|
36
|
+
|
|
37
|
+
model_path: Annotated[
|
|
38
|
+
str | None,
|
|
39
|
+
Field(
|
|
40
|
+
default=None,
|
|
41
|
+
max_length=4096,
|
|
42
|
+
description=(
|
|
43
|
+
"Path to the whisper.cpp ggml model file"
|
|
44
|
+
" (max 4096 chars, equivalent to OS path length limit)."
|
|
45
|
+
" None (default) uses the CLIPWRIGHT_WHISPER_MODEL"
|
|
46
|
+
" environment variable."
|
|
47
|
+
" If neither is set or the file does not exist, an error is raised."
|
|
48
|
+
),
|
|
49
|
+
),
|
|
50
|
+
] = None
|
|
51
|
+
|
|
52
|
+
initial_prompt: Annotated[
|
|
53
|
+
str | None,
|
|
54
|
+
Field(
|
|
55
|
+
default=None,
|
|
56
|
+
max_length=2048,
|
|
57
|
+
description=(
|
|
58
|
+
"Context hint passed to whisper (proper nouns, technical terms, etc.)."
|
|
59
|
+
" None (default) means no prompt. Used to tune recognition accuracy."
|
|
60
|
+
" Maximum 2048 characters (equivalent to whisper.cpp context length)."
|
|
61
|
+
),
|
|
62
|
+
),
|
|
63
|
+
] = None
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""server.py — clipwright-transcribe MCP server + CLI entry point.
|
|
2
|
+
|
|
3
|
+
Thin wrapper that delegates business logic to transcribe.py.
|
|
4
|
+
ClipwrightError conversion is handled in transcribe.py; no double conversion here.
|
|
5
|
+
|
|
6
|
+
Transport defaults to stdio (mcp.run(transport="stdio")).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Annotated, Any
|
|
12
|
+
|
|
13
|
+
from mcp.server.fastmcp import FastMCP
|
|
14
|
+
from mcp.types import ToolAnnotations
|
|
15
|
+
from pydantic import Field
|
|
16
|
+
|
|
17
|
+
from clipwright_transcribe.schemas import TranscribeOptions
|
|
18
|
+
from clipwright_transcribe.transcribe import transcribe_media
|
|
19
|
+
|
|
20
|
+
# FastMCP instance (server name)
|
|
21
|
+
mcp = FastMCP("clipwright-transcribe")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ===========================================================================
|
|
25
|
+
# clipwright_transcribe MCP tool
|
|
26
|
+
# ===========================================================================
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@mcp.tool(
|
|
30
|
+
annotations=ToolAnnotations(
|
|
31
|
+
readOnlyHint=True,
|
|
32
|
+
destructiveHint=False,
|
|
33
|
+
idempotentHint=True,
|
|
34
|
+
openWorldHint=False,
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
def clipwright_transcribe(
|
|
38
|
+
media: Annotated[
|
|
39
|
+
str,
|
|
40
|
+
Field(
|
|
41
|
+
description="Input media file path (must contain audio; video is optional)."
|
|
42
|
+
),
|
|
43
|
+
],
|
|
44
|
+
output: Annotated[
|
|
45
|
+
str,
|
|
46
|
+
Field(description="Output OTIO timeline file path (.otio extension required)."),
|
|
47
|
+
],
|
|
48
|
+
options: Annotated[
|
|
49
|
+
TranscribeOptions | None,
|
|
50
|
+
Field(
|
|
51
|
+
description=(
|
|
52
|
+
"Transcription options (language / model_path / initial_prompt). "
|
|
53
|
+
"When omitted, all fields use their defaults "
|
|
54
|
+
"(auto language detection, model from env)."
|
|
55
|
+
)
|
|
56
|
+
),
|
|
57
|
+
] = None,
|
|
58
|
+
) -> dict[str, Any]:
|
|
59
|
+
"""MCP tool: transcribe audio and produce SRT/VTT captions and an OTIO timeline.
|
|
60
|
+
|
|
61
|
+
Non-destructive (readOnly): the input media file is never modified.
|
|
62
|
+
Outputs are newly created files; their paths are returned in artifacts.
|
|
63
|
+
|
|
64
|
+
Business logic is delegated to transcribe.transcribe_media.
|
|
65
|
+
When options is None, default TranscribeOptions() is used.
|
|
66
|
+
"""
|
|
67
|
+
resolved_options = options if options is not None else TranscribeOptions()
|
|
68
|
+
return transcribe_media(
|
|
69
|
+
media=media,
|
|
70
|
+
output=output,
|
|
71
|
+
options=resolved_options,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# ===========================================================================
|
|
76
|
+
# Entry point (MCP stdio)
|
|
77
|
+
# ===========================================================================
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def main() -> None:
|
|
81
|
+
"""CLI entry point. Starts the MCP server over stdio.
|
|
82
|
+
|
|
83
|
+
Registered in pyproject.toml [project.scripts] as:
|
|
84
|
+
clipwright-transcribe = "clipwright_transcribe.server:main"
|
|
85
|
+
"""
|
|
86
|
+
mcp.run(transport="stdio")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
main()
|
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
"""transcribe.py — clipwright-transcribe orchestration layer (mirrors detect.py).
|
|
2
|
+
|
|
3
|
+
Flow: input validation -> inspect_media -> model resolution -> ffmpeg WAV extraction ->
|
|
4
|
+
whisper-cli execution -> SRT/VTT generation via captions -> OTIO construction/save ->
|
|
5
|
+
envelope return.
|
|
6
|
+
|
|
7
|
+
Design decisions:
|
|
8
|
+
- _run_whisper() is the single adapter function (TR-AD-01) that encapsulates ffmpeg WAV
|
|
9
|
+
extraction, whisper-cli invocation, and JSON loading. To swap backends
|
|
10
|
+
(e.g. faster-whisper), replace only this function.
|
|
11
|
+
- The whisper binary name and language auto-detect flag are isolated as module constants
|
|
12
|
+
(spike-whisper confirmed values, replaceable via e2e; DC-AS-003/DC-AM-002).
|
|
13
|
+
- Model resolution uses os.path.isfile rather than resolve_tool (the model is not an
|
|
14
|
+
executable; DC-AS-003). Resolution order: options.model_path -> env
|
|
15
|
+
CLIPWRIGHT_WHISPER_MODEL.
|
|
16
|
+
- marker.marked_range uses whisper second values (media coordinates) directly.
|
|
17
|
+
Coordinates match because the clip is full-length with source_range.start_time=0
|
|
18
|
+
(DC-AM-001).
|
|
19
|
+
- SRT/VTT timecodes and marker second values share the same origin (DC-AS-005).
|
|
20
|
+
- Error messages expose only basename; raw whisper/ffmpeg stderr fragments are replaced
|
|
21
|
+
with a sanitised generic message (TR-AD-09, following VAD M-1 precedent).
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import json
|
|
27
|
+
import math
|
|
28
|
+
import os
|
|
29
|
+
import tempfile
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
from clipwright.envelope import error_result, ok_result
|
|
34
|
+
from clipwright.errors import ClipwrightError, ErrorCode
|
|
35
|
+
from clipwright.media import inspect_media
|
|
36
|
+
from clipwright.otio_utils import add_clip, add_marker, new_timeline, save_timeline
|
|
37
|
+
from clipwright.process import resolve_tool, run, safe_subprocess_message
|
|
38
|
+
from clipwright.schemas import MediaRef, RationalTimeModel, TimeRangeModel
|
|
39
|
+
|
|
40
|
+
import clipwright_transcribe
|
|
41
|
+
from clipwright_transcribe.captions import Segment, normalize_segments, to_srt, to_vtt
|
|
42
|
+
from clipwright_transcribe.schemas import TranscribeOptions
|
|
43
|
+
|
|
44
|
+
# whisper.cpp executable name (spike-whisper confirmed; latest = whisper-cli,
|
|
45
|
+
# legacy = main).
|
|
46
|
+
# Must match the binary name pointed to by env CLIPWRIGHT_WHISPER (DC-AS-003-R).
|
|
47
|
+
# Verified by e2e test.
|
|
48
|
+
WHISPER_BINARY_NAME = "whisper-cli"
|
|
49
|
+
# Language auto-detect flag (spike confirmed; replaceable via e2e as a list; DC-AM-002)
|
|
50
|
+
LANG_AUTO_FLAG: list[str] = ["-l", "auto"]
|
|
51
|
+
# Maximum display length for marker name (excess is truncated; full text kept in
|
|
52
|
+
# metadata.text; DC-GP-003).
|
|
53
|
+
_MARKER_NAME_MAX = 40
|
|
54
|
+
# Hint shown when the whisper model cannot be resolved (actionable; TR-AD-05).
|
|
55
|
+
_MODEL_MISSING_HINT = (
|
|
56
|
+
"Specify the ggml model file path via options.model_path or "
|
|
57
|
+
"set the CLIPWRIGHT_WHISPER_MODEL environment variable. "
|
|
58
|
+
"Download the model from the whisper.cpp distribution (e.g. ggml-base.bin)."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _fmt_sec(sec: float) -> str:
|
|
63
|
+
"""Convert seconds to a human-readable "Xm Ys" string (used in summary)."""
|
|
64
|
+
m = int(sec) // 60
|
|
65
|
+
s = sec - m * 60
|
|
66
|
+
return f"{m}m{s:.1f}s" if m > 0 else f"{s:.1f}s"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _truncate_name(text: str) -> str:
|
|
70
|
+
"""Truncate text to the first _MARKER_NAME_MAX characters for use as a marker name
|
|
71
|
+
(DC-GP-003).
|
|
72
|
+
|
|
73
|
+
Appends an ellipsis "…" when truncated. The full text is preserved in metadata.text.
|
|
74
|
+
"""
|
|
75
|
+
if len(text) <= _MARKER_NAME_MAX:
|
|
76
|
+
return text
|
|
77
|
+
return text[:_MARKER_NAME_MAX] + "…"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _sanitize_subprocess_error(exc: ClipwrightError) -> ClipwrightError:
|
|
81
|
+
"""Replace SUBPROCESS_FAILED/TIMEOUT message with a generic string (TR-AD-09).
|
|
82
|
+
|
|
83
|
+
run() messages may contain stderr fragments and executable paths; this function
|
|
84
|
+
substitutes a fixed string to prevent leakage into MCP responses. hint is
|
|
85
|
+
preserved. Other error codes are returned unchanged.
|
|
86
|
+
"""
|
|
87
|
+
if exc.code in (ErrorCode.SUBPROCESS_FAILED, ErrorCode.SUBPROCESS_TIMEOUT):
|
|
88
|
+
return ClipwrightError(
|
|
89
|
+
code=exc.code,
|
|
90
|
+
message=safe_subprocess_message(exc),
|
|
91
|
+
hint=exc.hint,
|
|
92
|
+
)
|
|
93
|
+
return exc
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _resolve_model_path(options: TranscribeOptions) -> str:
|
|
97
|
+
"""Resolve the whisper model file path (DC-AS-003).
|
|
98
|
+
|
|
99
|
+
Resolution order: options.model_path -> env CLIPWRIGHT_WHISPER_MODEL.
|
|
100
|
+
Uses os.path.isfile rather than resolve_tool (the model is not an executable).
|
|
101
|
+
Raises DEPENDENCY_MISSING when neither candidate exists.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
options: TranscribeOptions (model_path field is inspected).
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Absolute or relative path to an existing model file.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
ClipwrightError: When the model file cannot be found (DEPENDENCY_MISSING).
|
|
111
|
+
"""
|
|
112
|
+
candidates: list[str] = []
|
|
113
|
+
if options.model_path is not None:
|
|
114
|
+
candidates.append(options.model_path)
|
|
115
|
+
env_model = os.environ.get("CLIPWRIGHT_WHISPER_MODEL")
|
|
116
|
+
if env_model is not None:
|
|
117
|
+
candidates.append(env_model)
|
|
118
|
+
|
|
119
|
+
for candidate in candidates:
|
|
120
|
+
if os.path.isfile(candidate):
|
|
121
|
+
return candidate
|
|
122
|
+
|
|
123
|
+
raise ClipwrightError(
|
|
124
|
+
code=ErrorCode.DEPENDENCY_MISSING,
|
|
125
|
+
message="whisper model file not found",
|
|
126
|
+
hint=_MODEL_MISSING_HINT,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _extract_wav(ffmpeg: str, media: str, output_path: str, timeout: float) -> None:
|
|
131
|
+
"""Extract a 16 kHz mono s16le WAV to a temporary file using ffmpeg (TR-AD-01).
|
|
132
|
+
|
|
133
|
+
whisper.cpp requires 16 kHz mono WAV; this conversion satisfies that requirement.
|
|
134
|
+
Executed with shell=False and an argument list (subprocess discipline).
|
|
135
|
+
"""
|
|
136
|
+
cmd = [
|
|
137
|
+
ffmpeg,
|
|
138
|
+
"-hide_banner",
|
|
139
|
+
"-nostats",
|
|
140
|
+
"-i",
|
|
141
|
+
media,
|
|
142
|
+
"-vn",
|
|
143
|
+
"-acodec",
|
|
144
|
+
"pcm_s16le",
|
|
145
|
+
"-ar",
|
|
146
|
+
"16000",
|
|
147
|
+
"-ac",
|
|
148
|
+
"1",
|
|
149
|
+
"-y",
|
|
150
|
+
output_path,
|
|
151
|
+
]
|
|
152
|
+
run(cmd, timeout=timeout)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _build_whisper_cmd(
|
|
156
|
+
whisper: str,
|
|
157
|
+
model_path: str,
|
|
158
|
+
wav_path: str,
|
|
159
|
+
prefix: str,
|
|
160
|
+
options: TranscribeOptions,
|
|
161
|
+
) -> list[str]:
|
|
162
|
+
"""Build the whisper-cli argument list (TR-AD-01, DC-AM-002/003).
|
|
163
|
+
|
|
164
|
+
`-oj` writes JSON to `<prefix>.json`. Language None uses auto-detection
|
|
165
|
+
(LANG_AUTO_FLAG); an explicit code uses `-l <code>`. initial_prompt is passed
|
|
166
|
+
via `--prompt`.
|
|
167
|
+
"""
|
|
168
|
+
cmd = [whisper, "-m", model_path, "-f", wav_path, "-oj", "-of", prefix]
|
|
169
|
+
if options.language is None:
|
|
170
|
+
cmd.extend(LANG_AUTO_FLAG)
|
|
171
|
+
else:
|
|
172
|
+
cmd.extend(["-l", options.language])
|
|
173
|
+
if options.initial_prompt is not None:
|
|
174
|
+
cmd.extend(["--prompt", options.initial_prompt])
|
|
175
|
+
return cmd
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _run_whisper(
|
|
179
|
+
media: str,
|
|
180
|
+
options: TranscribeOptions,
|
|
181
|
+
total_duration_sec: float,
|
|
182
|
+
model_path: str,
|
|
183
|
+
) -> tuple[list[Segment], str | None]:
|
|
184
|
+
"""Single adapter: ffmpeg WAV extraction -> whisper-cli -> JSON normalisation
|
|
185
|
+
(TR-AD-01).
|
|
186
|
+
|
|
187
|
+
Replace only this function to swap the backend (e.g. faster-whisper).
|
|
188
|
+
WAV and JSON are written to a temporary directory so the source media directory
|
|
189
|
+
is not polluted.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
media: Absolute path to the input media file.
|
|
193
|
+
options: TranscribeOptions.
|
|
194
|
+
total_duration_sec: Total duration of the media (seconds); used to compute
|
|
195
|
+
timeouts.
|
|
196
|
+
model_path: Resolved model file path.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Tuple of (normalised Segment list, detected language code or None).
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
ClipwrightError: DEPENDENCY_MISSING (missing tool), sanitised
|
|
203
|
+
SUBPROCESS_FAILED/SUBPROCESS_TIMEOUT, or SUBPROCESS_FAILED on JSON parse
|
|
204
|
+
failure.
|
|
205
|
+
"""
|
|
206
|
+
ffmpeg = resolve_tool("ffmpeg", "CLIPWRIGHT_FFMPEG")
|
|
207
|
+
whisper = resolve_tool(WHISPER_BINARY_NAME, "CLIPWRIGHT_WHISPER")
|
|
208
|
+
|
|
209
|
+
# Timeouts scale with duration; whisper is computationally expensive (TR-AD-10).
|
|
210
|
+
ffmpeg_timeout = float(max(60, math.ceil(total_duration_sec * 2)))
|
|
211
|
+
whisper_timeout = float(max(300, math.ceil(total_duration_sec * 30)))
|
|
212
|
+
|
|
213
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
214
|
+
wav_path = os.path.join(tmpdir, "audio.wav")
|
|
215
|
+
prefix = os.path.join(tmpdir, "transcript")
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
_extract_wav(ffmpeg, media, wav_path, ffmpeg_timeout)
|
|
219
|
+
except ClipwrightError as exc:
|
|
220
|
+
raise _sanitize_subprocess_error(exc) from None
|
|
221
|
+
|
|
222
|
+
cmd = _build_whisper_cmd(whisper, model_path, wav_path, prefix, options)
|
|
223
|
+
try:
|
|
224
|
+
run(cmd, timeout=whisper_timeout)
|
|
225
|
+
except ClipwrightError as exc:
|
|
226
|
+
raise _sanitize_subprocess_error(exc) from None
|
|
227
|
+
|
|
228
|
+
# whisper `-oj -of <prefix>` produces <prefix>.json (DC-AM-003).
|
|
229
|
+
json_path = prefix + ".json"
|
|
230
|
+
try:
|
|
231
|
+
with open(json_path, encoding="utf-8") as f:
|
|
232
|
+
whisper_json: dict[str, Any] = json.load(f)
|
|
233
|
+
except (OSError, json.JSONDecodeError):
|
|
234
|
+
raise ClipwrightError(
|
|
235
|
+
code=ErrorCode.SUBPROCESS_FAILED,
|
|
236
|
+
message="failed to read whisper output JSON",
|
|
237
|
+
hint=(
|
|
238
|
+
"Check the whisper.cpp version and arguments. "
|
|
239
|
+
"Please report with reproduction steps."
|
|
240
|
+
),
|
|
241
|
+
) from None
|
|
242
|
+
|
|
243
|
+
# Complete JSON loading and normalisation inside the with block so that the
|
|
244
|
+
# temporary directory still exists while data is accessed (CR M-2).
|
|
245
|
+
segments = normalize_segments(whisper_json)
|
|
246
|
+
result = whisper_json.get("result")
|
|
247
|
+
language = result.get("language") if isinstance(result, dict) else None
|
|
248
|
+
|
|
249
|
+
return segments, language
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def transcribe_media(
|
|
253
|
+
media: str,
|
|
254
|
+
output: str,
|
|
255
|
+
options: TranscribeOptions,
|
|
256
|
+
) -> dict[str, Any]:
|
|
257
|
+
"""Transcribe audio and produce SRT/VTT captions and an OTIO timeline (TR-AD-04).
|
|
258
|
+
|
|
259
|
+
Non-destructive: the input media file is never modified.
|
|
260
|
+
Outputs are newly created files; their paths are returned in artifacts.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
media: Input media file path (audio required; video optional).
|
|
264
|
+
output: Output timeline.otio file path (must be in the same directory as media).
|
|
265
|
+
options: TranscribeOptions.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
ok_result or error_result envelope dict.
|
|
269
|
+
"""
|
|
270
|
+
try:
|
|
271
|
+
return _transcribe_inner(media, output, options)
|
|
272
|
+
except ClipwrightError as exc:
|
|
273
|
+
return error_result(exc.code, exc.message, exc.hint)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _transcribe_inner(
|
|
277
|
+
media: str,
|
|
278
|
+
output: str,
|
|
279
|
+
options: TranscribeOptions,
|
|
280
|
+
) -> dict[str, Any]:
|
|
281
|
+
"""Internal implementation of transcribe_media. Raises ClipwrightError directly."""
|
|
282
|
+
output_path = Path(output)
|
|
283
|
+
media_path = Path(media)
|
|
284
|
+
|
|
285
|
+
# --- 1. Output validation ---
|
|
286
|
+
|
|
287
|
+
if output_path.suffix.lower() != ".otio":
|
|
288
|
+
raise ClipwrightError(
|
|
289
|
+
code=ErrorCode.INVALID_INPUT,
|
|
290
|
+
message=(
|
|
291
|
+
f"Invalid output file extension: {output_path.suffix!r}. "
|
|
292
|
+
"Only .otio is allowed."
|
|
293
|
+
),
|
|
294
|
+
hint="Change the output file path extension to .otio.",
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if not output_path.parent.exists():
|
|
298
|
+
raise ClipwrightError(
|
|
299
|
+
code=ErrorCode.INVALID_INPUT,
|
|
300
|
+
message=(
|
|
301
|
+
"Output directory does not exist. "
|
|
302
|
+
"Check the parent directory of the specified output path."
|
|
303
|
+
),
|
|
304
|
+
hint="Create the output directory before running again.",
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
if output_path.resolve() == media_path.resolve():
|
|
309
|
+
raise ClipwrightError(
|
|
310
|
+
code=ErrorCode.INVALID_INPUT,
|
|
311
|
+
message="Output path and input media path are identical.",
|
|
312
|
+
hint="Change the output file path to differ from the input media.",
|
|
313
|
+
)
|
|
314
|
+
except OSError as exc:
|
|
315
|
+
if str(output_path) == str(media_path):
|
|
316
|
+
raise ClipwrightError(
|
|
317
|
+
code=ErrorCode.INVALID_INPUT,
|
|
318
|
+
message="Output path and input media path are identical.",
|
|
319
|
+
hint="Change the output file path to differ from the input media.",
|
|
320
|
+
) from exc
|
|
321
|
+
|
|
322
|
+
# --- 2. inspect_media -> stream and duration check ---
|
|
323
|
+
|
|
324
|
+
# Replace FILE_NOT_FOUND message with basename only (TR-AD-09; no full path
|
|
325
|
+
# exposure).
|
|
326
|
+
try:
|
|
327
|
+
media_info = inspect_media(media)
|
|
328
|
+
except ClipwrightError as exc:
|
|
329
|
+
if exc.code == ErrorCode.FILE_NOT_FOUND:
|
|
330
|
+
raise ClipwrightError(
|
|
331
|
+
code=ErrorCode.FILE_NOT_FOUND,
|
|
332
|
+
message=f"File not found: {media_path.name}",
|
|
333
|
+
hint=exc.hint,
|
|
334
|
+
) from exc
|
|
335
|
+
raise
|
|
336
|
+
|
|
337
|
+
# Verify that output is in the same directory as media (TR-AD-08).
|
|
338
|
+
# ClipwrightError propagates; OSError is skipped best-effort.
|
|
339
|
+
try:
|
|
340
|
+
media_dir = media_path.resolve().parent
|
|
341
|
+
output_dir = output_path.parent.resolve()
|
|
342
|
+
if media_dir != output_dir:
|
|
343
|
+
raise ClipwrightError(
|
|
344
|
+
code=ErrorCode.INVALID_INPUT,
|
|
345
|
+
message=(
|
|
346
|
+
"Output timeline must be placed in the same directory as "
|
|
347
|
+
f"the input media (input: {media_path.name})."
|
|
348
|
+
),
|
|
349
|
+
hint=(
|
|
350
|
+
"Change the output path to a location inside the same directory as "
|
|
351
|
+
"the media file."
|
|
352
|
+
),
|
|
353
|
+
)
|
|
354
|
+
except OSError:
|
|
355
|
+
# resolve() may fail for network paths; skip best-effort.
|
|
356
|
+
pass
|
|
357
|
+
|
|
358
|
+
# Audio stream check (TR-AD-03). Video is optional (audio-only sources accepted).
|
|
359
|
+
has_audio = any(s.codec_type == "audio" for s in media_info.streams)
|
|
360
|
+
if not has_audio:
|
|
361
|
+
raise ClipwrightError(
|
|
362
|
+
code=ErrorCode.UNSUPPORTED_OPERATION,
|
|
363
|
+
message=f"No audio stream found: {media_path.name}",
|
|
364
|
+
hint=(
|
|
365
|
+
"Transcription requires an audio stream. "
|
|
366
|
+
"Provide a media file that contains audio."
|
|
367
|
+
),
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Duration check
|
|
371
|
+
if media_info.duration is None:
|
|
372
|
+
raise ClipwrightError(
|
|
373
|
+
code=ErrorCode.PROBE_FAILED,
|
|
374
|
+
message=f"Could not retrieve media duration: {media_path.name}",
|
|
375
|
+
hint=(
|
|
376
|
+
"Check that the media file is not corrupted. "
|
|
377
|
+
"You can also inspect it manually with ffprobe."
|
|
378
|
+
),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
total_duration_sec = media_info.duration.value / media_info.duration.rate
|
|
382
|
+
rate = media_info.duration.rate
|
|
383
|
+
abs_media = str(media_path.resolve())
|
|
384
|
+
|
|
385
|
+
# --- 3. Model resolution (DC-AS-003) ---
|
|
386
|
+
|
|
387
|
+
model_path = _resolve_model_path(options)
|
|
388
|
+
|
|
389
|
+
# --- 4. whisper execution (adapter) ---
|
|
390
|
+
|
|
391
|
+
segments, detected_language = _run_whisper(
|
|
392
|
+
abs_media, options, total_duration_sec, model_path
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Language priority: detected result > explicit option > unknown
|
|
396
|
+
language = detected_language or options.language or "unknown"
|
|
397
|
+
|
|
398
|
+
# --- 5. SRT/VTT generation and write (TR-AD-08) ---
|
|
399
|
+
|
|
400
|
+
srt_path = output_path.with_suffix(".srt")
|
|
401
|
+
vtt_path = output_path.with_suffix(".vtt")
|
|
402
|
+
srt_path.write_text(to_srt(segments), encoding="utf-8")
|
|
403
|
+
vtt_path.write_text(to_vtt(segments), encoding="utf-8")
|
|
404
|
+
|
|
405
|
+
# --- 6. OTIO construction and save (TR-AD-04/DC-AM-001/DC-AM-101) ---
|
|
406
|
+
|
|
407
|
+
timeline = new_timeline(media_path.name)
|
|
408
|
+
v1 = timeline.tracks[0] # V1 (Video) track
|
|
409
|
+
|
|
410
|
+
# Full-length single clip (source_range.start_time=0)
|
|
411
|
+
full_source_range = TimeRangeModel(
|
|
412
|
+
start_time=RationalTimeModel(value=0.0, rate=rate),
|
|
413
|
+
duration=RationalTimeModel(value=media_info.duration.value, rate=rate),
|
|
414
|
+
)
|
|
415
|
+
add_clip(
|
|
416
|
+
v1,
|
|
417
|
+
MediaRef(target_url=abs_media),
|
|
418
|
+
full_source_range,
|
|
419
|
+
name=media_path.name,
|
|
420
|
+
metadata={
|
|
421
|
+
"tool": "clipwright-transcribe",
|
|
422
|
+
"version": clipwright_transcribe.__version__,
|
|
423
|
+
"kind": "transcript-source",
|
|
424
|
+
},
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# Attach each segment as a marker on the V1 track (DC-AM-101).
|
|
428
|
+
# marked_range uses whisper second values directly (media coord = track coord;
|
|
429
|
+
# DC-AM-001).
|
|
430
|
+
for seg in segments:
|
|
431
|
+
start_value = seg["start_sec"] * rate
|
|
432
|
+
dur_value = (seg["end_sec"] - seg["start_sec"]) * rate
|
|
433
|
+
marked_range = TimeRangeModel(
|
|
434
|
+
start_time=RationalTimeModel(value=start_value, rate=rate),
|
|
435
|
+
duration=RationalTimeModel(value=dur_value, rate=rate),
|
|
436
|
+
)
|
|
437
|
+
add_marker(
|
|
438
|
+
item=v1,
|
|
439
|
+
marked_range=marked_range,
|
|
440
|
+
name=_truncate_name(seg["text"]),
|
|
441
|
+
metadata={
|
|
442
|
+
"tool": "clipwright-transcribe",
|
|
443
|
+
"version": clipwright_transcribe.__version__,
|
|
444
|
+
"kind": "caption",
|
|
445
|
+
"text": seg["text"],
|
|
446
|
+
"language": language,
|
|
447
|
+
},
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
save_timeline(timeline, output)
|
|
451
|
+
|
|
452
|
+
# --- 7. Return envelope ---
|
|
453
|
+
|
|
454
|
+
segment_count = len(segments)
|
|
455
|
+
summary = (
|
|
456
|
+
f"Language {language} · {segment_count} segment(s) · "
|
|
457
|
+
f"total duration {_fmt_sec(total_duration_sec)} transcribed. "
|
|
458
|
+
f"Generated {srt_path.name} / {vtt_path.name} / {output_path.name}."
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
warnings: list[str] = []
|
|
462
|
+
if segment_count == 0:
|
|
463
|
+
warnings.append(
|
|
464
|
+
"No transcription segments found "
|
|
465
|
+
"(possible silence or recognition failure). "
|
|
466
|
+
"SRT is empty, VTT has header only, no markers were added. "
|
|
467
|
+
"The timeline contains only the full-length clip."
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
return ok_result(
|
|
471
|
+
summary,
|
|
472
|
+
data={
|
|
473
|
+
"segment_count": segment_count,
|
|
474
|
+
"language": language,
|
|
475
|
+
"total_duration_seconds": total_duration_sec,
|
|
476
|
+
},
|
|
477
|
+
artifacts=[
|
|
478
|
+
{"role": "timeline", "path": str(output_path), "format": "otio"},
|
|
479
|
+
{"role": "captions", "path": str(srt_path), "format": "srt"},
|
|
480
|
+
{"role": "captions", "path": str(vtt_path), "format": "vtt"},
|
|
481
|
+
],
|
|
482
|
+
warnings=warnings,
|
|
483
|
+
)
|