vidgrid 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vidgrid/__init__.py +9 -0
- vidgrid/__main__.py +7 -0
- vidgrid/api.py +166 -0
- vidgrid/assets/fonts/SourceSans3-Semibold.ttf +0 -0
- vidgrid/captions.py +374 -0
- vidgrid/cli.py +231 -0
- vidgrid/compose.py +414 -0
- vidgrid/llm.py +227 -0
- vidgrid/models.py +130 -0
- vidgrid/output.py +108 -0
- vidgrid/presets.py +165 -0
- vidgrid/probe.py +118 -0
- vidgrid/sample.py +133 -0
- vidgrid-0.1.0.dist-info/METADATA +315 -0
- vidgrid-0.1.0.dist-info/RECORD +19 -0
- vidgrid-0.1.0.dist-info/WHEEL +5 -0
- vidgrid-0.1.0.dist-info/entry_points.txt +2 -0
- vidgrid-0.1.0.dist-info/licenses/LICENSE +21 -0
- vidgrid-0.1.0.dist-info/top_level.txt +1 -0
vidgrid/__init__.py
ADDED
vidgrid/__main__.py
ADDED
vidgrid/api.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Public Python API.
|
|
2
|
+
|
|
3
|
+
Integration scripts and other packages should import from here:
|
|
4
|
+
|
|
5
|
+
from vidgrid import render
|
|
6
|
+
|
|
7
|
+
storyboard = render(
|
|
8
|
+
input_path="clip.mp4",
|
|
9
|
+
output_path="grid.png",
|
|
10
|
+
grid="3x3", # or "2x2", "4x4", "5x5", or None for auto
|
|
11
|
+
)
|
|
12
|
+
print(storyboard.board_paths)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import shutil
|
|
18
|
+
import tempfile
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import List, Optional
|
|
21
|
+
|
|
22
|
+
from vidgrid.captions import (
|
|
23
|
+
SUPPORTED_FORMATS,
|
|
24
|
+
load_captions,
|
|
25
|
+
phrase_for_timestamp,
|
|
26
|
+
transcribe_video,
|
|
27
|
+
write_captions,
|
|
28
|
+
)
|
|
29
|
+
from vidgrid.models import Board, Cell, Storyboard
|
|
30
|
+
from vidgrid.output import write_boards, write_sidecar
|
|
31
|
+
from vidgrid.presets import DEFAULT_MAX_BOARDS, MAX_DURATION_SECONDS, preset_for
|
|
32
|
+
from vidgrid.probe import probe
|
|
33
|
+
from vidgrid.sample import sample_video
|
|
34
|
+
|
|
35
|
+
DEFAULT_TRANSCRIPT_FORMAT = "json"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def render(
|
|
39
|
+
input_path: str,
|
|
40
|
+
*,
|
|
41
|
+
output_path: Optional[str] = None,
|
|
42
|
+
grid: Optional[str] = None,
|
|
43
|
+
fps: Optional[float] = None,
|
|
44
|
+
max_boards: int = DEFAULT_MAX_BOARDS,
|
|
45
|
+
max_duration_seconds: int = MAX_DURATION_SECONDS,
|
|
46
|
+
captions_path: Optional[str] = None,
|
|
47
|
+
transcribe: bool = False,
|
|
48
|
+
burn_captions: bool = False,
|
|
49
|
+
transcript_format: str = DEFAULT_TRANSCRIPT_FORMAT,
|
|
50
|
+
work_dir: Optional[str] = None,
|
|
51
|
+
cleanup_work_dir: bool = True,
|
|
52
|
+
) -> Storyboard:
|
|
53
|
+
"""Render a video into one or more annotated storyboards.
|
|
54
|
+
|
|
55
|
+
Each cell represents exactly 1 second of video. Longer videos produce
|
|
56
|
+
more boards, not denser grids.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
input_path: path to the source video file
|
|
60
|
+
output_path: where to write the PNG. Pass None to skip writing and
|
|
61
|
+
only get the in-memory Storyboard.
|
|
62
|
+
grid: '2x2', '3x3', '4x4', '5x5', or None for auto (<= 4s → 2x2,
|
|
63
|
+
otherwise → 3x3).
|
|
64
|
+
max_duration_seconds: reject videos longer than this. Default 300
|
|
65
|
+
(5 min). Raise max_duration_seconds explicitly to override.
|
|
66
|
+
captions_path: path to a Whisper captions JSON for burn-in or
|
|
67
|
+
transcript correlation.
|
|
68
|
+
transcribe: run faster-whisper on the video's audio to generate
|
|
69
|
+
captions automatically.
|
|
70
|
+
burn_captions: render a caption strip below each cell (one-layer mode).
|
|
71
|
+
transcript_format: 'json' (default, Remotion-compatible), 'srt', or
|
|
72
|
+
'txt'. Controls the extension and format of the transcript
|
|
73
|
+
sidecar file.
|
|
74
|
+
work_dir: directory for intermediate frames. A temp dir is created
|
|
75
|
+
if not supplied.
|
|
76
|
+
cleanup_work_dir: delete the work dir after rendering.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Storyboard describing the result. board.png_path is populated if
|
|
80
|
+
output_path was provided.
|
|
81
|
+
"""
|
|
82
|
+
video = probe(input_path)
|
|
83
|
+
preset = preset_for(
|
|
84
|
+
video,
|
|
85
|
+
grid=grid,
|
|
86
|
+
fps=fps,
|
|
87
|
+
max_boards=max_boards,
|
|
88
|
+
max_duration_seconds=max_duration_seconds,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if transcript_format not in SUPPORTED_FORMATS:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
f"Unknown transcript format '{transcript_format}'. "
|
|
94
|
+
f"Supported: {', '.join(SUPPORTED_FORMATS)}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# --- captions ---
|
|
98
|
+
captions_data: List[dict] = []
|
|
99
|
+
transcript_out_path: Optional[str] = None
|
|
100
|
+
|
|
101
|
+
if transcribe and captions_path:
|
|
102
|
+
raise ValueError("Use either --transcribe or --captions, not both")
|
|
103
|
+
|
|
104
|
+
if transcribe:
|
|
105
|
+
if output_path:
|
|
106
|
+
transcript_out_path = str(
|
|
107
|
+
Path(output_path).with_name(
|
|
108
|
+
Path(output_path).stem + f"-transcript.{transcript_format}"
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
captions_data = transcribe_video(
|
|
112
|
+
input_path,
|
|
113
|
+
output_path=transcript_out_path,
|
|
114
|
+
format=transcript_format,
|
|
115
|
+
)
|
|
116
|
+
elif captions_path:
|
|
117
|
+
captions_data = load_captions(captions_path)
|
|
118
|
+
if output_path and captions_data:
|
|
119
|
+
dest = Path(output_path).with_name(
|
|
120
|
+
Path(output_path).stem + f"-transcript.{transcript_format}"
|
|
121
|
+
)
|
|
122
|
+
write_captions(captions_data, str(dest), format=transcript_format)
|
|
123
|
+
transcript_out_path = str(dest)
|
|
124
|
+
|
|
125
|
+
# --- frame extraction ---
|
|
126
|
+
managed_work_dir = work_dir is None
|
|
127
|
+
if managed_work_dir:
|
|
128
|
+
work_dir = tempfile.mkdtemp(prefix="vidgrid-")
|
|
129
|
+
else:
|
|
130
|
+
Path(work_dir).mkdir(parents=True, exist_ok=True)
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
per_board_samples = sample_video(video, preset, work_dir)
|
|
134
|
+
|
|
135
|
+
# --- assemble Storyboard ---
|
|
136
|
+
boards: List[Board] = []
|
|
137
|
+
for i, samples in enumerate(per_board_samples, start=1):
|
|
138
|
+
cells: List[Cell] = []
|
|
139
|
+
for sample in samples:
|
|
140
|
+
phrase = None
|
|
141
|
+
if captions_data:
|
|
142
|
+
phrase = phrase_for_timestamp(
|
|
143
|
+
captions_data, sample.timestamp_ms
|
|
144
|
+
)
|
|
145
|
+
cells.append(Cell(sample=sample, caption=phrase))
|
|
146
|
+
boards.append(
|
|
147
|
+
Board(index=i, layout=preset.layout, cells=cells)
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
storyboard = Storyboard(
|
|
151
|
+
source=video,
|
|
152
|
+
preset=preset,
|
|
153
|
+
boards=boards,
|
|
154
|
+
transcript_path=transcript_out_path,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
if output_path:
|
|
158
|
+
write_boards(
|
|
159
|
+
storyboard, output_path, burn_captions=burn_captions
|
|
160
|
+
)
|
|
161
|
+
write_sidecar(storyboard, output_path)
|
|
162
|
+
|
|
163
|
+
return storyboard
|
|
164
|
+
finally:
|
|
165
|
+
if managed_work_dir and cleanup_work_dir and work_dir:
|
|
166
|
+
shutil.rmtree(work_dir, ignore_errors=True)
|
|
Binary file
|
vidgrid/captions.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
"""Caption loading, writing, phrase windowing, and optional Whisper transcription.
|
|
2
|
+
|
|
3
|
+
Captions are represented internally as a list of Remotion-style dicts:
|
|
4
|
+
[{"text": " word", "startMs": int, "endMs": int, "timestampMs": int, "confidence": float}, ...]
|
|
5
|
+
|
|
6
|
+
vidgrid can read and write three on-disk formats:
|
|
7
|
+
|
|
8
|
+
1. **json** (default, Remotion-compatible)
|
|
9
|
+
Verbose but preserves word-level timing and confidence. Required by the
|
|
10
|
+
indiehacker-news pipeline because Remotion reads this exact shape.
|
|
11
|
+
|
|
12
|
+
2. **srt** (SubRip subtitle, industry standard)
|
|
13
|
+
Every video editor understands it. One entry per word. Easy to share,
|
|
14
|
+
easy to edit in a text editor.
|
|
15
|
+
|
|
16
|
+
3. **txt** (plain timestamped text)
|
|
17
|
+
The simplest possible format. One word per line, prefixed by its start
|
|
18
|
+
time in seconds. Trivial to parse, smallest file on disk.
|
|
19
|
+
|
|
20
|
+
Auto-detection on load uses the file extension. Writes either use an
|
|
21
|
+
explicit format argument or infer from the output path's extension.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import json
|
|
27
|
+
import re
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import List, Optional
|
|
30
|
+
|
|
31
|
+
from vidgrid.models import CaptionPhrase
|
|
32
|
+
|
|
33
|
+
SUPPORTED_FORMATS = ("json", "srt", "txt")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------- loading ----------
|
|
37
|
+
|
|
38
|
+
def load_captions(captions_path: str) -> List[dict]:
|
|
39
|
+
"""Load captions from disk. Auto-detects format from file extension.
|
|
40
|
+
|
|
41
|
+
Supported: .json, .srt, .txt. Anything else is treated as JSON.
|
|
42
|
+
Returns an empty list on missing file or parse failure.
|
|
43
|
+
"""
|
|
44
|
+
path = Path(captions_path)
|
|
45
|
+
if not path.exists():
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
ext = path.suffix.lower().lstrip(".")
|
|
49
|
+
content = path.read_text(errors="replace")
|
|
50
|
+
|
|
51
|
+
if ext == "srt":
|
|
52
|
+
return _parse_srt(content)
|
|
53
|
+
if ext == "txt":
|
|
54
|
+
return _parse_txt(content)
|
|
55
|
+
return _parse_json(content)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _parse_json(content: str) -> List[dict]:
|
|
59
|
+
try:
|
|
60
|
+
data = json.loads(content)
|
|
61
|
+
except json.JSONDecodeError:
|
|
62
|
+
return []
|
|
63
|
+
if not isinstance(data, list):
|
|
64
|
+
return []
|
|
65
|
+
return data
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
_SRT_TIME_RE = re.compile(
|
|
69
|
+
r"(\d{1,2}):(\d{2}):(\d{2})[,\.](\d{1,3})\s*-->\s*"
|
|
70
|
+
r"(\d{1,2}):(\d{2}):(\d{2})[,\.](\d{1,3})"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _parse_srt(content: str) -> List[dict]:
|
|
75
|
+
"""Parse an SRT file into the internal caption dict format."""
|
|
76
|
+
blocks = re.split(r"\n\s*\n", content.strip())
|
|
77
|
+
captions: List[dict] = []
|
|
78
|
+
for block in blocks:
|
|
79
|
+
lines = [ln for ln in block.strip().splitlines() if ln.strip()]
|
|
80
|
+
if len(lines) < 2:
|
|
81
|
+
continue
|
|
82
|
+
# First line is usually an index, but some SRT files skip it
|
|
83
|
+
if lines[0].isdigit():
|
|
84
|
+
time_line = lines[1] if len(lines) >= 3 else None
|
|
85
|
+
text_lines = lines[2:]
|
|
86
|
+
else:
|
|
87
|
+
time_line = lines[0]
|
|
88
|
+
text_lines = lines[1:]
|
|
89
|
+
|
|
90
|
+
if not time_line:
|
|
91
|
+
continue
|
|
92
|
+
match = _SRT_TIME_RE.search(time_line)
|
|
93
|
+
if not match:
|
|
94
|
+
continue
|
|
95
|
+
h1, m1, s1, ms1, h2, m2, s2, ms2 = (int(x) for x in match.groups())
|
|
96
|
+
start_ms = h1 * 3600_000 + m1 * 60_000 + s1 * 1000 + ms1
|
|
97
|
+
end_ms = h2 * 3600_000 + m2 * 60_000 + s2 * 1000 + ms2
|
|
98
|
+
text = " ".join(text_lines).strip()
|
|
99
|
+
if not text:
|
|
100
|
+
continue
|
|
101
|
+
captions.append(
|
|
102
|
+
{
|
|
103
|
+
"text": f" {text}" if captions else text,
|
|
104
|
+
"startMs": start_ms,
|
|
105
|
+
"endMs": end_ms,
|
|
106
|
+
"timestampMs": start_ms,
|
|
107
|
+
"confidence": 1.0,
|
|
108
|
+
}
|
|
109
|
+
)
|
|
110
|
+
return captions
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _parse_txt(content: str) -> List[dict]:
|
|
114
|
+
"""Parse plain timestamped text (`<seconds> <word>` per line).
|
|
115
|
+
|
|
116
|
+
Each line looks like:
|
|
117
|
+
0.00 hello
|
|
118
|
+
0.50 world
|
|
119
|
+
|
|
120
|
+
The end timestamp of word N is set to the start of word N+1 (or
|
|
121
|
+
start + 500ms for the last word).
|
|
122
|
+
"""
|
|
123
|
+
captions: List[dict] = []
|
|
124
|
+
for raw in content.splitlines():
|
|
125
|
+
line = raw.strip()
|
|
126
|
+
if not line:
|
|
127
|
+
continue
|
|
128
|
+
parts = line.split(None, 1)
|
|
129
|
+
if len(parts) < 2:
|
|
130
|
+
continue
|
|
131
|
+
try:
|
|
132
|
+
start_s = float(parts[0])
|
|
133
|
+
except ValueError:
|
|
134
|
+
continue
|
|
135
|
+
text = parts[1]
|
|
136
|
+
start_ms = int(round(start_s * 1000))
|
|
137
|
+
captions.append(
|
|
138
|
+
{
|
|
139
|
+
"text": f" {text}" if captions else text,
|
|
140
|
+
"startMs": start_ms,
|
|
141
|
+
"endMs": start_ms, # fixed below
|
|
142
|
+
"timestampMs": start_ms,
|
|
143
|
+
"confidence": 1.0,
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
# Fill in endMs by using the next word's start, last word gets +500ms
|
|
147
|
+
for i in range(len(captions) - 1):
|
|
148
|
+
captions[i]["endMs"] = captions[i + 1]["startMs"]
|
|
149
|
+
if captions:
|
|
150
|
+
captions[-1]["endMs"] = captions[-1]["startMs"] + 500
|
|
151
|
+
return captions
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ---------- writing ----------
|
|
155
|
+
|
|
156
|
+
def write_captions(
|
|
157
|
+
captions: List[dict],
|
|
158
|
+
output_path: str,
|
|
159
|
+
format: Optional[str] = None,
|
|
160
|
+
) -> str:
|
|
161
|
+
"""Write captions to disk in the requested format.
|
|
162
|
+
|
|
163
|
+
If `format` is None, infer from the output_path's extension. Returns
|
|
164
|
+
the resolved output path (unchanged from input).
|
|
165
|
+
"""
|
|
166
|
+
path = Path(output_path)
|
|
167
|
+
if format is None:
|
|
168
|
+
ext = path.suffix.lower().lstrip(".")
|
|
169
|
+
format = ext if ext in SUPPORTED_FORMATS else "json"
|
|
170
|
+
|
|
171
|
+
if format == "json":
|
|
172
|
+
path.write_text(json.dumps(captions, indent=2))
|
|
173
|
+
elif format == "srt":
|
|
174
|
+
path.write_text(_to_srt(captions))
|
|
175
|
+
elif format == "txt":
|
|
176
|
+
path.write_text(_to_txt(captions))
|
|
177
|
+
else:
|
|
178
|
+
raise ValueError(
|
|
179
|
+
f"Unknown caption format '{format}'. "
|
|
180
|
+
f"Supported: {', '.join(SUPPORTED_FORMATS)}"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
return str(path)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _ms_to_srt_time(ms: int) -> str:
|
|
187
|
+
"""Convert milliseconds to SRT time format HH:MM:SS,mmm."""
|
|
188
|
+
ms = max(0, int(ms))
|
|
189
|
+
h, ms = divmod(ms, 3600_000)
|
|
190
|
+
m, ms = divmod(ms, 60_000)
|
|
191
|
+
s, ms = divmod(ms, 1000)
|
|
192
|
+
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _to_srt(captions: List[dict]) -> str:
|
|
196
|
+
lines: List[str] = []
|
|
197
|
+
for i, cap in enumerate(captions, start=1):
|
|
198
|
+
start = _ms_to_srt_time(cap.get("startMs", 0))
|
|
199
|
+
end = _ms_to_srt_time(cap.get("endMs", cap.get("startMs", 0) + 500))
|
|
200
|
+
text = str(cap.get("text", "")).strip()
|
|
201
|
+
if not text:
|
|
202
|
+
continue
|
|
203
|
+
lines.append(str(i))
|
|
204
|
+
lines.append(f"{start} --> {end}")
|
|
205
|
+
lines.append(text)
|
|
206
|
+
lines.append("")
|
|
207
|
+
return "\n".join(lines).rstrip() + "\n"
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _to_txt(captions: List[dict]) -> str:
|
|
211
|
+
lines: List[str] = []
|
|
212
|
+
for cap in captions:
|
|
213
|
+
text = str(cap.get("text", "")).strip()
|
|
214
|
+
if not text:
|
|
215
|
+
continue
|
|
216
|
+
seconds = cap.get("startMs", 0) / 1000.0
|
|
217
|
+
lines.append(f"{seconds:.2f} {text}")
|
|
218
|
+
return "\n".join(lines) + ("\n" if lines else "")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ---------- phrase windowing ----------
|
|
222
|
+
|
|
223
|
+
def phrase_for_timestamp(
|
|
224
|
+
captions: List[dict],
|
|
225
|
+
t_ms: int,
|
|
226
|
+
*,
|
|
227
|
+
before_ms: int = 900,
|
|
228
|
+
after_ms: int = 1400,
|
|
229
|
+
max_words: int = 12,
|
|
230
|
+
max_chars: int = 80,
|
|
231
|
+
) -> Optional[CaptionPhrase]:
|
|
232
|
+
"""Return a phrase window centered on a frame timestamp.
|
|
233
|
+
|
|
234
|
+
Single isolated words look terrible under cells. Instead, we grab the
|
|
235
|
+
words that fall inside a window around the frame's timestamp and trim to
|
|
236
|
+
a reasonable word/char count so the caption strip stays legible.
|
|
237
|
+
"""
|
|
238
|
+
if not captions:
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
window_start = t_ms - before_ms
|
|
242
|
+
window_end = t_ms + after_ms
|
|
243
|
+
|
|
244
|
+
in_window: List[dict] = []
|
|
245
|
+
for cap in captions:
|
|
246
|
+
start = cap.get("startMs", 0)
|
|
247
|
+
end = cap.get("endMs", start)
|
|
248
|
+
if end >= window_start and start <= window_end:
|
|
249
|
+
in_window.append(cap)
|
|
250
|
+
|
|
251
|
+
if not in_window:
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
words = [str(cap.get("text", "")).strip() for cap in in_window if cap.get("text")]
|
|
255
|
+
if not words:
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
if len(words) > max_words:
|
|
259
|
+
words = words[:max_words]
|
|
260
|
+
ellipsis = "…"
|
|
261
|
+
else:
|
|
262
|
+
ellipsis = ""
|
|
263
|
+
|
|
264
|
+
text = " ".join(words).strip()
|
|
265
|
+
if len(text) > max_chars:
|
|
266
|
+
text = text[: max_chars - 1].rstrip() + "…"
|
|
267
|
+
elif ellipsis:
|
|
268
|
+
text = text + ellipsis
|
|
269
|
+
|
|
270
|
+
return CaptionPhrase(
|
|
271
|
+
text=text,
|
|
272
|
+
start_ms=in_window[0].get("startMs", 0),
|
|
273
|
+
end_ms=in_window[-1].get("endMs", 0),
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ---------- optional: transcribe a video directly ----------
|
|
278
|
+
|
|
279
|
+
def transcribe_video(
|
|
280
|
+
video_path: str,
|
|
281
|
+
output_path: Optional[str] = None,
|
|
282
|
+
format: Optional[str] = None,
|
|
283
|
+
) -> List[dict]:
|
|
284
|
+
"""Transcribe a video's audio with faster-whisper.
|
|
285
|
+
|
|
286
|
+
Returns the captions list. If `output_path` is provided, also writes
|
|
287
|
+
the captions there in the chosen format (auto-detected from the path
|
|
288
|
+
extension if `format` is None).
|
|
289
|
+
|
|
290
|
+
Raises:
|
|
291
|
+
ImportError: if faster-whisper is not installed
|
|
292
|
+
RuntimeError: if transcription fails
|
|
293
|
+
"""
|
|
294
|
+
try:
|
|
295
|
+
from faster_whisper import WhisperModel
|
|
296
|
+
except ImportError as e:
|
|
297
|
+
raise ImportError(
|
|
298
|
+
"faster-whisper is required for --transcribe. "
|
|
299
|
+
"Install with: pip install vidgrid[transcribe]"
|
|
300
|
+
) from e
|
|
301
|
+
|
|
302
|
+
model = WhisperModel("base", compute_type="int8")
|
|
303
|
+
try:
|
|
304
|
+
segments, _ = model.transcribe(video_path, word_timestamps=True)
|
|
305
|
+
except Exception as e:
|
|
306
|
+
raise RuntimeError(f"Whisper transcription failed: {e}") from e
|
|
307
|
+
|
|
308
|
+
captions: List[dict] = []
|
|
309
|
+
for segment in segments:
|
|
310
|
+
if not segment.words:
|
|
311
|
+
continue
|
|
312
|
+
for word_info in segment.words:
|
|
313
|
+
word_text = word_info.word.strip()
|
|
314
|
+
if not word_text:
|
|
315
|
+
continue
|
|
316
|
+
caption_text = f" {word_text}" if captions else word_text
|
|
317
|
+
captions.append(
|
|
318
|
+
{
|
|
319
|
+
"text": caption_text,
|
|
320
|
+
"startMs": round(word_info.start * 1000),
|
|
321
|
+
"endMs": round(word_info.end * 1000),
|
|
322
|
+
"timestampMs": round(word_info.start * 1000),
|
|
323
|
+
"confidence": float(
|
|
324
|
+
getattr(word_info, "probability", 1.0) or 1.0
|
|
325
|
+
),
|
|
326
|
+
}
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
if output_path:
|
|
330
|
+
write_captions(captions, output_path, format=format)
|
|
331
|
+
|
|
332
|
+
return captions
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def captions_to_prompt_text(captions: List[dict]) -> str:
|
|
336
|
+
"""Format captions as a plain-text transcript for LLM prompts.
|
|
337
|
+
|
|
338
|
+
Groups words into short lines with timestamps at the start of each line.
|
|
339
|
+
Example:
|
|
340
|
+
[0:00] we just hit three hundred thousand in MRR
|
|
341
|
+
[0:05] and we still have no funding
|
|
342
|
+
"""
|
|
343
|
+
if not captions:
|
|
344
|
+
return ""
|
|
345
|
+
|
|
346
|
+
lines: List[str] = []
|
|
347
|
+
current_words: List[str] = []
|
|
348
|
+
current_start_ms: Optional[int] = None
|
|
349
|
+
line_char_cap = 80
|
|
350
|
+
|
|
351
|
+
def flush():
|
|
352
|
+
nonlocal current_words, current_start_ms
|
|
353
|
+
if not current_words:
|
|
354
|
+
return
|
|
355
|
+
ts_s = (current_start_ms or 0) // 1000
|
|
356
|
+
m, s = divmod(ts_s, 60)
|
|
357
|
+
prefix = f"[{m}:{s:02d}] "
|
|
358
|
+
lines.append(prefix + " ".join(current_words).strip())
|
|
359
|
+
current_words = []
|
|
360
|
+
current_start_ms = None
|
|
361
|
+
|
|
362
|
+
for cap in captions:
|
|
363
|
+
text = str(cap.get("text", "")).strip()
|
|
364
|
+
if not text:
|
|
365
|
+
continue
|
|
366
|
+
if current_start_ms is None:
|
|
367
|
+
current_start_ms = cap.get("startMs", 0)
|
|
368
|
+
current_words.append(text)
|
|
369
|
+
running = " ".join(current_words)
|
|
370
|
+
if len(running) >= line_char_cap or text.endswith((".", "!", "?")):
|
|
371
|
+
flush()
|
|
372
|
+
|
|
373
|
+
flush()
|
|
374
|
+
return "\n".join(lines)
|