vidgrid 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vidgrid/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ """vidgrid: convert video clips into annotated image grids for vision LLM analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ from vidgrid.api import render
8
+
9
+ __all__ = ["render", "__version__"]
vidgrid/__main__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Enable `python3 -m vidgrid` as a fallback when the console script is
2
+ not on PATH."""
3
+
4
+ from vidgrid.cli import main
5
+
6
+ if __name__ == "__main__":
7
+ raise SystemExit(main())
vidgrid/api.py ADDED
@@ -0,0 +1,166 @@
1
+ """Public Python API.
2
+
3
+ Integration scripts and other packages should import from here:
4
+
5
+ from vidgrid import render
6
+
7
+ storyboard = render(
8
+ input_path="clip.mp4",
9
+ output_path="grid.png",
10
+ grid="3x3", # or "2x2", "4x4", "5x5", or None for auto
11
+ )
12
+ print(storyboard.board_paths)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import shutil
18
+ import tempfile
19
+ from pathlib import Path
20
+ from typing import List, Optional
21
+
22
+ from vidgrid.captions import (
23
+ SUPPORTED_FORMATS,
24
+ load_captions,
25
+ phrase_for_timestamp,
26
+ transcribe_video,
27
+ write_captions,
28
+ )
29
+ from vidgrid.models import Board, Cell, Storyboard
30
+ from vidgrid.output import write_boards, write_sidecar
31
+ from vidgrid.presets import DEFAULT_MAX_BOARDS, MAX_DURATION_SECONDS, preset_for
32
+ from vidgrid.probe import probe
33
+ from vidgrid.sample import sample_video
34
+
35
+ DEFAULT_TRANSCRIPT_FORMAT = "json"
36
+
37
+
38
+ def render(
39
+ input_path: str,
40
+ *,
41
+ output_path: Optional[str] = None,
42
+ grid: Optional[str] = None,
43
+ fps: Optional[float] = None,
44
+ max_boards: int = DEFAULT_MAX_BOARDS,
45
+ max_duration_seconds: int = MAX_DURATION_SECONDS,
46
+ captions_path: Optional[str] = None,
47
+ transcribe: bool = False,
48
+ burn_captions: bool = False,
49
+ transcript_format: str = DEFAULT_TRANSCRIPT_FORMAT,
50
+ work_dir: Optional[str] = None,
51
+ cleanup_work_dir: bool = True,
52
+ ) -> Storyboard:
53
+ """Render a video into one or more annotated storyboards.
54
+
55
+ Each cell represents exactly 1 second of video. Longer videos produce
56
+ more boards, not denser grids.
57
+
58
+ Args:
59
+ input_path: path to the source video file
60
+ output_path: where to write the PNG. Pass None to skip writing and
61
+ only get the in-memory Storyboard.
62
+ grid: '2x2', '3x3', '4x4', '5x5', or None for auto (<= 4s → 2x2,
63
+ otherwise → 3x3).
64
+ max_duration_seconds: reject videos longer than this. Default 300
65
+ (5 min). Raise max_duration_seconds explicitly to override.
66
+ captions_path: path to a Whisper captions JSON for burn-in or
67
+ transcript correlation.
68
+ transcribe: run faster-whisper on the video's audio to generate
69
+ captions automatically.
70
+ burn_captions: render a caption strip below each cell (one-layer mode).
71
+ transcript_format: 'json' (default, Remotion-compatible), 'srt', or
72
+ 'txt'. Controls the extension and format of the transcript
73
+ sidecar file.
74
+ work_dir: directory for intermediate frames. A temp dir is created
75
+ if not supplied.
76
+ cleanup_work_dir: delete the work dir after rendering.
77
+
78
+ Returns:
79
+ Storyboard describing the result. board.png_path is populated if
80
+ output_path was provided.
81
+ """
82
+ video = probe(input_path)
83
+ preset = preset_for(
84
+ video,
85
+ grid=grid,
86
+ fps=fps,
87
+ max_boards=max_boards,
88
+ max_duration_seconds=max_duration_seconds,
89
+ )
90
+
91
+ if transcript_format not in SUPPORTED_FORMATS:
92
+ raise ValueError(
93
+ f"Unknown transcript format '{transcript_format}'. "
94
+ f"Supported: {', '.join(SUPPORTED_FORMATS)}"
95
+ )
96
+
97
+ # --- captions ---
98
+ captions_data: List[dict] = []
99
+ transcript_out_path: Optional[str] = None
100
+
101
+ if transcribe and captions_path:
102
+ raise ValueError("Use either --transcribe or --captions, not both")
103
+
104
+ if transcribe:
105
+ if output_path:
106
+ transcript_out_path = str(
107
+ Path(output_path).with_name(
108
+ Path(output_path).stem + f"-transcript.{transcript_format}"
109
+ )
110
+ )
111
+ captions_data = transcribe_video(
112
+ input_path,
113
+ output_path=transcript_out_path,
114
+ format=transcript_format,
115
+ )
116
+ elif captions_path:
117
+ captions_data = load_captions(captions_path)
118
+ if output_path and captions_data:
119
+ dest = Path(output_path).with_name(
120
+ Path(output_path).stem + f"-transcript.{transcript_format}"
121
+ )
122
+ write_captions(captions_data, str(dest), format=transcript_format)
123
+ transcript_out_path = str(dest)
124
+
125
+ # --- frame extraction ---
126
+ managed_work_dir = work_dir is None
127
+ if managed_work_dir:
128
+ work_dir = tempfile.mkdtemp(prefix="vidgrid-")
129
+ else:
130
+ Path(work_dir).mkdir(parents=True, exist_ok=True)
131
+
132
+ try:
133
+ per_board_samples = sample_video(video, preset, work_dir)
134
+
135
+ # --- assemble Storyboard ---
136
+ boards: List[Board] = []
137
+ for i, samples in enumerate(per_board_samples, start=1):
138
+ cells: List[Cell] = []
139
+ for sample in samples:
140
+ phrase = None
141
+ if captions_data:
142
+ phrase = phrase_for_timestamp(
143
+ captions_data, sample.timestamp_ms
144
+ )
145
+ cells.append(Cell(sample=sample, caption=phrase))
146
+ boards.append(
147
+ Board(index=i, layout=preset.layout, cells=cells)
148
+ )
149
+
150
+ storyboard = Storyboard(
151
+ source=video,
152
+ preset=preset,
153
+ boards=boards,
154
+ transcript_path=transcript_out_path,
155
+ )
156
+
157
+ if output_path:
158
+ write_boards(
159
+ storyboard, output_path, burn_captions=burn_captions
160
+ )
161
+ write_sidecar(storyboard, output_path)
162
+
163
+ return storyboard
164
+ finally:
165
+ if managed_work_dir and cleanup_work_dir and work_dir:
166
+ shutil.rmtree(work_dir, ignore_errors=True)
vidgrid/captions.py ADDED
@@ -0,0 +1,374 @@
1
+ """Caption loading, writing, phrase windowing, and optional Whisper transcription.
2
+
3
+ Captions are represented internally as a list of Remotion-style dicts:
4
+ [{"text": " word", "startMs": int, "endMs": int, "timestampMs": int, "confidence": float}, ...]
5
+
6
+ vidgrid can read and write three on-disk formats:
7
+
8
+ 1. **json** (default, Remotion-compatible)
9
+ Verbose but preserves word-level timing and confidence. Required by the
10
+ indiehacker-news pipeline because Remotion reads this exact shape.
11
+
12
+ 2. **srt** (SubRip subtitle, industry standard)
13
+ Every video editor understands it. One entry per word. Easy to share,
14
+ easy to edit in a text editor.
15
+
16
+ 3. **txt** (plain timestamped text)
17
+ The simplest possible format. One word per line, prefixed by its start
18
+ time in seconds. Trivial to parse, smallest file on disk.
19
+
20
+ Auto-detection on load uses the file extension. Writes either use an
21
+ explicit format argument or infer from the output path's extension.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import json
27
+ import re
28
+ from pathlib import Path
29
+ from typing import List, Optional
30
+
31
+ from vidgrid.models import CaptionPhrase
32
+
33
+ SUPPORTED_FORMATS = ("json", "srt", "txt")
34
+
35
+
36
+ # ---------- loading ----------
37
+
38
+ def load_captions(captions_path: str) -> List[dict]:
39
+ """Load captions from disk. Auto-detects format from file extension.
40
+
41
+ Supported: .json, .srt, .txt. Anything else is treated as JSON.
42
+ Returns an empty list on missing file or parse failure.
43
+ """
44
+ path = Path(captions_path)
45
+ if not path.exists():
46
+ return []
47
+
48
+ ext = path.suffix.lower().lstrip(".")
49
+ content = path.read_text(errors="replace")
50
+
51
+ if ext == "srt":
52
+ return _parse_srt(content)
53
+ if ext == "txt":
54
+ return _parse_txt(content)
55
+ return _parse_json(content)
56
+
57
+
58
+ def _parse_json(content: str) -> List[dict]:
59
+ try:
60
+ data = json.loads(content)
61
+ except json.JSONDecodeError:
62
+ return []
63
+ if not isinstance(data, list):
64
+ return []
65
+ return data
66
+
67
+
68
+ _SRT_TIME_RE = re.compile(
69
+ r"(\d{1,2}):(\d{2}):(\d{2})[,\.](\d{1,3})\s*-->\s*"
70
+ r"(\d{1,2}):(\d{2}):(\d{2})[,\.](\d{1,3})"
71
+ )
72
+
73
+
74
+ def _parse_srt(content: str) -> List[dict]:
75
+ """Parse an SRT file into the internal caption dict format."""
76
+ blocks = re.split(r"\n\s*\n", content.strip())
77
+ captions: List[dict] = []
78
+ for block in blocks:
79
+ lines = [ln for ln in block.strip().splitlines() if ln.strip()]
80
+ if len(lines) < 2:
81
+ continue
82
+ # First line is usually an index, but some SRT files skip it
83
+ if lines[0].isdigit():
84
+ time_line = lines[1] if len(lines) >= 3 else None
85
+ text_lines = lines[2:]
86
+ else:
87
+ time_line = lines[0]
88
+ text_lines = lines[1:]
89
+
90
+ if not time_line:
91
+ continue
92
+ match = _SRT_TIME_RE.search(time_line)
93
+ if not match:
94
+ continue
95
+ h1, m1, s1, ms1, h2, m2, s2, ms2 = (int(x) for x in match.groups())
96
+ start_ms = h1 * 3600_000 + m1 * 60_000 + s1 * 1000 + ms1
97
+ end_ms = h2 * 3600_000 + m2 * 60_000 + s2 * 1000 + ms2
98
+ text = " ".join(text_lines).strip()
99
+ if not text:
100
+ continue
101
+ captions.append(
102
+ {
103
+ "text": f" {text}" if captions else text,
104
+ "startMs": start_ms,
105
+ "endMs": end_ms,
106
+ "timestampMs": start_ms,
107
+ "confidence": 1.0,
108
+ }
109
+ )
110
+ return captions
111
+
112
+
113
+ def _parse_txt(content: str) -> List[dict]:
114
+ """Parse plain timestamped text (`<seconds> <word>` per line).
115
+
116
+ Each line looks like:
117
+ 0.00 hello
118
+ 0.50 world
119
+
120
+ The end timestamp of word N is set to the start of word N+1 (or
121
+ start + 500ms for the last word).
122
+ """
123
+ captions: List[dict] = []
124
+ for raw in content.splitlines():
125
+ line = raw.strip()
126
+ if not line:
127
+ continue
128
+ parts = line.split(None, 1)
129
+ if len(parts) < 2:
130
+ continue
131
+ try:
132
+ start_s = float(parts[0])
133
+ except ValueError:
134
+ continue
135
+ text = parts[1]
136
+ start_ms = int(round(start_s * 1000))
137
+ captions.append(
138
+ {
139
+ "text": f" {text}" if captions else text,
140
+ "startMs": start_ms,
141
+ "endMs": start_ms, # fixed below
142
+ "timestampMs": start_ms,
143
+ "confidence": 1.0,
144
+ }
145
+ )
146
+ # Fill in endMs by using the next word's start, last word gets +500ms
147
+ for i in range(len(captions) - 1):
148
+ captions[i]["endMs"] = captions[i + 1]["startMs"]
149
+ if captions:
150
+ captions[-1]["endMs"] = captions[-1]["startMs"] + 500
151
+ return captions
152
+
153
+
154
+ # ---------- writing ----------
155
+
156
+ def write_captions(
157
+ captions: List[dict],
158
+ output_path: str,
159
+ format: Optional[str] = None,
160
+ ) -> str:
161
+ """Write captions to disk in the requested format.
162
+
163
+ If `format` is None, infer from the output_path's extension. Returns
164
+ the resolved output path (unchanged from input).
165
+ """
166
+ path = Path(output_path)
167
+ if format is None:
168
+ ext = path.suffix.lower().lstrip(".")
169
+ format = ext if ext in SUPPORTED_FORMATS else "json"
170
+
171
+ if format == "json":
172
+ path.write_text(json.dumps(captions, indent=2))
173
+ elif format == "srt":
174
+ path.write_text(_to_srt(captions))
175
+ elif format == "txt":
176
+ path.write_text(_to_txt(captions))
177
+ else:
178
+ raise ValueError(
179
+ f"Unknown caption format '{format}'. "
180
+ f"Supported: {', '.join(SUPPORTED_FORMATS)}"
181
+ )
182
+
183
+ return str(path)
184
+
185
+
186
+ def _ms_to_srt_time(ms: int) -> str:
187
+ """Convert milliseconds to SRT time format HH:MM:SS,mmm."""
188
+ ms = max(0, int(ms))
189
+ h, ms = divmod(ms, 3600_000)
190
+ m, ms = divmod(ms, 60_000)
191
+ s, ms = divmod(ms, 1000)
192
+ return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
193
+
194
+
195
+ def _to_srt(captions: List[dict]) -> str:
196
+ lines: List[str] = []
197
+ for i, cap in enumerate(captions, start=1):
198
+ start = _ms_to_srt_time(cap.get("startMs", 0))
199
+ end = _ms_to_srt_time(cap.get("endMs", cap.get("startMs", 0) + 500))
200
+ text = str(cap.get("text", "")).strip()
201
+ if not text:
202
+ continue
203
+ lines.append(str(i))
204
+ lines.append(f"{start} --> {end}")
205
+ lines.append(text)
206
+ lines.append("")
207
+ return "\n".join(lines).rstrip() + "\n"
208
+
209
+
210
+ def _to_txt(captions: List[dict]) -> str:
211
+ lines: List[str] = []
212
+ for cap in captions:
213
+ text = str(cap.get("text", "")).strip()
214
+ if not text:
215
+ continue
216
+ seconds = cap.get("startMs", 0) / 1000.0
217
+ lines.append(f"{seconds:.2f} {text}")
218
+ return "\n".join(lines) + ("\n" if lines else "")
219
+
220
+
221
+ # ---------- phrase windowing ----------
222
+
223
+ def phrase_for_timestamp(
224
+ captions: List[dict],
225
+ t_ms: int,
226
+ *,
227
+ before_ms: int = 900,
228
+ after_ms: int = 1400,
229
+ max_words: int = 12,
230
+ max_chars: int = 80,
231
+ ) -> Optional[CaptionPhrase]:
232
+ """Return a phrase window centered on a frame timestamp.
233
+
234
+ Single isolated words look terrible under cells. Instead, we grab the
235
+ words that fall inside a window around the frame's timestamp and trim to
236
+ a reasonable word/char count so the caption strip stays legible.
237
+ """
238
+ if not captions:
239
+ return None
240
+
241
+ window_start = t_ms - before_ms
242
+ window_end = t_ms + after_ms
243
+
244
+ in_window: List[dict] = []
245
+ for cap in captions:
246
+ start = cap.get("startMs", 0)
247
+ end = cap.get("endMs", start)
248
+ if end >= window_start and start <= window_end:
249
+ in_window.append(cap)
250
+
251
+ if not in_window:
252
+ return None
253
+
254
+ words = [str(cap.get("text", "")).strip() for cap in in_window if cap.get("text")]
255
+ if not words:
256
+ return None
257
+
258
+ if len(words) > max_words:
259
+ words = words[:max_words]
260
+ ellipsis = "…"
261
+ else:
262
+ ellipsis = ""
263
+
264
+ text = " ".join(words).strip()
265
+ if len(text) > max_chars:
266
+ text = text[: max_chars - 1].rstrip() + "…"
267
+ elif ellipsis:
268
+ text = text + ellipsis
269
+
270
+ return CaptionPhrase(
271
+ text=text,
272
+ start_ms=in_window[0].get("startMs", 0),
273
+ end_ms=in_window[-1].get("endMs", 0),
274
+ )
275
+
276
+
277
+ # ---------- optional: transcribe a video directly ----------
278
+
279
+ def transcribe_video(
280
+ video_path: str,
281
+ output_path: Optional[str] = None,
282
+ format: Optional[str] = None,
283
+ ) -> List[dict]:
284
+ """Transcribe a video's audio with faster-whisper.
285
+
286
+ Returns the captions list. If `output_path` is provided, also writes
287
+ the captions there in the chosen format (auto-detected from the path
288
+ extension if `format` is None).
289
+
290
+ Raises:
291
+ ImportError: if faster-whisper is not installed
292
+ RuntimeError: if transcription fails
293
+ """
294
+ try:
295
+ from faster_whisper import WhisperModel
296
+ except ImportError as e:
297
+ raise ImportError(
298
+ "faster-whisper is required for --transcribe. "
299
+ "Install with: pip install vidgrid[transcribe]"
300
+ ) from e
301
+
302
+ model = WhisperModel("base", compute_type="int8")
303
+ try:
304
+ segments, _ = model.transcribe(video_path, word_timestamps=True)
305
+ except Exception as e:
306
+ raise RuntimeError(f"Whisper transcription failed: {e}") from e
307
+
308
+ captions: List[dict] = []
309
+ for segment in segments:
310
+ if not segment.words:
311
+ continue
312
+ for word_info in segment.words:
313
+ word_text = word_info.word.strip()
314
+ if not word_text:
315
+ continue
316
+ caption_text = f" {word_text}" if captions else word_text
317
+ captions.append(
318
+ {
319
+ "text": caption_text,
320
+ "startMs": round(word_info.start * 1000),
321
+ "endMs": round(word_info.end * 1000),
322
+ "timestampMs": round(word_info.start * 1000),
323
+ "confidence": float(
324
+ getattr(word_info, "probability", 1.0) or 1.0
325
+ ),
326
+ }
327
+ )
328
+
329
+ if output_path:
330
+ write_captions(captions, output_path, format=format)
331
+
332
+ return captions
333
+
334
+
335
+ def captions_to_prompt_text(captions: List[dict]) -> str:
336
+ """Format captions as a plain-text transcript for LLM prompts.
337
+
338
+ Groups words into short lines with timestamps at the start of each line.
339
+ Example:
340
+ [0:00] we just hit three hundred thousand in MRR
341
+ [0:05] and we still have no funding
342
+ """
343
+ if not captions:
344
+ return ""
345
+
346
+ lines: List[str] = []
347
+ current_words: List[str] = []
348
+ current_start_ms: Optional[int] = None
349
+ line_char_cap = 80
350
+
351
+ def flush():
352
+ nonlocal current_words, current_start_ms
353
+ if not current_words:
354
+ return
355
+ ts_s = (current_start_ms or 0) // 1000
356
+ m, s = divmod(ts_s, 60)
357
+ prefix = f"[{m}:{s:02d}] "
358
+ lines.append(prefix + " ".join(current_words).strip())
359
+ current_words = []
360
+ current_start_ms = None
361
+
362
+ for cap in captions:
363
+ text = str(cap.get("text", "")).strip()
364
+ if not text:
365
+ continue
366
+ if current_start_ms is None:
367
+ current_start_ms = cap.get("startMs", 0)
368
+ current_words.append(text)
369
+ running = " ".join(current_words)
370
+ if len(running) >= line_char_cap or text.endswith((".", "!", "?")):
371
+ flush()
372
+
373
+ flush()
374
+ return "\n".join(lines)