vidgrid 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vidgrid-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pawel Kozlowski
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
vidgrid-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,315 @@
1
+ Metadata-Version: 2.4
2
+ Name: vidgrid
3
+ Version: 0.1.0
4
+ Summary: Convert video clips into annotated image grids for vision LLM analysis
5
+ Author: Paw Vej
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/pawvej/vidgrid
8
+ Project-URL: Issues, https://github.com/pawvej/vidgrid/issues
9
+ Keywords: video,llm,vision,storyboard,claude,gpt,gemini,ffmpeg
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Multimedia :: Video
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Intended Audience :: Developers
19
+ Classifier: Operating System :: OS Independent
20
+ Requires-Python: >=3.9
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: Pillow>=10.0.0
24
+ Provides-Extra: transcribe
25
+ Requires-Dist: faster-whisper>=1.0.0; extra == "transcribe"
26
+ Provides-Extra: anthropic
27
+ Requires-Dist: anthropic>=0.39.0; extra == "anthropic"
28
+ Provides-Extra: openai
29
+ Requires-Dist: openai>=1.50.0; extra == "openai"
30
+ Provides-Extra: gemini
31
+ Requires-Dist: google-genai>=0.3.0; extra == "gemini"
32
+ Provides-Extra: llm
33
+ Requires-Dist: anthropic>=0.39.0; extra == "llm"
34
+ Requires-Dist: openai>=1.50.0; extra == "llm"
35
+ Requires-Dist: google-genai>=0.3.0; extra == "llm"
36
+ Provides-Extra: all
37
+ Requires-Dist: faster-whisper>=1.0.0; extra == "all"
38
+ Requires-Dist: anthropic>=0.39.0; extra == "all"
39
+ Requires-Dist: openai>=1.50.0; extra == "all"
40
+ Requires-Dist: google-genai>=0.3.0; extra == "all"
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
43
+ Requires-Dist: pytest-mock>=3.14.0; extra == "dev"
44
+ Dynamic: license-file
45
+
46
+ # vidgrid
47
+
48
+ [![PyPI](https://img.shields.io/pypi/v/vidgrid?style=flat-square&color=1f4fd1)](https://pypi.org/project/vidgrid/)
49
+ [![Python](https://img.shields.io/pypi/pyversions/vidgrid?style=flat-square)](https://pypi.org/project/vidgrid/)
50
+ [![License: MIT](https://img.shields.io/badge/license-MIT-black.svg?style=flat-square)](LICENSE)
51
+ [![Downloads](https://img.shields.io/pypi/dm/vidgrid?style=flat-square&color=e03127)](https://pypi.org/project/vidgrid/)
52
+ [![Hosted](https://img.shields.io/badge/hosted-vidgrid.site-e03127?style=flat-square)](https://vidgrid.site)
53
+
54
+ > Convert video clips into annotated image grids for vision LLM analysis.
55
+ > **One cell = one second, by default.**
56
+
57
+ ![vidgrid example — a 3×3 grid generated from "Me at the zoo" with numbered cells, top-right timestamps, and real auto-captions burned in](docs/img/hero.jpg)
58
+
59
+ LLMs can't watch video, but they can analyze a single image. `vidgrid` samples
60
+ one frame per second from a video, tiles them into a numbered storyboard with
61
+ timestamps, and optionally sends the result to Claude, GPT, or Gemini with a
62
+ prompt. The result is something close to "my LLM just watched a video" for
63
+ the cost of a handful of image uploads.
64
+
65
+ **Don't want to install?** Use the hosted version at
66
+ [vidgrid.site](https://vidgrid.site) — drop a file, get the grid in the
67
+ browser. 3 free renders, $5 lifetime after that. Free for ever on the CLI.
68
+
69
+ ## The model
70
+
71
+ **One cell = one second, by default.** The auto-picker chooses the smallest
72
+ grid (biggest, most-legible cells) whose board count stays under
73
+ `--max-boards` (default 10). When that's not enough for a long clip, it
74
+ bumps the grid up; as a last resort, it reduces the sampling rate. Override
75
+ with `--fps` and `--max-boards` for full control.
76
+
77
+ - Grid size determines how many seconds fit in one photo
78
+ - Default sampling is 1fps; drops below 1fps only when needed to stay under the max-boards cap
79
+ - Videos over 5 minutes are rejected (chop them up first)
80
+
81
+ | Grid | Cells | Seconds per photo | Best for |
82
+ |---|---|---|---|
83
+ | `2x2` | 4 | 4 | Very short clips (2–4s) |
84
+ | `3x3` | **9** | **9** | **Default — best overall readability** |
85
+ | `4x4` | 16 | 16 | More compact, cells get smaller |
86
+ | `5x5` | 25 | 25 | Experimental — cells small, LLM accuracy drops |
87
+
88
+ **Quality degrades with bigger grids.** Cells shrink, detail is lost, and the
89
+ LLM has a harder time reading fine content like text or UI elements. Stick
90
+ with 3×3 unless you specifically need to pack more seconds into one photo.
91
+ 5×5 exists mostly as a "let me see what happens" option.
92
+
93
+ ## How many photos a video produces
94
+
95
+ At 1fps sampling, the board count at each grid size:
96
+
97
+ | Video length | 2×2 | 3×3 | 4×4 | 5×5 |
98
+ |---|---|---|---|---|
99
+ | 3s | **1** (partial) | 1 (partial) | 1 (partial) | 1 (partial) |
100
+ | 9s | **3** | 1 | 1 (partial) | 1 (partial) |
101
+ | 25s | **7** | 3 | 2 | 1 |
102
+ | 60s | 15 | **7** | 4 | 3 |
103
+ | 186s (3 min) | 47 | 21 | **12** | 8 |
104
+ | 300s (5 min, cap) | 75 | 34 | **19**¹ | 12 |
105
+
106
+ **Bold** = what `auto` picks — the smallest grid (biggest cells) that
107
+ keeps the board count under `--max-boards` (default 10).
108
+
109
+ ¹ At the 5-min cap, even 4×4 exceeds 10 boards at 1fps, so auto drops
110
+ the sampling rate (≈1 cell per 1.9s) to land at the 10-board limit. Use
111
+ `--fps 1.0 --max-boards 20` to preserve 1fps and accept more boards.
112
+
113
+ Most vision LLMs accept ~10–20 images per request, so auto's default
114
+ ceiling of 10 keeps a full video inside a single model call.
115
+
116
+ ## Install
117
+
118
+ ```bash
119
+ pip install vidgrid # core renderer only
120
+ pip install vidgrid[transcribe] # + faster-whisper for --transcribe
121
+ pip install vidgrid[anthropic] # + Claude support via --ask
122
+ pip install vidgrid[llm] # + Claude + GPT + Gemini
123
+ pip install vidgrid[all] # everything
124
+ ```
125
+
126
+ Requires Python 3.9+ and `ffmpeg` on your `PATH`.
127
+
128
+ ## Quick start
129
+
130
+ ```bash
131
+ # 1. Auto-pick grid and sampling rate — smallest grid that fits in 10 boards
132
+ vidgrid clip.mp4 -o grid.png
133
+
134
+ # 2. Force a specific grid
135
+ vidgrid clip.mp4 -o grid.png --grid 4x4
136
+
137
+ # 3. Force a sampling rate — 0.5fps = 1 cell every 2 seconds
138
+ vidgrid long-clip.mp4 -o grid.png --fps 0.5
139
+
140
+ # 4. Raise the max-boards ceiling (default 10) if you want more boards
141
+ vidgrid lecture.mp4 -o grid.png --max-boards 20
142
+
143
+ # 5. Render + auto-transcribe + send to Claude in one call
144
+ vidgrid lecture.mp4 --transcribe --ask "bullet-point summary"
145
+
146
+ # 6. Use existing Whisper captions, burn them onto the grid
147
+ vidgrid interview.mp4 -o grid.png --captions whisper.json --burn-captions
148
+
149
+ # 7. Let the CLI fall back to python -m if the console script isn't on PATH
150
+ python3 -m vidgrid clip.mp4 -o grid.png
151
+ ```
152
+
153
+ ## Three things you can do with it
154
+
155
+ ### 1. Summarize a talk without watching it
156
+
157
+ ```bash
158
+ vidgrid "team-meeting.mp4" \
159
+ --transcribe \
160
+ --ask "list the decisions made and who owns each" \
161
+ --model claude-opus-4-7
162
+ ```
163
+
164
+ vidgrid samples one frame per second, runs Whisper on the audio, sends the
165
+ grid + transcript to Claude, and prints the answer. The model correlates
166
+ frames and words via the burned-in timestamps.
167
+
168
+ ### 2. Find a specific moment in a screen recording
169
+
170
+ ```bash
171
+ vidgrid bug-repro.mp4 --grid 3x3 \
172
+ --ask "at which numbered frame does the error dialog appear?" \
173
+ --model gpt-5
174
+ ```
175
+
176
+ Because cells are globally numbered (1, 2, 3...) and tagged with timestamps,
177
+ the model can point you at the exact moment. No scrubbing.
178
+
179
+ ### 3. Rank a pile of stock footage
180
+
181
+ ```bash
182
+ for clip in broll/*.mp4; do
183
+ vidgrid "$clip" -o "grids/$(basename $clip .mp4).png"
184
+ done
185
+ ```
186
+
187
+ Send the PNGs to Claude in a single request and ask it to rank or reject
188
+ clips against your shot list. This is the workflow vidgrid was built for.
189
+
190
+ ## Portrait vs landscape
191
+
192
+ vidgrid keeps the grid shape square (N×N) regardless of source orientation
193
+ and preserves the source aspect inside each cell. Landscape sources produce
194
+ wide boards; portrait sources produce tall boards. Cells are never cropped.
195
+
196
+ ## Two-layer captions (default)
197
+
198
+ The default mode gives the LLM **two correlated inputs**: the rendered grid
199
+ image AND the Whisper transcript as separate text. The model correlates them
200
+ via the timestamps printed on each cell.
201
+
202
+ This beats burning captions into the image because:
203
+
204
+ 1. Frames keep their pixels for actual content
205
+ 2. Text is higher fidelity as tokens than as baked-in pixels
206
+ 3. The grid stays clean and shareable
207
+
208
+ Add `--burn-captions` if you want a self-contained image (useful for sharing
209
+ or offline analysis).
210
+
211
+ ## Caption file formats
212
+
213
+ vidgrid reads and writes three caption formats. The `--captions` flag
214
+ auto-detects from the file extension. The `--transcript-format` flag
215
+ controls what `--transcribe` writes.
216
+
217
+ | Format | Extension | Size (36 words) | When to use |
218
+ |---|---|---|---|
219
+ | `json` | `.json` | ~4.8 KB | Remotion pipelines, tools that need word confidence |
220
+ | `srt` | `.srt` | ~1.4 KB | Video editors, universal subtitle format |
221
+ | `txt` | `.txt` | ~0.4 KB | Smallest, grep-friendly, trivial to parse |
222
+
223
+ **JSON** (default, Remotion-compatible):
224
+ ```json
225
+ [
226
+ {"text": "hello", "startMs": 0, "endMs": 500, "timestampMs": 0, "confidence": 0.98},
227
+ ...
228
+ ]
229
+ ```
230
+
231
+ **SRT** (SubRip subtitles):
232
+ ```
233
+ 1
234
+ 00:00:00,000 --> 00:00:00,500
235
+ hello
236
+
237
+ 2
238
+ 00:00:00,500 --> 00:00:01,000
239
+ world
240
+ ```
241
+
242
+ **TXT** (plain timestamped text, one word per line):
243
+ ```
244
+ 0.00 hello
245
+ 0.50 world
246
+ ```
247
+
248
+ Use any format as input, output, or both. You can mix — read an `.srt` and
249
+ write a `.txt` with `--captions foo.srt --transcript-format txt`.
250
+
251
+ ## Python API
252
+
253
+ ```python
254
+ from vidgrid import render
255
+
256
+ storyboard = render(
257
+ input_path="interview.mp4",
258
+ output_path="grid.png",
259
+ grid="3x3", # or "2x2", "4x4", "5x5", or None for auto
260
+ transcribe=True,
261
+ )
262
+
263
+ print(storyboard.board_paths) # ['grid-1.png', 'grid-2.png', ...]
264
+ print(storyboard.transcript_path) # 'grid-transcript.json'
265
+ print(storyboard.all_samples) # list[Sample] with timestamps
266
+ ```
267
+
268
+ Modules: `vidgrid.probe`, `vidgrid.sample`, `vidgrid.compose`,
269
+ `vidgrid.captions`, `vidgrid.llm`, `vidgrid.presets`.
270
+
271
+ ## Output structure
272
+
273
+ **Single-board run:**
274
+ ```
275
+ grid.png # the storyboard
276
+ grid.json # sidecar: timestamps, layout, source info
277
+ grid-transcript.json # only if --transcribe or --captions was used
278
+ ```
279
+
280
+ **Multi-board run:**
281
+ ```
282
+ grid-1.png, grid-2.png, grid-3.png, ...
283
+ grid.json # index covering all boards + global cell numbering
284
+ grid-transcript.json
285
+ ```
286
+
287
+ Cells are numbered **globally** across boards. A 3-board run has cells 1–27
288
+ so the LLM can reference any frame without ambiguity.
289
+
290
+ ## Limits and caveats
291
+
292
+ - **5-minute hard cap on video length.** Longer videos are rejected. Chop
293
+ them up with `ffmpeg -ss START -t 300 input.mp4 chunk.mp4`.
294
+ - **No scene detection.** v1 samples strictly 1 frame per second, uniform.
295
+ No dedupe, no shifting — the spacing is always exactly 1 second.
296
+ - **Variable-framerate videos** may have sub-frame seek drift (≤1 frame),
297
+ which is acceptable at 1fps sampling.
298
+ - **Bigger grids hurt legibility.** A 5×5 grid has cells ~300px wide; fine
299
+ for people and objects, marginal for dense text or code. Stick with 3×3.
300
+ - **LLM integration** uses the official SDKs (anthropic, openai, google-genai)
301
+ and won't be installed unless you request them as extras.
302
+
303
+ ## Prior art
304
+
305
+ - [IG-VLM](https://arxiv.org/abs/2403.18406) — research paper proving the grid trick works
306
+ - [llm-video-frames](https://github.com/simonw/llm-video-frames) — Simon Willison's per-frame approach
307
+ - [vcsi](https://github.com/amietn/vcsi) — contact sheets without LLMs
308
+ - [byjlw/video-analyzer](https://github.com/byjlw/video-analyzer) — whisper + sequential frames
309
+
310
+ vidgrid's differentiator: **1 cell = 1 second, numbered cells, simple CLI,
311
+ multi-provider LLM integration in one package**.
312
+
313
+ ## License
314
+
315
+ MIT. The bundled Source Sans 3 font is licensed under [SIL OFL 1.1](vidgrid/assets/fonts/LICENSE-SourceSans3.md).
@@ -0,0 +1,270 @@
1
+ # vidgrid
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/vidgrid?style=flat-square&color=1f4fd1)](https://pypi.org/project/vidgrid/)
4
+ [![Python](https://img.shields.io/pypi/pyversions/vidgrid?style=flat-square)](https://pypi.org/project/vidgrid/)
5
+ [![License: MIT](https://img.shields.io/badge/license-MIT-black.svg?style=flat-square)](LICENSE)
6
+ [![Downloads](https://img.shields.io/pypi/dm/vidgrid?style=flat-square&color=e03127)](https://pypi.org/project/vidgrid/)
7
+ [![Hosted](https://img.shields.io/badge/hosted-vidgrid.site-e03127?style=flat-square)](https://vidgrid.site)
8
+
9
+ > Convert video clips into annotated image grids for vision LLM analysis.
10
+ > **One cell = one second, by default.**
11
+
12
+ ![vidgrid example — a 3×3 grid generated from "Me at the zoo" with numbered cells, top-right timestamps, and real auto-captions burned in](docs/img/hero.jpg)
13
+
14
+ LLMs can't watch video, but they can analyze a single image. `vidgrid` samples
15
+ one frame per second from a video, tiles them into a numbered storyboard with
16
+ timestamps, and optionally sends the result to Claude, GPT, or Gemini with a
17
+ prompt. The result is something close to "my LLM just watched a video" for
18
+ the cost of a handful of image uploads.
19
+
20
+ **Don't want to install?** Use the hosted version at
21
+ [vidgrid.site](https://vidgrid.site) — drop a file, get the grid in the
22
+ browser. 3 free renders, $5 lifetime after that. Free for ever on the CLI.
23
+
24
+ ## The model
25
+
26
+ **One cell = one second, by default.** The auto-picker chooses the smallest
27
+ grid (biggest, most-legible cells) whose board count stays under
28
+ `--max-boards` (default 10). When that's not enough for a long clip, it
29
+ bumps the grid up; as a last resort, it reduces the sampling rate. Override
30
+ with `--fps` and `--max-boards` for full control.
31
+
32
+ - Grid size determines how many seconds fit in one photo
33
+ - Default sampling is 1fps; drops below 1fps only when needed to stay under the max-boards cap
34
+ - Videos over 5 minutes are rejected (chop them up first)
35
+
36
+ | Grid | Cells | Seconds per photo | Best for |
37
+ |---|---|---|---|
38
+ | `2x2` | 4 | 4 | Very short clips (2–4s) |
39
+ | `3x3` | **9** | **9** | **Default — best overall readability** |
40
+ | `4x4` | 16 | 16 | More compact, cells get smaller |
41
+ | `5x5` | 25 | 25 | Experimental — cells small, LLM accuracy drops |
42
+
43
+ **Quality degrades with bigger grids.** Cells shrink, detail is lost, and the
44
+ LLM has a harder time reading fine content like text or UI elements. Stick
45
+ with 3×3 unless you specifically need to pack more seconds into one photo.
46
+ 5×5 exists mostly as a "let me see what happens" option.
47
+
48
+ ## How many photos a video produces
49
+
50
+ At 1fps sampling, the board count at each grid size:
51
+
52
+ | Video length | 2×2 | 3×3 | 4×4 | 5×5 |
53
+ |---|---|---|---|---|
54
+ | 3s | **1** (partial) | 1 (partial) | 1 (partial) | 1 (partial) |
55
+ | 9s | **3** | 1 | 1 (partial) | 1 (partial) |
56
+ | 25s | **7** | 3 | 2 | 1 |
57
+ | 60s | 15 | **7** | 4 | 3 |
58
+ | 186s (3 min) | 47 | 21 | **12** | 8 |
59
+ | 300s (5 min, cap) | 75 | 34 | **19**¹ | 12 |
60
+
61
+ **Bold** = what `auto` picks — the smallest grid (biggest cells) that
62
+ keeps the board count under `--max-boards` (default 10).
63
+
64
+ ¹ At the 5-min cap, even 4×4 exceeds 10 boards at 1fps, so auto drops
65
+ the sampling rate (≈1 cell per 1.9s) to land at the 10-board limit. Use
66
+ `--fps 1.0 --max-boards 20` to preserve 1fps and accept more boards.
67
+
68
+ Most vision LLMs accept ~10–20 images per request, so auto's default
69
+ ceiling of 10 keeps a full video inside a single model call.
70
+
71
+ ## Install
72
+
73
+ ```bash
74
+ pip install vidgrid # core renderer only
75
+ pip install vidgrid[transcribe] # + faster-whisper for --transcribe
76
+ pip install vidgrid[anthropic] # + Claude support via --ask
77
+ pip install vidgrid[llm] # + Claude + GPT + Gemini
78
+ pip install vidgrid[all] # everything
79
+ ```
80
+
81
+ Requires Python 3.9+ and `ffmpeg` on your `PATH`.
82
+
83
+ ## Quick start
84
+
85
+ ```bash
86
+ # 1. Auto-pick grid and sampling rate — smallest grid that fits in 10 boards
87
+ vidgrid clip.mp4 -o grid.png
88
+
89
+ # 2. Force a specific grid
90
+ vidgrid clip.mp4 -o grid.png --grid 4x4
91
+
92
+ # 3. Force a sampling rate — 0.5fps = 1 cell every 2 seconds
93
+ vidgrid long-clip.mp4 -o grid.png --fps 0.5
94
+
95
+ # 4. Raise the max-boards ceiling (default 10) if you want more boards
96
+ vidgrid lecture.mp4 -o grid.png --max-boards 20
97
+
98
+ # 5. Render + auto-transcribe + send to Claude in one call
99
+ vidgrid lecture.mp4 --transcribe --ask "bullet-point summary"
100
+
101
+ # 6. Use existing Whisper captions, burn them onto the grid
102
+ vidgrid interview.mp4 -o grid.png --captions whisper.json --burn-captions
103
+
104
+ # 7. Let the CLI fall back to python -m if the console script isn't on PATH
105
+ python3 -m vidgrid clip.mp4 -o grid.png
106
+ ```
107
+
108
+ ## Three things you can do with it
109
+
110
+ ### 1. Summarize a talk without watching it
111
+
112
+ ```bash
113
+ vidgrid "team-meeting.mp4" \
114
+ --transcribe \
115
+ --ask "list the decisions made and who owns each" \
116
+ --model claude-opus-4-7
117
+ ```
118
+
119
+ vidgrid samples one frame per second, runs Whisper on the audio, sends the
120
+ grid + transcript to Claude, and prints the answer. The model correlates
121
+ frames and words via the burned-in timestamps.
122
+
123
+ ### 2. Find a specific moment in a screen recording
124
+
125
+ ```bash
126
+ vidgrid bug-repro.mp4 --grid 3x3 \
127
+ --ask "at which numbered frame does the error dialog appear?" \
128
+ --model gpt-5
129
+ ```
130
+
131
+ Because cells are globally numbered (1, 2, 3...) and tagged with timestamps,
132
+ the model can point you at the exact moment. No scrubbing.
133
+
134
+ ### 3. Rank a pile of stock footage
135
+
136
+ ```bash
137
+ for clip in broll/*.mp4; do
138
+ vidgrid "$clip" -o "grids/$(basename $clip .mp4).png"
139
+ done
140
+ ```
141
+
142
+ Send the PNGs to Claude in a single request and ask it to rank or reject
143
+ clips against your shot list. This is the workflow vidgrid was built for.
144
+
145
+ ## Portrait vs landscape
146
+
147
+ vidgrid keeps the grid shape square (N×N) regardless of source orientation
148
+ and preserves the source aspect inside each cell. Landscape sources produce
149
+ wide boards; portrait sources produce tall boards. Cells are never cropped.
150
+
151
+ ## Two-layer captions (default)
152
+
153
+ The default mode gives the LLM **two correlated inputs**: the rendered grid
154
+ image AND the Whisper transcript as separate text. The model correlates them
155
+ via the timestamps printed on each cell.
156
+
157
+ This beats burning captions into the image because:
158
+
159
+ 1. Frames keep their pixels for actual content
160
+ 2. Text is higher fidelity as tokens than as baked-in pixels
161
+ 3. The grid stays clean and shareable
162
+
163
+ Add `--burn-captions` if you want a self-contained image (useful for sharing
164
+ or offline analysis).
165
+
166
+ ## Caption file formats
167
+
168
+ vidgrid reads and writes three caption formats. The `--captions` flag
169
+ auto-detects from the file extension. The `--transcript-format` flag
170
+ controls what `--transcribe` writes.
171
+
172
+ | Format | Extension | Size (36 words) | When to use |
173
+ |---|---|---|---|
174
+ | `json` | `.json` | ~4.8 KB | Remotion pipelines, tools that need word confidence |
175
+ | `srt` | `.srt` | ~1.4 KB | Video editors, universal subtitle format |
176
+ | `txt` | `.txt` | ~0.4 KB | Smallest, grep-friendly, trivial to parse |
177
+
178
+ **JSON** (default, Remotion-compatible):
179
+ ```json
180
+ [
181
+ {"text": "hello", "startMs": 0, "endMs": 500, "timestampMs": 0, "confidence": 0.98},
182
+ ...
183
+ ]
184
+ ```
185
+
186
+ **SRT** (SubRip subtitles):
187
+ ```
188
+ 1
189
+ 00:00:00,000 --> 00:00:00,500
190
+ hello
191
+
192
+ 2
193
+ 00:00:00,500 --> 00:00:01,000
194
+ world
195
+ ```
196
+
197
+ **TXT** (plain timestamped text, one word per line):
198
+ ```
199
+ 0.00 hello
200
+ 0.50 world
201
+ ```
202
+
203
+ Use any format as input, output, or both. You can mix — read an `.srt` and
204
+ write a `.txt` with `--captions foo.srt --transcript-format txt`.
205
+
206
+ ## Python API
207
+
208
+ ```python
209
+ from vidgrid import render
210
+
211
+ storyboard = render(
212
+ input_path="interview.mp4",
213
+ output_path="grid.png",
214
+ grid="3x3", # or "2x2", "4x4", "5x5", or None for auto
215
+ transcribe=True,
216
+ )
217
+
218
+ print(storyboard.board_paths) # ['grid-1.png', 'grid-2.png', ...]
219
+ print(storyboard.transcript_path) # 'grid-transcript.json'
220
+ print(storyboard.all_samples) # list[Sample] with timestamps
221
+ ```
222
+
223
+ Modules: `vidgrid.probe`, `vidgrid.sample`, `vidgrid.compose`,
224
+ `vidgrid.captions`, `vidgrid.llm`, `vidgrid.presets`.
225
+
226
+ ## Output structure
227
+
228
+ **Single-board run:**
229
+ ```
230
+ grid.png # the storyboard
231
+ grid.json # sidecar: timestamps, layout, source info
232
+ grid-transcript.json # only if --transcribe or --captions was used
233
+ ```
234
+
235
+ **Multi-board run:**
236
+ ```
237
+ grid-1.png, grid-2.png, grid-3.png, ...
238
+ grid.json # index covering all boards + global cell numbering
239
+ grid-transcript.json
240
+ ```
241
+
242
+ Cells are numbered **globally** across boards. A 3-board run has cells 1–27
243
+ so the LLM can reference any frame without ambiguity.
244
+
245
+ ## Limits and caveats
246
+
247
+ - **5-minute hard cap on video length.** Longer videos are rejected. Chop
248
+ them up with `ffmpeg -ss START -t 300 input.mp4 chunk.mp4`.
249
+ - **No scene detection.** v1 samples strictly 1 frame per second, uniform.
250
+ No dedupe, no shifting — the spacing is always exactly 1 second.
251
+ - **Variable-framerate videos** may have sub-frame seek drift (≤1 frame),
252
+ which is acceptable at 1fps sampling.
253
+ - **Bigger grids hurt legibility.** A 5×5 grid has cells ~300px wide; fine
254
+ for people and objects, marginal for dense text or code. Stick with 3×3.
255
+ - **LLM integration** uses the official SDKs (anthropic, openai, google-genai)
256
+ and won't be installed unless you request them as extras.
257
+
258
+ ## Prior art
259
+
260
+ - [IG-VLM](https://arxiv.org/abs/2403.18406) — research paper proving the grid trick works
261
+ - [llm-video-frames](https://github.com/simonw/llm-video-frames) — Simon Willison's per-frame approach
262
+ - [vcsi](https://github.com/amietn/vcsi) — contact sheets without LLMs
263
+ - [byjlw/video-analyzer](https://github.com/byjlw/video-analyzer) — whisper + sequential frames
264
+
265
+ vidgrid's differentiator: **1 cell = 1 second, numbered cells, simple CLI,
266
+ multi-provider LLM integration in one package**.
267
+
268
+ ## License
269
+
270
+ MIT. The bundled Source Sans 3 font is licensed under [SIL OFL 1.1](vidgrid/assets/fonts/LICENSE-SourceSans3.md).
@@ -0,0 +1,70 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "vidgrid"
7
+ version = "0.1.0"
8
+ description = "Convert video clips into annotated image grids for vision LLM analysis"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Paw Vej" }]
13
+ keywords = ["video", "llm", "vision", "storyboard", "claude", "gpt", "gemini", "ffmpeg"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3.9",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Topic :: Multimedia :: Video",
22
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
23
+ "Intended Audience :: Developers",
24
+ "Operating System :: OS Independent",
25
+ ]
26
+ dependencies = [
27
+ "Pillow>=10.0.0",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ transcribe = ["faster-whisper>=1.0.0"]
32
+ anthropic = ["anthropic>=0.39.0"]
33
+ openai = ["openai>=1.50.0"]
34
+ gemini = ["google-genai>=0.3.0"]
35
+ llm = [
36
+ "anthropic>=0.39.0",
37
+ "openai>=1.50.0",
38
+ "google-genai>=0.3.0",
39
+ ]
40
+ all = [
41
+ "faster-whisper>=1.0.0",
42
+ "anthropic>=0.39.0",
43
+ "openai>=1.50.0",
44
+ "google-genai>=0.3.0",
45
+ ]
46
+ dev = [
47
+ "pytest>=8.0.0",
48
+ "pytest-mock>=3.14.0",
49
+ ]
50
+
51
+ [project.scripts]
52
+ vidgrid = "vidgrid.cli:main"
53
+
54
+ [project.urls]
55
+ Homepage = "https://github.com/pawvej/vidgrid"
56
+ Issues = "https://github.com/pawvej/vidgrid/issues"
57
+
58
+ [tool.setuptools]
59
+ include-package-data = true
60
+
61
+ [tool.setuptools.packages.find]
62
+ include = ["vidgrid*"]
63
+
64
+ [tool.setuptools.package-data]
65
+ vidgrid = ["assets/fonts/*.ttf"]
66
+
67
+ [tool.pytest.ini_options]
68
+ testpaths = ["tests"]
69
+ addopts = "-q"
70
+ markers = ["integration: tests that require ffmpeg"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+