vidgrid 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vidgrid-0.1.0/LICENSE +21 -0
- vidgrid-0.1.0/PKG-INFO +315 -0
- vidgrid-0.1.0/README.md +270 -0
- vidgrid-0.1.0/pyproject.toml +70 -0
- vidgrid-0.1.0/setup.cfg +4 -0
- vidgrid-0.1.0/tests/test_captions.py +265 -0
- vidgrid-0.1.0/tests/test_compose.py +193 -0
- vidgrid-0.1.0/tests/test_llm.py +51 -0
- vidgrid-0.1.0/tests/test_presets.py +224 -0
- vidgrid-0.1.0/tests/test_sample.py +98 -0
- vidgrid-0.1.0/vidgrid/__init__.py +9 -0
- vidgrid-0.1.0/vidgrid/__main__.py +7 -0
- vidgrid-0.1.0/vidgrid/api.py +166 -0
- vidgrid-0.1.0/vidgrid/assets/fonts/SourceSans3-Semibold.ttf +0 -0
- vidgrid-0.1.0/vidgrid/captions.py +374 -0
- vidgrid-0.1.0/vidgrid/cli.py +231 -0
- vidgrid-0.1.0/vidgrid/compose.py +414 -0
- vidgrid-0.1.0/vidgrid/llm.py +227 -0
- vidgrid-0.1.0/vidgrid/models.py +130 -0
- vidgrid-0.1.0/vidgrid/output.py +108 -0
- vidgrid-0.1.0/vidgrid/presets.py +165 -0
- vidgrid-0.1.0/vidgrid/probe.py +118 -0
- vidgrid-0.1.0/vidgrid/sample.py +133 -0
- vidgrid-0.1.0/vidgrid.egg-info/PKG-INFO +315 -0
- vidgrid-0.1.0/vidgrid.egg-info/SOURCES.txt +27 -0
- vidgrid-0.1.0/vidgrid.egg-info/dependency_links.txt +1 -0
- vidgrid-0.1.0/vidgrid.egg-info/entry_points.txt +2 -0
- vidgrid-0.1.0/vidgrid.egg-info/requires.txt +28 -0
- vidgrid-0.1.0/vidgrid.egg-info/top_level.txt +1 -0
vidgrid-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pawel Kozlowski
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vidgrid-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vidgrid
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert video clips into annotated image grids for vision LLM analysis
|
|
5
|
+
Author: Paw Vej
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/pawvej/vidgrid
|
|
8
|
+
Project-URL: Issues, https://github.com/pawvej/vidgrid/issues
|
|
9
|
+
Keywords: video,llm,vision,storyboard,claude,gpt,gemini,ffmpeg
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Multimedia :: Video
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Intended Audience :: Developers
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: Pillow>=10.0.0
|
|
24
|
+
Provides-Extra: transcribe
|
|
25
|
+
Requires-Dist: faster-whisper>=1.0.0; extra == "transcribe"
|
|
26
|
+
Provides-Extra: anthropic
|
|
27
|
+
Requires-Dist: anthropic>=0.39.0; extra == "anthropic"
|
|
28
|
+
Provides-Extra: openai
|
|
29
|
+
Requires-Dist: openai>=1.50.0; extra == "openai"
|
|
30
|
+
Provides-Extra: gemini
|
|
31
|
+
Requires-Dist: google-genai>=0.3.0; extra == "gemini"
|
|
32
|
+
Provides-Extra: llm
|
|
33
|
+
Requires-Dist: anthropic>=0.39.0; extra == "llm"
|
|
34
|
+
Requires-Dist: openai>=1.50.0; extra == "llm"
|
|
35
|
+
Requires-Dist: google-genai>=0.3.0; extra == "llm"
|
|
36
|
+
Provides-Extra: all
|
|
37
|
+
Requires-Dist: faster-whisper>=1.0.0; extra == "all"
|
|
38
|
+
Requires-Dist: anthropic>=0.39.0; extra == "all"
|
|
39
|
+
Requires-Dist: openai>=1.50.0; extra == "all"
|
|
40
|
+
Requires-Dist: google-genai>=0.3.0; extra == "all"
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-mock>=3.14.0; extra == "dev"
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# vidgrid
|
|
47
|
+
|
|
48
|
+
[](https://pypi.org/project/vidgrid/)
|
|
49
|
+
[](https://pypi.org/project/vidgrid/)
|
|
50
|
+
[](LICENSE)
|
|
51
|
+
[](https://pypi.org/project/vidgrid/)
|
|
52
|
+
[](https://vidgrid.site)
|
|
53
|
+
|
|
54
|
+
> Convert video clips into annotated image grids for vision LLM analysis.
|
|
55
|
+
> **One cell = one second, by default.**
|
|
56
|
+
|
|
57
|
+

|
|
58
|
+
|
|
59
|
+
LLMs can't watch video, but they can analyze a single image. `vidgrid` samples
|
|
60
|
+
one frame per second from a video, tiles them into a numbered storyboard with
|
|
61
|
+
timestamps, and optionally sends the result to Claude, GPT, or Gemini with a
|
|
62
|
+
prompt. The result is something close to "my LLM just watched a video" for
|
|
63
|
+
the cost of a handful of image uploads.
|
|
64
|
+
|
|
65
|
+
**Don't want to install?** Use the hosted version at
|
|
66
|
+
[vidgrid.site](https://vidgrid.site) — drop a file, get the grid in the
|
|
67
|
+
browser. 3 free renders, $5 lifetime after that. Free for ever on the CLI.
|
|
68
|
+
|
|
69
|
+
## The model
|
|
70
|
+
|
|
71
|
+
**One cell = one second, by default.** The auto-picker chooses the smallest
|
|
72
|
+
grid (biggest, most-legible cells) whose board count stays under
|
|
73
|
+
`--max-boards` (default 10). When that's not enough for a long clip, it
|
|
74
|
+
bumps the grid up; as a last resort, it reduces the sampling rate. Override
|
|
75
|
+
with `--fps` and `--max-boards` for full control.
|
|
76
|
+
|
|
77
|
+
- Grid size determines how many seconds fit in one photo
|
|
78
|
+
- Default sampling is 1fps; drops below 1fps only when needed to stay under the max-boards cap
|
|
79
|
+
- Videos over 5 minutes are rejected (chop them up first)
|
|
80
|
+
|
|
81
|
+
| Grid | Cells | Seconds per photo | Best for |
|
|
82
|
+
|---|---|---|---|
|
|
83
|
+
| `2x2` | 4 | 4 | Very short clips (2–4s) |
|
|
84
|
+
| `3x3` | **9** | **9** | **Default — best overall readability** |
|
|
85
|
+
| `4x4` | 16 | 16 | More compact, cells get smaller |
|
|
86
|
+
| `5x5` | 25 | 25 | Experimental — cells small, LLM accuracy drops |
|
|
87
|
+
|
|
88
|
+
**Quality degrades with bigger grids.** Cells shrink, detail is lost, and the
|
|
89
|
+
LLM has a harder time reading fine content like text or UI elements. Stick
|
|
90
|
+
with 3×3 unless you specifically need to pack more seconds into one photo.
|
|
91
|
+
5×5 exists mostly as a "let me see what happens" option.
|
|
92
|
+
|
|
93
|
+
## How many photos a video produces
|
|
94
|
+
|
|
95
|
+
At 1fps sampling, the board count at each grid size:
|
|
96
|
+
|
|
97
|
+
| Video length | 2×2 | 3×3 | 4×4 | 5×5 |
|
|
98
|
+
|---|---|---|---|---|
|
|
99
|
+
| 3s | **1** (partial) | 1 (partial) | 1 (partial) | 1 (partial) |
|
|
100
|
+
| 9s | **3** | 1 | 1 (partial) | 1 (partial) |
|
|
101
|
+
| 25s | **7** | 3 | 2 | 1 |
|
|
102
|
+
| 60s | 15 | **7** | 4 | 3 |
|
|
103
|
+
| 186s (3 min) | 47 | 21 | **12** | 8 |
|
|
104
|
+
| 300s (5 min, cap) | 75 | 34 | **19**¹ | 12 |
|
|
105
|
+
|
|
106
|
+
**Bold** = what `auto` picks — the smallest grid (biggest cells) that
|
|
107
|
+
keeps the board count under `--max-boards` (default 10).
|
|
108
|
+
|
|
109
|
+
¹ At the 5-min cap, even 4×4 exceeds 10 boards at 1fps, so auto drops
|
|
110
|
+
the sampling rate (≈1 cell per 1.9s) to land at the 10-board limit. Use
|
|
111
|
+
`--fps 1.0 --max-boards 20` to preserve 1fps and accept more boards.
|
|
112
|
+
|
|
113
|
+
Most vision LLMs accept ~10–20 images per request, so auto's default
|
|
114
|
+
ceiling of 10 keeps a full video inside a single model call.
|
|
115
|
+
|
|
116
|
+
## Install
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
pip install vidgrid # core renderer only
|
|
120
|
+
pip install vidgrid[transcribe] # + faster-whisper for --transcribe
|
|
121
|
+
pip install vidgrid[anthropic] # + Claude support via --ask
|
|
122
|
+
pip install vidgrid[llm] # + Claude + GPT + Gemini
|
|
123
|
+
pip install vidgrid[all] # everything
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Requires Python 3.9+ and `ffmpeg` on your `PATH`.
|
|
127
|
+
|
|
128
|
+
## Quick start
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# 1. Auto-pick grid and sampling rate — smallest grid that fits in 10 boards
|
|
132
|
+
vidgrid clip.mp4 -o grid.png
|
|
133
|
+
|
|
134
|
+
# 2. Force a specific grid
|
|
135
|
+
vidgrid clip.mp4 -o grid.png --grid 4x4
|
|
136
|
+
|
|
137
|
+
# 3. Force a sampling rate — 0.5fps = 1 cell every 2 seconds
|
|
138
|
+
vidgrid long-clip.mp4 -o grid.png --fps 0.5
|
|
139
|
+
|
|
140
|
+
# 4. Raise the max-boards ceiling (default 10) if you want more boards
|
|
141
|
+
vidgrid lecture.mp4 -o grid.png --max-boards 20
|
|
142
|
+
|
|
143
|
+
# 5. Render + auto-transcribe + send to Claude in one call
|
|
144
|
+
vidgrid lecture.mp4 --transcribe --ask "bullet-point summary"
|
|
145
|
+
|
|
146
|
+
# 6. Use existing Whisper captions, burn them onto the grid
|
|
147
|
+
vidgrid interview.mp4 -o grid.png --captions whisper.json --burn-captions
|
|
148
|
+
|
|
149
|
+
# 7. Let the CLI fall back to python -m if the console script isn't on PATH
|
|
150
|
+
python3 -m vidgrid clip.mp4 -o grid.png
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Three things you can do with it
|
|
154
|
+
|
|
155
|
+
### 1. Summarize a talk without watching it
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
vidgrid "team-meeting.mp4" \
|
|
159
|
+
--transcribe \
|
|
160
|
+
--ask "list the decisions made and who owns each" \
|
|
161
|
+
--model claude-opus-4-7
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
vidgrid samples one frame per second, runs Whisper on the audio, sends the
|
|
165
|
+
grid + transcript to Claude, and prints the answer. The model correlates
|
|
166
|
+
frames and words via the burned-in timestamps.
|
|
167
|
+
|
|
168
|
+
### 2. Find a specific moment in a screen recording
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
vidgrid bug-repro.mp4 --grid 3x3 \
|
|
172
|
+
--ask "at which numbered frame does the error dialog appear?" \
|
|
173
|
+
--model gpt-5
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
Because cells are globally numbered (1, 2, 3...) and tagged with timestamps,
|
|
177
|
+
the model can point you at the exact moment. No scrubbing.
|
|
178
|
+
|
|
179
|
+
### 3. Rank a pile of stock footage
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
for clip in broll/*.mp4; do
|
|
183
|
+
vidgrid "$clip" -o "grids/$(basename $clip .mp4).png"
|
|
184
|
+
done
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Send the PNGs to Claude in a single request and ask it to rank or reject
|
|
188
|
+
clips against your shot list. This is the workflow vidgrid was built for.
|
|
189
|
+
|
|
190
|
+
## Portrait vs landscape
|
|
191
|
+
|
|
192
|
+
vidgrid keeps the grid shape square (N×N) regardless of source orientation
|
|
193
|
+
and preserves the source aspect inside each cell. Landscape sources produce
|
|
194
|
+
wide boards; portrait sources produce tall boards. Cells are never cropped.
|
|
195
|
+
|
|
196
|
+
## Two-layer captions (default)
|
|
197
|
+
|
|
198
|
+
The default mode gives the LLM **two correlated inputs**: the rendered grid
|
|
199
|
+
image AND the Whisper transcript as separate text. The model correlates them
|
|
200
|
+
via the timestamps printed on each cell.
|
|
201
|
+
|
|
202
|
+
This beats burning captions into the image because:
|
|
203
|
+
|
|
204
|
+
1. Frames keep their pixels for actual content
|
|
205
|
+
2. Text is higher fidelity as tokens than as baked-in pixels
|
|
206
|
+
3. The grid stays clean and shareable
|
|
207
|
+
|
|
208
|
+
Add `--burn-captions` if you want a self-contained image (useful for sharing
|
|
209
|
+
or offline analysis).
|
|
210
|
+
|
|
211
|
+
## Caption file formats
|
|
212
|
+
|
|
213
|
+
vidgrid reads and writes three caption formats. The `--captions` flag
|
|
214
|
+
auto-detects from the file extension. The `--transcript-format` flag
|
|
215
|
+
controls what `--transcribe` writes.
|
|
216
|
+
|
|
217
|
+
| Format | Extension | Size (36 words) | When to use |
|
|
218
|
+
|---|---|---|---|
|
|
219
|
+
| `json` | `.json` | ~4.8 KB | Remotion pipelines, tools that need word confidence |
|
|
220
|
+
| `srt` | `.srt` | ~1.4 KB | Video editors, universal subtitle format |
|
|
221
|
+
| `txt` | `.txt` | ~0.4 KB | Smallest, grep-friendly, trivial to parse |
|
|
222
|
+
|
|
223
|
+
**JSON** (default, Remotion-compatible):
|
|
224
|
+
```json
|
|
225
|
+
[
|
|
226
|
+
{"text": "hello", "startMs": 0, "endMs": 500, "timestampMs": 0, "confidence": 0.98},
|
|
227
|
+
...
|
|
228
|
+
]
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
**SRT** (SubRip subtitles):
|
|
232
|
+
```
|
|
233
|
+
1
|
|
234
|
+
00:00:00,000 --> 00:00:00,500
|
|
235
|
+
hello
|
|
236
|
+
|
|
237
|
+
2
|
|
238
|
+
00:00:00,500 --> 00:00:01,000
|
|
239
|
+
world
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
**TXT** (plain timestamped text, one word per line):
|
|
243
|
+
```
|
|
244
|
+
0.00 hello
|
|
245
|
+
0.50 world
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
Use any format as input, output, or both. You can mix — read an `.srt` and
|
|
249
|
+
write a `.txt` with `--captions foo.srt --transcript-format txt`.
|
|
250
|
+
|
|
251
|
+
## Python API
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
from vidgrid import render
|
|
255
|
+
|
|
256
|
+
storyboard = render(
|
|
257
|
+
input_path="interview.mp4",
|
|
258
|
+
output_path="grid.png",
|
|
259
|
+
grid="3x3", # or "2x2", "4x4", "5x5", or None for auto
|
|
260
|
+
transcribe=True,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
print(storyboard.board_paths) # ['grid-1.png', 'grid-2.png', ...]
|
|
264
|
+
print(storyboard.transcript_path) # 'grid-transcript.json'
|
|
265
|
+
print(storyboard.all_samples) # list[Sample] with timestamps
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
Modules: `vidgrid.probe`, `vidgrid.sample`, `vidgrid.compose`,
|
|
269
|
+
`vidgrid.captions`, `vidgrid.llm`, `vidgrid.presets`.
|
|
270
|
+
|
|
271
|
+
## Output structure
|
|
272
|
+
|
|
273
|
+
**Single-board run:**
|
|
274
|
+
```
|
|
275
|
+
grid.png # the storyboard
|
|
276
|
+
grid.json # sidecar: timestamps, layout, source info
|
|
277
|
+
grid-transcript.json # only if --transcribe or --captions was used
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
**Multi-board run:**
|
|
281
|
+
```
|
|
282
|
+
grid-1.png, grid-2.png, grid-3.png, ...
|
|
283
|
+
grid.json # index covering all boards + global cell numbering
|
|
284
|
+
grid-transcript.json
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
Cells are numbered **globally** across boards. A 3-board run has cells 1–27
|
|
288
|
+
so the LLM can reference any frame without ambiguity.
|
|
289
|
+
|
|
290
|
+
## Limits and caveats
|
|
291
|
+
|
|
292
|
+
- **5-minute hard cap on video length.** Longer videos are rejected. Chop
|
|
293
|
+
them up with `ffmpeg -ss START -t 300 input.mp4 chunk.mp4`.
|
|
294
|
+
- **No scene detection.** v1 samples strictly 1 frame per second, uniform.
|
|
295
|
+
No dedupe, no shifting — the spacing is always exactly 1 second.
|
|
296
|
+
- **Variable-framerate videos** may have sub-frame seek drift (≤1 frame),
|
|
297
|
+
which is acceptable at 1fps sampling.
|
|
298
|
+
- **Bigger grids hurt legibility.** A 5×5 grid has cells ~300px wide; fine
|
|
299
|
+
for people and objects, marginal for dense text or code. Stick with 3×3.
|
|
300
|
+
- **LLM integration** uses the official SDKs (anthropic, openai, google-genai)
|
|
301
|
+
and won't be installed unless you request them as extras.
|
|
302
|
+
|
|
303
|
+
## Prior art
|
|
304
|
+
|
|
305
|
+
- [IG-VLM](https://arxiv.org/abs/2403.18406) — research paper proving the grid trick works
|
|
306
|
+
- [llm-video-frames](https://github.com/simonw/llm-video-frames) — Simon Willison's per-frame approach
|
|
307
|
+
- [vcsi](https://github.com/amietn/vcsi) — contact sheets without LLMs
|
|
308
|
+
- [byjlw/video-analyzer](https://github.com/byjlw/video-analyzer) — whisper + sequential frames
|
|
309
|
+
|
|
310
|
+
vidgrid's differentiator: **1 cell = 1 second, numbered cells, simple CLI,
|
|
311
|
+
multi-provider LLM integration in one package**.
|
|
312
|
+
|
|
313
|
+
## License
|
|
314
|
+
|
|
315
|
+
MIT. The bundled Source Sans 3 font is licensed under [SIL OFL 1.1](vidgrid/assets/fonts/LICENSE-SourceSans3.md).
|
vidgrid-0.1.0/README.md
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
# vidgrid
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/vidgrid/)
|
|
4
|
+
[](https://pypi.org/project/vidgrid/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://pypi.org/project/vidgrid/)
|
|
7
|
+
[](https://vidgrid.site)
|
|
8
|
+
|
|
9
|
+
> Convert video clips into annotated image grids for vision LLM analysis.
|
|
10
|
+
> **One cell = one second, by default.**
|
|
11
|
+
|
|
12
|
+

|
|
13
|
+
|
|
14
|
+
LLMs can't watch video, but they can analyze a single image. `vidgrid` samples
|
|
15
|
+
one frame per second from a video, tiles them into a numbered storyboard with
|
|
16
|
+
timestamps, and optionally sends the result to Claude, GPT, or Gemini with a
|
|
17
|
+
prompt. The result is something close to "my LLM just watched a video" for
|
|
18
|
+
the cost of a handful of image uploads.
|
|
19
|
+
|
|
20
|
+
**Don't want to install?** Use the hosted version at
|
|
21
|
+
[vidgrid.site](https://vidgrid.site) — drop a file, get the grid in the
|
|
22
|
+
browser. 3 free renders, $5 lifetime after that. Free for ever on the CLI.
|
|
23
|
+
|
|
24
|
+
## The model
|
|
25
|
+
|
|
26
|
+
**One cell = one second, by default.** The auto-picker chooses the smallest
|
|
27
|
+
grid (biggest, most-legible cells) whose board count stays under
|
|
28
|
+
`--max-boards` (default 10). When that's not enough for a long clip, it
|
|
29
|
+
bumps the grid up; as a last resort, it reduces the sampling rate. Override
|
|
30
|
+
with `--fps` and `--max-boards` for full control.
|
|
31
|
+
|
|
32
|
+
- Grid size determines how many seconds fit in one photo
|
|
33
|
+
- Default sampling is 1fps; drops below 1fps only when needed to stay under the max-boards cap
|
|
34
|
+
- Videos over 5 minutes are rejected (chop them up first)
|
|
35
|
+
|
|
36
|
+
| Grid | Cells | Seconds per photo | Best for |
|
|
37
|
+
|---|---|---|---|
|
|
38
|
+
| `2x2` | 4 | 4 | Very short clips (2–4s) |
|
|
39
|
+
| `3x3` | **9** | **9** | **Default — best overall readability** |
|
|
40
|
+
| `4x4` | 16 | 16 | More compact, cells get smaller |
|
|
41
|
+
| `5x5` | 25 | 25 | Experimental — cells small, LLM accuracy drops |
|
|
42
|
+
|
|
43
|
+
**Quality degrades with bigger grids.** Cells shrink, detail is lost, and the
|
|
44
|
+
LLM has a harder time reading fine content like text or UI elements. Stick
|
|
45
|
+
with 3×3 unless you specifically need to pack more seconds into one photo.
|
|
46
|
+
5×5 exists mostly as a "let me see what happens" option.
|
|
47
|
+
|
|
48
|
+
## How many photos a video produces
|
|
49
|
+
|
|
50
|
+
At 1fps sampling, the board count at each grid size:
|
|
51
|
+
|
|
52
|
+
| Video length | 2×2 | 3×3 | 4×4 | 5×5 |
|
|
53
|
+
|---|---|---|---|---|
|
|
54
|
+
| 3s | **1** (partial) | 1 (partial) | 1 (partial) | 1 (partial) |
|
|
55
|
+
| 9s | **3** | 1 | 1 (partial) | 1 (partial) |
|
|
56
|
+
| 25s | **7** | 3 | 2 | 1 |
|
|
57
|
+
| 60s | 15 | **7** | 4 | 3 |
|
|
58
|
+
| 186s (3 min) | 47 | 21 | **12** | 8 |
|
|
59
|
+
| 300s (5 min, cap) | 75 | 34 | **19**¹ | 12 |
|
|
60
|
+
|
|
61
|
+
**Bold** = what `auto` picks — the smallest grid (biggest cells) that
|
|
62
|
+
keeps the board count under `--max-boards` (default 10).
|
|
63
|
+
|
|
64
|
+
¹ At the 5-min cap, even 4×4 exceeds 10 boards at 1fps, so auto drops
|
|
65
|
+
the sampling rate (≈1 cell per 1.9s) to land at the 10-board limit. Use
|
|
66
|
+
`--fps 1.0 --max-boards 20` to preserve 1fps and accept more boards.
|
|
67
|
+
|
|
68
|
+
Most vision LLMs accept ~10–20 images per request, so auto's default
|
|
69
|
+
ceiling of 10 keeps a full video inside a single model call.
|
|
70
|
+
|
|
71
|
+
## Install
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install vidgrid # core renderer only
|
|
75
|
+
pip install vidgrid[transcribe] # + faster-whisper for --transcribe
|
|
76
|
+
pip install vidgrid[anthropic] # + Claude support via --ask
|
|
77
|
+
pip install vidgrid[llm] # + Claude + GPT + Gemini
|
|
78
|
+
pip install vidgrid[all] # everything
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Requires Python 3.9+ and `ffmpeg` on your `PATH`.
|
|
82
|
+
|
|
83
|
+
## Quick start
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# 1. Auto-pick grid and sampling rate — smallest grid that fits in 10 boards
|
|
87
|
+
vidgrid clip.mp4 -o grid.png
|
|
88
|
+
|
|
89
|
+
# 2. Force a specific grid
|
|
90
|
+
vidgrid clip.mp4 -o grid.png --grid 4x4
|
|
91
|
+
|
|
92
|
+
# 3. Force a sampling rate — 0.5fps = 1 cell every 2 seconds
|
|
93
|
+
vidgrid long-clip.mp4 -o grid.png --fps 0.5
|
|
94
|
+
|
|
95
|
+
# 4. Raise the max-boards ceiling (default 10) if you want more boards
|
|
96
|
+
vidgrid lecture.mp4 -o grid.png --max-boards 20
|
|
97
|
+
|
|
98
|
+
# 5. Render + auto-transcribe + send to Claude in one call
|
|
99
|
+
vidgrid lecture.mp4 --transcribe --ask "bullet-point summary"
|
|
100
|
+
|
|
101
|
+
# 6. Use existing Whisper captions, burn them onto the grid
|
|
102
|
+
vidgrid interview.mp4 -o grid.png --captions whisper.json --burn-captions
|
|
103
|
+
|
|
104
|
+
# 7. Let the CLI fall back to python -m if the console script isn't on PATH
|
|
105
|
+
python3 -m vidgrid clip.mp4 -o grid.png
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Three things you can do with it
|
|
109
|
+
|
|
110
|
+
### 1. Summarize a talk without watching it
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
vidgrid "team-meeting.mp4" \
|
|
114
|
+
--transcribe \
|
|
115
|
+
--ask "list the decisions made and who owns each" \
|
|
116
|
+
--model claude-opus-4-7
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
vidgrid samples one frame per second, runs Whisper on the audio, sends the
|
|
120
|
+
grid + transcript to Claude, and prints the answer. The model correlates
|
|
121
|
+
frames and words via the burned-in timestamps.
|
|
122
|
+
|
|
123
|
+
### 2. Find a specific moment in a screen recording
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
vidgrid bug-repro.mp4 --grid 3x3 \
|
|
127
|
+
--ask "at which numbered frame does the error dialog appear?" \
|
|
128
|
+
--model gpt-5
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Because cells are globally numbered (1, 2, 3...) and tagged with timestamps,
|
|
132
|
+
the model can point you at the exact moment. No scrubbing.
|
|
133
|
+
|
|
134
|
+
### 3. Rank a pile of stock footage
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
for clip in broll/*.mp4; do
|
|
138
|
+
vidgrid "$clip" -o "grids/$(basename $clip .mp4).png"
|
|
139
|
+
done
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Send the PNGs to Claude in a single request and ask it to rank or reject
|
|
143
|
+
clips against your shot list. This is the workflow vidgrid was built for.
|
|
144
|
+
|
|
145
|
+
## Portrait vs landscape
|
|
146
|
+
|
|
147
|
+
vidgrid keeps the grid shape square (N×N) regardless of source orientation
|
|
148
|
+
and preserves the source aspect inside each cell. Landscape sources produce
|
|
149
|
+
wide boards; portrait sources produce tall boards. Cells are never cropped.
|
|
150
|
+
|
|
151
|
+
## Two-layer captions (default)
|
|
152
|
+
|
|
153
|
+
The default mode gives the LLM **two correlated inputs**: the rendered grid
|
|
154
|
+
image AND the Whisper transcript as separate text. The model correlates them
|
|
155
|
+
via the timestamps printed on each cell.
|
|
156
|
+
|
|
157
|
+
This beats burning captions into the image because:
|
|
158
|
+
|
|
159
|
+
1. Frames keep their pixels for actual content
|
|
160
|
+
2. Text is higher fidelity as tokens than as baked-in pixels
|
|
161
|
+
3. The grid stays clean and shareable
|
|
162
|
+
|
|
163
|
+
Add `--burn-captions` if you want a self-contained image (useful for sharing
|
|
164
|
+
or offline analysis).
|
|
165
|
+
|
|
166
|
+
## Caption file formats
|
|
167
|
+
|
|
168
|
+
vidgrid reads and writes three caption formats. The `--captions` flag
|
|
169
|
+
auto-detects from the file extension. The `--transcript-format` flag
|
|
170
|
+
controls what `--transcribe` writes.
|
|
171
|
+
|
|
172
|
+
| Format | Extension | Size (36 words) | When to use |
|
|
173
|
+
|---|---|---|---|
|
|
174
|
+
| `json` | `.json` | ~4.8 KB | Remotion pipelines, tools that need word confidence |
|
|
175
|
+
| `srt` | `.srt` | ~1.4 KB | Video editors, universal subtitle format |
|
|
176
|
+
| `txt` | `.txt` | ~0.4 KB | Smallest, grep-friendly, trivial to parse |
|
|
177
|
+
|
|
178
|
+
**JSON** (default, Remotion-compatible):
|
|
179
|
+
```json
|
|
180
|
+
[
|
|
181
|
+
{"text": "hello", "startMs": 0, "endMs": 500, "timestampMs": 0, "confidence": 0.98},
|
|
182
|
+
...
|
|
183
|
+
]
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**SRT** (SubRip subtitles):
|
|
187
|
+
```
|
|
188
|
+
1
|
|
189
|
+
00:00:00,000 --> 00:00:00,500
|
|
190
|
+
hello
|
|
191
|
+
|
|
192
|
+
2
|
|
193
|
+
00:00:00,500 --> 00:00:01,000
|
|
194
|
+
world
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
**TXT** (plain timestamped text, one word per line):
|
|
198
|
+
```
|
|
199
|
+
0.00 hello
|
|
200
|
+
0.50 world
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Use any format as input, output, or both. You can mix — read an `.srt` and
|
|
204
|
+
write a `.txt` with `--captions foo.srt --transcript-format txt`.
|
|
205
|
+
|
|
206
|
+
## Python API
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from vidgrid import render
|
|
210
|
+
|
|
211
|
+
storyboard = render(
|
|
212
|
+
input_path="interview.mp4",
|
|
213
|
+
output_path="grid.png",
|
|
214
|
+
grid="3x3", # or "2x2", "4x4", "5x5", or None for auto
|
|
215
|
+
transcribe=True,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
print(storyboard.board_paths) # ['grid-1.png', 'grid-2.png', ...]
|
|
219
|
+
print(storyboard.transcript_path) # 'grid-transcript.json'
|
|
220
|
+
print(storyboard.all_samples) # list[Sample] with timestamps
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
Modules: `vidgrid.probe`, `vidgrid.sample`, `vidgrid.compose`,
|
|
224
|
+
`vidgrid.captions`, `vidgrid.llm`, `vidgrid.presets`.
|
|
225
|
+
|
|
226
|
+
## Output structure
|
|
227
|
+
|
|
228
|
+
**Single-board run:**
|
|
229
|
+
```
|
|
230
|
+
grid.png # the storyboard
|
|
231
|
+
grid.json # sidecar: timestamps, layout, source info
|
|
232
|
+
grid-transcript.json # only if --transcribe or --captions was used
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
**Multi-board run:**
|
|
236
|
+
```
|
|
237
|
+
grid-1.png, grid-2.png, grid-3.png, ...
|
|
238
|
+
grid.json # index covering all boards + global cell numbering
|
|
239
|
+
grid-transcript.json
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
Cells are numbered **globally** across boards. A 3-board run has cells 1–27
|
|
243
|
+
so the LLM can reference any frame without ambiguity.
|
|
244
|
+
|
|
245
|
+
## Limits and caveats
|
|
246
|
+
|
|
247
|
+
- **5-minute hard cap on video length.** Longer videos are rejected. Chop
|
|
248
|
+
them up with `ffmpeg -ss START -t 300 input.mp4 chunk.mp4`.
|
|
249
|
+
- **No scene detection.** v1 samples strictly 1 frame per second, uniform.
|
|
250
|
+
No dedupe, no shifting — the spacing is always exactly 1 second.
|
|
251
|
+
- **Variable-framerate videos** may have sub-frame seek drift (≤1 frame),
|
|
252
|
+
which is acceptable at 1fps sampling.
|
|
253
|
+
- **Bigger grids hurt legibility.** A 5×5 grid has cells ~300px wide; fine
|
|
254
|
+
for people and objects, marginal for dense text or code. Stick with 3×3.
|
|
255
|
+
- **LLM integration** uses the official SDKs (anthropic, openai, google-genai)
|
|
256
|
+
and won't be installed unless you request them as extras.
|
|
257
|
+
|
|
258
|
+
## Prior art
|
|
259
|
+
|
|
260
|
+
- [IG-VLM](https://arxiv.org/abs/2403.18406) — research paper proving the grid trick works
|
|
261
|
+
- [llm-video-frames](https://github.com/simonw/llm-video-frames) — Simon Willison's per-frame approach
|
|
262
|
+
- [vcsi](https://github.com/amietn/vcsi) — contact sheets without LLMs
|
|
263
|
+
- [byjlw/video-analyzer](https://github.com/byjlw/video-analyzer) — whisper + sequential frames
|
|
264
|
+
|
|
265
|
+
vidgrid's differentiator: **1 cell = 1 second, numbered cells, simple CLI,
|
|
266
|
+
multi-provider LLM integration in one package**.
|
|
267
|
+
|
|
268
|
+
## License
|
|
269
|
+
|
|
270
|
+
MIT. The bundled Source Sans 3 font is licensed under [SIL OFL 1.1](vidgrid/assets/fonts/LICENSE-SourceSans3.md).
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "vidgrid"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Convert video clips into annotated image grids for vision LLM analysis"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Paw Vej" }]
|
|
13
|
+
keywords = ["video", "llm", "vision", "storyboard", "claude", "gpt", "gemini", "ffmpeg"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3.9",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Multimedia :: Video",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
"Intended Audience :: Developers",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"Pillow>=10.0.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
transcribe = ["faster-whisper>=1.0.0"]
|
|
32
|
+
anthropic = ["anthropic>=0.39.0"]
|
|
33
|
+
openai = ["openai>=1.50.0"]
|
|
34
|
+
gemini = ["google-genai>=0.3.0"]
|
|
35
|
+
llm = [
|
|
36
|
+
"anthropic>=0.39.0",
|
|
37
|
+
"openai>=1.50.0",
|
|
38
|
+
"google-genai>=0.3.0",
|
|
39
|
+
]
|
|
40
|
+
all = [
|
|
41
|
+
"faster-whisper>=1.0.0",
|
|
42
|
+
"anthropic>=0.39.0",
|
|
43
|
+
"openai>=1.50.0",
|
|
44
|
+
"google-genai>=0.3.0",
|
|
45
|
+
]
|
|
46
|
+
dev = [
|
|
47
|
+
"pytest>=8.0.0",
|
|
48
|
+
"pytest-mock>=3.14.0",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[project.scripts]
|
|
52
|
+
vidgrid = "vidgrid.cli:main"
|
|
53
|
+
|
|
54
|
+
[project.urls]
|
|
55
|
+
Homepage = "https://github.com/pawvej/vidgrid"
|
|
56
|
+
Issues = "https://github.com/pawvej/vidgrid/issues"
|
|
57
|
+
|
|
58
|
+
[tool.setuptools]
|
|
59
|
+
include-package-data = true
|
|
60
|
+
|
|
61
|
+
[tool.setuptools.packages.find]
|
|
62
|
+
include = ["vidgrid*"]
|
|
63
|
+
|
|
64
|
+
[tool.setuptools.package-data]
|
|
65
|
+
vidgrid = ["assets/fonts/*.ttf"]
|
|
66
|
+
|
|
67
|
+
[tool.pytest.ini_options]
|
|
68
|
+
testpaths = ["tests"]
|
|
69
|
+
addopts = "-q"
|
|
70
|
+
markers = ["integration: tests that require ffmpeg"]
|
vidgrid-0.1.0/setup.cfg
ADDED