vidwise 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vidwise-0.1.0/.claude-plugin/marketplace.json +16 -0
- vidwise-0.1.0/.github/workflows/ci.yml +31 -0
- vidwise-0.1.0/.github/workflows/publish.yml +20 -0
- vidwise-0.1.0/.gitignore +23 -0
- vidwise-0.1.0/CHANGELOG.md +19 -0
- vidwise-0.1.0/LICENSE +21 -0
- vidwise-0.1.0/PKG-INFO +240 -0
- vidwise-0.1.0/README.md +205 -0
- vidwise-0.1.0/assets/banner.png +0 -0
- vidwise-0.1.0/assets/banner.svg +63 -0
- vidwise-0.1.0/assets/logo.svg +34 -0
- vidwise-0.1.0/plugin/.claude-plugin/plugin.json +12 -0
- vidwise-0.1.0/plugin/agents/frame-analyzer.md +43 -0
- vidwise-0.1.0/plugin/commands/vidwise.md +100 -0
- vidwise-0.1.0/plugin/skills/video-analysis/SKILL.md +78 -0
- vidwise-0.1.0/pyproject.toml +62 -0
- vidwise-0.1.0/src/vidwise/__init__.py +6 -0
- vidwise-0.1.0/src/vidwise/__main__.py +5 -0
- vidwise-0.1.0/src/vidwise/cli.py +144 -0
- vidwise-0.1.0/src/vidwise/downloader.py +76 -0
- vidwise-0.1.0/src/vidwise/extractor.py +88 -0
- vidwise-0.1.0/src/vidwise/frames.py +80 -0
- vidwise-0.1.0/src/vidwise/guide.py +140 -0
- vidwise-0.1.0/src/vidwise/providers/__init__.py +1 -0
- vidwise-0.1.0/src/vidwise/providers/base.py +46 -0
- vidwise-0.1.0/src/vidwise/providers/claude.py +122 -0
- vidwise-0.1.0/src/vidwise/providers/openai.py +124 -0
- vidwise-0.1.0/src/vidwise/transcriber.py +124 -0
- vidwise-0.1.0/src/vidwise/utils.py +67 -0
- vidwise-0.1.0/tests/__init__.py +0 -0
- vidwise-0.1.0/tests/test_utils.py +31 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "vidwise",
|
|
3
|
+
"description": "LLMs can't watch videos. vidwise gives them eyes. Extract transcripts, frames, and visual guides from any video.",
|
|
4
|
+
"owner": {
|
|
5
|
+
"name": "Juan Pablo Djeredjian",
|
|
6
|
+
"email": "jpdjeredjian@gmail.com"
|
|
7
|
+
},
|
|
8
|
+
"plugins": [
|
|
9
|
+
{
|
|
10
|
+
"name": "vidwise",
|
|
11
|
+
"source": "./plugin",
|
|
12
|
+
"description": "Make any video AI-readable. Extract transcripts, frames, and visual guides from videos using Claude Code's native multimodal AI — no API key needed.",
|
|
13
|
+
"version": "0.1.0"
|
|
14
|
+
}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- run: pip install ruff
|
|
18
|
+
- run: ruff check src/
|
|
19
|
+
|
|
20
|
+
test:
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
strategy:
|
|
23
|
+
matrix:
|
|
24
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v4
|
|
27
|
+
- uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: ${{ matrix.python-version }}
|
|
30
|
+
- run: pip install -e ".[dev]"
|
|
31
|
+
- run: pytest
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
publish:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.12"
|
|
18
|
+
- run: pip install build
|
|
19
|
+
- run: python -m build
|
|
20
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
vidwise-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.egg-info/
|
|
5
|
+
dist/
|
|
6
|
+
build/
|
|
7
|
+
.eggs/
|
|
8
|
+
*.egg
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
env/
|
|
12
|
+
.mypy_cache/
|
|
13
|
+
.ruff_cache/
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
*.wav
|
|
16
|
+
*.mp4
|
|
17
|
+
*.webm
|
|
18
|
+
*.mkv
|
|
19
|
+
*.avi
|
|
20
|
+
*.mov
|
|
21
|
+
output-*/
|
|
22
|
+
vidwise-output-*/
|
|
23
|
+
TODO.md
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-02-26
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- Core CLI with `vidwise <source>` command
|
|
13
|
+
- Whisper-powered transcription (`.txt`, `.srt`, `.json` outputs)
|
|
14
|
+
- Frame extraction every N seconds with timestamp-based naming
|
|
15
|
+
- Smart key frame selection via pixel-difference analysis
|
|
16
|
+
- URL support via yt-dlp (YouTube, Loom, 1000+ sites)
|
|
17
|
+
- AI-powered visual guide generation (Claude and OpenAI providers)
|
|
18
|
+
- Claude Code plugin with `/vidwise` slash command
|
|
19
|
+
- Parallel frame analysis via Claude Code subagents (no API key needed)
|
vidwise-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Juan Pablo Djeredjian
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vidwise-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vidwise
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLMs can't watch videos. vidwise gives them eyes.
|
|
5
|
+
Project-URL: Homepage, https://github.com/jpdjere/vidwise
|
|
6
|
+
Project-URL: Repository, https://github.com/jpdjere/vidwise
|
|
7
|
+
Project-URL: Issues, https://github.com/jpdjere/vidwise/issues
|
|
8
|
+
Author: Juan Pablo Djeredjian
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai,frames,knowledge,llm,transcript,video,whisper
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Multimedia :: Video
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: anthropic>=0.40
|
|
24
|
+
Requires-Dist: click>=8.0
|
|
25
|
+
Requires-Dist: openai-whisper>=20231117
|
|
26
|
+
Requires-Dist: openai>=1.0
|
|
27
|
+
Requires-Dist: yt-dlp>=2023.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
32
|
+
Provides-Extra: fast
|
|
33
|
+
Requires-Dist: faster-whisper>=1.0; extra == 'fast'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
<img src="assets/banner.png" alt="vidwise — LLMs can't watch videos. vidwise gives them eyes." width="700">
|
|
38
|
+
</p>
|
|
39
|
+
|
|
40
|
+
<p align="center">
|
|
41
|
+
<a href="https://pypi.org/project/vidwise/"><img src="https://img.shields.io/pypi/v/vidwise?color=blue" alt="PyPI"></a>
|
|
42
|
+
<a href="https://pypi.org/project/vidwise/"><img src="https://img.shields.io/pypi/pyversions/vidwise" alt="Python"></a>
|
|
43
|
+
<a href="https://github.com/jpdjere/vidwise/blob/main/LICENSE"><img src="https://img.shields.io/github/license/jpdjere/vidwise" alt="License"></a>
|
|
44
|
+
<a href="https://github.com/jpdjere/vidwise/actions/workflows/ci.yml"><img src="https://github.com/jpdjere/vidwise/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
45
|
+
</p>
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
Videos are the biggest blind spot for AI. A 5-minute Loom bug report, a 30-minute tutorial, a conference talk — all completely opaque to your LLM. You either watch the whole thing yourself or lose the knowledge.
|
|
50
|
+
|
|
51
|
+
**vidwise** extracts the visual and audio knowledge from any video into structured, LLM-consumable markdown. Feed the output to any LLM and it instantly "understands" the video.
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
Video ─→ vidwise ─→ Transcript + Key Frames + Visual Guide ─→ LLM Context
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## What can you do with it?
|
|
58
|
+
|
|
59
|
+
| Scenario | What happens |
|
|
60
|
+
|----------|-------------|
|
|
61
|
+
| **Debug a Loom bug report** | Feed the output to Claude → it "sees" the bug, the UI state, the error messages |
|
|
62
|
+
| **Absorb a tutorial** | 30-min coding video → structured knowledge your LLM can answer questions about |
|
|
63
|
+
| **Process a meeting** | Extract decisions, action items, and what was on screen |
|
|
64
|
+
| **Learn from a talk** | Turn any conference presentation into searchable, queryable knowledge |
|
|
65
|
+
| **Onboard faster** | Training videos become AI-queryable — new hires get instant answers |
|
|
66
|
+
|
|
67
|
+
## Why vidwise?
|
|
68
|
+
|
|
69
|
+
| | |
|
|
70
|
+
|---|---|
|
|
71
|
+
| **See the whole picture** | Most tools only extract audio. vidwise captures both what was *said* and what was *shown* — UI states, error messages, slides, code, diagrams. |
|
|
72
|
+
| **Process once, query forever** | The output is a self-contained artifact. Feed it to any LLM, any number of times, at zero additional cost. No re-uploading, no re-processing. |
|
|
73
|
+
| **Works with any LLM** | Standard markdown + images. Claude, GPT, Gemini, Llama, Mistral — whatever you use. No vendor lock-in. |
|
|
74
|
+
| **Your video stays local** | Whisper and ffmpeg run on your machine. Nothing leaves your computer unless you opt into AI guide generation. |
|
|
75
|
+
| **Smart, not brute-force** | Pixel-difference analysis keeps only frames where the visual content actually changed. Less noise, better LLM understanding. |
|
|
76
|
+
| **Human-readable AND machine-readable** | The output isn't just for LLMs — `guide.md` is a visual walkthrough you can read, share, and bookmark. One command, two audiences. |
|
|
77
|
+
| **One command** | `vidwise recording.mp4` → transcript, key frames, and visual guide in a single portable directory. |
|
|
78
|
+
|
|
79
|
+
> **Not just for LLMs.** The visual guide vidwise generates is a fully readable document with embedded screenshots — open it in VS Code, Obsidian, or GitHub and you have a skimmable walkthrough of the entire video. Share it with your team, bookmark it for later, or feed it to any LLM. One artifact, two audiences.
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
# Install
|
|
85
|
+
pip install vidwise
|
|
86
|
+
|
|
87
|
+
# Process a local video
|
|
88
|
+
vidwise recording.mp4
|
|
89
|
+
|
|
90
|
+
# Process a YouTube video
|
|
91
|
+
vidwise https://youtube.com/watch?v=abc
|
|
92
|
+
|
|
93
|
+
# With AI-powered visual guide
|
|
94
|
+
export ANTHROPIC_API_KEY=sk-... # or OPENAI_API_KEY
|
|
95
|
+
vidwise recording.mp4 --provider claude
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Prerequisites
|
|
99
|
+
|
|
100
|
+
- **Python 3.10+**
|
|
101
|
+
- **ffmpeg** — `brew install ffmpeg` (macOS) or `apt install ffmpeg` (Linux)
|
|
102
|
+
|
|
103
|
+
> **Lighter install?** `pip install "vidwise[fast]"` uses faster-whisper (~200MB) instead of openai-whisper (~2GB). 3-4x faster transcription, but without Apple Metal GPU support. vidwise auto-detects which backend is installed.
|
|
104
|
+
|
|
105
|
+
## Usage
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
vidwise <source> [options]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
| Option | Default | Description |
|
|
112
|
+
|--------|---------|-------------|
|
|
113
|
+
| `--model`, `-m` | `medium` | Whisper model: `tiny`, `base`, `small`, `medium`, `large` |
|
|
114
|
+
| `--output-dir`, `-o` | auto | Output directory path |
|
|
115
|
+
| `--no-guide` | off | Skip AI guide generation |
|
|
116
|
+
| `--provider`, `-p` | `auto` | AI provider: `auto`, `claude`, `openai` |
|
|
117
|
+
| `--frame-interval` | `2` | Seconds between frame captures |
|
|
118
|
+
| `--frame-threshold` | `0.05` | Pixel diff threshold for key frame selection |
|
|
119
|
+
|
|
120
|
+
### Examples
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
# Fast transcription of a short video
|
|
124
|
+
vidwise demo.mp4 --model tiny --no-guide
|
|
125
|
+
|
|
126
|
+
# YouTube tutorial with Claude-powered guide
|
|
127
|
+
vidwise https://youtube.com/watch?v=abc --model small --provider claude
|
|
128
|
+
|
|
129
|
+
# Loom bug report — default settings
|
|
130
|
+
vidwise https://loom.com/share/abc123def
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Output
|
|
134
|
+
|
|
135
|
+
vidwise creates a single self-contained directory:
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
vidwise-abc123-2026-02-26/
|
|
139
|
+
├── video.mp4 # Source video
|
|
140
|
+
├── audio.wav # Extracted audio (16kHz mono)
|
|
141
|
+
├── transcript.txt # Plain text transcript
|
|
142
|
+
├── transcript.srt # Timestamped subtitles
|
|
143
|
+
├── transcript.json # Full Whisper output with segments
|
|
144
|
+
├── frames/ # Key frames every 2 seconds
|
|
145
|
+
│ ├── frame_0m00s.png
|
|
146
|
+
│ ├── frame_0m02s.png
|
|
147
|
+
│ ├── frame_0m04s.png
|
|
148
|
+
│ └── ...
|
|
149
|
+
└── guide.md # Visual guide with embedded frames (if AI enabled)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
The `guide.md` uses relative image paths — open it in any markdown viewer (VS Code, GitHub, Obsidian) and the images render inline.
|
|
153
|
+
|
|
154
|
+
## How It Works
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
┌─────────────┐
|
|
158
|
+
│ Video URL │──→ yt-dlp download
|
|
159
|
+
│ or local │
|
|
160
|
+
└──────┬───────┘
|
|
161
|
+
│
|
|
162
|
+
▼
|
|
163
|
+
┌──────────────┐ ┌──────────────────┐
|
|
164
|
+
│ ffmpeg │────→│ audio.wav │──→ Whisper ──→ transcript.*
|
|
165
|
+
│ (parallel) │ │ (16kHz mono) │
|
|
166
|
+
│ │────→│ frames/ │──→ Key frame selection
|
|
167
|
+
│ │ │ (every 2 sec) │ (pixel diff filtering)
|
|
168
|
+
└──────────────┘ └──────────────────┘
|
|
169
|
+
│
|
|
170
|
+
▼
|
|
171
|
+
┌──────────────────┐
|
|
172
|
+
│ AI Analysis │ Claude API, OpenAI API,
|
|
173
|
+
│ (optional) │ or Claude Code (free)
|
|
174
|
+
└────────┬─────────┘
|
|
175
|
+
│
|
|
176
|
+
▼
|
|
177
|
+
┌──────────────────┐
|
|
178
|
+
│ guide.md │ Structured markdown with
|
|
179
|
+
│ │ embedded frame images
|
|
180
|
+
└──────────────────┘
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Smart frame selection:** Not every frame matters. vidwise compares consecutive frames using pixel-difference analysis and only keeps frames where the visual content actually changed. A 10-minute video might have 300 raw frames but only ~40 meaningful ones.
|
|
184
|
+
|
|
185
|
+
## Claude Code Plugin
|
|
186
|
+
|
|
187
|
+
If you use [Claude Code](https://docs.anthropic.com/en/docs/claude-code), install vidwise as a plugin for **AI-powered guide generation without needing an API key** — Claude Code's native multimodal AI handles the analysis:
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
# Add the vidwise marketplace and install the plugin
|
|
191
|
+
/plugin marketplace add jpdjere/vidwise
|
|
192
|
+
/plugin install vidwise@vidwise
|
|
193
|
+
|
|
194
|
+
# Then use it:
|
|
195
|
+
/vidwise:vidwise recording.mp4
|
|
196
|
+
/vidwise:vidwise https://loom.com/share/abc123
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
For local development or testing, you can also load directly:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
claude --plugin-dir /path/to/vidwise/plugin
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
The plugin runs `vidwise --no-guide` for extraction, then uses Claude Code's built-in vision capabilities to analyze frames in parallel — completely free, no API key needed.
|
|
206
|
+
|
|
207
|
+
## Whisper Model Sizes
|
|
208
|
+
|
|
209
|
+
| Model | Speed | Quality | Best For |
|
|
210
|
+
|-------|-------|---------|----------|
|
|
211
|
+
| `tiny` | ~1 min/min | Basic | Quick tests, long videos |
|
|
212
|
+
| `base` | ~2 min/min | Good | Short videos |
|
|
213
|
+
| `small` | ~4 min/min | Better | Videos >30 min |
|
|
214
|
+
| `medium` | ~8 min/min | Recommended | Default for most content |
|
|
215
|
+
| `large` | ~16 min/min | Best | When accuracy is critical |
|
|
216
|
+
|
|
217
|
+
*Speed estimates on Apple M-series. First run downloads model weights (one-time).*
|
|
218
|
+
|
|
219
|
+
## Contributing
|
|
220
|
+
|
|
221
|
+
Contributions are welcome! Please open an issue first to discuss what you'd like to change.
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# Development setup
|
|
225
|
+
git clone https://github.com/jpdjere/vidwise
|
|
226
|
+
cd vidwise
|
|
227
|
+
python -m venv .venv
|
|
228
|
+
source .venv/bin/activate
|
|
229
|
+
pip install -e ".[dev]"
|
|
230
|
+
|
|
231
|
+
# Run tests
|
|
232
|
+
pytest
|
|
233
|
+
|
|
234
|
+
# Lint
|
|
235
|
+
ruff check src/
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## License
|
|
239
|
+
|
|
240
|
+
[MIT](LICENSE)
|
vidwise-0.1.0/README.md
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="assets/banner.png" alt="vidwise — LLMs can't watch videos. vidwise gives them eyes." width="700">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<a href="https://pypi.org/project/vidwise/"><img src="https://img.shields.io/pypi/v/vidwise?color=blue" alt="PyPI"></a>
|
|
7
|
+
<a href="https://pypi.org/project/vidwise/"><img src="https://img.shields.io/pypi/pyversions/vidwise" alt="Python"></a>
|
|
8
|
+
<a href="https://github.com/jpdjere/vidwise/blob/main/LICENSE"><img src="https://img.shields.io/github/license/jpdjere/vidwise" alt="License"></a>
|
|
9
|
+
<a href="https://github.com/jpdjere/vidwise/actions/workflows/ci.yml"><img src="https://github.com/jpdjere/vidwise/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
Videos are the biggest blind spot for AI. A 5-minute Loom bug report, a 30-minute tutorial, a conference talk — all completely opaque to your LLM. You either watch the whole thing yourself or lose the knowledge.
|
|
15
|
+
|
|
16
|
+
**vidwise** extracts the visual and audio knowledge from any video into structured, LLM-consumable markdown. Feed the output to any LLM and it instantly "understands" the video.
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
Video ─→ vidwise ─→ Transcript + Key Frames + Visual Guide ─→ LLM Context
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## What can you do with it?
|
|
23
|
+
|
|
24
|
+
| Scenario | What happens |
|
|
25
|
+
|----------|-------------|
|
|
26
|
+
| **Debug a Loom bug report** | Feed the output to Claude → it "sees" the bug, the UI state, the error messages |
|
|
27
|
+
| **Absorb a tutorial** | 30-min coding video → structured knowledge your LLM can answer questions about |
|
|
28
|
+
| **Process a meeting** | Extract decisions, action items, and what was on screen |
|
|
29
|
+
| **Learn from a talk** | Turn any conference presentation into searchable, queryable knowledge |
|
|
30
|
+
| **Onboard faster** | Training videos become AI-queryable — new hires get instant answers |
|
|
31
|
+
|
|
32
|
+
## Why vidwise?
|
|
33
|
+
|
|
34
|
+
| | |
|
|
35
|
+
|---|---|
|
|
36
|
+
| **See the whole picture** | Most tools only extract audio. vidwise captures both what was *said* and what was *shown* — UI states, error messages, slides, code, diagrams. |
|
|
37
|
+
| **Process once, query forever** | The output is a self-contained artifact. Feed it to any LLM, any number of times, at zero additional cost. No re-uploading, no re-processing. |
|
|
38
|
+
| **Works with any LLM** | Standard markdown + images. Claude, GPT, Gemini, Llama, Mistral — whatever you use. No vendor lock-in. |
|
|
39
|
+
| **Your video stays local** | Whisper and ffmpeg run on your machine. Nothing leaves your computer unless you opt into AI guide generation. |
|
|
40
|
+
| **Smart, not brute-force** | Pixel-difference analysis keeps only frames where the visual content actually changed. Less noise, better LLM understanding. |
|
|
41
|
+
| **Human-readable AND machine-readable** | The output isn't just for LLMs — `guide.md` is a visual walkthrough you can read, share, and bookmark. One command, two audiences. |
|
|
42
|
+
| **One command** | `vidwise recording.mp4` → transcript, key frames, and visual guide in a single portable directory. |
|
|
43
|
+
|
|
44
|
+
> **Not just for LLMs.** The visual guide vidwise generates is a fully readable document with embedded screenshots — open it in VS Code, Obsidian, or GitHub and you have a skimmable walkthrough of the entire video. Share it with your team, bookmark it for later, or feed it to any LLM. One artifact, two audiences.
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Install
|
|
50
|
+
pip install vidwise
|
|
51
|
+
|
|
52
|
+
# Process a local video
|
|
53
|
+
vidwise recording.mp4
|
|
54
|
+
|
|
55
|
+
# Process a YouTube video
|
|
56
|
+
vidwise https://youtube.com/watch?v=abc
|
|
57
|
+
|
|
58
|
+
# With AI-powered visual guide
|
|
59
|
+
export ANTHROPIC_API_KEY=sk-... # or OPENAI_API_KEY
|
|
60
|
+
vidwise recording.mp4 --provider claude
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Prerequisites
|
|
64
|
+
|
|
65
|
+
- **Python 3.10+**
|
|
66
|
+
- **ffmpeg** — `brew install ffmpeg` (macOS) or `apt install ffmpeg` (Linux)
|
|
67
|
+
|
|
68
|
+
> **Lighter install?** `pip install "vidwise[fast]"` uses faster-whisper (~200MB) instead of openai-whisper (~2GB). 3-4x faster transcription, but without Apple Metal GPU support. vidwise auto-detects which backend is installed.
|
|
69
|
+
|
|
70
|
+
## Usage
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
vidwise <source> [options]
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
| Option | Default | Description |
|
|
77
|
+
|--------|---------|-------------|
|
|
78
|
+
| `--model`, `-m` | `medium` | Whisper model: `tiny`, `base`, `small`, `medium`, `large` |
|
|
79
|
+
| `--output-dir`, `-o` | auto | Output directory path |
|
|
80
|
+
| `--no-guide` | off | Skip AI guide generation |
|
|
81
|
+
| `--provider`, `-p` | `auto` | AI provider: `auto`, `claude`, `openai` |
|
|
82
|
+
| `--frame-interval` | `2` | Seconds between frame captures |
|
|
83
|
+
| `--frame-threshold` | `0.05` | Pixel diff threshold for key frame selection |
|
|
84
|
+
|
|
85
|
+
### Examples
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Fast transcription of a short video
|
|
89
|
+
vidwise demo.mp4 --model tiny --no-guide
|
|
90
|
+
|
|
91
|
+
# YouTube tutorial with Claude-powered guide
|
|
92
|
+
vidwise https://youtube.com/watch?v=abc --model small --provider claude
|
|
93
|
+
|
|
94
|
+
# Loom bug report — default settings
|
|
95
|
+
vidwise https://loom.com/share/abc123def
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Output
|
|
99
|
+
|
|
100
|
+
vidwise creates a single self-contained directory:
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
vidwise-abc123-2026-02-26/
|
|
104
|
+
├── video.mp4 # Source video
|
|
105
|
+
├── audio.wav # Extracted audio (16kHz mono)
|
|
106
|
+
├── transcript.txt # Plain text transcript
|
|
107
|
+
├── transcript.srt # Timestamped subtitles
|
|
108
|
+
├── transcript.json # Full Whisper output with segments
|
|
109
|
+
├── frames/ # Key frames every 2 seconds
|
|
110
|
+
│ ├── frame_0m00s.png
|
|
111
|
+
│ ├── frame_0m02s.png
|
|
112
|
+
│ ├── frame_0m04s.png
|
|
113
|
+
│ └── ...
|
|
114
|
+
└── guide.md # Visual guide with embedded frames (if AI enabled)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
The `guide.md` uses relative image paths — open it in any markdown viewer (VS Code, GitHub, Obsidian) and the images render inline.
|
|
118
|
+
|
|
119
|
+
## How It Works
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
┌─────────────┐
|
|
123
|
+
│ Video URL │──→ yt-dlp download
|
|
124
|
+
│ or local │
|
|
125
|
+
└──────┬───────┘
|
|
126
|
+
│
|
|
127
|
+
▼
|
|
128
|
+
┌──────────────┐ ┌──────────────────┐
|
|
129
|
+
│ ffmpeg │────→│ audio.wav │──→ Whisper ──→ transcript.*
|
|
130
|
+
│ (parallel) │ │ (16kHz mono) │
|
|
131
|
+
│ │────→│ frames/ │──→ Key frame selection
|
|
132
|
+
│ │ │ (every 2 sec) │ (pixel diff filtering)
|
|
133
|
+
└──────────────┘ └──────────────────┘
|
|
134
|
+
│
|
|
135
|
+
▼
|
|
136
|
+
┌──────────────────┐
|
|
137
|
+
│ AI Analysis │ Claude API, OpenAI API,
|
|
138
|
+
│ (optional) │ or Claude Code (free)
|
|
139
|
+
└────────┬─────────┘
|
|
140
|
+
│
|
|
141
|
+
▼
|
|
142
|
+
┌──────────────────┐
|
|
143
|
+
│ guide.md │ Structured markdown with
|
|
144
|
+
│ │ embedded frame images
|
|
145
|
+
└──────────────────┘
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
**Smart frame selection:** Not every frame matters. vidwise compares consecutive frames using pixel-difference analysis and only keeps frames where the visual content actually changed. A 10-minute video might have 300 raw frames but only ~40 meaningful ones.
|
|
149
|
+
|
|
150
|
+
## Claude Code Plugin
|
|
151
|
+
|
|
152
|
+
If you use [Claude Code](https://docs.anthropic.com/en/docs/claude-code), install vidwise as a plugin for **AI-powered guide generation without needing an API key** — Claude Code's native multimodal AI handles the analysis:
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
# Add the vidwise marketplace and install the plugin
|
|
156
|
+
/plugin marketplace add jpdjere/vidwise
|
|
157
|
+
/plugin install vidwise@vidwise
|
|
158
|
+
|
|
159
|
+
# Then use it:
|
|
160
|
+
/vidwise:vidwise recording.mp4
|
|
161
|
+
/vidwise:vidwise https://loom.com/share/abc123
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
For local development or testing, you can also load directly:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
claude --plugin-dir /path/to/vidwise/plugin
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
The plugin runs `vidwise --no-guide` for extraction, then uses Claude Code's built-in vision capabilities to analyze frames in parallel — completely free, no API key needed.
|
|
171
|
+
|
|
172
|
+
## Whisper Model Sizes
|
|
173
|
+
|
|
174
|
+
| Model | Speed | Quality | Best For |
|
|
175
|
+
|-------|-------|---------|----------|
|
|
176
|
+
| `tiny` | ~1 min/min | Basic | Quick tests, long videos |
|
|
177
|
+
| `base` | ~2 min/min | Good | Short videos |
|
|
178
|
+
| `small` | ~4 min/min | Better | Videos >30 min |
|
|
179
|
+
| `medium` | ~8 min/min | Recommended | Default for most content |
|
|
180
|
+
| `large` | ~16 min/min | Best | When accuracy is critical |
|
|
181
|
+
|
|
182
|
+
*Speed estimates on Apple M-series. First run downloads model weights (one-time).*
|
|
183
|
+
|
|
184
|
+
## Contributing
|
|
185
|
+
|
|
186
|
+
Contributions are welcome! Please open an issue first to discuss what you'd like to change.
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
# Development setup
|
|
190
|
+
git clone https://github.com/jpdjere/vidwise
|
|
191
|
+
cd vidwise
|
|
192
|
+
python -m venv .venv
|
|
193
|
+
source .venv/bin/activate
|
|
194
|
+
pip install -e ".[dev]"
|
|
195
|
+
|
|
196
|
+
# Run tests
|
|
197
|
+
pytest
|
|
198
|
+
|
|
199
|
+
# Lint
|
|
200
|
+
ruff check src/
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## License
|
|
204
|
+
|
|
205
|
+
[MIT](LICENSE)
|
|
Binary file
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 900 320" fill="none">
|
|
2
|
+
<defs>
|
|
3
|
+
<linearGradient id="g1" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
4
|
+
<stop offset="0%" style="stop-color:#6366f1" />
|
|
5
|
+
<stop offset="100%" style="stop-color:#8b5cf6" />
|
|
6
|
+
</linearGradient>
|
|
7
|
+
<linearGradient id="g2" x1="0%" y1="0%" x2="100%" y2="100%">
|
|
8
|
+
<stop offset="0%" style="stop-color:#8b5cf6" />
|
|
9
|
+
<stop offset="100%" style="stop-color:#a78bfa" />
|
|
10
|
+
</linearGradient>
|
|
11
|
+
<linearGradient id="g3" x1="0%" y1="0%" x2="100%" y2="0%">
|
|
12
|
+
<stop offset="0%" style="stop-color:#6366f1" />
|
|
13
|
+
<stop offset="50%" style="stop-color:#8b5cf6" />
|
|
14
|
+
<stop offset="100%" style="stop-color:#a78bfa" />
|
|
15
|
+
</linearGradient>
|
|
16
|
+
</defs>
|
|
17
|
+
|
|
18
|
+
<!-- Centered icon group -->
|
|
19
|
+
<g transform="translate(290, 30)">
|
|
20
|
+
<!-- Video frame -->
|
|
21
|
+
<rect x="0" y="0" width="140" height="140" rx="20" fill="url(#g1)" />
|
|
22
|
+
|
|
23
|
+
<!-- Film strip notches -->
|
|
24
|
+
<rect x="8" y="8" width="12" height="8" rx="2" fill="white" opacity="0.3" />
|
|
25
|
+
<rect x="28" y="8" width="12" height="8" rx="2" fill="white" opacity="0.3" />
|
|
26
|
+
<rect x="48" y="8" width="12" height="8" rx="2" fill="white" opacity="0.3" />
|
|
27
|
+
<rect x="8" y="124" width="12" height="8" rx="2" fill="white" opacity="0.3" />
|
|
28
|
+
<rect x="28" y="124" width="12" height="8" rx="2" fill="white" opacity="0.3" />
|
|
29
|
+
<rect x="48" y="124" width="12" height="8" rx="2" fill="white" opacity="0.3" />
|
|
30
|
+
|
|
31
|
+
<!-- Play triangle -->
|
|
32
|
+
<polygon points="52,38 100,70 52,102" fill="white" opacity="0.95" />
|
|
33
|
+
|
|
34
|
+
<!-- Eye overlapping the video frame -->
|
|
35
|
+
<g transform="translate(85, 32)">
|
|
36
|
+
<path d="M0,38 Q55,-15 110,38 Q55,91 0,38 Z" fill="url(#g2)" />
|
|
37
|
+
<circle cx="55" cy="38" r="20" fill="white" />
|
|
38
|
+
<circle cx="55" cy="38" r="10" fill="#4f46e5" />
|
|
39
|
+
<circle cx="51" cy="33" r="3.5" fill="white" />
|
|
40
|
+
</g>
|
|
41
|
+
|
|
42
|
+
<!-- Subtle sparkle / AI indicator -->
|
|
43
|
+
<g transform="translate(178, 12)" fill="#a78bfa">
|
|
44
|
+
<path d="M8,0 L10,6 L16,8 L10,10 L8,16 L6,10 L0,8 L6,6 Z" opacity="0.7" />
|
|
45
|
+
</g>
|
|
46
|
+
<g transform="translate(200, 28)" fill="#c4b5fd">
|
|
47
|
+
<path d="M5,0 L6.2,3.8 L10,5 L6.2,6.2 L5,10 L3.8,6.2 L0,5 L3.8,3.8 Z" opacity="0.5" />
|
|
48
|
+
</g>
|
|
49
|
+
</g>
|
|
50
|
+
|
|
51
|
+
<!-- Title -->
|
|
52
|
+
<text x="450" y="230" text-anchor="middle" font-family="system-ui, -apple-system, 'Segoe UI', Roboto, sans-serif" font-size="72" font-weight="800" fill="#1e1b4b" letter-spacing="-2">
|
|
53
|
+
vid<tspan fill="url(#g1)">wise</tspan>
|
|
54
|
+
</text>
|
|
55
|
+
|
|
56
|
+
<!-- Tagline -->
|
|
57
|
+
<text x="450" y="270" text-anchor="middle" font-family="system-ui, -apple-system, 'Segoe UI', Roboto, sans-serif" font-size="21" fill="#6b7280" font-weight="500" letter-spacing="0.3">
|
|
58
|
+
LLMs can't watch videos. vidwise gives them eyes.
|
|
59
|
+
</text>
|
|
60
|
+
|
|
61
|
+
<!-- Accent line -->
|
|
62
|
+
<rect x="350" y="285" width="200" height="3" rx="1.5" fill="url(#g3)" opacity="0.4" />
|
|
63
|
+
</svg>
|