videopython 0.30.0__tar.gz → 0.31.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.30.0 → videopython-0.31.0}/PKG-INFO +72 -43
- {videopython-0.30.0 → videopython-0.31.0}/README.md +71 -42
- {videopython-0.30.0 → videopython-0.31.0}/pyproject.toml +2 -1
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/__init__.py +1 -4
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/dubber.py +3 -1
- videopython-0.31.0/src/videopython/ai/transforms.py +193 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/faces.py +4 -5
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/video_analysis.py +3 -1
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/__init__.py +7 -38
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/description.py +18 -18
- videopython-0.31.0/src/videopython/base/effects.py +765 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/exceptions.py +0 -12
- videopython-0.31.0/src/videopython/base/operation.py +269 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/text/overlay.py +68 -103
- videopython-0.31.0/src/videopython/base/transforms.py +612 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/video.py +1 -512
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/editing/__init__.py +0 -3
- videopython-0.31.0/src/videopython/editing/video_edit.py +539 -0
- videopython-0.30.0/src/videopython/ai/registry.py +0 -33
- videopython-0.30.0/src/videopython/ai/transforms.py +0 -533
- videopython-0.30.0/src/videopython/base/effects.py +0 -1049
- videopython-0.30.0/src/videopython/base/registry.py +0 -808
- videopython-0.30.0/src/videopython/base/transforms.py +0 -919
- videopython-0.30.0/src/videopython/base/transitions.py +0 -200
- videopython-0.30.0/src/videopython/editing/multicam.py +0 -398
- videopython-0.30.0/src/videopython/editing/video_edit.py +0 -1384
- {videopython-0.30.0 → videopython-0.31.0}/.gitignore +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/LICENSE +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/__init__.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/scene.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/streaming.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.30.0 → videopython-0.31.0}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.31.0
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -85,22 +85,31 @@ Python `>=3.10, <3.14`. AI features run locally - no cloud API keys required, bu
|
|
|
85
85
|
|
|
86
86
|
## Quick Start
|
|
87
87
|
|
|
88
|
-
###
|
|
88
|
+
### Imperative editing
|
|
89
|
+
|
|
90
|
+
Every editing primitive is an `Operation` subclass — a Pydantic model
|
|
91
|
+
whose fields ARE the JSON wire format. Apply one to a `Video`:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from videopython.base import Video, CutSeconds, Resize, Fade
|
|
95
|
+
|
|
96
|
+
video = Video.from_path("raw.mp4")
|
|
97
|
+
video = CutSeconds(start=10, end=25).apply(video)
|
|
98
|
+
video = Resize(width=1080, height=1920).apply(video)
|
|
99
|
+
video = Fade(mode="in", duration=0.5).apply(video)
|
|
100
|
+
video.save("output.mp4")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Concatenate clips with `+` (must share fps + dimensions):
|
|
89
104
|
|
|
90
105
|
```python
|
|
91
|
-
|
|
92
|
-
from videopython.base import FadeTransition
|
|
93
|
-
|
|
94
|
-
intro = Video.from_path("intro.mp4").resize(1080, 1920)
|
|
95
|
-
clip = Video.from_path("raw.mp4").cut(10, 25).resize(1080, 1920).resample_fps(30)
|
|
96
|
-
final = intro.transition_to(clip, FadeTransition(effect_time_seconds=0.5))
|
|
97
|
-
final = final.add_audio_from_file("music.mp3")
|
|
98
|
-
final.save("output.mp4")
|
|
106
|
+
combined = video_a + video_b
|
|
99
107
|
```
|
|
100
108
|
|
|
101
109
|
### JSON editing plans
|
|
102
110
|
|
|
103
|
-
Define multi-segment edits as JSON
|
|
111
|
+
Define multi-segment edits as JSON — the format LLM-driven workflows
|
|
112
|
+
generate against. `VideoEdit.json_schema()` returns the schema:
|
|
104
113
|
|
|
105
114
|
```python
|
|
106
115
|
from videopython.editing import VideoEdit
|
|
@@ -110,68 +119,89 @@ plan = {
|
|
|
110
119
|
"source": "raw.mp4",
|
|
111
120
|
"start": 10.0,
|
|
112
121
|
"end": 20.0,
|
|
113
|
-
"
|
|
114
|
-
{"op": "resize", "
|
|
115
|
-
{"op": "
|
|
122
|
+
"operations": [
|
|
123
|
+
{"op": "resize", "width": 1080, "height": 1920},
|
|
124
|
+
{"op": "color_adjust", "saturation": 1.15, "contrast": 1.05},
|
|
125
|
+
{"op": "fade", "mode": "in", "duration": 0.5,
|
|
126
|
+
"window": {"stop": 0.5}},
|
|
116
127
|
],
|
|
117
128
|
}],
|
|
118
|
-
"post_effects": [
|
|
119
|
-
{"op": "fade", "args": {"mode": "in", "duration": 0.5}, "apply": {"start": 0.0, "stop": 0.5}},
|
|
120
|
-
],
|
|
121
129
|
}
|
|
122
130
|
|
|
123
131
|
edit = VideoEdit.from_dict(plan)
|
|
124
|
-
edit.validate()
|
|
125
|
-
|
|
126
|
-
final.save("output.mp4")
|
|
132
|
+
edit.validate() # dry-run via metadata, no frames loaded
|
|
133
|
+
edit.run_to_file("output.mp4") # stream to disk, ~constant memory
|
|
127
134
|
```
|
|
128
135
|
|
|
136
|
+
`run_to_file()` pipes ffmpeg decode → per-frame effects → ffmpeg encode,
|
|
137
|
+
so memory stays bounded even for hour-long sources. Use `edit.run()`
|
|
138
|
+
instead if you want the result back in memory as a `Video`.
|
|
139
|
+
|
|
129
140
|
### AI generation
|
|
130
141
|
|
|
131
142
|
```python
|
|
132
143
|
from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
|
|
144
|
+
from videopython.base import Resize
|
|
133
145
|
|
|
134
146
|
image = TextToImage().generate_image("A cinematic mountain sunrise")
|
|
135
|
-
video = ImageToVideo().generate_video(image=image)
|
|
147
|
+
video = ImageToVideo().generate_video(image=image)
|
|
136
148
|
audio = TextToSpeech().generate_audio("Welcome to videopython.")
|
|
149
|
+
|
|
150
|
+
video = Resize(width=1080, height=1920).apply(video)
|
|
137
151
|
video.add_audio(audio).save("ai_video.mp4")
|
|
138
152
|
```
|
|
139
153
|
|
|
140
154
|
## LLM & AI Agent Integration
|
|
141
155
|
|
|
142
|
-
|
|
156
|
+
The library is built for LLM-driven editing. Two surfaces matter:
|
|
143
157
|
|
|
144
|
-
**
|
|
158
|
+
**1. Plan schema for tool / structured-output calls.**
|
|
159
|
+
`VideoEdit.json_schema()` returns a JSON Schema covering segments,
|
|
160
|
+
`post_operations`, and a discriminated union over every registered
|
|
161
|
+
`Operation`. Drop it into any LLM API:
|
|
145
162
|
|
|
146
163
|
```python
|
|
147
164
|
from videopython.editing import VideoEdit
|
|
148
165
|
|
|
149
166
|
schema = VideoEdit.json_schema()
|
|
150
|
-
#
|
|
151
|
-
#
|
|
167
|
+
# Anthropic: tools=[{"name": "edit", "input_schema": schema}]
|
|
168
|
+
# OpenAI: tools=[{"type": "function",
|
|
169
|
+
# "function": {"name": "edit", "parameters": schema}}]
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Validate the LLM's output without touching the filesystem, then run it:
|
|
152
173
|
|
|
174
|
+
```python
|
|
153
175
|
edit = VideoEdit.from_dict(plan)
|
|
154
|
-
edit.validate()
|
|
155
|
-
|
|
156
|
-
final.save("output.mp4")
|
|
176
|
+
edit.validate() # catches bad ops, time ranges, fps mismatches
|
|
177
|
+
edit.run_to_file("output.mp4")
|
|
157
178
|
```
|
|
158
179
|
|
|
159
|
-
**Operation discovery
|
|
180
|
+
**2. Operation discovery for agent loops.**
|
|
181
|
+
Every registered op exposes its own Pydantic schema, so an agent can
|
|
182
|
+
introspect what's available without hardcoded lists:
|
|
160
183
|
|
|
161
184
|
```python
|
|
162
|
-
from videopython.base import
|
|
185
|
+
from videopython.base import Operation, OpCategory
|
|
163
186
|
|
|
164
|
-
|
|
165
|
-
|
|
187
|
+
for op_id, cls in Operation.registry().items():
|
|
188
|
+
print(f"{op_id}: {(cls.__doc__ or '').splitlines()[0]}")
|
|
166
189
|
|
|
167
|
-
|
|
168
|
-
print(spec.description) # LLM-friendly docstring
|
|
169
|
-
print(spec.to_json_schema()) # {"brightness": {"type": "number", "minimum": -1, "maximum": 1}, ...}
|
|
190
|
+
schema = Operation.get("color_adjust").model_json_schema() # per-op schema
|
|
170
191
|
```
|
|
171
192
|
|
|
172
|
-
|
|
193
|
+
Field constraints (`minimum`, `maximum`, `enum`, `exclusiveMinimum`,
|
|
194
|
+
nullability) flow through to the schema, so LLMs that support
|
|
195
|
+
constrained generation produce valid parameters on the first try.
|
|
196
|
+
|
|
197
|
+
For ops that need side-channel data (e.g. `silence_removal` and
|
|
198
|
+
`add_subtitles` need a `Transcription`), pass it via `context`:
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
edit.run(context={"transcription": my_transcription})
|
|
202
|
+
```
|
|
173
203
|
|
|
174
|
-
Docs: [Editing Plans](https://videopython.com/api/editing/) | [
|
|
204
|
+
Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [LLM Integration Guide](https://videopython.com/guides/llm-integration/)
|
|
175
205
|
|
|
176
206
|
## Features
|
|
177
207
|
|
|
@@ -180,16 +210,15 @@ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operation Registr
|
|
|
180
210
|
| Area | Highlights |
|
|
181
211
|
|---|---|
|
|
182
212
|
| **Video I/O** | `Video`, `VideoMetadata`, `FrameIterator` - load, save, inspect |
|
|
183
|
-
| **
|
|
184
|
-
| **
|
|
185
|
-
| **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change,
|
|
186
|
-
| **Transitions** | `FadeTransition`, `BlurTransition`, `InstantTransition` |
|
|
213
|
+
| **Operation foundation** | `Operation`, `Effect`, `TimeRange`, `OpCategory` - Pydantic base + auto-registry + discriminated-union schema |
|
|
214
|
+
| **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with JSON Schema generation, dry-run validation, and streaming `run_to_file` |
|
|
215
|
+
| **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, reverse, freeze frame, silence removal |
|
|
187
216
|
| **Effects** | Blur, zoom, color grading, vignette, Ken Burns, image overlay, fade, text overlay, volume adjust |
|
|
188
217
|
| **Audio** | Load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
|
|
189
218
|
| **Text** | Transcription data classes, `TranscriptionOverlay` for subtitle rendering |
|
|
190
219
|
| **Scene detection** | Histogram-based scene boundaries (`detect`, `detect_streaming`, `detect_parallel`) |
|
|
191
220
|
|
|
192
|
-
API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [
|
|
221
|
+
API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [Transforms](https://videopython.com/api/transforms/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
|
|
193
222
|
|
|
194
223
|
### `videopython.ai` - local AI features (install with `[ai]`)
|
|
195
224
|
|
|
@@ -199,7 +228,7 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
|
|
|
199
228
|
| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
|
|
200
229
|
| **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
|
|
201
230
|
| **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
|
|
202
|
-
| **Transforms** | `FaceTrackingCrop
|
|
231
|
+
| **Transforms** | `FaceTrackingCrop` |
|
|
203
232
|
| **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
|
|
204
233
|
|
|
205
234
|
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
@@ -36,22 +36,31 @@ Python `>=3.10, <3.14`. AI features run locally - no cloud API keys required, bu
|
|
|
36
36
|
|
|
37
37
|
## Quick Start
|
|
38
38
|
|
|
39
|
-
###
|
|
39
|
+
### Imperative editing
|
|
40
|
+
|
|
41
|
+
Every editing primitive is an `Operation` subclass — a Pydantic model
|
|
42
|
+
whose fields ARE the JSON wire format. Apply one to a `Video`:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from videopython.base import Video, CutSeconds, Resize, Fade
|
|
46
|
+
|
|
47
|
+
video = Video.from_path("raw.mp4")
|
|
48
|
+
video = CutSeconds(start=10, end=25).apply(video)
|
|
49
|
+
video = Resize(width=1080, height=1920).apply(video)
|
|
50
|
+
video = Fade(mode="in", duration=0.5).apply(video)
|
|
51
|
+
video.save("output.mp4")
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Concatenate clips with `+` (must share fps + dimensions):
|
|
40
55
|
|
|
41
56
|
```python
|
|
42
|
-
|
|
43
|
-
from videopython.base import FadeTransition
|
|
44
|
-
|
|
45
|
-
intro = Video.from_path("intro.mp4").resize(1080, 1920)
|
|
46
|
-
clip = Video.from_path("raw.mp4").cut(10, 25).resize(1080, 1920).resample_fps(30)
|
|
47
|
-
final = intro.transition_to(clip, FadeTransition(effect_time_seconds=0.5))
|
|
48
|
-
final = final.add_audio_from_file("music.mp3")
|
|
49
|
-
final.save("output.mp4")
|
|
57
|
+
combined = video_a + video_b
|
|
50
58
|
```
|
|
51
59
|
|
|
52
60
|
### JSON editing plans
|
|
53
61
|
|
|
54
|
-
Define multi-segment edits as JSON
|
|
62
|
+
Define multi-segment edits as JSON — the format LLM-driven workflows
|
|
63
|
+
generate against. `VideoEdit.json_schema()` returns the schema:
|
|
55
64
|
|
|
56
65
|
```python
|
|
57
66
|
from videopython.editing import VideoEdit
|
|
@@ -61,68 +70,89 @@ plan = {
|
|
|
61
70
|
"source": "raw.mp4",
|
|
62
71
|
"start": 10.0,
|
|
63
72
|
"end": 20.0,
|
|
64
|
-
"
|
|
65
|
-
{"op": "resize", "
|
|
66
|
-
{"op": "
|
|
73
|
+
"operations": [
|
|
74
|
+
{"op": "resize", "width": 1080, "height": 1920},
|
|
75
|
+
{"op": "color_adjust", "saturation": 1.15, "contrast": 1.05},
|
|
76
|
+
{"op": "fade", "mode": "in", "duration": 0.5,
|
|
77
|
+
"window": {"stop": 0.5}},
|
|
67
78
|
],
|
|
68
79
|
}],
|
|
69
|
-
"post_effects": [
|
|
70
|
-
{"op": "fade", "args": {"mode": "in", "duration": 0.5}, "apply": {"start": 0.0, "stop": 0.5}},
|
|
71
|
-
],
|
|
72
80
|
}
|
|
73
81
|
|
|
74
82
|
edit = VideoEdit.from_dict(plan)
|
|
75
|
-
edit.validate()
|
|
76
|
-
|
|
77
|
-
final.save("output.mp4")
|
|
83
|
+
edit.validate() # dry-run via metadata, no frames loaded
|
|
84
|
+
edit.run_to_file("output.mp4") # stream to disk, ~constant memory
|
|
78
85
|
```
|
|
79
86
|
|
|
87
|
+
`run_to_file()` pipes ffmpeg decode → per-frame effects → ffmpeg encode,
|
|
88
|
+
so memory stays bounded even for hour-long sources. Use `edit.run()`
|
|
89
|
+
instead if you want the result back in memory as a `Video`.
|
|
90
|
+
|
|
80
91
|
### AI generation
|
|
81
92
|
|
|
82
93
|
```python
|
|
83
94
|
from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
|
|
95
|
+
from videopython.base import Resize
|
|
84
96
|
|
|
85
97
|
image = TextToImage().generate_image("A cinematic mountain sunrise")
|
|
86
|
-
video = ImageToVideo().generate_video(image=image)
|
|
98
|
+
video = ImageToVideo().generate_video(image=image)
|
|
87
99
|
audio = TextToSpeech().generate_audio("Welcome to videopython.")
|
|
100
|
+
|
|
101
|
+
video = Resize(width=1080, height=1920).apply(video)
|
|
88
102
|
video.add_audio(audio).save("ai_video.mp4")
|
|
89
103
|
```
|
|
90
104
|
|
|
91
105
|
## LLM & AI Agent Integration
|
|
92
106
|
|
|
93
|
-
|
|
107
|
+
The library is built for LLM-driven editing. Two surfaces matter:
|
|
94
108
|
|
|
95
|
-
**
|
|
109
|
+
**1. Plan schema for tool / structured-output calls.**
|
|
110
|
+
`VideoEdit.json_schema()` returns a JSON Schema covering segments,
|
|
111
|
+
`post_operations`, and a discriminated union over every registered
|
|
112
|
+
`Operation`. Drop it into any LLM API:
|
|
96
113
|
|
|
97
114
|
```python
|
|
98
115
|
from videopython.editing import VideoEdit
|
|
99
116
|
|
|
100
117
|
schema = VideoEdit.json_schema()
|
|
101
|
-
#
|
|
102
|
-
#
|
|
118
|
+
# Anthropic: tools=[{"name": "edit", "input_schema": schema}]
|
|
119
|
+
# OpenAI: tools=[{"type": "function",
|
|
120
|
+
# "function": {"name": "edit", "parameters": schema}}]
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Validate the LLM's output without touching the filesystem, then run it:
|
|
103
124
|
|
|
125
|
+
```python
|
|
104
126
|
edit = VideoEdit.from_dict(plan)
|
|
105
|
-
edit.validate()
|
|
106
|
-
|
|
107
|
-
final.save("output.mp4")
|
|
127
|
+
edit.validate() # catches bad ops, time ranges, fps mismatches
|
|
128
|
+
edit.run_to_file("output.mp4")
|
|
108
129
|
```
|
|
109
130
|
|
|
110
|
-
**Operation discovery
|
|
131
|
+
**2. Operation discovery for agent loops.**
|
|
132
|
+
Every registered op exposes its own Pydantic schema, so an agent can
|
|
133
|
+
introspect what's available without hardcoded lists:
|
|
111
134
|
|
|
112
135
|
```python
|
|
113
|
-
from videopython.base import
|
|
136
|
+
from videopython.base import Operation, OpCategory
|
|
114
137
|
|
|
115
|
-
|
|
116
|
-
|
|
138
|
+
for op_id, cls in Operation.registry().items():
|
|
139
|
+
print(f"{op_id}: {(cls.__doc__ or '').splitlines()[0]}")
|
|
117
140
|
|
|
118
|
-
|
|
119
|
-
print(spec.description) # LLM-friendly docstring
|
|
120
|
-
print(spec.to_json_schema()) # {"brightness": {"type": "number", "minimum": -1, "maximum": 1}, ...}
|
|
141
|
+
schema = Operation.get("color_adjust").model_json_schema() # per-op schema
|
|
121
142
|
```
|
|
122
143
|
|
|
123
|
-
|
|
144
|
+
Field constraints (`minimum`, `maximum`, `enum`, `exclusiveMinimum`,
|
|
145
|
+
nullability) flow through to the schema, so LLMs that support
|
|
146
|
+
constrained generation produce valid parameters on the first try.
|
|
147
|
+
|
|
148
|
+
For ops that need side-channel data (e.g. `silence_removal` and
|
|
149
|
+
`add_subtitles` need a `Transcription`), pass it via `context`:
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
edit.run(context={"transcription": my_transcription})
|
|
153
|
+
```
|
|
124
154
|
|
|
125
|
-
Docs: [Editing Plans](https://videopython.com/api/editing/) | [
|
|
155
|
+
Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [LLM Integration Guide](https://videopython.com/guides/llm-integration/)
|
|
126
156
|
|
|
127
157
|
## Features
|
|
128
158
|
|
|
@@ -131,16 +161,15 @@ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operation Registr
|
|
|
131
161
|
| Area | Highlights |
|
|
132
162
|
|---|---|
|
|
133
163
|
| **Video I/O** | `Video`, `VideoMetadata`, `FrameIterator` - load, save, inspect |
|
|
134
|
-
| **
|
|
135
|
-
| **
|
|
136
|
-
| **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change,
|
|
137
|
-
| **Transitions** | `FadeTransition`, `BlurTransition`, `InstantTransition` |
|
|
164
|
+
| **Operation foundation** | `Operation`, `Effect`, `TimeRange`, `OpCategory` - Pydantic base + auto-registry + discriminated-union schema |
|
|
165
|
+
| **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with JSON Schema generation, dry-run validation, and streaming `run_to_file` |
|
|
166
|
+
| **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, reverse, freeze frame, silence removal |
|
|
138
167
|
| **Effects** | Blur, zoom, color grading, vignette, Ken Burns, image overlay, fade, text overlay, volume adjust |
|
|
139
168
|
| **Audio** | Load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
|
|
140
169
|
| **Text** | Transcription data classes, `TranscriptionOverlay` for subtitle rendering |
|
|
141
170
|
| **Scene detection** | Histogram-based scene boundaries (`detect`, `detect_streaming`, `detect_parallel`) |
|
|
142
171
|
|
|
143
|
-
API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [
|
|
172
|
+
API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [Transforms](https://videopython.com/api/transforms/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
|
|
144
173
|
|
|
145
174
|
### `videopython.ai` - local AI features (install with `[ai]`)
|
|
146
175
|
|
|
@@ -150,7 +179,7 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
|
|
|
150
179
|
| **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
|
|
151
180
|
| **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
|
|
152
181
|
| **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
|
|
153
|
-
| **Transforms** | `FaceTrackingCrop
|
|
182
|
+
| **Transforms** | `FaceTrackingCrop` |
|
|
154
183
|
| **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
|
|
155
184
|
|
|
156
185
|
API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.31.0"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -136,6 +136,7 @@ Documentation = "https://videopython.com"
|
|
|
136
136
|
|
|
137
137
|
[tool.mypy]
|
|
138
138
|
mypy_path = "src/stubs"
|
|
139
|
+
plugins = ["pydantic.mypy"]
|
|
139
140
|
|
|
140
141
|
[[tool.mypy.overrides]]
|
|
141
142
|
module = [
|
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
from videopython.ai import registry as _ai_registry # noqa: F401
|
|
2
|
-
|
|
3
1
|
from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
|
|
4
|
-
from .transforms import FaceTrackingCrop
|
|
2
|
+
from .transforms import FaceTrackingCrop
|
|
5
3
|
from .understanding import (
|
|
6
4
|
AudioClassifier,
|
|
7
5
|
AudioToText,
|
|
@@ -26,7 +24,6 @@ __all__ = [
|
|
|
26
24
|
"SemanticSceneDetector",
|
|
27
25
|
# Transforms (AI-powered)
|
|
28
26
|
"FaceTrackingCrop",
|
|
29
|
-
"SplitScreenComposite",
|
|
30
27
|
# Video analysis
|
|
31
28
|
"VideoAnalysis",
|
|
32
29
|
"VideoAnalysisConfig",
|
|
@@ -292,7 +292,9 @@ class VideoDubber:
|
|
|
292
292
|
video_duration = video.total_seconds
|
|
293
293
|
|
|
294
294
|
if video_duration > speech_duration:
|
|
295
|
-
|
|
295
|
+
from videopython.base.transforms import CutSeconds
|
|
296
|
+
|
|
297
|
+
output_video = CutSeconds(start=0, end=speech_duration).apply(video)
|
|
296
298
|
else:
|
|
297
299
|
output_video = video
|
|
298
300
|
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""AI-powered video transforms that require face detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import ClassVar, Literal
|
|
7
|
+
|
|
8
|
+
import cv2
|
|
9
|
+
import numpy as np
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
|
|
13
|
+
from videopython.ai.understanding.faces import FaceTracker
|
|
14
|
+
from videopython.base.operation import OpCategory, Operation
|
|
15
|
+
from videopython.base.video import Video
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _make_even(value: int) -> int:
|
|
21
|
+
"""Round down to nearest even number for H.264 compatibility."""
|
|
22
|
+
return value - (value % 2)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"FaceTrackingCrop",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FaceTrackingCrop(Operation):
|
|
31
|
+
"""Crops video to follow detected faces.
|
|
32
|
+
|
|
33
|
+
Useful for creating vertical (9:16) content from horizontal (16:9) video
|
|
34
|
+
by tracking the speaker's face and keeping it framed.
|
|
35
|
+
|
|
36
|
+
Supports GPU acceleration for faster processing with optional frame sampling
|
|
37
|
+
and simple cinematographic framing rules (headroom / thirds) plus optional
|
|
38
|
+
movement speed clamping.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
op: Literal["face_crop"] = "face_crop"
|
|
42
|
+
category: ClassVar[OpCategory] = OpCategory.TRANSFORM
|
|
43
|
+
|
|
44
|
+
target_aspect: tuple[int, int] = Field((9, 16), description="Output aspect ratio as (width, height).")
|
|
45
|
+
face_selection: Literal["largest", "centered", "index"] = Field(
|
|
46
|
+
"largest", description="Strategy for selecting which face to track."
|
|
47
|
+
)
|
|
48
|
+
face_index: int = Field(0, ge=0, description='Index of face to track when using ``face_selection="index"``.')
|
|
49
|
+
padding: float = Field(0.3, ge=0, description="Extra space around face (0.3 = 30% padding on each side).")
|
|
50
|
+
vertical_offset: float = Field(
|
|
51
|
+
-0.1, description='Legacy vertical position offset used by ``framing_rule="offset"``.'
|
|
52
|
+
)
|
|
53
|
+
framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = Field(
|
|
54
|
+
"offset",
|
|
55
|
+
description=(
|
|
56
|
+
'Subject framing strategy. "offset": legacy ``vertical_offset`` behavior; '
|
|
57
|
+
'"center": keep face centered; "headroom": extra room above the face; '
|
|
58
|
+
'"thirds": face near the upper-third line; "dynamic": currently same as "headroom".'
|
|
59
|
+
),
|
|
60
|
+
)
|
|
61
|
+
headroom: float = Field(0.15, description="Headroom amount for framing rules that use it.")
|
|
62
|
+
smoothing: float = Field(0.8, ge=0, le=1, description="Position smoothing factor (0-1, higher = smoother).")
|
|
63
|
+
max_speed: float | None = Field(None, gt=0, description="Optional max camera movement per frame (normalized).")
|
|
64
|
+
fallback: Literal["center", "last_position", "full_frame"] = Field(
|
|
65
|
+
"last_position", description="Behavior when no face detected."
|
|
66
|
+
)
|
|
67
|
+
detection_interval: int = Field(3, ge=1, description="Frames between face detections.")
|
|
68
|
+
backend: Literal["cpu", "gpu", "auto"] = Field("auto", description='Detection backend - "cpu", "gpu", or "auto".')
|
|
69
|
+
sample_rate: int = Field(1, ge=1, description="For GPU backend, detect every Nth frame and interpolate.")
|
|
70
|
+
|
|
71
|
+
def _apply_framing_offset(self, face_cx: float, face_cy: float, face_h: float) -> tuple[float, float]:
|
|
72
|
+
if self.framing_rule == "offset":
|
|
73
|
+
return (face_cx, face_cy + self.vertical_offset)
|
|
74
|
+
if self.framing_rule == "center":
|
|
75
|
+
return (face_cx, face_cy)
|
|
76
|
+
if self.framing_rule == "headroom":
|
|
77
|
+
return (face_cx, face_cy - self.headroom)
|
|
78
|
+
if self.framing_rule == "thirds":
|
|
79
|
+
return (face_cx, face_cy - (1 / 3 - 0.5))
|
|
80
|
+
# "dynamic" — placeholder until motion/look-direction framing is implemented.
|
|
81
|
+
return (face_cx, face_cy - self.headroom)
|
|
82
|
+
|
|
83
|
+
def _clamp_speed(self, current: tuple[float, float], target: tuple[float, float]) -> tuple[float, float]:
|
|
84
|
+
if self.max_speed is None:
|
|
85
|
+
return target
|
|
86
|
+
dx = target[0] - current[0]
|
|
87
|
+
dy = target[1] - current[1]
|
|
88
|
+
distance = (dx**2 + dy**2) ** 0.5
|
|
89
|
+
if distance <= self.max_speed or distance == 0:
|
|
90
|
+
return target
|
|
91
|
+
scale = self.max_speed / distance
|
|
92
|
+
return (current[0] + dx * scale, current[1] + dy * scale)
|
|
93
|
+
|
|
94
|
+
def _calculate_crop_region(
|
|
95
|
+
self,
|
|
96
|
+
face_cx: float,
|
|
97
|
+
face_cy: float,
|
|
98
|
+
face_w: float,
|
|
99
|
+
face_h: float,
|
|
100
|
+
frame_w: int,
|
|
101
|
+
frame_h: int,
|
|
102
|
+
center_position: tuple[float, float] | None = None,
|
|
103
|
+
) -> tuple[int, int, int, int]:
|
|
104
|
+
target_ratio = self.target_aspect[0] / self.target_aspect[1]
|
|
105
|
+
frame_ratio = frame_w / frame_h
|
|
106
|
+
|
|
107
|
+
if target_ratio < frame_ratio:
|
|
108
|
+
crop_h = _make_even(frame_h)
|
|
109
|
+
crop_w = _make_even(int(crop_h * target_ratio))
|
|
110
|
+
else:
|
|
111
|
+
crop_w = _make_even(frame_w)
|
|
112
|
+
crop_h = _make_even(int(crop_w / target_ratio))
|
|
113
|
+
|
|
114
|
+
min_face_dim = max(face_w * frame_w, face_h * frame_h)
|
|
115
|
+
min_crop_dim = min_face_dim * (1 + 2 * self.padding)
|
|
116
|
+
if crop_w < min_crop_dim * target_ratio:
|
|
117
|
+
crop_w = _make_even(min(int(min_crop_dim * target_ratio), frame_w))
|
|
118
|
+
crop_h = _make_even(min(int(crop_w / target_ratio), frame_h))
|
|
119
|
+
|
|
120
|
+
if center_position is None:
|
|
121
|
+
center_position = self._apply_framing_offset(face_cx, face_cy, face_h)
|
|
122
|
+
|
|
123
|
+
center_x = center_position[0] * frame_w
|
|
124
|
+
center_y = center_position[1] * frame_h
|
|
125
|
+
x = int(center_x - crop_w / 2)
|
|
126
|
+
y = int(center_y - crop_h / 2)
|
|
127
|
+
x = max(0, min(x, frame_w - crop_w))
|
|
128
|
+
y = max(0, min(y, frame_h - crop_h))
|
|
129
|
+
return (x, y, crop_w, crop_h)
|
|
130
|
+
|
|
131
|
+
def apply(self, video: Video) -> Video:
|
|
132
|
+
tracker = FaceTracker(
|
|
133
|
+
selection_strategy=self.face_selection,
|
|
134
|
+
face_index=self.face_index,
|
|
135
|
+
smoothing=self.smoothing,
|
|
136
|
+
detection_interval=self.detection_interval,
|
|
137
|
+
backend=self.backend,
|
|
138
|
+
sample_rate=self.sample_rate,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
h, w = video.frame_shape[:2]
|
|
142
|
+
target_ratio = self.target_aspect[0] / self.target_aspect[1]
|
|
143
|
+
if target_ratio < w / h:
|
|
144
|
+
out_h = _make_even(h)
|
|
145
|
+
out_w = _make_even(int(out_h * target_ratio))
|
|
146
|
+
else:
|
|
147
|
+
out_w = _make_even(w)
|
|
148
|
+
out_h = _make_even(int(out_w / target_ratio))
|
|
149
|
+
|
|
150
|
+
default_x = (w - out_w) // 2
|
|
151
|
+
default_y = (h - out_h) // 2
|
|
152
|
+
last_crop = (default_x, default_y, out_w, out_h)
|
|
153
|
+
current_position = (0.5, 0.5)
|
|
154
|
+
|
|
155
|
+
framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
|
|
156
|
+
logger.info(
|
|
157
|
+
"Face tracking crop: %dx%d -> %dx%d (%d:%d, framing=%s)",
|
|
158
|
+
w,
|
|
159
|
+
h,
|
|
160
|
+
out_w,
|
|
161
|
+
out_h,
|
|
162
|
+
self.target_aspect[0],
|
|
163
|
+
self.target_aspect[1],
|
|
164
|
+
framing_label,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
new_frames = []
|
|
168
|
+
for i in tqdm(range(len(video.frames)), desc="Face tracking crop"):
|
|
169
|
+
frame = video.frames[i]
|
|
170
|
+
face_info = tracker.detect_and_track(frame, i)
|
|
171
|
+
|
|
172
|
+
if face_info:
|
|
173
|
+
cx, cy, fw, fh = face_info
|
|
174
|
+
target_position = self._apply_framing_offset(cx, cy, fh)
|
|
175
|
+
current_position = self._clamp_speed(current_position, target_position)
|
|
176
|
+
crop = self._calculate_crop_region(cx, cy, fw, fh, w, h, center_position=current_position)
|
|
177
|
+
last_crop = crop
|
|
178
|
+
else:
|
|
179
|
+
if self.fallback == "center":
|
|
180
|
+
crop = (default_x, default_y, out_w, out_h)
|
|
181
|
+
elif self.fallback == "last_position":
|
|
182
|
+
crop = last_crop
|
|
183
|
+
else: # full_frame
|
|
184
|
+
crop = (0, 0, w, h)
|
|
185
|
+
|
|
186
|
+
x, y, cw, ch = crop
|
|
187
|
+
cropped = frame[y : y + ch, x : x + cw]
|
|
188
|
+
if cropped.shape[1] != out_w or cropped.shape[0] != out_h:
|
|
189
|
+
cropped = cv2.resize(cropped, (out_w, out_h), interpolation=cv2.INTER_AREA)
|
|
190
|
+
new_frames.append(cropped)
|
|
191
|
+
|
|
192
|
+
video.frames = np.array(new_frames, dtype=np.uint8)
|
|
193
|
+
return video
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""Face detection and per-shot tracking for the understanding layer.
|
|
2
2
|
|
|
3
3
|
Lifted from ``ai/transforms.py`` so analysis code (``VideoAnalyzer``) and
|
|
4
|
-
transforms (``FaceTrackingCrop``
|
|
5
|
-
|
|
4
|
+
transforms (``FaceTrackingCrop``) can share a single source. M6 lip-sync
|
|
5
|
+
also consumes this directly.
|
|
6
6
|
|
|
7
7
|
Tracking is IoU-only — no embedding re-id. Tracks do not survive across
|
|
8
8
|
shot/scene boundaries; a shot here means a ``SceneBoundary`` produced by
|
|
@@ -167,9 +167,8 @@ class FaceTracker:
|
|
|
167
167
|
Two surfaces:
|
|
168
168
|
|
|
169
169
|
- ``detect_and_track(frame, frame_index)`` / ``track_video(frames)`` —
|
|
170
|
-
legacy single-subject API used by ``FaceTrackingCrop
|
|
171
|
-
``
|
|
172
|
-
``(cx, cy, w, h)`` tuple.
|
|
170
|
+
legacy single-subject API used by ``FaceTrackingCrop``. Returns a
|
|
171
|
+
smoothed ``(cx, cy, w, h)`` tuple.
|
|
173
172
|
- ``track_shot(frames, frame_indices)`` — new per-shot multi-track API
|
|
174
173
|
returning ``list[FaceTrack]``. Used by the analysis pipeline (M5)
|
|
175
174
|
and lip-sync (M6) to bind detections to subjects across the
|