videopython 0.30.0__tar.gz → 0.31.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {videopython-0.30.0 → videopython-0.31.0}/PKG-INFO +72 -43
  2. {videopython-0.30.0 → videopython-0.31.0}/README.md +71 -42
  3. {videopython-0.30.0 → videopython-0.31.0}/pyproject.toml +2 -1
  4. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/__init__.py +1 -4
  5. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/dubber.py +3 -1
  6. videopython-0.31.0/src/videopython/ai/transforms.py +193 -0
  7. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/faces.py +4 -5
  8. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/video_analysis.py +3 -1
  9. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/__init__.py +7 -38
  10. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/description.py +18 -18
  11. videopython-0.31.0/src/videopython/base/effects.py +765 -0
  12. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/exceptions.py +0 -12
  13. videopython-0.31.0/src/videopython/base/operation.py +269 -0
  14. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/text/overlay.py +68 -103
  15. videopython-0.31.0/src/videopython/base/transforms.py +612 -0
  16. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/video.py +1 -512
  17. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/editing/__init__.py +0 -3
  18. videopython-0.31.0/src/videopython/editing/video_edit.py +539 -0
  19. videopython-0.30.0/src/videopython/ai/registry.py +0 -33
  20. videopython-0.30.0/src/videopython/ai/transforms.py +0 -533
  21. videopython-0.30.0/src/videopython/base/effects.py +0 -1049
  22. videopython-0.30.0/src/videopython/base/registry.py +0 -808
  23. videopython-0.30.0/src/videopython/base/transforms.py +0 -919
  24. videopython-0.30.0/src/videopython/base/transitions.py +0 -200
  25. videopython-0.30.0/src/videopython/editing/multicam.py +0 -398
  26. videopython-0.30.0/src/videopython/editing/video_edit.py +0 -1384
  27. {videopython-0.30.0 → videopython-0.31.0}/.gitignore +0 -0
  28. {videopython-0.30.0 → videopython-0.31.0}/LICENSE +0 -0
  29. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/__init__.py +0 -0
  30. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/_device.py +0 -0
  31. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/__init__.py +0 -0
  32. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/models.py +0 -0
  33. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
  34. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/quality.py +0 -0
  35. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/remux.py +0 -0
  36. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/dubbing/timing.py +0 -0
  37. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/__init__.py +0 -0
  38. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/audio.py +0 -0
  39. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/image.py +0 -0
  40. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/qwen3.py +0 -0
  41. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/translation.py +0 -0
  42. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/generation/video.py +0 -0
  43. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/__init__.py +0 -0
  44. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/audio.py +0 -0
  45. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/image.py +0 -0
  46. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/separation.py +0 -0
  47. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/ai/understanding/temporal.py +0 -0
  48. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/audio/__init__.py +0 -0
  49. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/audio/analysis.py +0 -0
  50. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/audio/audio.py +0 -0
  51. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/scene.py +0 -0
  52. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/streaming.py +0 -0
  53. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/text/__init__.py +0 -0
  54. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/base/text/transcription.py +0 -0
  55. {videopython-0.30.0 → videopython-0.31.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.30.0
3
+ Version: 0.31.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -85,22 +85,31 @@ Python `>=3.10, <3.14`. AI features run locally - no cloud API keys required, bu
85
85
 
86
86
  ## Quick Start
87
87
 
88
- ### Video editing
88
+ ### Imperative editing
89
+
90
+ Every editing primitive is an `Operation` subclass — a Pydantic model
91
+ whose fields ARE the JSON wire format. Apply one to a `Video`:
92
+
93
+ ```python
94
+ from videopython.base import Video, CutSeconds, Resize, Fade
95
+
96
+ video = Video.from_path("raw.mp4")
97
+ video = CutSeconds(start=10, end=25).apply(video)
98
+ video = Resize(width=1080, height=1920).apply(video)
99
+ video = Fade(mode="in", duration=0.5).apply(video)
100
+ video.save("output.mp4")
101
+ ```
102
+
103
+ Concatenate clips with `+` (must share fps + dimensions):
89
104
 
90
105
  ```python
91
- from videopython import Video
92
- from videopython.base import FadeTransition
93
-
94
- intro = Video.from_path("intro.mp4").resize(1080, 1920)
95
- clip = Video.from_path("raw.mp4").cut(10, 25).resize(1080, 1920).resample_fps(30)
96
- final = intro.transition_to(clip, FadeTransition(effect_time_seconds=0.5))
97
- final = final.add_audio_from_file("music.mp3")
98
- final.save("output.mp4")
106
+ combined = video_a + video_b
99
107
  ```
100
108
 
101
109
  ### JSON editing plans
102
110
 
103
- Define multi-segment edits as JSON - useful for LLM-driven workflows. `VideoEdit.json_schema()` returns a schema for plan generation/validation.
111
+ Define multi-segment edits as JSON the format LLM-driven workflows
112
+ generate against. `VideoEdit.json_schema()` returns the schema:
104
113
 
105
114
  ```python
106
115
  from videopython.editing import VideoEdit
@@ -110,68 +119,89 @@ plan = {
110
119
  "source": "raw.mp4",
111
120
  "start": 10.0,
112
121
  "end": 20.0,
113
- "transforms": [
114
- {"op": "resize", "args": {"height": 1280}},
115
- {"op": "speed_change", "args": {"speed": 1.25}},
122
+ "operations": [
123
+ {"op": "resize", "width": 1080, "height": 1920},
124
+ {"op": "color_adjust", "saturation": 1.15, "contrast": 1.05},
125
+ {"op": "fade", "mode": "in", "duration": 0.5,
126
+ "window": {"stop": 0.5}},
116
127
  ],
117
128
  }],
118
- "post_effects": [
119
- {"op": "fade", "args": {"mode": "in", "duration": 0.5}, "apply": {"start": 0.0, "stop": 0.5}},
120
- ],
121
129
  }
122
130
 
123
131
  edit = VideoEdit.from_dict(plan)
124
- edit.validate() # dry-run via metadata (no frame loading)
125
- final = edit.run()
126
- final.save("output.mp4")
132
+ edit.validate() # dry-run via metadata, no frames loaded
133
+ edit.run_to_file("output.mp4") # stream to disk, ~constant memory
127
134
  ```
128
135
 
136
+ `run_to_file()` pipes ffmpeg decode → per-frame effects → ffmpeg encode,
137
+ so memory stays bounded even for hour-long sources. Use `edit.run()`
138
+ instead if you want the result back in memory as a `Video`.
139
+
129
140
  ### AI generation
130
141
 
131
142
  ```python
132
143
  from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
144
+ from videopython.base import Resize
133
145
 
134
146
  image = TextToImage().generate_image("A cinematic mountain sunrise")
135
- video = ImageToVideo().generate_video(image=image).resize(1080, 1920)
147
+ video = ImageToVideo().generate_video(image=image)
136
148
  audio = TextToSpeech().generate_audio("Welcome to videopython.")
149
+
150
+ video = Resize(width=1080, height=1920).apply(video)
137
151
  video.add_audio(audio).save("ai_video.mp4")
138
152
  ```
139
153
 
140
154
  ## LLM & AI Agent Integration
141
155
 
142
- videopython is designed to be controlled by LLMs. Every video operation exposes a machine-readable spec with descriptions, parameter types, and value constraints - all available as JSON Schema at runtime.
156
+ The library is built for LLM-driven editing. Two surfaces matter:
143
157
 
144
- **Schema generation** - `VideoEdit.json_schema()` returns a complete JSON Schema describing valid edit plans. Pass it directly as a tool schema or structured-output format to any LLM API:
158
+ **1. Plan schema for tool / structured-output calls.**
159
+ `VideoEdit.json_schema()` returns a JSON Schema covering segments,
160
+ `post_operations`, and a discriminated union over every registered
161
+ `Operation`. Drop it into any LLM API:
145
162
 
146
163
  ```python
147
164
  from videopython.editing import VideoEdit
148
165
 
149
166
  schema = VideoEdit.json_schema()
150
- # Pass `schema` to your LLM as a function/tool definition or response format.
151
- # The LLM generates a plan dict, then:
167
+ # Anthropic: tools=[{"name": "edit", "input_schema": schema}]
168
+ # OpenAI: tools=[{"type": "function",
169
+ # "function": {"name": "edit", "parameters": schema}}]
170
+ ```
171
+
172
+ Validate the LLM's output without touching the filesystem, then run it:
152
173
 
174
+ ```python
153
175
  edit = VideoEdit.from_dict(plan)
154
- edit.validate() # dry-run: checks sources, time ranges, params - no frames loaded
155
- final = edit.run()
156
- final.save("output.mp4")
176
+ edit.validate() # catches bad ops, time ranges, fps mismatches
177
+ edit.run_to_file("output.mp4")
157
178
  ```
158
179
 
159
- **Operation discovery** - the registry lets an LLM (or your code) inspect all available operations, their parameters, and constraints:
180
+ **2. Operation discovery for agent loops.**
181
+ Every registered op exposes its own Pydantic schema, so an agent can
182
+ introspect what's available without hardcoded lists:
160
183
 
161
184
  ```python
162
- from videopython.base import get_operation_specs, get_specs_by_category, OperationCategory
185
+ from videopython.base import Operation, OpCategory
163
186
 
164
- all_ops = get_operation_specs() # all registered operations
165
- transforms = get_specs_by_category(OperationCategory.TRANSFORMATION) # just transforms
187
+ for op_id, cls in Operation.registry().items():
188
+ print(f"{op_id}: {(cls.__doc__ or '').splitlines()[0]}")
166
189
 
167
- spec = all_ops["color_adjust"]
168
- print(spec.description) # LLM-friendly docstring
169
- print(spec.to_json_schema()) # {"brightness": {"type": "number", "minimum": -1, "maximum": 1}, ...}
190
+ schema = Operation.get("color_adjust").model_json_schema() # per-op schema
170
191
  ```
171
192
 
172
- Every operation has LLM-optimized descriptions and rich constraints (`minimum`, `maximum`, `enum`, `exclusive_minimum`, etc.) so models generate valid parameters on the first try.
193
+ Field constraints (`minimum`, `maximum`, `enum`, `exclusiveMinimum`,
194
+ nullability) flow through to the schema, so LLMs that support
195
+ constrained generation produce valid parameters on the first try.
196
+
197
+ For ops that need side-channel data (e.g. `silence_removal` and
198
+ `add_subtitles` need a `Transcription`), pass it via `context`:
199
+
200
+ ```python
201
+ edit.run(context={"transcription": my_transcription})
202
+ ```
173
203
 
174
- Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operation Registry](https://videopython.com/api/registry/)
204
+ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [LLM Integration Guide](https://videopython.com/guides/llm-integration/)
175
205
 
176
206
  ## Features
177
207
 
@@ -180,16 +210,15 @@ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operation Registr
180
210
  | Area | Highlights |
181
211
  |---|---|
182
212
  | **Video I/O** | `Video`, `VideoMetadata`, `FrameIterator` - load, save, inspect |
183
- | **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with full JSON Schema generation, dry-run validation, and operation registry |
184
- | **Multicam editing** | `MultiCamEdit`, `CutPoint` - switch between synchronized camera angles with transitions, replace audio with external track |
185
- | **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, picture-in-picture, reverse, freeze frame, silence removal |
186
- | **Transitions** | `FadeTransition`, `BlurTransition`, `InstantTransition` |
213
+ | **Operation foundation** | `Operation`, `Effect`, `TimeRange`, `OpCategory` - Pydantic base + auto-registry + discriminated-union schema |
214
+ | **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with JSON Schema generation, dry-run validation, and streaming `run_to_file` |
215
+ | **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, reverse, freeze frame, silence removal |
187
216
  | **Effects** | Blur, zoom, color grading, vignette, Ken Burns, image overlay, fade, text overlay, volume adjust |
188
217
  | **Audio** | Load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
189
218
  | **Text** | Transcription data classes, `TranscriptionOverlay` for subtitle rendering |
190
219
  | **Scene detection** | Histogram-based scene boundaries (`detect`, `detect_streaming`, `detect_parallel`) |
191
220
 
192
- API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Transforms](https://videopython.com/api/transforms/) | [Transitions](https://videopython.com/api/transitions/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
221
+ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [Transforms](https://videopython.com/api/transforms/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
193
222
 
194
223
  ### `videopython.ai` - local AI features (install with `[ai]`)
195
224
 
@@ -199,7 +228,7 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
199
228
  | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
200
229
  | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
201
230
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
202
- | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
231
+ | **Transforms** | `FaceTrackingCrop` |
203
232
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
204
233
 
205
234
  API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
@@ -36,22 +36,31 @@ Python `>=3.10, <3.14`. AI features run locally - no cloud API keys required, bu
36
36
 
37
37
  ## Quick Start
38
38
 
39
- ### Video editing
39
+ ### Imperative editing
40
+
41
+ Every editing primitive is an `Operation` subclass — a Pydantic model
42
+ whose fields ARE the JSON wire format. Apply one to a `Video`:
43
+
44
+ ```python
45
+ from videopython.base import Video, CutSeconds, Resize, Fade
46
+
47
+ video = Video.from_path("raw.mp4")
48
+ video = CutSeconds(start=10, end=25).apply(video)
49
+ video = Resize(width=1080, height=1920).apply(video)
50
+ video = Fade(mode="in", duration=0.5).apply(video)
51
+ video.save("output.mp4")
52
+ ```
53
+
54
+ Concatenate clips with `+` (must share fps + dimensions):
40
55
 
41
56
  ```python
42
- from videopython import Video
43
- from videopython.base import FadeTransition
44
-
45
- intro = Video.from_path("intro.mp4").resize(1080, 1920)
46
- clip = Video.from_path("raw.mp4").cut(10, 25).resize(1080, 1920).resample_fps(30)
47
- final = intro.transition_to(clip, FadeTransition(effect_time_seconds=0.5))
48
- final = final.add_audio_from_file("music.mp3")
49
- final.save("output.mp4")
57
+ combined = video_a + video_b
50
58
  ```
51
59
 
52
60
  ### JSON editing plans
53
61
 
54
- Define multi-segment edits as JSON - useful for LLM-driven workflows. `VideoEdit.json_schema()` returns a schema for plan generation/validation.
62
+ Define multi-segment edits as JSON the format LLM-driven workflows
63
+ generate against. `VideoEdit.json_schema()` returns the schema:
55
64
 
56
65
  ```python
57
66
  from videopython.editing import VideoEdit
@@ -61,68 +70,89 @@ plan = {
61
70
  "source": "raw.mp4",
62
71
  "start": 10.0,
63
72
  "end": 20.0,
64
- "transforms": [
65
- {"op": "resize", "args": {"height": 1280}},
66
- {"op": "speed_change", "args": {"speed": 1.25}},
73
+ "operations": [
74
+ {"op": "resize", "width": 1080, "height": 1920},
75
+ {"op": "color_adjust", "saturation": 1.15, "contrast": 1.05},
76
+ {"op": "fade", "mode": "in", "duration": 0.5,
77
+ "window": {"stop": 0.5}},
67
78
  ],
68
79
  }],
69
- "post_effects": [
70
- {"op": "fade", "args": {"mode": "in", "duration": 0.5}, "apply": {"start": 0.0, "stop": 0.5}},
71
- ],
72
80
  }
73
81
 
74
82
  edit = VideoEdit.from_dict(plan)
75
- edit.validate() # dry-run via metadata (no frame loading)
76
- final = edit.run()
77
- final.save("output.mp4")
83
+ edit.validate() # dry-run via metadata, no frames loaded
84
+ edit.run_to_file("output.mp4") # stream to disk, ~constant memory
78
85
  ```
79
86
 
87
+ `run_to_file()` pipes ffmpeg decode → per-frame effects → ffmpeg encode,
88
+ so memory stays bounded even for hour-long sources. Use `edit.run()`
89
+ instead if you want the result back in memory as a `Video`.
90
+
80
91
  ### AI generation
81
92
 
82
93
  ```python
83
94
  from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
95
+ from videopython.base import Resize
84
96
 
85
97
  image = TextToImage().generate_image("A cinematic mountain sunrise")
86
- video = ImageToVideo().generate_video(image=image).resize(1080, 1920)
98
+ video = ImageToVideo().generate_video(image=image)
87
99
  audio = TextToSpeech().generate_audio("Welcome to videopython.")
100
+
101
+ video = Resize(width=1080, height=1920).apply(video)
88
102
  video.add_audio(audio).save("ai_video.mp4")
89
103
  ```
90
104
 
91
105
  ## LLM & AI Agent Integration
92
106
 
93
- videopython is designed to be controlled by LLMs. Every video operation exposes a machine-readable spec with descriptions, parameter types, and value constraints - all available as JSON Schema at runtime.
107
+ The library is built for LLM-driven editing. Two surfaces matter:
94
108
 
95
- **Schema generation** - `VideoEdit.json_schema()` returns a complete JSON Schema describing valid edit plans. Pass it directly as a tool schema or structured-output format to any LLM API:
109
+ **1. Plan schema for tool / structured-output calls.**
110
+ `VideoEdit.json_schema()` returns a JSON Schema covering segments,
111
+ `post_operations`, and a discriminated union over every registered
112
+ `Operation`. Drop it into any LLM API:
96
113
 
97
114
  ```python
98
115
  from videopython.editing import VideoEdit
99
116
 
100
117
  schema = VideoEdit.json_schema()
101
- # Pass `schema` to your LLM as a function/tool definition or response format.
102
- # The LLM generates a plan dict, then:
118
+ # Anthropic: tools=[{"name": "edit", "input_schema": schema}]
119
+ # OpenAI: tools=[{"type": "function",
120
+ # "function": {"name": "edit", "parameters": schema}}]
121
+ ```
122
+
123
+ Validate the LLM's output without touching the filesystem, then run it:
103
124
 
125
+ ```python
104
126
  edit = VideoEdit.from_dict(plan)
105
- edit.validate() # dry-run: checks sources, time ranges, params - no frames loaded
106
- final = edit.run()
107
- final.save("output.mp4")
127
+ edit.validate() # catches bad ops, time ranges, fps mismatches
128
+ edit.run_to_file("output.mp4")
108
129
  ```
109
130
 
110
- **Operation discovery** - the registry lets an LLM (or your code) inspect all available operations, their parameters, and constraints:
131
+ **2. Operation discovery for agent loops.**
132
+ Every registered op exposes its own Pydantic schema, so an agent can
133
+ introspect what's available without hardcoded lists:
111
134
 
112
135
  ```python
113
- from videopython.base import get_operation_specs, get_specs_by_category, OperationCategory
136
+ from videopython.base import Operation, OpCategory
114
137
 
115
- all_ops = get_operation_specs() # all registered operations
116
- transforms = get_specs_by_category(OperationCategory.TRANSFORMATION) # just transforms
138
+ for op_id, cls in Operation.registry().items():
139
+ print(f"{op_id}: {(cls.__doc__ or '').splitlines()[0]}")
117
140
 
118
- spec = all_ops["color_adjust"]
119
- print(spec.description) # LLM-friendly docstring
120
- print(spec.to_json_schema()) # {"brightness": {"type": "number", "minimum": -1, "maximum": 1}, ...}
141
+ schema = Operation.get("color_adjust").model_json_schema() # per-op schema
121
142
  ```
122
143
 
123
- Every operation has LLM-optimized descriptions and rich constraints (`minimum`, `maximum`, `enum`, `exclusive_minimum`, etc.) so models generate valid parameters on the first try.
144
+ Field constraints (`minimum`, `maximum`, `enum`, `exclusiveMinimum`,
145
+ nullability) flow through to the schema, so LLMs that support
146
+ constrained generation produce valid parameters on the first try.
147
+
148
+ For ops that need side-channel data (e.g. `silence_removal` and
149
+ `add_subtitles` need a `Transcription`), pass it via `context`:
150
+
151
+ ```python
152
+ edit.run(context={"transcription": my_transcription})
153
+ ```
124
154
 
125
- Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operation Registry](https://videopython.com/api/registry/)
155
+ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [LLM Integration Guide](https://videopython.com/guides/llm-integration/)
126
156
 
127
157
  ## Features
128
158
 
@@ -131,16 +161,15 @@ Docs: [Editing Plans](https://videopython.com/api/editing/) | [Operation Registr
131
161
  | Area | Highlights |
132
162
  |---|---|
133
163
  | **Video I/O** | `Video`, `VideoMetadata`, `FrameIterator` - load, save, inspect |
134
- | **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with full JSON Schema generation, dry-run validation, and operation registry |
135
- | **Multicam editing** | `MultiCamEdit`, `CutPoint` - switch between synchronized camera angles with transitions, replace audio with external track |
136
- | **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, picture-in-picture, reverse, freeze frame, silence removal |
137
- | **Transitions** | `FadeTransition`, `BlurTransition`, `InstantTransition` |
164
+ | **Operation foundation** | `Operation`, `Effect`, `TimeRange`, `OpCategory` - Pydantic base + auto-registry + discriminated-union schema |
165
+ | **Editing plans** | `VideoEdit`, `SegmentConfig` - JSON/LLM-friendly multi-segment plans with JSON Schema generation, dry-run validation, and streaming `run_to_file` |
166
+ | **Transforms** | Cut (time/frame), resize, crop, FPS resampling, speed change, reverse, freeze frame, silence removal |
138
167
  | **Effects** | Blur, zoom, color grading, vignette, Ken Burns, image overlay, fade, text overlay, volume adjust |
139
168
  | **Audio** | Load/save, overlay, concat, normalize, time-stretch, silence detection, segment classification |
140
169
  | **Text** | Transcription data classes, `TranscriptionOverlay` for subtitle rendering |
141
170
  | **Scene detection** | Histogram-based scene boundaries (`detect`, `detect_streaming`, `detect_parallel`) |
142
171
 
143
- API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Transforms](https://videopython.com/api/transforms/) | [Transitions](https://videopython.com/api/transitions/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
172
+ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopython.com/api/core/video/) | [Audio](https://videopython.com/api/core/audio/) | [Editing Plans](https://videopython.com/api/editing/) | [Operations](https://videopython.com/api/operations/) | [Transforms](https://videopython.com/api/transforms/) | [Effects](https://videopython.com/api/effects/) | [Text](https://videopython.com/api/text/)
144
173
 
145
174
  ### `videopython.ai` - local AI features (install with `[ai]`)
146
175
 
@@ -150,7 +179,7 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
150
179
  | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
151
180
  | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
152
181
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
153
- | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
182
+ | **Transforms** | `FaceTrackingCrop` |
154
183
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
155
184
 
156
185
  API docs: [Generation](https://videopython.com/api/ai/generation/) | [Understanding](https://videopython.com/api/ai/understanding/) | [Transforms](https://videopython.com/api/ai/transforms/) | [Dubbing](https://videopython.com/api/ai/dubbing/)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.30.0"
3
+ version = "0.31.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -136,6 +136,7 @@ Documentation = "https://videopython.com"
136
136
 
137
137
  [tool.mypy]
138
138
  mypy_path = "src/stubs"
139
+ plugins = ["pydantic.mypy"]
139
140
 
140
141
  [[tool.mypy.overrides]]
141
142
  module = [
@@ -1,7 +1,5 @@
1
- from videopython.ai import registry as _ai_registry # noqa: F401
2
-
3
1
  from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
4
- from .transforms import FaceTrackingCrop, SplitScreenComposite
2
+ from .transforms import FaceTrackingCrop
5
3
  from .understanding import (
6
4
  AudioClassifier,
7
5
  AudioToText,
@@ -26,7 +24,6 @@ __all__ = [
26
24
  "SemanticSceneDetector",
27
25
  # Transforms (AI-powered)
28
26
  "FaceTrackingCrop",
29
- "SplitScreenComposite",
30
27
  # Video analysis
31
28
  "VideoAnalysis",
32
29
  "VideoAnalysisConfig",
@@ -292,7 +292,9 @@ class VideoDubber:
292
292
  video_duration = video.total_seconds
293
293
 
294
294
  if video_duration > speech_duration:
295
- output_video = video.cut(0, speech_duration)
295
+ from videopython.base.transforms import CutSeconds
296
+
297
+ output_video = CutSeconds(start=0, end=speech_duration).apply(video)
296
298
  else:
297
299
  output_video = video
298
300
 
@@ -0,0 +1,193 @@
1
+ """AI-powered video transforms that require face detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import ClassVar, Literal
7
+
8
+ import cv2
9
+ import numpy as np
10
+ from pydantic import Field
11
+ from tqdm import tqdm
12
+
13
+ from videopython.ai.understanding.faces import FaceTracker
14
+ from videopython.base.operation import OpCategory, Operation
15
+ from videopython.base.video import Video
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _make_even(value: int) -> int:
21
+ """Round down to nearest even number for H.264 compatibility."""
22
+ return value - (value % 2)
23
+
24
+
25
+ __all__ = [
26
+ "FaceTrackingCrop",
27
+ ]
28
+
29
+
30
+ class FaceTrackingCrop(Operation):
31
+ """Crops video to follow detected faces.
32
+
33
+ Useful for creating vertical (9:16) content from horizontal (16:9) video
34
+ by tracking the speaker's face and keeping it framed.
35
+
36
+ Supports GPU acceleration for faster processing with optional frame sampling
37
+ and simple cinematographic framing rules (headroom / thirds) plus optional
38
+ movement speed clamping.
39
+ """
40
+
41
+ op: Literal["face_crop"] = "face_crop"
42
+ category: ClassVar[OpCategory] = OpCategory.TRANSFORM
43
+
44
+ target_aspect: tuple[int, int] = Field((9, 16), description="Output aspect ratio as (width, height).")
45
+ face_selection: Literal["largest", "centered", "index"] = Field(
46
+ "largest", description="Strategy for selecting which face to track."
47
+ )
48
+ face_index: int = Field(0, ge=0, description='Index of face to track when using ``face_selection="index"``.')
49
+ padding: float = Field(0.3, ge=0, description="Extra space around face (0.3 = 30% padding on each side).")
50
+ vertical_offset: float = Field(
51
+ -0.1, description='Legacy vertical position offset used by ``framing_rule="offset"``.'
52
+ )
53
+ framing_rule: Literal["offset", "center", "headroom", "thirds", "dynamic"] = Field(
54
+ "offset",
55
+ description=(
56
+ 'Subject framing strategy. "offset": legacy ``vertical_offset`` behavior; '
57
+ '"center": keep face centered; "headroom": extra room above the face; '
58
+ '"thirds": face near the upper-third line; "dynamic": currently same as "headroom".'
59
+ ),
60
+ )
61
+ headroom: float = Field(0.15, description="Headroom amount for framing rules that use it.")
62
+ smoothing: float = Field(0.8, ge=0, le=1, description="Position smoothing factor (0-1, higher = smoother).")
63
+ max_speed: float | None = Field(None, gt=0, description="Optional max camera movement per frame (normalized).")
64
+ fallback: Literal["center", "last_position", "full_frame"] = Field(
65
+ "last_position", description="Behavior when no face detected."
66
+ )
67
+ detection_interval: int = Field(3, ge=1, description="Frames between face detections.")
68
+ backend: Literal["cpu", "gpu", "auto"] = Field("auto", description='Detection backend - "cpu", "gpu", or "auto".')
69
+ sample_rate: int = Field(1, ge=1, description="For GPU backend, detect every Nth frame and interpolate.")
70
+
71
+ def _apply_framing_offset(self, face_cx: float, face_cy: float, face_h: float) -> tuple[float, float]:
72
+ if self.framing_rule == "offset":
73
+ return (face_cx, face_cy + self.vertical_offset)
74
+ if self.framing_rule == "center":
75
+ return (face_cx, face_cy)
76
+ if self.framing_rule == "headroom":
77
+ return (face_cx, face_cy - self.headroom)
78
+ if self.framing_rule == "thirds":
79
+ return (face_cx, face_cy - (1 / 3 - 0.5))
80
+ # "dynamic" — placeholder until motion/look-direction framing is implemented.
81
+ return (face_cx, face_cy - self.headroom)
82
+
83
+ def _clamp_speed(self, current: tuple[float, float], target: tuple[float, float]) -> tuple[float, float]:
84
+ if self.max_speed is None:
85
+ return target
86
+ dx = target[0] - current[0]
87
+ dy = target[1] - current[1]
88
+ distance = (dx**2 + dy**2) ** 0.5
89
+ if distance <= self.max_speed or distance == 0:
90
+ return target
91
+ scale = self.max_speed / distance
92
+ return (current[0] + dx * scale, current[1] + dy * scale)
93
+
94
+ def _calculate_crop_region(
95
+ self,
96
+ face_cx: float,
97
+ face_cy: float,
98
+ face_w: float,
99
+ face_h: float,
100
+ frame_w: int,
101
+ frame_h: int,
102
+ center_position: tuple[float, float] | None = None,
103
+ ) -> tuple[int, int, int, int]:
104
+ target_ratio = self.target_aspect[0] / self.target_aspect[1]
105
+ frame_ratio = frame_w / frame_h
106
+
107
+ if target_ratio < frame_ratio:
108
+ crop_h = _make_even(frame_h)
109
+ crop_w = _make_even(int(crop_h * target_ratio))
110
+ else:
111
+ crop_w = _make_even(frame_w)
112
+ crop_h = _make_even(int(crop_w / target_ratio))
113
+
114
+ min_face_dim = max(face_w * frame_w, face_h * frame_h)
115
+ min_crop_dim = min_face_dim * (1 + 2 * self.padding)
116
+ if crop_w < min_crop_dim * target_ratio:
117
+ crop_w = _make_even(min(int(min_crop_dim * target_ratio), frame_w))
118
+ crop_h = _make_even(min(int(crop_w / target_ratio), frame_h))
119
+
120
+ if center_position is None:
121
+ center_position = self._apply_framing_offset(face_cx, face_cy, face_h)
122
+
123
+ center_x = center_position[0] * frame_w
124
+ center_y = center_position[1] * frame_h
125
+ x = int(center_x - crop_w / 2)
126
+ y = int(center_y - crop_h / 2)
127
+ x = max(0, min(x, frame_w - crop_w))
128
+ y = max(0, min(y, frame_h - crop_h))
129
+ return (x, y, crop_w, crop_h)
130
+
131
+ def apply(self, video: Video) -> Video:
132
+ tracker = FaceTracker(
133
+ selection_strategy=self.face_selection,
134
+ face_index=self.face_index,
135
+ smoothing=self.smoothing,
136
+ detection_interval=self.detection_interval,
137
+ backend=self.backend,
138
+ sample_rate=self.sample_rate,
139
+ )
140
+
141
+ h, w = video.frame_shape[:2]
142
+ target_ratio = self.target_aspect[0] / self.target_aspect[1]
143
+ if target_ratio < w / h:
144
+ out_h = _make_even(h)
145
+ out_w = _make_even(int(out_h * target_ratio))
146
+ else:
147
+ out_w = _make_even(w)
148
+ out_h = _make_even(int(out_w / target_ratio))
149
+
150
+ default_x = (w - out_w) // 2
151
+ default_y = (h - out_h) // 2
152
+ last_crop = (default_x, default_y, out_w, out_h)
153
+ current_position = (0.5, 0.5)
154
+
155
+ framing_label = self.framing_rule if self.framing_rule != "offset" else "legacy-offset"
156
+ logger.info(
157
+ "Face tracking crop: %dx%d -> %dx%d (%d:%d, framing=%s)",
158
+ w,
159
+ h,
160
+ out_w,
161
+ out_h,
162
+ self.target_aspect[0],
163
+ self.target_aspect[1],
164
+ framing_label,
165
+ )
166
+
167
+ new_frames = []
168
+ for i in tqdm(range(len(video.frames)), desc="Face tracking crop"):
169
+ frame = video.frames[i]
170
+ face_info = tracker.detect_and_track(frame, i)
171
+
172
+ if face_info:
173
+ cx, cy, fw, fh = face_info
174
+ target_position = self._apply_framing_offset(cx, cy, fh)
175
+ current_position = self._clamp_speed(current_position, target_position)
176
+ crop = self._calculate_crop_region(cx, cy, fw, fh, w, h, center_position=current_position)
177
+ last_crop = crop
178
+ else:
179
+ if self.fallback == "center":
180
+ crop = (default_x, default_y, out_w, out_h)
181
+ elif self.fallback == "last_position":
182
+ crop = last_crop
183
+ else: # full_frame
184
+ crop = (0, 0, w, h)
185
+
186
+ x, y, cw, ch = crop
187
+ cropped = frame[y : y + ch, x : x + cw]
188
+ if cropped.shape[1] != out_w or cropped.shape[0] != out_h:
189
+ cropped = cv2.resize(cropped, (out_w, out_h), interpolation=cv2.INTER_AREA)
190
+ new_frames.append(cropped)
191
+
192
+ video.frames = np.array(new_frames, dtype=np.uint8)
193
+ return video
@@ -1,8 +1,8 @@
1
1
  """Face detection and per-shot tracking for the understanding layer.
2
2
 
3
3
  Lifted from ``ai/transforms.py`` so analysis code (``VideoAnalyzer``) and
4
- transforms (``FaceTrackingCrop`` / ``SplitScreenComposite``) can share a
5
- single source. M6 lip-sync also consumes this directly.
4
+ transforms (``FaceTrackingCrop``) can share a single source. M6 lip-sync
5
+ also consumes this directly.
6
6
 
7
7
  Tracking is IoU-only — no embedding re-id. Tracks do not survive across
8
8
  shot/scene boundaries; a shot here means a ``SceneBoundary`` produced by
@@ -167,9 +167,8 @@ class FaceTracker:
167
167
  Two surfaces:
168
168
 
169
169
  - ``detect_and_track(frame, frame_index)`` / ``track_video(frames)`` —
170
- legacy single-subject API used by ``FaceTrackingCrop`` /
171
- ``SplitScreenComposite``. Returns a smoothed
172
- ``(cx, cy, w, h)`` tuple.
170
+ legacy single-subject API used by ``FaceTrackingCrop``. Returns a
171
+ smoothed ``(cx, cy, w, h)`` tuple.
173
172
  - ``track_shot(frames, frame_indices)`` — new per-shot multi-track API
174
173
  returning ``list[FaceTrack]``. Used by the analysis pipeline (M5)
175
174
  and lip-sync (M6) to bind detections to subjects across the