videopython 0.33.1__tar.gz → 0.33.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- videopython-0.33.3/PKG-INFO +133 -0
- videopython-0.33.3/README.md +84 -0
- {videopython-0.33.1 → videopython-0.33.3}/pyproject.toml +6 -1
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/generation/audio.py +1 -1
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/generation/translation.py +1 -1
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/understanding/audio.py +2 -2
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/understanding/faces.py +11 -16
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/understanding/image.py +2 -2
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/audio/audio.py +4 -4
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/_ffmpeg.py +5 -5
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/_video_io.py +1 -1
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/description.py +21 -20
- videopython-0.33.3/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
- videopython-0.33.3/src/videopython/base/fonts/LICENSE_DEJAVU +99 -0
- videopython-0.33.3/src/videopython/base/fonts/__init__.py +58 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/image_text.py +22 -22
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/transcription.py +10 -8
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/video.py +2 -2
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/editing/__init__.py +20 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/editing/effects.py +651 -8
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/editing/operation.py +4 -5
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/editing/streaming.py +8 -2
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/editing/transcription_overlay.py +4 -1
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/editing/transforms.py +2 -2
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/editing/video_edit.py +2 -2
- videopython-0.33.1/PKG-INFO +0 -258
- videopython-0.33.1/README.md +0 -209
- {videopython-0.33.1 → videopython-0.33.3}/.gitignore +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/LICENSE +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/__init__.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/_device.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/config.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/expressiveness.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/loudness.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/dubbing/voice_sample.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/video_analysis/__init__.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/video_analysis/analyzer.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/video_analysis/models.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/video_analysis/sampling.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/ai/video_analysis/stages.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/audio/__init__.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/audio/analysis.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/__init__.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/_dimensions.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.33.1 → videopython-0.33.3}/src/videopython/py.typed +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: videopython
|
|
3
|
+
Version: 0.33.3
|
|
4
|
+
Summary: Minimal video generation and processing library.
|
|
5
|
+
Project-URL: Homepage, https://videopython.com
|
|
6
|
+
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
7
|
+
Project-URL: Documentation, https://videopython.com
|
|
8
|
+
Author-email: Bartosz Wójtowicz <bartoszwojtowicz@outlook.com>, Bartosz Rudnikowicz <bartoszrudnikowicz840@gmail.com>, Piotr Pukisz <piotr.pukisz@gmail.com>
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai,editing,generation,movie,opencv,python,shorts,video,videopython
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Requires-Python: <3.14,>=3.10
|
|
20
|
+
Requires-Dist: numpy>=1.25.2
|
|
21
|
+
Requires-Dist: opencv-python-headless>=4.9.0.80
|
|
22
|
+
Requires-Dist: pillow>=12.1.1
|
|
23
|
+
Requires-Dist: pydantic>=2.8.0
|
|
24
|
+
Requires-Dist: tqdm>=4.66.3
|
|
25
|
+
Provides-Extra: ai
|
|
26
|
+
Requires-Dist: accelerate>=0.29.2; extra == 'ai'
|
|
27
|
+
Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
|
|
28
|
+
Requires-Dist: demucs>=4.0.0; extra == 'ai'
|
|
29
|
+
Requires-Dist: diffusers>=0.30.0; extra == 'ai'
|
|
30
|
+
Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
|
|
31
|
+
Requires-Dist: imagehash>=4.3; extra == 'ai'
|
|
32
|
+
Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
|
|
33
|
+
Requires-Dist: numba>=0.61.0; extra == 'ai'
|
|
34
|
+
Requires-Dist: ollama>=0.4.5; extra == 'ai'
|
|
35
|
+
Requires-Dist: openai-whisper>=20240930; extra == 'ai'
|
|
36
|
+
Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
|
|
37
|
+
Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
|
|
38
|
+
Requires-Dist: qwen-vl-utils>=0.0.10; extra == 'ai'
|
|
39
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
|
|
40
|
+
Requires-Dist: scipy>=1.10.0; extra == 'ai'
|
|
41
|
+
Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
|
|
42
|
+
Requires-Dist: silero-vad>=5.1; extra == 'ai'
|
|
43
|
+
Requires-Dist: torch>=2.8.0; extra == 'ai'
|
|
44
|
+
Requires-Dist: torchaudio>=2.8.0; extra == 'ai'
|
|
45
|
+
Requires-Dist: transformers>=5.2.0; extra == 'ai'
|
|
46
|
+
Requires-Dist: transnetv2-pytorch>=1.0.5; extra == 'ai'
|
|
47
|
+
Requires-Dist: ultralytics>=8.0.0; extra == 'ai'
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
# videopython
|
|
51
|
+
|
|
52
|
+
[](https://pypi.org/project/videopython/)
|
|
53
|
+
[](https://pypi.org/project/videopython/)
|
|
54
|
+
[](LICENSE)
|
|
55
|
+
|
|
56
|
+
Minimal, LLM-friendly Python library for programmatic video editing, processing, and AI video workflows.
|
|
57
|
+
|
|
58
|
+
Full documentation: [videopython.com](https://videopython.com)
|
|
59
|
+
|
|
60
|
+
> **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
# Install FFmpeg first (macOS: brew install ffmpeg | Debian: apt-get install ffmpeg)
|
|
66
|
+
pip install videopython # core video/audio editing
|
|
67
|
+
pip install "videopython[ai]" # + local AI features (GPU recommended)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Python `>=3.10, <3.14`. AI features run locally — no cloud API keys required, but model weights are downloaded on first use.
|
|
71
|
+
|
|
72
|
+
## Quick Start
|
|
73
|
+
|
|
74
|
+
### JSON editing plans
|
|
75
|
+
|
|
76
|
+
A `VideoEdit` is a multi-segment plan, defined as a dict (or JSON), validated and executed against the source files:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from videopython.editing import VideoEdit
|
|
80
|
+
|
|
81
|
+
edit = VideoEdit.from_dict({
|
|
82
|
+
"segments": [{
|
|
83
|
+
"source": "raw.mp4",
|
|
84
|
+
"start": 10.0,
|
|
85
|
+
"end": 20.0,
|
|
86
|
+
"operations": [
|
|
87
|
+
{"op": "resize", "width": 1080, "height": 1920},
|
|
88
|
+
{"op": "color_adjust", "saturation": 1.15, "contrast": 1.05},
|
|
89
|
+
{"op": "fade", "mode": "in", "duration": 0.5},
|
|
90
|
+
],
|
|
91
|
+
}],
|
|
92
|
+
})
|
|
93
|
+
edit.validate() # dry-run via metadata, no frames loaded
|
|
94
|
+
edit.run_to_file("output.mp4") # streams ffmpeg decode → effects → encode
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
`run_to_file()` streams ffmpeg decode → per-frame effects → encode, so memory stays bounded even for hour-long sources. Use `edit.run()` to get a `Video` back in memory instead.
|
|
98
|
+
|
|
99
|
+
### AI generation
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
|
|
103
|
+
|
|
104
|
+
image = TextToImage().generate_image("A cinematic mountain sunrise")
|
|
105
|
+
video = ImageToVideo().generate_video(image=image)
|
|
106
|
+
audio = TextToSpeech().generate_audio("Welcome to videopython.")
|
|
107
|
+
video.add_audio(audio).save("ai_video.mp4")
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## LLM & AI Agent Integration
|
|
111
|
+
|
|
112
|
+
Every operation is a Pydantic model whose fields ARE the JSON wire format. `VideoEdit.json_schema()` returns a JSON Schema with a discriminated union over every registered `Operation` — pass it straight to Anthropic tool use, OpenAI function calling, or any structured-output API. Then `edit.validate()` dry-runs the plan via metadata before any frames are loaded, so a failed LLM output can be fed back as an error and retried cheaply.
|
|
113
|
+
|
|
114
|
+
See the [LLM Integration Guide](https://videopython.com/guides/llm-integration/) for end-to-end examples, validation error loops, and operation discovery patterns.
|
|
115
|
+
|
|
116
|
+
## Features
|
|
117
|
+
|
|
118
|
+
- **`videopython.base`** — `Video`, `VideoMetadata`, `FrameIterator`, `ImageText`, `Transcription`, and shared result types (`BoundingBox`, `FaceTrack`, `SceneBoundary`, ...). No AI dependencies.
|
|
119
|
+
- **`videopython.audio`** — `Audio` with overlay, concat, normalize, time-stretch, silence detection, segment classification.
|
|
120
|
+
- **`videopython.editing`** — `Operation`/`Effect` foundation, `VideoEdit` plan runner with JSON Schema + streaming execution. Transforms (cut, resize, crop, fps, speed, reverse, freeze, silence removal) and effects (blur, zoom, color grading, vignette, Ken Burns, fade, overlays, animated subtitles).
|
|
121
|
+
- **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `SemanticSceneDetector`), `FaceTrackingCrop` transform, and the full-pipeline `VideoAnalyzer`.
|
|
122
|
+
- **`videopython.ai.dubbing`** — `VideoDubber` for voice-cloned revoicing with timing sync.
|
|
123
|
+
|
|
124
|
+
## Examples
|
|
125
|
+
|
|
126
|
+
- [Social Media Clip](https://videopython.com/examples/social-clip/)
|
|
127
|
+
- [AI-Generated Video](https://videopython.com/examples/ai-video/)
|
|
128
|
+
- [Auto-Subtitles](https://videopython.com/examples/auto-subtitles/)
|
|
129
|
+
- [Processing Large Videos](https://videopython.com/examples/large-videos/)
|
|
130
|
+
|
|
131
|
+
## Development
|
|
132
|
+
|
|
133
|
+
See [`DEVELOPMENT.md`](DEVELOPMENT.md) for local setup, testing, and contribution workflow.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# videopython
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/videopython/)
|
|
4
|
+
[](https://pypi.org/project/videopython/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
Minimal, LLM-friendly Python library for programmatic video editing, processing, and AI video workflows.
|
|
8
|
+
|
|
9
|
+
Full documentation: [videopython.com](https://videopython.com)
|
|
10
|
+
|
|
11
|
+
> **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Install FFmpeg first (macOS: brew install ffmpeg | Debian: apt-get install ffmpeg)
|
|
17
|
+
pip install videopython # core video/audio editing
|
|
18
|
+
pip install "videopython[ai]" # + local AI features (GPU recommended)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Python `>=3.10, <3.14`. AI features run locally — no cloud API keys required, but model weights are downloaded on first use.
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
### JSON editing plans
|
|
26
|
+
|
|
27
|
+
A `VideoEdit` is a multi-segment plan, defined as a dict (or JSON), validated and executed against the source files:
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from videopython.editing import VideoEdit
|
|
31
|
+
|
|
32
|
+
edit = VideoEdit.from_dict({
|
|
33
|
+
"segments": [{
|
|
34
|
+
"source": "raw.mp4",
|
|
35
|
+
"start": 10.0,
|
|
36
|
+
"end": 20.0,
|
|
37
|
+
"operations": [
|
|
38
|
+
{"op": "resize", "width": 1080, "height": 1920},
|
|
39
|
+
{"op": "color_adjust", "saturation": 1.15, "contrast": 1.05},
|
|
40
|
+
{"op": "fade", "mode": "in", "duration": 0.5},
|
|
41
|
+
],
|
|
42
|
+
}],
|
|
43
|
+
})
|
|
44
|
+
edit.validate() # dry-run via metadata, no frames loaded
|
|
45
|
+
edit.run_to_file("output.mp4") # streams ffmpeg decode → effects → encode
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
`run_to_file()` streams ffmpeg decode → per-frame effects → encode, so memory stays bounded even for hour-long sources. Use `edit.run()` to get a `Video` back in memory instead.
|
|
49
|
+
|
|
50
|
+
### AI generation
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from videopython.ai import TextToImage, ImageToVideo, TextToSpeech
|
|
54
|
+
|
|
55
|
+
image = TextToImage().generate_image("A cinematic mountain sunrise")
|
|
56
|
+
video = ImageToVideo().generate_video(image=image)
|
|
57
|
+
audio = TextToSpeech().generate_audio("Welcome to videopython.")
|
|
58
|
+
video.add_audio(audio).save("ai_video.mp4")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## LLM & AI Agent Integration
|
|
62
|
+
|
|
63
|
+
Every operation is a Pydantic model whose fields ARE the JSON wire format. `VideoEdit.json_schema()` returns a JSON Schema with a discriminated union over every registered `Operation` — pass it straight to Anthropic tool use, OpenAI function calling, or any structured-output API. Then `edit.validate()` dry-runs the plan via metadata before any frames are loaded, so a failed LLM output can be fed back as an error and retried cheaply.
|
|
64
|
+
|
|
65
|
+
See the [LLM Integration Guide](https://videopython.com/guides/llm-integration/) for end-to-end examples, validation error loops, and operation discovery patterns.
|
|
66
|
+
|
|
67
|
+
## Features
|
|
68
|
+
|
|
69
|
+
- **`videopython.base`** — `Video`, `VideoMetadata`, `FrameIterator`, `ImageText`, `Transcription`, and shared result types (`BoundingBox`, `FaceTrack`, `SceneBoundary`, ...). No AI dependencies.
|
|
70
|
+
- **`videopython.audio`** — `Audio` with overlay, concat, normalize, time-stretch, silence detection, segment classification.
|
|
71
|
+
- **`videopython.editing`** — `Operation`/`Effect` foundation, `VideoEdit` plan runner with JSON Schema + streaming execution. Transforms (cut, resize, crop, fps, speed, reverse, freeze, silence removal) and effects (blur, zoom, color grading, vignette, Ken Burns, fade, overlays, animated subtitles).
|
|
72
|
+
- **`videopython.ai`** *(install with `[ai]`)* — generation (`TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic`), understanding (`AudioToText`, `AudioClassifier`, `SceneVLM`, `FaceTracker`, `SemanticSceneDetector`), `FaceTrackingCrop` transform, and the full-pipeline `VideoAnalyzer`.
|
|
73
|
+
- **`videopython.ai.dubbing`** — `VideoDubber` for voice-cloned revoicing with timing sync.
|
|
74
|
+
|
|
75
|
+
## Examples
|
|
76
|
+
|
|
77
|
+
- [Social Media Clip](https://videopython.com/examples/social-clip/)
|
|
78
|
+
- [AI-Generated Video](https://videopython.com/examples/ai-video/)
|
|
79
|
+
- [Auto-Subtitles](https://videopython.com/examples/auto-subtitles/)
|
|
80
|
+
- [Processing Large Videos](https://videopython.com/examples/large-videos/)
|
|
81
|
+
|
|
82
|
+
## Development
|
|
83
|
+
|
|
84
|
+
See [`DEVELOPMENT.md`](DEVELOPMENT.md) for local setup, testing, and contribution workflow.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.33.
|
|
3
|
+
version = "0.33.3"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -137,6 +137,9 @@ Documentation = "https://videopython.com"
|
|
|
137
137
|
[tool.mypy]
|
|
138
138
|
mypy_path = "src/stubs"
|
|
139
139
|
plugins = ["pydantic.mypy"]
|
|
140
|
+
warn_unused_ignores = true
|
|
141
|
+
warn_redundant_casts = true
|
|
142
|
+
disallow_any_generics = true
|
|
140
143
|
|
|
141
144
|
[[tool.mypy.overrides]]
|
|
142
145
|
module = [
|
|
@@ -183,9 +186,11 @@ build-backend = "hatchling.build"
|
|
|
183
186
|
|
|
184
187
|
[tool.hatch.build.targets.wheel]
|
|
185
188
|
packages = ["src/videopython"]
|
|
189
|
+
artifacts = ["src/videopython/base/fonts/*.ttf", "src/videopython/base/fonts/LICENSE_DEJAVU"]
|
|
186
190
|
|
|
187
191
|
[tool.hatch.build.targets.sdist]
|
|
188
192
|
include = ["src/videopython", "src/videopython/py.typed"]
|
|
193
|
+
artifacts = ["src/videopython/base/fonts/*.ttf", "src/videopython/base/fonts/LICENSE_DEJAVU"]
|
|
189
194
|
|
|
190
195
|
[tool.pytest.ini_options]
|
|
191
196
|
pythonpath = ["src/"]
|
|
@@ -33,7 +33,7 @@ class TextToSpeech:
|
|
|
33
33
|
self._model: Any = None
|
|
34
34
|
|
|
35
35
|
def _init_local(self) -> None:
|
|
36
|
-
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
|
36
|
+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
|
37
37
|
|
|
38
38
|
requested_device = self.device
|
|
39
39
|
device = select_device(self.device, mps_allowed=False)
|
|
@@ -170,7 +170,7 @@ class MarianTranslator:
|
|
|
170
170
|
return f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
|
|
171
171
|
|
|
172
172
|
def _init_local(self, source_lang: str, target_lang: str) -> None:
|
|
173
|
-
from transformers import MarianMTModel, MarianTokenizer
|
|
173
|
+
from transformers import MarianMTModel, MarianTokenizer
|
|
174
174
|
|
|
175
175
|
model_name = self._get_local_model_name(source_lang, target_lang)
|
|
176
176
|
|
|
@@ -188,7 +188,7 @@ class AudioToText:
|
|
|
188
188
|
def _init_diarization(self) -> None:
|
|
189
189
|
"""Initialize pyannote speaker diarization pipeline."""
|
|
190
190
|
import torch
|
|
191
|
-
from pyannote.audio import Pipeline
|
|
191
|
+
from pyannote.audio import Pipeline
|
|
192
192
|
|
|
193
193
|
self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
|
|
194
194
|
self._diarization_pipeline.to(torch.device(self.device))
|
|
@@ -214,7 +214,7 @@ class AudioToText:
|
|
|
214
214
|
self._vad_model = None
|
|
215
215
|
release_device_memory(self.device)
|
|
216
216
|
|
|
217
|
-
def _process_transcription_result(self, transcription_result: dict) -> Transcription:
|
|
217
|
+
def _process_transcription_result(self, transcription_result: dict[str, Any]) -> Transcription:
|
|
218
218
|
"""Process raw transcription result into a Transcription object."""
|
|
219
219
|
transcription_segments = []
|
|
220
220
|
for segment in transcription_result["segments"]:
|
|
@@ -237,7 +237,7 @@ class FaceTracker:
|
|
|
237
237
|
|
|
238
238
|
def _select_face(
|
|
239
239
|
self,
|
|
240
|
-
faces: list,
|
|
240
|
+
faces: list[DetectedFace],
|
|
241
241
|
frame_width: int,
|
|
242
242
|
frame_height: int,
|
|
243
243
|
) -> tuple[float, float, float, float] | None:
|
|
@@ -251,29 +251,24 @@ class FaceTracker:
|
|
|
251
251
|
Returns:
|
|
252
252
|
Tuple of (center_x, center_y, width, height) in normalized coords, or None.
|
|
253
253
|
"""
|
|
254
|
-
if not
|
|
254
|
+
faces_with_box = [(f, f.bounding_box) for f in faces if f.bounding_box is not None]
|
|
255
|
+
if not faces_with_box:
|
|
255
256
|
return None
|
|
256
257
|
|
|
257
258
|
if self.selection_strategy == "largest":
|
|
258
|
-
|
|
259
|
+
_, bbox = faces_with_box[0]
|
|
259
260
|
elif self.selection_strategy == "centered":
|
|
260
261
|
frame_center = (0.5, 0.5)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
key=lambda
|
|
264
|
-
(f.bounding_box.center[0] - frame_center[0]) ** 2
|
|
265
|
-
+ (f.bounding_box.center[1] - frame_center[1]) ** 2
|
|
266
|
-
),
|
|
262
|
+
_, bbox = min(
|
|
263
|
+
faces_with_box,
|
|
264
|
+
key=lambda fb: ((fb[1].center[0] - frame_center[0]) ** 2 + (fb[1].center[1] - frame_center[1]) ** 2),
|
|
267
265
|
)
|
|
268
266
|
elif self.selection_strategy == "index":
|
|
269
|
-
if self.face_index < len(
|
|
270
|
-
|
|
271
|
-
else:
|
|
272
|
-
face = faces[0]
|
|
267
|
+
idx = self.face_index if self.face_index < len(faces_with_box) else 0
|
|
268
|
+
_, bbox = faces_with_box[idx]
|
|
273
269
|
else:
|
|
274
|
-
|
|
270
|
+
_, bbox = faces_with_box[0]
|
|
275
271
|
|
|
276
|
-
bbox = face.bounding_box
|
|
277
272
|
return (bbox.center[0], bbox.center[1], bbox.width, bbox.height)
|
|
278
273
|
|
|
279
274
|
def detect_and_track(
|
|
@@ -407,7 +402,7 @@ class FaceTracker:
|
|
|
407
402
|
|
|
408
403
|
sampled_frames = [frames[i] for i in sample_indices]
|
|
409
404
|
|
|
410
|
-
sampled_detections: list[list] = []
|
|
405
|
+
sampled_detections: list[list[DetectedFace]] = []
|
|
411
406
|
for batch_start in range(0, len(sampled_frames), self.batch_size):
|
|
412
407
|
batch_end = min(batch_start + self.batch_size, len(sampled_frames))
|
|
413
408
|
batch = sampled_frames[batch_start:batch_end]
|
|
@@ -151,7 +151,7 @@ class SceneVLM:
|
|
|
151
151
|
def _init_local(self) -> None:
|
|
152
152
|
"""Initialize local Qwen3.5 model."""
|
|
153
153
|
import torch
|
|
154
|
-
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
154
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
155
155
|
|
|
156
156
|
t0 = time.perf_counter()
|
|
157
157
|
requested_device = self.device
|
|
@@ -275,7 +275,7 @@ class SceneVLM:
|
|
|
275
275
|
def _generate_from_message_batch(self, messages_batch: list[list[dict[str, Any]]]) -> list[str]:
|
|
276
276
|
"""Run batch generation for one or more multimodal chat messages."""
|
|
277
277
|
import torch
|
|
278
|
-
from qwen_vl_utils import process_vision_info
|
|
278
|
+
from qwen_vl_utils import process_vision_info
|
|
279
279
|
|
|
280
280
|
if self._model is None:
|
|
281
281
|
self._init_local()
|
|
@@ -5,7 +5,7 @@ import subprocess
|
|
|
5
5
|
import wave
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING
|
|
8
|
+
from typing import TYPE_CHECKING, Any
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
@@ -69,7 +69,7 @@ class Audio:
|
|
|
69
69
|
return bool(np.all(np.abs(self.data) < 1e-7))
|
|
70
70
|
|
|
71
71
|
@staticmethod
|
|
72
|
-
def _get_ffmpeg_info(file_path: Path) -> dict:
|
|
72
|
+
def _get_ffmpeg_info(file_path: Path) -> dict[str, Any]:
|
|
73
73
|
"""Get audio metadata using ffprobe"""
|
|
74
74
|
try:
|
|
75
75
|
info = _ffmpeg.probe(file_path)
|
|
@@ -483,7 +483,7 @@ class Audio:
|
|
|
483
483
|
if first.metadata.channels == 1:
|
|
484
484
|
output = np.zeros(total_samples, dtype=np.float32)
|
|
485
485
|
else:
|
|
486
|
-
output = np.zeros((total_samples, 2), dtype=np.float32)
|
|
486
|
+
output = np.zeros((total_samples, 2), dtype=np.float32)
|
|
487
487
|
|
|
488
488
|
# Copy non-crossfaded portions
|
|
489
489
|
crossfade_start = len(first.data) - crossfade_samples
|
|
@@ -761,7 +761,7 @@ class Audio:
|
|
|
761
761
|
if base.metadata.channels == 1:
|
|
762
762
|
output = np.zeros(total_length, dtype=np.float32)
|
|
763
763
|
else:
|
|
764
|
-
output = np.zeros((total_length, 2), dtype=np.float32)
|
|
764
|
+
output = np.zeros((total_length, 2), dtype=np.float32)
|
|
765
765
|
|
|
766
766
|
# Copy base audio
|
|
767
767
|
output[: len(base.data)] = base.data
|
|
@@ -13,7 +13,7 @@ import json
|
|
|
13
13
|
import subprocess
|
|
14
14
|
from contextlib import contextmanager
|
|
15
15
|
from pathlib import Path
|
|
16
|
-
from typing import Iterator, Sequence
|
|
16
|
+
from typing import Any, Iterator, Sequence
|
|
17
17
|
|
|
18
18
|
from videopython.base.exceptions import FFmpegProbeError, FFmpegRunError
|
|
19
19
|
|
|
@@ -44,7 +44,7 @@ def run(cmd: Sequence[str], *, stdin: bytes | None = None) -> bytes:
|
|
|
44
44
|
return result.stdout
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict:
|
|
47
|
+
def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict[str, Any]:
|
|
48
48
|
"""Run ffprobe and return the parsed JSON payload.
|
|
49
49
|
|
|
50
50
|
Args:
|
|
@@ -76,7 +76,7 @@ def probe(path: str | Path, *, extra_args: Sequence[str] | None = None) -> dict:
|
|
|
76
76
|
raise FFmpegProbeError(f"Error parsing ffprobe output: {e}") from e
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
def _terminate(proc: subprocess.Popen, *, timeout: float = 5) -> None:
|
|
79
|
+
def _terminate(proc: subprocess.Popen[bytes], *, timeout: float = 5) -> None:
|
|
80
80
|
"""Terminate a still-running process, escalating to kill after ``timeout``."""
|
|
81
81
|
if proc.poll() is None:
|
|
82
82
|
proc.terminate()
|
|
@@ -88,7 +88,7 @@ def _terminate(proc: subprocess.Popen, *, timeout: float = 5) -> None:
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
@contextmanager
|
|
91
|
-
def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subprocess.Popen]:
|
|
91
|
+
def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subprocess.Popen[bytes]]:
|
|
92
92
|
"""Context manager wrapping an ffmpeg decode process.
|
|
93
93
|
|
|
94
94
|
Yields a Popen with ``stdout=PIPE`` and ``stderr=DEVNULL``. Callers
|
|
@@ -116,7 +116,7 @@ def popen_decode(cmd: Sequence[str], *, bufsize: int = -1) -> Iterator[subproces
|
|
|
116
116
|
|
|
117
117
|
|
|
118
118
|
@contextmanager
|
|
119
|
-
def popen_encode(cmd: Sequence[str]) -> Iterator[subprocess.Popen]:
|
|
119
|
+
def popen_encode(cmd: Sequence[str]) -> Iterator[subprocess.Popen[bytes]]:
|
|
120
120
|
"""Context manager wrapping an ffmpeg encode process via stdin pipe.
|
|
121
121
|
|
|
122
122
|
Yields a Popen with ``stdin=PIPE``, ``stdout=DEVNULL``, and
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any
|
|
4
5
|
|
|
5
6
|
from pydantic import BaseModel, ConfigDict, Field
|
|
6
7
|
|
|
@@ -49,7 +50,7 @@ class SceneBoundary:
|
|
|
49
50
|
"""Number of frames in this scene."""
|
|
50
51
|
return self.end_frame - self.start_frame
|
|
51
52
|
|
|
52
|
-
def to_dict(self) -> dict:
|
|
53
|
+
def to_dict(self) -> dict[str, Any]:
|
|
53
54
|
"""Convert to dictionary for JSON serialization."""
|
|
54
55
|
return {
|
|
55
56
|
"start": self.start,
|
|
@@ -59,7 +60,7 @@ class SceneBoundary:
|
|
|
59
60
|
}
|
|
60
61
|
|
|
61
62
|
@classmethod
|
|
62
|
-
def from_dict(cls, data: dict) -> "SceneBoundary":
|
|
63
|
+
def from_dict(cls, data: dict[str, Any]) -> "SceneBoundary":
|
|
63
64
|
"""Create SceneBoundary from dictionary."""
|
|
64
65
|
return cls(
|
|
65
66
|
start=data["start"],
|
|
@@ -95,12 +96,12 @@ class BoundingBox(BaseModel):
|
|
|
95
96
|
"""Area of the bounding box (normalized)."""
|
|
96
97
|
return self.width * self.height
|
|
97
98
|
|
|
98
|
-
def to_dict(self) -> dict:
|
|
99
|
+
def to_dict(self) -> dict[str, Any]:
|
|
99
100
|
"""Backwards-compat alias for ``model_dump()``."""
|
|
100
101
|
return self.model_dump()
|
|
101
102
|
|
|
102
103
|
@classmethod
|
|
103
|
-
def from_dict(cls, data: dict) -> BoundingBox:
|
|
104
|
+
def from_dict(cls, data: dict[str, Any]) -> BoundingBox:
|
|
104
105
|
"""Backwards-compat alias for ``model_validate(data)``."""
|
|
105
106
|
return cls.model_validate(data)
|
|
106
107
|
|
|
@@ -119,7 +120,7 @@ class DetectedObject:
|
|
|
119
120
|
confidence: float
|
|
120
121
|
bounding_box: BoundingBox | None = None
|
|
121
122
|
|
|
122
|
-
def to_dict(self) -> dict:
|
|
123
|
+
def to_dict(self) -> dict[str, Any]:
|
|
123
124
|
"""Convert to dictionary for JSON serialization."""
|
|
124
125
|
return {
|
|
125
126
|
"label": self.label,
|
|
@@ -128,7 +129,7 @@ class DetectedObject:
|
|
|
128
129
|
}
|
|
129
130
|
|
|
130
131
|
@classmethod
|
|
131
|
-
def from_dict(cls, data: dict) -> DetectedObject:
|
|
132
|
+
def from_dict(cls, data: dict[str, Any]) -> DetectedObject:
|
|
132
133
|
"""Create DetectedObject from dictionary."""
|
|
133
134
|
return cls(
|
|
134
135
|
label=data["label"],
|
|
@@ -160,7 +161,7 @@ class DetectedFace:
|
|
|
160
161
|
"""Area of the face bounding box (normalized), or None if no bounding box."""
|
|
161
162
|
return self.bounding_box.area if self.bounding_box else None
|
|
162
163
|
|
|
163
|
-
def to_dict(self) -> dict:
|
|
164
|
+
def to_dict(self) -> dict[str, Any]:
|
|
164
165
|
"""Convert to dictionary for JSON serialization."""
|
|
165
166
|
return {
|
|
166
167
|
"bounding_box": self.bounding_box.to_dict() if self.bounding_box else None,
|
|
@@ -168,7 +169,7 @@ class DetectedFace:
|
|
|
168
169
|
}
|
|
169
170
|
|
|
170
171
|
@classmethod
|
|
171
|
-
def from_dict(cls, data: dict) -> DetectedFace:
|
|
172
|
+
def from_dict(cls, data: dict[str, Any]) -> DetectedFace:
|
|
172
173
|
"""Create DetectedFace from dictionary."""
|
|
173
174
|
return cls(
|
|
174
175
|
bounding_box=BoundingBox.from_dict(data["bounding_box"]) if data.get("bounding_box") else None,
|
|
@@ -190,7 +191,7 @@ class DetectedText:
|
|
|
190
191
|
confidence: float
|
|
191
192
|
bounding_box: BoundingBox | None = None
|
|
192
193
|
|
|
193
|
-
def to_dict(self) -> dict:
|
|
194
|
+
def to_dict(self) -> dict[str, Any]:
|
|
194
195
|
"""Convert to dictionary for JSON serialization."""
|
|
195
196
|
return {
|
|
196
197
|
"text": self.text,
|
|
@@ -199,7 +200,7 @@ class DetectedText:
|
|
|
199
200
|
}
|
|
200
201
|
|
|
201
202
|
@classmethod
|
|
202
|
-
def from_dict(cls, data: dict) -> "DetectedText":
|
|
203
|
+
def from_dict(cls, data: dict[str, Any]) -> "DetectedText":
|
|
203
204
|
"""Create DetectedText from dictionary."""
|
|
204
205
|
return cls(
|
|
205
206
|
text=data["text"],
|
|
@@ -229,7 +230,7 @@ class AudioEvent:
|
|
|
229
230
|
"""Duration of the audio event in seconds."""
|
|
230
231
|
return self.end - self.start
|
|
231
232
|
|
|
232
|
-
def to_dict(self) -> dict:
|
|
233
|
+
def to_dict(self) -> dict[str, Any]:
|
|
233
234
|
"""Convert to dictionary for JSON serialization."""
|
|
234
235
|
return {
|
|
235
236
|
"start": self.start,
|
|
@@ -239,7 +240,7 @@ class AudioEvent:
|
|
|
239
240
|
}
|
|
240
241
|
|
|
241
242
|
@classmethod
|
|
242
|
-
def from_dict(cls, data: dict) -> AudioEvent:
|
|
243
|
+
def from_dict(cls, data: dict[str, Any]) -> AudioEvent:
|
|
243
244
|
"""Create AudioEvent from dictionary."""
|
|
244
245
|
return cls(
|
|
245
246
|
start=data["start"],
|
|
@@ -261,7 +262,7 @@ class AudioClassification:
|
|
|
261
262
|
events: list[AudioEvent]
|
|
262
263
|
clip_predictions: dict[str, float] = field(default_factory=dict)
|
|
263
264
|
|
|
264
|
-
def to_dict(self) -> dict:
|
|
265
|
+
def to_dict(self) -> dict[str, Any]:
|
|
265
266
|
"""Convert to dictionary for JSON serialization."""
|
|
266
267
|
return {
|
|
267
268
|
"events": [event.to_dict() for event in self.events],
|
|
@@ -269,7 +270,7 @@ class AudioClassification:
|
|
|
269
270
|
}
|
|
270
271
|
|
|
271
272
|
@classmethod
|
|
272
|
-
def from_dict(cls, data: dict) -> "AudioClassification":
|
|
273
|
+
def from_dict(cls, data: dict[str, Any]) -> "AudioClassification":
|
|
273
274
|
"""Create AudioClassification from dictionary."""
|
|
274
275
|
return cls(
|
|
275
276
|
events=[AudioEvent.from_dict(event) for event in data.get("events", [])],
|
|
@@ -306,7 +307,7 @@ class MotionInfo:
|
|
|
306
307
|
"""Check if this frame has significant motion."""
|
|
307
308
|
return self.motion_type != "static"
|
|
308
309
|
|
|
309
|
-
def to_dict(self) -> dict:
|
|
310
|
+
def to_dict(self) -> dict[str, Any]:
|
|
310
311
|
"""Convert to dictionary for JSON serialization."""
|
|
311
312
|
return {
|
|
312
313
|
"motion_type": self.motion_type,
|
|
@@ -315,7 +316,7 @@ class MotionInfo:
|
|
|
315
316
|
}
|
|
316
317
|
|
|
317
318
|
@classmethod
|
|
318
|
-
def from_dict(cls, data: dict) -> MotionInfo:
|
|
319
|
+
def from_dict(cls, data: dict[str, Any]) -> MotionInfo:
|
|
319
320
|
"""Create MotionInfo from dictionary."""
|
|
320
321
|
return cls(
|
|
321
322
|
motion_type=data["motion_type"],
|
|
@@ -344,7 +345,7 @@ class SceneDescription:
|
|
|
344
345
|
subjects: list[str] = field(default_factory=list)
|
|
345
346
|
shot_type: str | None = None
|
|
346
347
|
|
|
347
|
-
def to_dict(self) -> dict:
|
|
348
|
+
def to_dict(self) -> dict[str, Any]:
|
|
348
349
|
return {
|
|
349
350
|
"caption": self.caption,
|
|
350
351
|
"subjects": list(self.subjects),
|
|
@@ -352,7 +353,7 @@ class SceneDescription:
|
|
|
352
353
|
}
|
|
353
354
|
|
|
354
355
|
@classmethod
|
|
355
|
-
def from_dict(cls, data: dict) -> "SceneDescription":
|
|
356
|
+
def from_dict(cls, data: dict[str, Any]) -> "SceneDescription":
|
|
356
357
|
return cls(
|
|
357
358
|
caption=str(data["caption"]),
|
|
358
359
|
subjects=[str(s) for s in data.get("subjects", [])],
|
|
@@ -386,7 +387,7 @@ class FaceTrack:
|
|
|
386
387
|
"""Number of frames in this track."""
|
|
387
388
|
return len(self.frame_indices)
|
|
388
389
|
|
|
389
|
-
def to_dict(self) -> dict:
|
|
390
|
+
def to_dict(self) -> dict[str, Any]:
|
|
390
391
|
return {
|
|
391
392
|
"track_id": self.track_id,
|
|
392
393
|
"frame_indices": list(self.frame_indices),
|
|
@@ -395,7 +396,7 @@ class FaceTrack:
|
|
|
395
396
|
}
|
|
396
397
|
|
|
397
398
|
@classmethod
|
|
398
|
-
def from_dict(cls, data: dict) -> "FaceTrack":
|
|
399
|
+
def from_dict(cls, data: dict[str, Any]) -> "FaceTrack":
|
|
399
400
|
return cls(
|
|
400
401
|
track_id=int(data["track_id"]),
|
|
401
402
|
frame_indices=[int(i) for i in data.get("frame_indices", [])],
|
|
Binary file
|