videopython 0.28.3__tar.gz → 0.29.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. {videopython-0.28.3 → videopython-0.29.1}/PKG-INFO +7 -4
  2. {videopython-0.28.3 → videopython-0.29.1}/README.md +4 -2
  3. {videopython-0.28.3 → videopython-0.29.1}/pyproject.toml +13 -4
  4. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/__init__.py +3 -5
  5. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/dubbing/cache.py +17 -1
  6. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/dubbing/dubber.py +8 -0
  7. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/dubbing/pipeline.py +5 -0
  8. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/transforms.py +2 -478
  9. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/understanding/__init__.py +3 -3
  10. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/understanding/audio.py +97 -11
  11. videopython-0.29.1/src/videopython/ai/understanding/faces.py +592 -0
  12. videopython-0.29.1/src/videopython/ai/understanding/image.py +397 -0
  13. videopython-0.29.1/src/videopython/ai/understanding/temporal.py +218 -0
  14. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/video_analysis.py +217 -37
  15. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/__init__.py +4 -2
  16. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/description.py +82 -52
  17. videopython-0.28.3/src/videopython/ai/understanding/image.py +0 -215
  18. videopython-0.28.3/src/videopython/ai/understanding/temporal.py +0 -464
  19. {videopython-0.28.3 → videopython-0.29.1}/.gitignore +0 -0
  20. {videopython-0.28.3 → videopython-0.29.1}/LICENSE +0 -0
  21. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/__init__.py +0 -0
  22. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/_device.py +0 -0
  23. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/dubbing/__init__.py +0 -0
  24. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/dubbing/models.py +0 -0
  25. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/dubbing/quality.py +0 -0
  26. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/dubbing/remux.py +0 -0
  27. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/dubbing/timing.py +0 -0
  28. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/generation/__init__.py +0 -0
  29. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/generation/audio.py +0 -0
  30. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/generation/image.py +0 -0
  31. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/generation/qwen3.py +0 -0
  32. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/generation/translation.py +0 -0
  33. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/generation/video.py +0 -0
  34. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/registry.py +0 -0
  35. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/swapping/__init__.py +0 -0
  36. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/swapping/inpainter.py +0 -0
  37. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/swapping/models.py +0 -0
  38. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/swapping/segmenter.py +0 -0
  39. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/swapping/swapper.py +0 -0
  40. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/ai/understanding/separation.py +0 -0
  41. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/audio/__init__.py +0 -0
  42. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/audio/analysis.py +0 -0
  43. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/audio/audio.py +0 -0
  44. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/combine.py +0 -0
  45. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/effects.py +0 -0
  46. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/exceptions.py +0 -0
  47. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/progress.py +0 -0
  48. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/registry.py +0 -0
  49. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/scene.py +0 -0
  50. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/streaming.py +0 -0
  51. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/text/__init__.py +0 -0
  52. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/text/overlay.py +0 -0
  53. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/text/transcription.py +0 -0
  54. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/transforms.py +0 -0
  55. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/transitions.py +0 -0
  56. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/utils.py +0 -0
  57. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/base/video.py +0 -0
  58. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/editing/__init__.py +0 -0
  59. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/editing/multicam.py +0 -0
  60. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/editing/premiere_xml.py +0 -0
  61. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/editing/video_edit.py +0 -0
  62. {videopython-0.28.3 → videopython-0.29.1}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.28.3
3
+ Version: 0.29.1
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -27,14 +27,15 @@ Requires-Dist: accelerate>=0.29.2; extra == 'ai'
27
27
  Requires-Dist: chatterbox-tts>=0.1.7; extra == 'ai'
28
28
  Requires-Dist: demucs>=4.0.0; extra == 'ai'
29
29
  Requires-Dist: diffusers>=0.30.0; extra == 'ai'
30
- Requires-Dist: easyocr>=1.7.0; extra == 'ai'
31
30
  Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
31
+ Requires-Dist: imagehash>=4.3; extra == 'ai'
32
32
  Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
33
33
  Requires-Dist: numba>=0.61.0; extra == 'ai'
34
34
  Requires-Dist: ollama>=0.4.5; extra == 'ai'
35
35
  Requires-Dist: openai-whisper>=20240930; extra == 'ai'
36
36
  Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
37
37
  Requires-Dist: pyloudnorm>=0.1.1; extra == 'ai'
38
+ Requires-Dist: qwen-vl-utils>=0.0.10; extra == 'ai'
38
39
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
39
40
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
40
41
  Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
@@ -56,6 +57,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
56
57
 
57
58
  Full documentation: [videopython.com](https://videopython.com)
58
59
 
60
+ > **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
61
+
59
62
  ## Installation
60
63
 
61
64
  ### 1. Install FFmpeg
@@ -193,10 +196,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
193
196
  | Area | Highlights |
194
197
  |---|---|
195
198
  | **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
196
- | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `ActionRecognizer` |
199
+ | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
197
200
  | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
198
201
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
199
- | **Transforms** | `FaceTracker`, `FaceTrackingCrop`, `SplitScreenComposite` |
202
+ | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
200
203
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
201
204
  | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
202
205
 
@@ -8,6 +8,8 @@ Minimal, LLM-friendly Python library for programmatic video editing, processing,
8
8
 
9
9
  Full documentation: [videopython.com](https://videopython.com)
10
10
 
11
+ > **Disclaimer:** This project started as a hand-written hobby project, but most of the code is now produced by LLM agents. Humans still drive direction, approve changes, and own design decisions.
12
+
11
13
  ## Installation
12
14
 
13
15
  ### 1. Install FFmpeg
@@ -145,10 +147,10 @@ API docs: [Core](https://videopython.com/api/index/) | [Video](https://videopyth
145
147
  | Area | Highlights |
146
148
  |---|---|
147
149
  | **Generation** | `TextToVideo`, `ImageToVideo`, `TextToImage`, `TextToSpeech`, `TextToMusic` |
148
- | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (visual scene description), `ActionRecognizer` |
150
+ | **Understanding** | `AudioToText` (transcription), `AudioClassifier`, `SceneVLM` (structured visual scene description), `FaceTracker` (per-shot face tracks) |
149
151
  | **Scene detection** | `SemanticSceneDetector` (neural scene boundaries) |
150
152
  | **Video analysis** | `VideoAnalyzer` - full-pipeline analysis combining multiple AI capabilities |
151
- | **Transforms** | `FaceTracker`, `FaceTrackingCrop`, `SplitScreenComposite` |
153
+ | **Transforms** | `FaceTrackingCrop`, `SplitScreenComposite` |
152
154
  | **Dubbing** | `VideoDubber` - voice cloning and revoicing with timing sync |
153
155
  | **Object swapping** | `ObjectSwapper` - detect, segment, and inpaint objects in video |
154
156
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.28.3"
3
+ version = "0.29.1"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -70,7 +70,6 @@ ai = [
70
70
  "scikit-learn>=1.3.0",
71
71
  # Detection backends
72
72
  "ultralytics>=8.0.0",
73
- "easyocr>=1.7.0",
74
73
  # Audio classification (AST via transformers - no separate dep needed)
75
74
  # Scene detection
76
75
  "transnetv2-pytorch>=1.0.5",
@@ -84,6 +83,11 @@ ai = [
84
83
  "llama-cpp-python>=0.3.0",
85
84
  # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
86
85
  "pyloudnorm>=0.1.1",
86
+ # Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
87
+ # for AutoModelForImageTextToText with image/video chat templates.
88
+ "qwen-vl-utils>=0.0.10",
89
+ # Perceptual hashing for SceneVLM frame dedup (M5)
90
+ "imagehash>=4.3",
87
91
  ]
88
92
 
89
93
  # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -105,7 +109,6 @@ ai = [
105
109
  "scikit-learn>=1.3.0",
106
110
  # Detection backends
107
111
  "ultralytics>=8.0.0",
108
- "easyocr>=1.7.0",
109
112
  # Audio classification (AST via transformers - no separate dep needed)
110
113
  # Scene detection
111
114
  "transnetv2-pytorch>=1.0.5",
@@ -119,6 +122,11 @@ ai = [
119
122
  "llama-cpp-python>=0.3.0",
120
123
  # Loudness measurement (BS.1770) for dub-vs-source loudness matching (M3)
121
124
  "pyloudnorm>=0.1.1",
125
+ # Vision-language preprocessing for Qwen3.5 (M5) - documented prerequisite
126
+ # for AutoModelForImageTextToText with image/video chat templates.
127
+ "qwen-vl-utils>=0.0.10",
128
+ # Perceptual hashing for SceneVLM frame dedup (M5)
129
+ "imagehash>=4.3",
122
130
  ]
123
131
 
124
132
  [project.urls]
@@ -135,7 +143,6 @@ module = [
135
143
  "diffusers", "diffusers.*",
136
144
  "ollama", "ollama.*",
137
145
  "ultralytics", "ultralytics.*",
138
- "easyocr", "easyocr.*",
139
146
  "transformers", "transformers.*",
140
147
  "transnetv2_pytorch", "transnetv2_pytorch.*",
141
148
  "chatterbox", "chatterbox.*",
@@ -146,6 +153,8 @@ module = [
146
153
  "cv2", "cv2.*",
147
154
  "llama_cpp", "llama_cpp.*",
148
155
  "pyloudnorm", "pyloudnorm.*",
156
+ "qwen_vl_utils", "qwen_vl_utils.*",
157
+ "imagehash", "imagehash.*",
149
158
  ]
150
159
  ignore_missing_imports = true
151
160
 
@@ -2,11 +2,11 @@ from videopython.ai import registry as _ai_registry # noqa: F401
2
2
 
3
3
  from .generation import ImageToVideo, TextToImage, TextToMusic, TextToSpeech, TextToVideo
4
4
  from .swapping import ObjectSwapper
5
- from .transforms import FaceTracker, FaceTrackingCrop, SplitScreenComposite
5
+ from .transforms import FaceTrackingCrop, SplitScreenComposite
6
6
  from .understanding import (
7
- ActionRecognizer,
8
7
  AudioClassifier,
9
8
  AudioToText,
9
+ FaceTracker,
10
10
  SceneVLM,
11
11
  SemanticSceneDetector,
12
12
  )
@@ -22,12 +22,10 @@ __all__ = [
22
22
  # Understanding
23
23
  "AudioToText",
24
24
  "AudioClassifier",
25
+ "FaceTracker",
25
26
  "SceneVLM",
26
- # Temporal
27
- "ActionRecognizer",
28
27
  "SemanticSceneDetector",
29
28
  # Transforms (AI-powered)
30
- "FaceTracker",
31
29
  "FaceTrackingCrop",
32
30
  "SplitScreenComposite",
33
31
  # Swapping
@@ -27,6 +27,8 @@ from dataclasses import dataclass
27
27
  from pathlib import Path
28
28
  from typing import TYPE_CHECKING, Any
29
29
 
30
+ from videopython.ai.understanding.audio import _normalize_vocabulary
31
+
30
32
  if TYPE_CHECKING:
31
33
  from videopython.base.audio import Audio
32
34
  from videopython.base.text.transcription import Transcription
@@ -37,7 +39,12 @@ logger = logging.getLogger(__name__)
37
39
  # Cache schema version. Bump on incompatible changes to any artifact's
38
40
  # on-disk format (e.g. TranscriptionSegment field changes that break
39
41
  # from_dict). Mismatched cache entries are treated as a miss.
40
- SCHEMA_VERSION = 1
42
+ #
43
+ # v2 (0.29.1): vocabulary added to transcription_kwargs_hash for M1
44
+ # vocabulary biasing. Pre-v2 transcription artifacts miss on first hit
45
+ # and re-run; translation/TTS artifacts are unaffected (hashed
46
+ # independently and survive).
47
+ SCHEMA_VERSION = 2
41
48
 
42
49
  # Reserved for M4.3 per-speaker voice library. M3.2 does not write here;
43
50
  # documented so future code knows the path is taken.
@@ -126,13 +133,22 @@ class DubCache:
126
133
  condition_on_previous_text: bool,
127
134
  no_speech_threshold: float,
128
135
  logprob_threshold: float | None,
136
+ vocabulary: list[str] | None = None,
129
137
  ) -> str:
138
+ """Hash captures the kwargs that affect Whisper's output.
139
+
140
+ ``vocabulary`` is normalized (case-insensitive dedup, casing
141
+ preserved) before hashing so trivial reordering/casing
142
+ differences don't thrash the cache. Defaults to ``None`` so
143
+ pre-M1 callers keep hashing the same value as before.
144
+ """
130
145
  return _stable_hash(
131
146
  whisper_model,
132
147
  enable_diarization,
133
148
  condition_on_previous_text,
134
149
  no_speech_threshold,
135
150
  logprob_threshold,
151
+ *_normalize_vocabulary(vocabulary),
136
152
  )
137
153
 
138
154
  @staticmethod
@@ -37,6 +37,11 @@ class VideoDubber:
37
37
  gate; raise to drop more low-confidence windows.
38
38
  logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
39
  log-probability gate.
40
+ vocabulary: Forwarded to ``AudioToText``. Optional list of brand
41
+ names, product names, or proper nouns to bias Whisper's first-
42
+ window decoder via ``initial_prompt``. Recovers near-mishears
43
+ (e.g. Klarna → "carna") on brand-monitoring inputs without new
44
+ model deps.
40
45
  strict_quality: When True, the pipeline raises
41
46
  :class:`GarbageTranscriptError` before Demucs/translation/TTS run
42
47
  if the transcript-quality heuristic returns ``"reject"``. When
@@ -67,6 +72,7 @@ class VideoDubber:
67
72
  condition_on_previous_text: bool = False,
68
73
  no_speech_threshold: float = 0.6,
69
74
  logprob_threshold: float | None = -1.0,
75
+ vocabulary: list[str] | None = None,
70
76
  strict_quality: bool = False,
71
77
  translator: TranslatorChoice = "auto",
72
78
  cache_dir: str | Path | None = None,
@@ -77,6 +83,7 @@ class VideoDubber:
77
83
  self.condition_on_previous_text = condition_on_previous_text
78
84
  self.no_speech_threshold = no_speech_threshold
79
85
  self.logprob_threshold = logprob_threshold
86
+ self.vocabulary = vocabulary
80
87
  self.strict_quality = strict_quality
81
88
  self.translator = translator
82
89
  self.cache_dir = cache_dir
@@ -101,6 +108,7 @@ class VideoDubber:
101
108
  condition_on_previous_text=self.condition_on_previous_text,
102
109
  no_speech_threshold=self.no_speech_threshold,
103
110
  logprob_threshold=self.logprob_threshold,
111
+ vocabulary=self.vocabulary,
104
112
  strict_quality=self.strict_quality,
105
113
  translator=self.translator,
106
114
  cache_dir=self.cache_dir,
@@ -170,6 +170,7 @@ class LocalDubbingPipeline:
170
170
  condition_on_previous_text: bool = False,
171
171
  no_speech_threshold: float = 0.6,
172
172
  logprob_threshold: float | None = -1.0,
173
+ vocabulary: list[str] | None = None,
173
174
  strict_quality: bool = False,
174
175
  translator: TranslatorChoice = "auto",
175
176
  cache_dir: str | Path | None = None,
@@ -180,6 +181,7 @@ class LocalDubbingPipeline:
180
181
  self.condition_on_previous_text = condition_on_previous_text
181
182
  self.no_speech_threshold = no_speech_threshold
182
183
  self.logprob_threshold = logprob_threshold
184
+ self.vocabulary = vocabulary
183
185
  self.strict_quality = strict_quality
184
186
  self.translator = translator
185
187
  self.cache_dir = Path(cache_dir) if cache_dir is not None else None
@@ -256,6 +258,7 @@ class LocalDubbingPipeline:
256
258
  "condition_on_previous_text": self.condition_on_previous_text,
257
259
  "no_speech_threshold": self.no_speech_threshold,
258
260
  "logprob_threshold": self.logprob_threshold,
261
+ "vocabulary": self.vocabulary,
259
262
  },
260
263
  )
261
264
  return transcription
@@ -406,6 +409,7 @@ class LocalDubbingPipeline:
406
409
  condition_on_previous_text=self.condition_on_previous_text,
407
410
  no_speech_threshold=self.no_speech_threshold,
408
411
  logprob_threshold=self.logprob_threshold,
412
+ vocabulary=self.vocabulary,
409
413
  )
410
414
  return src_hash, kwargs_hash
411
415
 
@@ -420,6 +424,7 @@ class LocalDubbingPipeline:
420
424
  condition_on_previous_text=self.condition_on_previous_text,
421
425
  no_speech_threshold=self.no_speech_threshold,
422
426
  logprob_threshold=self.logprob_threshold,
427
+ vocabulary=self.vocabulary,
423
428
  )
424
429
 
425
430
  def _init_translator(self, source_lang: str, target_lang: str) -> None: