videopython 0.34.1__tar.gz → 0.35.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.34.1 → videopython-0.35.1}/PKG-INFO +4 -4
- {videopython-0.34.1 → videopython-0.35.1}/README.md +1 -1
- {videopython-0.34.1 → videopython-0.35.1}/pyproject.toml +4 -4
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/qwen3.py +124 -19
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/__init__.py +2 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/effects.py +217 -1
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/operation.py +12 -1
- {videopython-0.34.1 → videopython-0.35.1}/.gitignore +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/LICENSE +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/_device.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/config.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/expressiveness.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/loudness.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/voice_sample.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/analyzer.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/models.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/sampling.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/stages.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/audio/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/audio/analysis.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/audio/audio.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/_dimensions.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/_ffmpeg.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/_video_io.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/description.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/fonts/__init__.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/image_text.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/transcription.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/video.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/streaming.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/transcription_overlay.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/transforms.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.34.1 → videopython-0.35.1}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.35.1
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -12,15 +12,15 @@ Keywords: ai,editing,generation,movie,opencv,python,shorts,video,videopython
|
|
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
13
|
Classifier: Operating System :: OS Independent
|
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
-
Requires-Python: <3.14,>=3.
|
|
18
|
+
Requires-Python: <3.14,>=3.11
|
|
20
19
|
Requires-Dist: numpy>=1.25.2
|
|
21
20
|
Requires-Dist: opencv-python-headless>=4.9.0.80
|
|
22
21
|
Requires-Dist: pillow>=12.1.1
|
|
23
22
|
Requires-Dist: pydantic>=2.8.0
|
|
23
|
+
Requires-Dist: resvg-py>=0.3.2
|
|
24
24
|
Requires-Dist: tqdm>=4.66.3
|
|
25
25
|
Provides-Extra: ai
|
|
26
26
|
Requires-Dist: accelerate>=0.29.2; extra == 'ai'
|
|
@@ -67,7 +67,7 @@ pip install videopython # core video/audio editing
|
|
|
67
67
|
pip install "videopython[ai]" # + local AI features (GPU recommended)
|
|
68
68
|
```
|
|
69
69
|
|
|
70
|
-
Python `>=3.
|
|
70
|
+
Python `>=3.11, <3.14`. AI features run locally — no cloud API keys required, but model weights are downloaded on first use.
|
|
71
71
|
|
|
72
72
|
## Quick Start
|
|
73
73
|
|
|
@@ -18,7 +18,7 @@ pip install videopython # core video/audio editing
|
|
|
18
18
|
pip install "videopython[ai]" # + local AI features (GPU recommended)
|
|
19
19
|
```
|
|
20
20
|
|
|
21
|
-
Python `>=3.
|
|
21
|
+
Python `>=3.11, <3.14`. AI features run locally — no cloud API keys required, but model weights are downloaded on first use.
|
|
22
22
|
|
|
23
23
|
## Quick Start
|
|
24
24
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.35.1"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -9,7 +9,7 @@ authors = [
|
|
|
9
9
|
]
|
|
10
10
|
license = { text = "Apache-2.0" }
|
|
11
11
|
readme = "README.md"
|
|
12
|
-
requires-python = ">=3.
|
|
12
|
+
requires-python = ">=3.11, <3.14"
|
|
13
13
|
keywords = [
|
|
14
14
|
"python",
|
|
15
15
|
"videopython",
|
|
@@ -24,7 +24,6 @@ keywords = [
|
|
|
24
24
|
classifiers = [
|
|
25
25
|
"License :: OSI Approved :: Apache Software License",
|
|
26
26
|
"Programming Language :: Python :: 3",
|
|
27
|
-
"Programming Language :: Python :: 3.10",
|
|
28
27
|
"Programming Language :: Python :: 3.11",
|
|
29
28
|
"Programming Language :: Python :: 3.12",
|
|
30
29
|
"Programming Language :: Python :: 3.13",
|
|
@@ -35,6 +34,7 @@ dependencies = [
|
|
|
35
34
|
"numpy>=1.25.2",
|
|
36
35
|
"opencv-python-headless>=4.9.0.80",
|
|
37
36
|
"pillow>=12.1.1",
|
|
37
|
+
"resvg-py>=0.3.2",
|
|
38
38
|
"tqdm>=4.66.3",
|
|
39
39
|
"pydantic>=2.8.0",
|
|
40
40
|
]
|
|
@@ -203,7 +203,7 @@ markers = [
|
|
|
203
203
|
|
|
204
204
|
[tool.ruff]
|
|
205
205
|
line-length = 120
|
|
206
|
-
target-version = "
|
|
206
|
+
target-version = "py311"
|
|
207
207
|
|
|
208
208
|
[tool.ruff.lint]
|
|
209
209
|
select = [
|
|
@@ -92,6 +92,56 @@ _SPEECH_CHARS_DEFAULT = 12.0
|
|
|
92
92
|
_LOW_LOGPROB_HINT_THRESHOLD = -1.0
|
|
93
93
|
|
|
94
94
|
|
|
95
|
+
# Conservative chars-per-token used to size chunks without invoking the
|
|
96
|
+
# tokenizer. Morphologically rich languages land around 1.5-2.0
|
|
97
|
+
# chars/token; ASCII is ~3-4. We use the low end so chunks stay safe for
|
|
98
|
+
# any source language.
|
|
99
|
+
_CHARS_PER_TOKEN = 2.0
|
|
100
|
+
# Token reserve for the system prompt + user-prompt envelope ("Input
|
|
101
|
+
# segments:" / "Translations (...)" wrappers). Empirical upper bound.
|
|
102
|
+
_PROMPT_OVERHEAD_TOKENS = 300
|
|
103
|
+
# Per-segment JSON wrapper cost (keys, braces, commas, index). Added on
|
|
104
|
+
# top of len(seg.text) when sizing chunks.
|
|
105
|
+
_SEGMENT_ENVELOPE_CHARS = 40
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _chunk_segment_indices(
|
|
109
|
+
segments: list[TranscriptionSegment],
|
|
110
|
+
n_ctx: int,
|
|
111
|
+
max_tokens: int,
|
|
112
|
+
) -> list[list[int]]:
|
|
113
|
+
"""Group positions in ``segments`` into batches that fit one Qwen call.
|
|
114
|
+
|
|
115
|
+
Each batch must satisfy ``prompt_tokens + max_tokens <= n_ctx``, which
|
|
116
|
+
llama.cpp enforces. We approximate prompt token count from character
|
|
117
|
+
length using ``_CHARS_PER_TOKEN``; the conservative ratio means a chunk
|
|
118
|
+
estimated at the budget will tokenize to comfortably less.
|
|
119
|
+
|
|
120
|
+
A segment whose own serialized form exceeds the per-call budget goes in
|
|
121
|
+
its own chunk anyway — better to let llama.cpp report a clean overflow
|
|
122
|
+
on one giant segment than to silently swallow it.
|
|
123
|
+
"""
|
|
124
|
+
prompt_token_budget = n_ctx - max_tokens - _PROMPT_OVERHEAD_TOKENS
|
|
125
|
+
if prompt_token_budget <= 0:
|
|
126
|
+
return [[i] for i in range(len(segments))]
|
|
127
|
+
char_budget = int(prompt_token_budget * _CHARS_PER_TOKEN)
|
|
128
|
+
|
|
129
|
+
chunks: list[list[int]] = []
|
|
130
|
+
current: list[int] = []
|
|
131
|
+
current_chars = 0
|
|
132
|
+
for i, seg in enumerate(segments):
|
|
133
|
+
seg_chars = len(seg.text) + _SEGMENT_ENVELOPE_CHARS
|
|
134
|
+
if current and current_chars + seg_chars > char_budget:
|
|
135
|
+
chunks.append(current)
|
|
136
|
+
current = []
|
|
137
|
+
current_chars = 0
|
|
138
|
+
current.append(i)
|
|
139
|
+
current_chars += seg_chars
|
|
140
|
+
if current:
|
|
141
|
+
chunks.append(current)
|
|
142
|
+
return chunks
|
|
143
|
+
|
|
144
|
+
|
|
95
145
|
def _target_chars_for(duration_seconds: float, target_lang: str) -> int:
|
|
96
146
|
"""Character-count budget for a segment of ``duration_seconds`` in ``target_lang``."""
|
|
97
147
|
rate = _SPEECH_CHARS_PER_SEC.get(target_lang, _SPEECH_CHARS_DEFAULT)
|
|
@@ -170,9 +220,12 @@ class Qwen3Translator:
|
|
|
170
220
|
``DEFAULT_REPO_ID``; override for eval harnesses.
|
|
171
221
|
filename: GGUF filename within ``repo_id``. Defaults to
|
|
172
222
|
``DEFAULT_FILENAME``.
|
|
173
|
-
n_ctx: llama.cpp context window.
|
|
174
|
-
|
|
175
|
-
|
|
223
|
+
n_ctx: llama.cpp context window. ``translate_segments`` splits the
|
|
224
|
+
input across multiple calls when it doesn't fit, so 8192 stays
|
|
225
|
+
safe even for very long sources; raise to reduce the number of
|
|
226
|
+
calls (and gain cross-segment context per call) at the cost of
|
|
227
|
+
VRAM. Hard cap is the model's training context (262K for
|
|
228
|
+
Qwen3-4B-Instruct-2507).
|
|
176
229
|
max_tokens: Generation cap per call. 4× the input character count
|
|
177
230
|
is a safe upper bound for translation output.
|
|
178
231
|
temperature: Decoding temperature. 0.1 keeps output structurally
|
|
@@ -257,6 +310,52 @@ class Qwen3Translator:
|
|
|
257
310
|
raw = response["choices"][0]["text"]
|
|
258
311
|
return _parse_jsonl_response(raw)
|
|
259
312
|
|
|
313
|
+
def _qwen_translate_chunked(
|
|
314
|
+
self,
|
|
315
|
+
segments: list[TranscriptionSegment],
|
|
316
|
+
target_lang: str,
|
|
317
|
+
source_lang: str,
|
|
318
|
+
progress_callback: Callable[[float], None] | None = None,
|
|
319
|
+
progress_start: float = 0.0,
|
|
320
|
+
progress_end: float = 1.0,
|
|
321
|
+
) -> dict[int, str]:
|
|
322
|
+
"""Translate ``segments`` across one or more Qwen calls.
|
|
323
|
+
|
|
324
|
+
Returns a dict keyed by position in ``segments``. Splitting into
|
|
325
|
+
chunks keeps each call under llama.cpp's ``n_ctx`` cap — without
|
|
326
|
+
chunking, a long source with hundreds of dense segments easily
|
|
327
|
+
blows past the default 8192 token window.
|
|
328
|
+
|
|
329
|
+
Progress is reported as a linear ramp from ``progress_start`` to
|
|
330
|
+
``progress_end``, one tick per chunk completed.
|
|
331
|
+
"""
|
|
332
|
+
results: dict[int, str] = {}
|
|
333
|
+
if not segments:
|
|
334
|
+
if progress_callback is not None:
|
|
335
|
+
progress_callback(progress_end)
|
|
336
|
+
return results
|
|
337
|
+
|
|
338
|
+
chunks = _chunk_segment_indices(segments, self.n_ctx, self.max_tokens)
|
|
339
|
+
if len(chunks) > 1:
|
|
340
|
+
logger.info(
|
|
341
|
+
"Qwen3Translator: splitting %d segments into %d chunks (n_ctx=%d)",
|
|
342
|
+
len(segments),
|
|
343
|
+
len(chunks),
|
|
344
|
+
self.n_ctx,
|
|
345
|
+
)
|
|
346
|
+
for chunk_num, chunk_positions in enumerate(chunks):
|
|
347
|
+
chunk_segments = [segments[p] for p in chunk_positions]
|
|
348
|
+
chunk_result = self._qwen_translate(chunk_segments, target_lang, source_lang)
|
|
349
|
+
# chunk_result keys are 0..len(chunk_positions)-1; map back to
|
|
350
|
+
# positions in the caller-provided ``segments`` list.
|
|
351
|
+
for local_idx, text in chunk_result.items():
|
|
352
|
+
if 0 <= local_idx < len(chunk_positions):
|
|
353
|
+
results[chunk_positions[local_idx]] = text
|
|
354
|
+
if progress_callback is not None:
|
|
355
|
+
fraction = (chunk_num + 1) / len(chunks)
|
|
356
|
+
progress_callback(progress_start + (progress_end - progress_start) * fraction)
|
|
357
|
+
return results
|
|
358
|
+
|
|
260
359
|
def translate_segments(
|
|
261
360
|
self,
|
|
262
361
|
segments: list[TranscriptionSegment],
|
|
@@ -266,10 +365,10 @@ class Qwen3Translator:
|
|
|
266
365
|
) -> list[TranslatedSegment]:
|
|
267
366
|
"""Translate segments via Qwen with parse-retry + optional Marian fallback.
|
|
268
367
|
|
|
269
|
-
The progress_callback
|
|
270
|
-
Qwen
|
|
271
|
-
end.
|
|
272
|
-
|
|
368
|
+
The progress_callback ramps from 0 to 0.5 across the first-pass
|
|
369
|
+
Qwen chunks, hits 0.9 after the optional retry/fallback, and 1.0
|
|
370
|
+
at the end. Input larger than the model's context window is split
|
|
371
|
+
across multiple Qwen calls (see ``_qwen_translate_chunked``).
|
|
273
372
|
"""
|
|
274
373
|
effective_source = source_lang or "en"
|
|
275
374
|
self._failures_last_call = []
|
|
@@ -277,13 +376,15 @@ class Qwen3Translator:
|
|
|
277
376
|
translatable_indices = [i for i, seg in enumerate(segments) if _is_translatable_text(seg.text)]
|
|
278
377
|
translatable_segments = [segments[i] for i in translatable_indices]
|
|
279
378
|
|
|
280
|
-
# First attempt.
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
379
|
+
# First attempt — chunked to fit n_ctx.
|
|
380
|
+
qwen_results = self._qwen_translate_chunked(
|
|
381
|
+
translatable_segments,
|
|
382
|
+
target_lang,
|
|
383
|
+
effective_source,
|
|
384
|
+
progress_callback=progress_callback,
|
|
385
|
+
progress_start=0.0,
|
|
386
|
+
progress_end=0.5,
|
|
387
|
+
)
|
|
287
388
|
|
|
288
389
|
# Identify segments Qwen failed (unparseable or missing index).
|
|
289
390
|
# Indices in qwen_results / translatable_segments are 0-based positions
|
|
@@ -299,11 +400,15 @@ class Qwen3Translator:
|
|
|
299
400
|
len(retry_segments),
|
|
300
401
|
len(translatable_segments),
|
|
301
402
|
)
|
|
302
|
-
retry_results = self.
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
403
|
+
retry_results = self._qwen_translate_chunked(
|
|
404
|
+
retry_segments,
|
|
405
|
+
target_lang,
|
|
406
|
+
effective_source,
|
|
407
|
+
)
|
|
408
|
+
# retry_results keys are positions in retry_segments; map back to
|
|
409
|
+
# translatable_segments.
|
|
410
|
+
for retry_local, translation in retry_results.items():
|
|
411
|
+
qwen_results[missing_local_indices[retry_local]] = translation
|
|
307
412
|
if progress_callback is not None:
|
|
308
413
|
progress_callback(0.9)
|
|
309
414
|
|
|
@@ -8,6 +8,7 @@ from .effects import (
|
|
|
8
8
|
Flash,
|
|
9
9
|
FullImageOverlay,
|
|
10
10
|
Glitch,
|
|
11
|
+
ImageOverlay,
|
|
11
12
|
Kaleidoscope,
|
|
12
13
|
KenBurns,
|
|
13
14
|
MirrorFlip,
|
|
@@ -56,6 +57,7 @@ __all__ = [
|
|
|
56
57
|
"SilenceRemoval",
|
|
57
58
|
# Effects
|
|
58
59
|
"FullImageOverlay",
|
|
60
|
+
"ImageOverlay",
|
|
59
61
|
"Blur",
|
|
60
62
|
"Zoom",
|
|
61
63
|
"ColorGrading",
|
|
@@ -14,6 +14,7 @@ audio after ``_apply`` returns.
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
16
|
import logging
|
|
17
|
+
from io import BytesIO
|
|
17
18
|
from pathlib import Path
|
|
18
19
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
19
20
|
|
|
@@ -29,13 +30,14 @@ from videopython.editing.operation import Effect
|
|
|
29
30
|
|
|
30
31
|
if TYPE_CHECKING:
|
|
31
32
|
from videopython.audio import Audio
|
|
32
|
-
from videopython.base.video import Video
|
|
33
|
+
from videopython.base.video import Video, VideoMetadata
|
|
33
34
|
|
|
34
35
|
logger = logging.getLogger(__name__)
|
|
35
36
|
|
|
36
37
|
__all__ = [
|
|
37
38
|
"Effect",
|
|
38
39
|
"FullImageOverlay",
|
|
40
|
+
"ImageOverlay",
|
|
39
41
|
"Blur",
|
|
40
42
|
"Zoom",
|
|
41
43
|
"ColorGrading",
|
|
@@ -771,6 +773,220 @@ class TextOverlay(Effect):
|
|
|
771
773
|
return video
|
|
772
774
|
|
|
773
775
|
|
|
776
|
+
class ImageOverlay(Effect):
|
|
777
|
+
"""Composites a scaled image at an anchored position on every frame in the window.
|
|
778
|
+
|
|
779
|
+
A resolution-independent watermark / logo / brand mark. Unlike
|
|
780
|
+
:class:`FullImageOverlay` (full-frame only, raises on size mismatch), the
|
|
781
|
+
image is scaled to a fraction of the frame *width* and placed at an
|
|
782
|
+
anchored normalized position, so one config works across 1080p / 4k /
|
|
783
|
+
vertical / square. Loaded just-in-time from ``source`` so the op stays
|
|
784
|
+
JSON-serialisable. Off-frame or oversized placement clips to a partial
|
|
785
|
+
paste or a no-op -- the same contract as :class:`TextOverlay`, never an
|
|
786
|
+
error; only an unreadable ``source`` is rejected (in ``predict_metadata``).
|
|
787
|
+
|
|
788
|
+
``source`` may be a raster image (PNG/JPEG/WebP) or an SVG (detected by the
|
|
789
|
+
``.svg`` extension). An SVG is rasterised by ``resvg`` *at the exact target
|
|
790
|
+
pixel width* -- crisp at any frame size, not a blurry upscale of a
|
|
791
|
+
fixed-size bitmap -- with a transparent background and no remote-resource
|
|
792
|
+
fetching (the local path only; no SSRF). SVGs containing text depend on the
|
|
793
|
+
fonts available at render time.
|
|
794
|
+
"""
|
|
795
|
+
|
|
796
|
+
op: Literal["image_overlay"] = "image_overlay"
|
|
797
|
+
streamable: ClassVar[bool] = True
|
|
798
|
+
|
|
799
|
+
source: Path = Field(
|
|
800
|
+
description=(
|
|
801
|
+
"Path to an image file: a raster RGB/RGBA image (PNG/JPEG/WebP) or "
|
|
802
|
+
"an SVG (`.svg`, rasterised at the target resolution). Loaded at "
|
|
803
|
+
"apply time; kept JSON-serialisable as a path."
|
|
804
|
+
),
|
|
805
|
+
)
|
|
806
|
+
scale: float = Field(
|
|
807
|
+
0.15,
|
|
808
|
+
gt=0,
|
|
809
|
+
le=1,
|
|
810
|
+
description=(
|
|
811
|
+
"Overlay width as a fraction of frame width (0-1). Height follows "
|
|
812
|
+
"the image's aspect ratio. Resolution-independent."
|
|
813
|
+
),
|
|
814
|
+
)
|
|
815
|
+
opacity: float = Field(
|
|
816
|
+
1.0,
|
|
817
|
+
ge=0,
|
|
818
|
+
le=1,
|
|
819
|
+
description="Multiplies the image's own alpha. 0 = fully transparent, 1 = use the image alpha unchanged.",
|
|
820
|
+
)
|
|
821
|
+
position: tuple[float, float] = Field(
|
|
822
|
+
(0.95, 0.95),
|
|
823
|
+
description=(
|
|
824
|
+
"Where to place the overlay as normalized (x, y) coordinates. "
|
|
825
|
+
"(0, 0) = top-left corner, (1, 1) = bottom-right corner."
|
|
826
|
+
),
|
|
827
|
+
)
|
|
828
|
+
anchor: Literal["center", "top_left", "top_center", "bottom_center", "bottom_left", "bottom_right"] = Field(
|
|
829
|
+
"bottom_right",
|
|
830
|
+
description="Which point of the overlay box sits at the position coordinate.",
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
_overlay_rgba: np.ndarray | None = PrivateAttr(default=None)
|
|
834
|
+
_svg_cache: dict[int, np.ndarray] = PrivateAttr(default_factory=dict)
|
|
835
|
+
_stream_noop: bool = PrivateAttr(default=False)
|
|
836
|
+
_stream_alpha: np.ndarray | None = PrivateAttr(default=None)
|
|
837
|
+
_stream_rgb: np.ndarray | None = PrivateAttr(default=None)
|
|
838
|
+
_stream_dst: tuple[int, int, int, int] = PrivateAttr(default=(0, 0, 0, 0))
|
|
839
|
+
|
|
840
|
+
@model_validator(mode="after")
|
|
841
|
+
def _validate_position(self) -> ImageOverlay:
|
|
842
|
+
if not (0.0 <= self.position[0] <= 1.0 and 0.0 <= self.position[1] <= 1.0):
|
|
843
|
+
raise ValueError("position values must be in range [0, 1]")
|
|
844
|
+
return self
|
|
845
|
+
|
|
846
|
+
def _is_svg(self) -> bool:
|
|
847
|
+
return self.source.suffix.lower() == ".svg"
|
|
848
|
+
|
|
849
|
+
def predict_metadata(self, meta: VideoMetadata, **_context: Any) -> VideoMetadata:
|
|
850
|
+
"""Reject only a missing/unreadable ``source`` (see :meth:`Operation.predict_metadata`).
|
|
851
|
+
|
|
852
|
+
An unreadable source is the one failure ``run()`` cannot survive -- it
|
|
853
|
+
would raise mid-stream after expensive frame decode -- so it is caught
|
|
854
|
+
at ``validate()`` time, symmetric with ``TranscriptionOverlay``.
|
|
855
|
+
Geometry (oversized / off-frame) is deliberately *not* checked here: it
|
|
856
|
+
clips to a valid no-op like :class:`TextOverlay`, so rejecting it would
|
|
857
|
+
break that contract and the parity with the op this is modeled on. Both
|
|
858
|
+
checks are cheap (a header ``verify()`` / a 1px SVG parse, no full
|
|
859
|
+
decode), so ``validate()`` stays frame-free.
|
|
860
|
+
"""
|
|
861
|
+
try:
|
|
862
|
+
if self._is_svg():
|
|
863
|
+
import resvg_py
|
|
864
|
+
|
|
865
|
+
resvg_py.svg_to_bytes(svg_path=str(self.source), width=1)
|
|
866
|
+
else:
|
|
867
|
+
with Image.open(self.source) as im:
|
|
868
|
+
im.verify()
|
|
869
|
+
except (OSError, ValueError) as exc:
|
|
870
|
+
raise ValueError(f"image_overlay source {str(self.source)!r} is not a readable image: {exc}") from exc
|
|
871
|
+
return meta
|
|
872
|
+
|
|
873
|
+
def _rasterize_svg(self, target_w: int) -> np.ndarray:
|
|
874
|
+
cached = self._svg_cache.get(target_w)
|
|
875
|
+
if cached is not None:
|
|
876
|
+
return cached
|
|
877
|
+
# Lazy import: only when an SVG source is actually used. resvg renders
|
|
878
|
+
# at the exact target width (height proportional to the viewBox) with a
|
|
879
|
+
# transparent background and never fetches remote resources.
|
|
880
|
+
import resvg_py
|
|
881
|
+
|
|
882
|
+
png = resvg_py.svg_to_bytes(svg_path=str(self.source), width=target_w)
|
|
883
|
+
arr = np.array(Image.open(BytesIO(bytes(png))).convert("RGBA"), dtype=np.uint8)
|
|
884
|
+
self._svg_cache[target_w] = arr
|
|
885
|
+
return arr
|
|
886
|
+
|
|
887
|
+
def _load_overlay(self) -> np.ndarray:
|
|
888
|
+
if self._overlay_rgba is not None:
|
|
889
|
+
return self._overlay_rgba
|
|
890
|
+
img = Image.open(self.source).convert("RGBA")
|
|
891
|
+
self._overlay_rgba = np.array(img, dtype=np.uint8)
|
|
892
|
+
return self._overlay_rgba
|
|
893
|
+
|
|
894
|
+
def _compute_position(self, frame_width: int, frame_height: int, img_w: int, img_h: int) -> tuple[int, int]:
|
|
895
|
+
# Copied verbatim from TextOverlay: ImageOverlay's anchor Literal is
|
|
896
|
+
# deliberately the same set, so the geometry is shared by construction.
|
|
897
|
+
px = int(self.position[0] * frame_width)
|
|
898
|
+
py = int(self.position[1] * frame_height)
|
|
899
|
+
|
|
900
|
+
if self.anchor == "center":
|
|
901
|
+
return px - img_w // 2, py - img_h // 2
|
|
902
|
+
if self.anchor == "top_left":
|
|
903
|
+
return px, py
|
|
904
|
+
if self.anchor == "top_center":
|
|
905
|
+
return px - img_w // 2, py
|
|
906
|
+
if self.anchor == "bottom_center":
|
|
907
|
+
return px - img_w // 2, py - img_h
|
|
908
|
+
if self.anchor == "bottom_left":
|
|
909
|
+
return px, py - img_h
|
|
910
|
+
# bottom_right
|
|
911
|
+
return px - img_w, py - img_h
|
|
912
|
+
|
|
913
|
+
def _resized_overlay(self, frame_w: int) -> np.ndarray:
|
|
914
|
+
target_w = max(1, round(self.scale * frame_w))
|
|
915
|
+
if self._is_svg():
|
|
916
|
+
# Rasterise the vector at the target size (crisp) rather than
|
|
917
|
+
# upscaling a fixed bitmap. resvg derives height from the viewBox.
|
|
918
|
+
return self._rasterize_svg(target_w)
|
|
919
|
+
overlay = self._load_overlay()
|
|
920
|
+
src_h, src_w = overlay.shape[:2]
|
|
921
|
+
target_h = max(1, round(target_w * src_h / src_w))
|
|
922
|
+
if (target_w, target_h) == (src_w, src_h):
|
|
923
|
+
return overlay
|
|
924
|
+
resized = Image.fromarray(overlay).resize((target_w, target_h), Image.LANCZOS)
|
|
925
|
+
return np.array(resized, dtype=np.uint8)
|
|
926
|
+
|
|
927
|
+
def _blend_params(
|
|
928
|
+
self, frame_w: int, frame_h: int
|
|
929
|
+
) -> tuple[np.ndarray, np.ndarray, tuple[int, int, int, int]] | None:
|
|
930
|
+
"""Placement + blend inputs shared by the eager and streaming paths.
|
|
931
|
+
|
|
932
|
+
Single source of truth so the two paths cannot drift -- the
|
|
933
|
+
eager/stream parity-hole class of bug fixed in 0.34.1. Returns ``None``
|
|
934
|
+
when the overlay lands fully off-frame (the effect is a no-op).
|
|
935
|
+
"""
|
|
936
|
+
overlay = self._resized_overlay(frame_w)
|
|
937
|
+
oh, ow = overlay.shape[:2]
|
|
938
|
+
x, y = self._compute_position(frame_w, frame_h, ow, oh)
|
|
939
|
+
|
|
940
|
+
src_x = max(0, -x)
|
|
941
|
+
src_y = max(0, -y)
|
|
942
|
+
dst_x = max(0, x)
|
|
943
|
+
dst_y = max(0, y)
|
|
944
|
+
paste_w = min(ow - src_x, frame_w - dst_x)
|
|
945
|
+
paste_h = min(oh - src_y, frame_h - dst_y)
|
|
946
|
+
|
|
947
|
+
if paste_w <= 0 or paste_h <= 0:
|
|
948
|
+
return None
|
|
949
|
+
|
|
950
|
+
region = overlay[src_y : src_y + paste_h, src_x : src_x + paste_w]
|
|
951
|
+
alpha = (region[:, :, 3:4].astype(np.float32) / 255.0) * self.opacity
|
|
952
|
+
rgb = region[:, :, :3].astype(np.float32)
|
|
953
|
+
return alpha, rgb, (dst_y, dst_x, paste_h, paste_w)
|
|
954
|
+
|
|
955
|
+
def streaming_init(self, total_frames: int, fps: float, width: int, height: int) -> None:
|
|
956
|
+
params = self._blend_params(width, height)
|
|
957
|
+
if params is None:
|
|
958
|
+
self._stream_noop = True
|
|
959
|
+
return
|
|
960
|
+
self._stream_noop = False
|
|
961
|
+
self._stream_alpha, self._stream_rgb, self._stream_dst = params
|
|
962
|
+
|
|
963
|
+
def process_frame(self, frame: np.ndarray, frame_index: int) -> np.ndarray:
|
|
964
|
+
if self._stream_noop:
|
|
965
|
+
return frame
|
|
966
|
+
assert self._stream_alpha is not None and self._stream_rgb is not None
|
|
967
|
+
dy, dx, ph, pw = self._stream_dst
|
|
968
|
+
region = frame[dy : dy + ph, dx : dx + pw]
|
|
969
|
+
blended = (
|
|
970
|
+
self._stream_rgb * self._stream_alpha + region.astype(np.float32) * (1.0 - self._stream_alpha)
|
|
971
|
+
).astype(np.uint8)
|
|
972
|
+
frame[dy : dy + ph, dx : dx + pw] = blended
|
|
973
|
+
return frame
|
|
974
|
+
|
|
975
|
+
def _apply(self, video: Video) -> Video:
|
|
976
|
+
frame_h, frame_w = video.frame_shape[:2]
|
|
977
|
+
params = self._blend_params(frame_w, frame_h)
|
|
978
|
+
if params is None:
|
|
979
|
+
return video
|
|
980
|
+
alpha, rgb, (dy, dx, ph, pw) = params
|
|
981
|
+
|
|
982
|
+
logger.info("Applying image overlay...")
|
|
983
|
+
for frame in tqdm(video.frames, desc="Image overlay"):
|
|
984
|
+
region = frame[dy : dy + ph, dx : dx + pw]
|
|
985
|
+
blended = (rgb * alpha + region.astype(np.float32) * (1.0 - alpha)).astype(np.uint8)
|
|
986
|
+
frame[dy : dy + ph, dx : dx + pw] = blended
|
|
987
|
+
return video
|
|
988
|
+
|
|
989
|
+
|
|
774
990
|
class Shake(Effect):
|
|
775
991
|
"""Per-frame camera shake: jitters every frame by a random or rhythmic offset.
|
|
776
992
|
|
|
@@ -175,7 +175,18 @@ class Operation(BaseModel):
|
|
|
175
175
|
raise NotImplementedError(f"{type(self).__name__}.apply not implemented")
|
|
176
176
|
|
|
177
177
|
def predict_metadata(self, meta: VideoMetadata) -> VideoMetadata:
|
|
178
|
-
"""Predict output metadata from input metadata. Default: identity.
|
|
178
|
+
"""Predict output metadata from input metadata. Default: identity.
|
|
179
|
+
|
|
180
|
+
Run during ``VideoEdit.validate()``'s dry-run, before any frames are
|
|
181
|
+
decoded. Beyond predicting shape, this is the fail-fast gate, and it
|
|
182
|
+
has one contract: **reject exactly the plans that would otherwise crash
|
|
183
|
+
or do unrecoverable / expensive work in** :meth:`apply` **/** ``run()``;
|
|
184
|
+
anything ``run()`` can absorb by graceful degradation is NOT rejected.
|
|
185
|
+
``TranscriptionOverlay`` rejects un-fittable subtitles (they used to
|
|
186
|
+
crash mid-render); ``TextOverlay``/``ImageOverlay`` do not reject
|
|
187
|
+
off-frame geometry (it clips to a valid no-op). Keep the check
|
|
188
|
+
metadata-cheap -- no frame decode.
|
|
189
|
+
"""
|
|
179
190
|
return meta
|
|
180
191
|
|
|
181
192
|
def to_ffmpeg_filter(self, ctx: FilterCtx) -> str | None:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|