videopython 0.34.1__tar.gz → 0.35.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {videopython-0.34.1 → videopython-0.35.1}/PKG-INFO +4 -4
  2. {videopython-0.34.1 → videopython-0.35.1}/README.md +1 -1
  3. {videopython-0.34.1 → videopython-0.35.1}/pyproject.toml +4 -4
  4. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/qwen3.py +124 -19
  5. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/__init__.py +2 -0
  6. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/effects.py +217 -1
  7. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/operation.py +12 -1
  8. {videopython-0.34.1 → videopython-0.35.1}/.gitignore +0 -0
  9. {videopython-0.34.1 → videopython-0.35.1}/LICENSE +0 -0
  10. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/__init__.py +0 -0
  11. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/__init__.py +0 -0
  12. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/_device.py +0 -0
  13. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/__init__.py +0 -0
  14. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/config.py +0 -0
  15. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/dubber.py +0 -0
  16. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/expressiveness.py +0 -0
  17. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/loudness.py +0 -0
  18. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/models.py +0 -0
  19. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/pipeline.py +0 -0
  20. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/quality.py +0 -0
  21. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/remux.py +0 -0
  22. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/timing.py +0 -0
  23. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/dubbing/voice_sample.py +0 -0
  24. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/__init__.py +0 -0
  25. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/audio.py +0 -0
  26. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/image.py +0 -0
  27. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/translation.py +0 -0
  28. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/generation/video.py +0 -0
  29. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/transforms.py +0 -0
  30. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/__init__.py +0 -0
  31. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/audio.py +0 -0
  32. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/faces.py +0 -0
  33. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/image.py +0 -0
  34. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/separation.py +0 -0
  35. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/understanding/temporal.py +0 -0
  36. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/__init__.py +0 -0
  37. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/analyzer.py +0 -0
  38. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/models.py +0 -0
  39. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/sampling.py +0 -0
  40. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/ai/video_analysis/stages.py +0 -0
  41. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/audio/__init__.py +0 -0
  42. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/audio/analysis.py +0 -0
  43. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/audio/audio.py +0 -0
  44. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/__init__.py +0 -0
  45. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/_dimensions.py +0 -0
  46. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/_ffmpeg.py +0 -0
  47. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/_video_io.py +0 -0
  48. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/description.py +0 -0
  49. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/exceptions.py +0 -0
  50. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
  51. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
  52. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/fonts/__init__.py +0 -0
  53. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/image_text.py +0 -0
  54. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/transcription.py +0 -0
  55. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/base/video.py +0 -0
  56. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/streaming.py +0 -0
  57. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/transcription_overlay.py +0 -0
  58. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/transforms.py +0 -0
  59. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/editing/video_edit.py +0 -0
  60. {videopython-0.34.1 → videopython-0.35.1}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.34.1
3
+ Version: 0.35.1
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -12,15 +12,15 @@ Keywords: ai,editing,generation,movie,opencv,python,shorts,video,videopython
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.10
16
15
  Classifier: Programming Language :: Python :: 3.11
17
16
  Classifier: Programming Language :: Python :: 3.12
18
17
  Classifier: Programming Language :: Python :: 3.13
19
- Requires-Python: <3.14,>=3.10
18
+ Requires-Python: <3.14,>=3.11
20
19
  Requires-Dist: numpy>=1.25.2
21
20
  Requires-Dist: opencv-python-headless>=4.9.0.80
22
21
  Requires-Dist: pillow>=12.1.1
23
22
  Requires-Dist: pydantic>=2.8.0
23
+ Requires-Dist: resvg-py>=0.3.2
24
24
  Requires-Dist: tqdm>=4.66.3
25
25
  Provides-Extra: ai
26
26
  Requires-Dist: accelerate>=0.29.2; extra == 'ai'
@@ -67,7 +67,7 @@ pip install videopython # core video/audio editing
67
67
  pip install "videopython[ai]" # + local AI features (GPU recommended)
68
68
  ```
69
69
 
70
- Python `>=3.10, <3.14`. AI features run locally — no cloud API keys required, but model weights are downloaded on first use.
70
+ Python `>=3.11, <3.14`. AI features run locally — no cloud API keys required, but model weights are downloaded on first use.
71
71
 
72
72
  ## Quick Start
73
73
 
@@ -18,7 +18,7 @@ pip install videopython # core video/audio editing
18
18
  pip install "videopython[ai]" # + local AI features (GPU recommended)
19
19
  ```
20
20
 
21
- Python `>=3.10, <3.14`. AI features run locally — no cloud API keys required, but model weights are downloaded on first use.
21
+ Python `>=3.11, <3.14`. AI features run locally — no cloud API keys required, but model weights are downloaded on first use.
22
22
 
23
23
  ## Quick Start
24
24
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.34.1"
3
+ version = "0.35.1"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -9,7 +9,7 @@ authors = [
9
9
  ]
10
10
  license = { text = "Apache-2.0" }
11
11
  readme = "README.md"
12
- requires-python = ">=3.10, <3.14"
12
+ requires-python = ">=3.11, <3.14"
13
13
  keywords = [
14
14
  "python",
15
15
  "videopython",
@@ -24,7 +24,6 @@ keywords = [
24
24
  classifiers = [
25
25
  "License :: OSI Approved :: Apache Software License",
26
26
  "Programming Language :: Python :: 3",
27
- "Programming Language :: Python :: 3.10",
28
27
  "Programming Language :: Python :: 3.11",
29
28
  "Programming Language :: Python :: 3.12",
30
29
  "Programming Language :: Python :: 3.13",
@@ -35,6 +34,7 @@ dependencies = [
35
34
  "numpy>=1.25.2",
36
35
  "opencv-python-headless>=4.9.0.80",
37
36
  "pillow>=12.1.1",
37
+ "resvg-py>=0.3.2",
38
38
  "tqdm>=4.66.3",
39
39
  "pydantic>=2.8.0",
40
40
  ]
@@ -203,7 +203,7 @@ markers = [
203
203
 
204
204
  [tool.ruff]
205
205
  line-length = 120
206
- target-version = "py310"
206
+ target-version = "py311"
207
207
 
208
208
  [tool.ruff.lint]
209
209
  select = [
@@ -92,6 +92,56 @@ _SPEECH_CHARS_DEFAULT = 12.0
92
92
  _LOW_LOGPROB_HINT_THRESHOLD = -1.0
93
93
 
94
94
 
95
+ # Conservative chars-per-token used to size chunks without invoking the
96
+ # tokenizer. Morphologically rich languages land around 1.5-2.0
97
+ # chars/token; ASCII is ~3-4. We use the low end so chunks stay safe for
98
+ # any source language.
99
+ _CHARS_PER_TOKEN = 2.0
100
+ # Token reserve for the system prompt + user-prompt envelope ("Input
101
+ # segments:" / "Translations (...)" wrappers). Empirical upper bound.
102
+ _PROMPT_OVERHEAD_TOKENS = 300
103
+ # Per-segment JSON wrapper cost (keys, braces, commas, index). Added on
104
+ # top of len(seg.text) when sizing chunks.
105
+ _SEGMENT_ENVELOPE_CHARS = 40
106
+
107
+
108
+ def _chunk_segment_indices(
109
+ segments: list[TranscriptionSegment],
110
+ n_ctx: int,
111
+ max_tokens: int,
112
+ ) -> list[list[int]]:
113
+ """Group positions in ``segments`` into batches that fit one Qwen call.
114
+
115
+ Each batch must satisfy ``prompt_tokens + max_tokens <= n_ctx``, which
116
+ llama.cpp enforces. We approximate prompt token count from character
117
+ length using ``_CHARS_PER_TOKEN``; the conservative ratio means a chunk
118
+ estimated at the budget will tokenize to comfortably less.
119
+
120
+ A segment whose own serialized form exceeds the per-call budget goes in
121
+ its own chunk anyway — better to let llama.cpp report a clean overflow
122
+ on one giant segment than to silently swallow it.
123
+ """
124
+ prompt_token_budget = n_ctx - max_tokens - _PROMPT_OVERHEAD_TOKENS
125
+ if prompt_token_budget <= 0:
126
+ return [[i] for i in range(len(segments))]
127
+ char_budget = int(prompt_token_budget * _CHARS_PER_TOKEN)
128
+
129
+ chunks: list[list[int]] = []
130
+ current: list[int] = []
131
+ current_chars = 0
132
+ for i, seg in enumerate(segments):
133
+ seg_chars = len(seg.text) + _SEGMENT_ENVELOPE_CHARS
134
+ if current and current_chars + seg_chars > char_budget:
135
+ chunks.append(current)
136
+ current = []
137
+ current_chars = 0
138
+ current.append(i)
139
+ current_chars += seg_chars
140
+ if current:
141
+ chunks.append(current)
142
+ return chunks
143
+
144
+
95
145
  def _target_chars_for(duration_seconds: float, target_lang: str) -> int:
96
146
  """Character-count budget for a segment of ``duration_seconds`` in ``target_lang``."""
97
147
  rate = _SPEECH_CHARS_PER_SEC.get(target_lang, _SPEECH_CHARS_DEFAULT)
@@ -170,9 +220,12 @@ class Qwen3Translator:
170
220
  ``DEFAULT_REPO_ID``; override for eval harnesses.
171
221
  filename: GGUF filename within ``repo_id``. Defaults to
172
222
  ``DEFAULT_FILENAME``.
173
- n_ctx: llama.cpp context window. 8192 is plenty for a 15-min source;
174
- raise for very long sources. Hard cap is the model's training
175
- context (262K for Qwen3-4B-Instruct-2507).
223
+ n_ctx: llama.cpp context window. ``translate_segments`` splits the
224
+ input across multiple calls when it doesn't fit, so 8192 stays
225
+ safe even for very long sources; raise to reduce the number of
226
+ calls (and gain cross-segment context per call) at the cost of
227
+ VRAM. Hard cap is the model's training context (262K for
228
+ Qwen3-4B-Instruct-2507).
176
229
  max_tokens: Generation cap per call. 4× the input character count
177
230
  is a safe upper bound for translation output.
178
231
  temperature: Decoding temperature. 0.1 keeps output structurally
@@ -257,6 +310,52 @@ class Qwen3Translator:
257
310
  raw = response["choices"][0]["text"]
258
311
  return _parse_jsonl_response(raw)
259
312
 
313
+ def _qwen_translate_chunked(
314
+ self,
315
+ segments: list[TranscriptionSegment],
316
+ target_lang: str,
317
+ source_lang: str,
318
+ progress_callback: Callable[[float], None] | None = None,
319
+ progress_start: float = 0.0,
320
+ progress_end: float = 1.0,
321
+ ) -> dict[int, str]:
322
+ """Translate ``segments`` across one or more Qwen calls.
323
+
324
+ Returns a dict keyed by position in ``segments``. Splitting into
325
+ chunks keeps each call under llama.cpp's ``n_ctx`` cap — without
326
+ chunking, a long source with hundreds of dense segments easily
327
+ blows past the default 8192 token window.
328
+
329
+ Progress is reported as a linear ramp from ``progress_start`` to
330
+ ``progress_end``, one tick per chunk completed.
331
+ """
332
+ results: dict[int, str] = {}
333
+ if not segments:
334
+ if progress_callback is not None:
335
+ progress_callback(progress_end)
336
+ return results
337
+
338
+ chunks = _chunk_segment_indices(segments, self.n_ctx, self.max_tokens)
339
+ if len(chunks) > 1:
340
+ logger.info(
341
+ "Qwen3Translator: splitting %d segments into %d chunks (n_ctx=%d)",
342
+ len(segments),
343
+ len(chunks),
344
+ self.n_ctx,
345
+ )
346
+ for chunk_num, chunk_positions in enumerate(chunks):
347
+ chunk_segments = [segments[p] for p in chunk_positions]
348
+ chunk_result = self._qwen_translate(chunk_segments, target_lang, source_lang)
349
+ # chunk_result keys are 0..len(chunk_positions)-1; map back to
350
+ # positions in the caller-provided ``segments`` list.
351
+ for local_idx, text in chunk_result.items():
352
+ if 0 <= local_idx < len(chunk_positions):
353
+ results[chunk_positions[local_idx]] = text
354
+ if progress_callback is not None:
355
+ fraction = (chunk_num + 1) / len(chunks)
356
+ progress_callback(progress_start + (progress_end - progress_start) * fraction)
357
+ return results
358
+
260
359
  def translate_segments(
261
360
  self,
262
361
  segments: list[TranscriptionSegment],
@@ -266,10 +365,10 @@ class Qwen3Translator:
266
365
  ) -> list[TranslatedSegment]:
267
366
  """Translate segments via Qwen with parse-retry + optional Marian fallback.
268
367
 
269
- The progress_callback fires three times: 0.5 after the first
270
- Qwen call, 0.9 after the optional retry/fallback, 1.0 at the
271
- end. M2.1 phase 2 confirmed smaller batches don't help on CPU,
272
- so finer-grained progress isn't possible without fake ticks.
368
+ The progress_callback ramps from 0 to 0.5 across the first-pass
369
+ Qwen chunks, hits 0.9 after the optional retry/fallback, and 1.0
370
+ at the end. Input larger than the model's context window is split
371
+ across multiple Qwen calls (see ``_qwen_translate_chunked``).
273
372
  """
274
373
  effective_source = source_lang or "en"
275
374
  self._failures_last_call = []
@@ -277,13 +376,15 @@ class Qwen3Translator:
277
376
  translatable_indices = [i for i, seg in enumerate(segments) if _is_translatable_text(seg.text)]
278
377
  translatable_segments = [segments[i] for i in translatable_indices]
279
378
 
280
- # First attempt.
281
- if translatable_segments:
282
- qwen_results = self._qwen_translate(translatable_segments, target_lang, effective_source)
283
- else:
284
- qwen_results = {}
285
- if progress_callback is not None:
286
- progress_callback(0.5)
379
+ # First attempt — chunked to fit n_ctx.
380
+ qwen_results = self._qwen_translate_chunked(
381
+ translatable_segments,
382
+ target_lang,
383
+ effective_source,
384
+ progress_callback=progress_callback,
385
+ progress_start=0.0,
386
+ progress_end=0.5,
387
+ )
287
388
 
288
389
  # Identify segments Qwen failed (unparseable or missing index).
289
390
  # Indices in qwen_results / translatable_segments are 0-based positions
@@ -299,11 +400,15 @@ class Qwen3Translator:
299
400
  len(retry_segments),
300
401
  len(translatable_segments),
301
402
  )
302
- retry_results = self._qwen_translate(retry_segments, target_lang, effective_source)
303
- # retry_results uses 0..len(retry_segments)-1 as keys; map back.
304
- for retry_local, original_local in enumerate(missing_local_indices):
305
- if retry_local in retry_results:
306
- qwen_results[original_local] = retry_results[retry_local]
403
+ retry_results = self._qwen_translate_chunked(
404
+ retry_segments,
405
+ target_lang,
406
+ effective_source,
407
+ )
408
+ # retry_results keys are positions in retry_segments; map back to
409
+ # translatable_segments.
410
+ for retry_local, translation in retry_results.items():
411
+ qwen_results[missing_local_indices[retry_local]] = translation
307
412
  if progress_callback is not None:
308
413
  progress_callback(0.9)
309
414
 
@@ -8,6 +8,7 @@ from .effects import (
8
8
  Flash,
9
9
  FullImageOverlay,
10
10
  Glitch,
11
+ ImageOverlay,
11
12
  Kaleidoscope,
12
13
  KenBurns,
13
14
  MirrorFlip,
@@ -56,6 +57,7 @@ __all__ = [
56
57
  "SilenceRemoval",
57
58
  # Effects
58
59
  "FullImageOverlay",
60
+ "ImageOverlay",
59
61
  "Blur",
60
62
  "Zoom",
61
63
  "ColorGrading",
@@ -14,6 +14,7 @@ audio after ``_apply`` returns.
14
14
  from __future__ import annotations
15
15
 
16
16
  import logging
17
+ from io import BytesIO
17
18
  from pathlib import Path
18
19
  from typing import TYPE_CHECKING, Any, ClassVar, Literal
19
20
 
@@ -29,13 +30,14 @@ from videopython.editing.operation import Effect
29
30
 
30
31
  if TYPE_CHECKING:
31
32
  from videopython.audio import Audio
32
- from videopython.base.video import Video
33
+ from videopython.base.video import Video, VideoMetadata
33
34
 
34
35
  logger = logging.getLogger(__name__)
35
36
 
36
37
  __all__ = [
37
38
  "Effect",
38
39
  "FullImageOverlay",
40
+ "ImageOverlay",
39
41
  "Blur",
40
42
  "Zoom",
41
43
  "ColorGrading",
@@ -771,6 +773,220 @@ class TextOverlay(Effect):
771
773
  return video
772
774
 
773
775
 
776
+ class ImageOverlay(Effect):
777
+ """Composites a scaled image at an anchored position on every frame in the window.
778
+
779
+ A resolution-independent watermark / logo / brand mark. Unlike
780
+ :class:`FullImageOverlay` (full-frame only, raises on size mismatch), the
781
+ image is scaled to a fraction of the frame *width* and placed at an
782
+ anchored normalized position, so one config works across 1080p / 4k /
783
+ vertical / square. Loaded just-in-time from ``source`` so the op stays
784
+ JSON-serialisable. Off-frame or oversized placement clips to a partial
785
+ paste or a no-op -- the same contract as :class:`TextOverlay`, never an
786
+ error; only an unreadable ``source`` is rejected (in ``predict_metadata``).
787
+
788
+ ``source`` may be a raster image (PNG/JPEG/WebP) or an SVG (detected by the
789
+ ``.svg`` extension). An SVG is rasterised by ``resvg`` *at the exact target
790
+ pixel width* -- crisp at any frame size, not a blurry upscale of a
791
+ fixed-size bitmap -- with a transparent background and no remote-resource
792
+ fetching (the local path only; no SSRF). SVGs containing text depend on the
793
+ fonts available at render time.
794
+ """
795
+
796
+ op: Literal["image_overlay"] = "image_overlay"
797
+ streamable: ClassVar[bool] = True
798
+
799
+ source: Path = Field(
800
+ description=(
801
+ "Path to an image file: a raster RGB/RGBA image (PNG/JPEG/WebP) or "
802
+ "an SVG (`.svg`, rasterised at the target resolution). Loaded at "
803
+ "apply time; kept JSON-serialisable as a path."
804
+ ),
805
+ )
806
+ scale: float = Field(
807
+ 0.15,
808
+ gt=0,
809
+ le=1,
810
+ description=(
811
+ "Overlay width as a fraction of frame width (0-1). Height follows "
812
+ "the image's aspect ratio. Resolution-independent."
813
+ ),
814
+ )
815
+ opacity: float = Field(
816
+ 1.0,
817
+ ge=0,
818
+ le=1,
819
+ description="Multiplies the image's own alpha. 0 = fully transparent, 1 = use the image alpha unchanged.",
820
+ )
821
+ position: tuple[float, float] = Field(
822
+ (0.95, 0.95),
823
+ description=(
824
+ "Where to place the overlay as normalized (x, y) coordinates. "
825
+ "(0, 0) = top-left corner, (1, 1) = bottom-right corner."
826
+ ),
827
+ )
828
+ anchor: Literal["center", "top_left", "top_center", "bottom_center", "bottom_left", "bottom_right"] = Field(
829
+ "bottom_right",
830
+ description="Which point of the overlay box sits at the position coordinate.",
831
+ )
832
+
833
+ _overlay_rgba: np.ndarray | None = PrivateAttr(default=None)
834
+ _svg_cache: dict[int, np.ndarray] = PrivateAttr(default_factory=dict)
835
+ _stream_noop: bool = PrivateAttr(default=False)
836
+ _stream_alpha: np.ndarray | None = PrivateAttr(default=None)
837
+ _stream_rgb: np.ndarray | None = PrivateAttr(default=None)
838
+ _stream_dst: tuple[int, int, int, int] = PrivateAttr(default=(0, 0, 0, 0))
839
+
840
+ @model_validator(mode="after")
841
+ def _validate_position(self) -> ImageOverlay:
842
+ if not (0.0 <= self.position[0] <= 1.0 and 0.0 <= self.position[1] <= 1.0):
843
+ raise ValueError("position values must be in range [0, 1]")
844
+ return self
845
+
846
+ def _is_svg(self) -> bool:
847
+ return self.source.suffix.lower() == ".svg"
848
+
849
+ def predict_metadata(self, meta: VideoMetadata, **_context: Any) -> VideoMetadata:
850
+ """Reject only a missing/unreadable ``source`` (see :meth:`Operation.predict_metadata`).
851
+
852
+ An unreadable source is the one failure ``run()`` cannot survive -- it
853
+ would raise mid-stream after expensive frame decode -- so it is caught
854
+ at ``validate()`` time, symmetric with ``TranscriptionOverlay``.
855
+ Geometry (oversized / off-frame) is deliberately *not* checked here: it
856
+ clips to a valid no-op like :class:`TextOverlay`, so rejecting it would
857
+ break that contract and the parity with the op this is modeled on. Both
858
+ checks are cheap (a header ``verify()`` / a 1px SVG parse, no full
859
+ decode), so ``validate()`` stays frame-free.
860
+ """
861
+ try:
862
+ if self._is_svg():
863
+ import resvg_py
864
+
865
+ resvg_py.svg_to_bytes(svg_path=str(self.source), width=1)
866
+ else:
867
+ with Image.open(self.source) as im:
868
+ im.verify()
869
+ except (OSError, ValueError) as exc:
870
+ raise ValueError(f"image_overlay source {str(self.source)!r} is not a readable image: {exc}") from exc
871
+ return meta
872
+
873
+ def _rasterize_svg(self, target_w: int) -> np.ndarray:
874
+ cached = self._svg_cache.get(target_w)
875
+ if cached is not None:
876
+ return cached
877
+ # Lazy import: only when an SVG source is actually used. resvg renders
878
+ # at the exact target width (height proportional to the viewBox) with a
879
+ # transparent background and never fetches remote resources.
880
+ import resvg_py
881
+
882
+ png = resvg_py.svg_to_bytes(svg_path=str(self.source), width=target_w)
883
+ arr = np.array(Image.open(BytesIO(bytes(png))).convert("RGBA"), dtype=np.uint8)
884
+ self._svg_cache[target_w] = arr
885
+ return arr
886
+
887
+ def _load_overlay(self) -> np.ndarray:
888
+ if self._overlay_rgba is not None:
889
+ return self._overlay_rgba
890
+ img = Image.open(self.source).convert("RGBA")
891
+ self._overlay_rgba = np.array(img, dtype=np.uint8)
892
+ return self._overlay_rgba
893
+
894
+ def _compute_position(self, frame_width: int, frame_height: int, img_w: int, img_h: int) -> tuple[int, int]:
895
+ # Copied verbatim from TextOverlay: ImageOverlay's anchor Literal is
896
+ # deliberately the same set, so the geometry is shared by construction.
897
+ px = int(self.position[0] * frame_width)
898
+ py = int(self.position[1] * frame_height)
899
+
900
+ if self.anchor == "center":
901
+ return px - img_w // 2, py - img_h // 2
902
+ if self.anchor == "top_left":
903
+ return px, py
904
+ if self.anchor == "top_center":
905
+ return px - img_w // 2, py
906
+ if self.anchor == "bottom_center":
907
+ return px - img_w // 2, py - img_h
908
+ if self.anchor == "bottom_left":
909
+ return px, py - img_h
910
+ # bottom_right
911
+ return px - img_w, py - img_h
912
+
913
+ def _resized_overlay(self, frame_w: int) -> np.ndarray:
914
+ target_w = max(1, round(self.scale * frame_w))
915
+ if self._is_svg():
916
+ # Rasterise the vector at the target size (crisp) rather than
917
+ # upscaling a fixed bitmap. resvg derives height from the viewBox.
918
+ return self._rasterize_svg(target_w)
919
+ overlay = self._load_overlay()
920
+ src_h, src_w = overlay.shape[:2]
921
+ target_h = max(1, round(target_w * src_h / src_w))
922
+ if (target_w, target_h) == (src_w, src_h):
923
+ return overlay
924
+ resized = Image.fromarray(overlay).resize((target_w, target_h), Image.LANCZOS)
925
+ return np.array(resized, dtype=np.uint8)
926
+
927
+ def _blend_params(
928
+ self, frame_w: int, frame_h: int
929
+ ) -> tuple[np.ndarray, np.ndarray, tuple[int, int, int, int]] | None:
930
+ """Placement + blend inputs shared by the eager and streaming paths.
931
+
932
+ Single source of truth so the two paths cannot drift -- the
933
+ eager/stream parity-hole class of bug fixed in 0.34.1. Returns ``None``
934
+ when the overlay lands fully off-frame (the effect is a no-op).
935
+ """
936
+ overlay = self._resized_overlay(frame_w)
937
+ oh, ow = overlay.shape[:2]
938
+ x, y = self._compute_position(frame_w, frame_h, ow, oh)
939
+
940
+ src_x = max(0, -x)
941
+ src_y = max(0, -y)
942
+ dst_x = max(0, x)
943
+ dst_y = max(0, y)
944
+ paste_w = min(ow - src_x, frame_w - dst_x)
945
+ paste_h = min(oh - src_y, frame_h - dst_y)
946
+
947
+ if paste_w <= 0 or paste_h <= 0:
948
+ return None
949
+
950
+ region = overlay[src_y : src_y + paste_h, src_x : src_x + paste_w]
951
+ alpha = (region[:, :, 3:4].astype(np.float32) / 255.0) * self.opacity
952
+ rgb = region[:, :, :3].astype(np.float32)
953
+ return alpha, rgb, (dst_y, dst_x, paste_h, paste_w)
954
+
955
+ def streaming_init(self, total_frames: int, fps: float, width: int, height: int) -> None:
956
+ params = self._blend_params(width, height)
957
+ if params is None:
958
+ self._stream_noop = True
959
+ return
960
+ self._stream_noop = False
961
+ self._stream_alpha, self._stream_rgb, self._stream_dst = params
962
+
963
+ def process_frame(self, frame: np.ndarray, frame_index: int) -> np.ndarray:
964
+ if self._stream_noop:
965
+ return frame
966
+ assert self._stream_alpha is not None and self._stream_rgb is not None
967
+ dy, dx, ph, pw = self._stream_dst
968
+ region = frame[dy : dy + ph, dx : dx + pw]
969
+ blended = (
970
+ self._stream_rgb * self._stream_alpha + region.astype(np.float32) * (1.0 - self._stream_alpha)
971
+ ).astype(np.uint8)
972
+ frame[dy : dy + ph, dx : dx + pw] = blended
973
+ return frame
974
+
975
+ def _apply(self, video: Video) -> Video:
976
+ frame_h, frame_w = video.frame_shape[:2]
977
+ params = self._blend_params(frame_w, frame_h)
978
+ if params is None:
979
+ return video
980
+ alpha, rgb, (dy, dx, ph, pw) = params
981
+
982
+ logger.info("Applying image overlay...")
983
+ for frame in tqdm(video.frames, desc="Image overlay"):
984
+ region = frame[dy : dy + ph, dx : dx + pw]
985
+ blended = (rgb * alpha + region.astype(np.float32) * (1.0 - alpha)).astype(np.uint8)
986
+ frame[dy : dy + ph, dx : dx + pw] = blended
987
+ return video
988
+
989
+
774
990
  class Shake(Effect):
775
991
  """Per-frame camera shake: jitters every frame by a random or rhythmic offset.
776
992
 
@@ -175,7 +175,18 @@ class Operation(BaseModel):
175
175
  raise NotImplementedError(f"{type(self).__name__}.apply not implemented")
176
176
 
177
177
  def predict_metadata(self, meta: VideoMetadata) -> VideoMetadata:
178
- """Predict output metadata from input metadata. Default: identity."""
178
+ """Predict output metadata from input metadata. Default: identity.
179
+
180
+ Run during ``VideoEdit.validate()``'s dry-run, before any frames are
181
+ decoded. Beyond predicting shape, this is the fail-fast gate, and it
182
+ has one contract: **reject exactly the plans that would otherwise crash
183
+ or do unrecoverable / expensive work in** :meth:`apply` **/** ``run()``;
184
+ anything ``run()`` can absorb by graceful degradation is NOT rejected.
185
+ ``TranscriptionOverlay`` rejects un-fittable subtitles (they used to
186
+ crash mid-render); ``TextOverlay``/``ImageOverlay`` do not reject
187
+ off-frame geometry (it clips to a valid no-op). Keep the check
188
+ metadata-cheap -- no frame decode.
189
+ """
179
190
  return meta
180
191
 
181
192
  def to_ffmpeg_filter(self, ctx: FilterCtx) -> str | None:
File without changes
File without changes