videopython 0.26.8__tar.gz → 0.26.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {videopython-0.26.8 → videopython-0.26.10}/PKG-INFO +1 -1
  2. {videopython-0.26.8 → videopython-0.26.10}/pyproject.toml +1 -1
  3. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/pipeline.py +19 -4
  4. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/video_analysis.py +123 -78
  5. {videopython-0.26.8 → videopython-0.26.10}/.gitignore +0 -0
  6. {videopython-0.26.8 → videopython-0.26.10}/LICENSE +0 -0
  7. {videopython-0.26.8 → videopython-0.26.10}/README.md +0 -0
  8. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/__init__.py +0 -0
  9. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/__init__.py +0 -0
  10. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/_device.py +0 -0
  11. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/__init__.py +0 -0
  12. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/dubber.py +0 -0
  13. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/models.py +0 -0
  14. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/remux.py +0 -0
  15. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/timing.py +0 -0
  16. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/__init__.py +0 -0
  17. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/audio.py +0 -0
  18. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/image.py +0 -0
  19. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/translation.py +0 -0
  20. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/video.py +0 -0
  21. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/registry.py +0 -0
  22. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/__init__.py +0 -0
  23. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/inpainter.py +0 -0
  24. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/models.py +0 -0
  25. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/segmenter.py +0 -0
  26. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/swapper.py +0 -0
  27. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/transforms.py +0 -0
  28. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/__init__.py +0 -0
  29. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/audio.py +0 -0
  30. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/image.py +0 -0
  31. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/separation.py +0 -0
  32. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/temporal.py +0 -0
  33. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/__init__.py +0 -0
  34. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/audio/__init__.py +0 -0
  35. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/audio/analysis.py +0 -0
  36. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/audio/audio.py +0 -0
  37. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/combine.py +0 -0
  38. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/description.py +0 -0
  39. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/effects.py +0 -0
  40. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/exceptions.py +0 -0
  41. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/progress.py +0 -0
  42. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/registry.py +0 -0
  43. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/scene.py +0 -0
  44. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/streaming.py +0 -0
  45. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/text/__init__.py +0 -0
  46. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/text/overlay.py +0 -0
  47. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/text/transcription.py +0 -0
  48. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/transforms.py +0 -0
  49. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/transitions.py +0 -0
  50. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/utils.py +0 -0
  51. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/video.py +0 -0
  52. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/editing/__init__.py +0 -0
  53. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/editing/multicam.py +0 -0
  54. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/editing/premiere_xml.py +0 -0
  55. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/editing/video_edit.py +0 -0
  56. {videopython-0.26.8 → videopython-0.26.10}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.8
3
+ Version: 0.26.10
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.8"
3
+ version = "0.26.10"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -350,10 +350,25 @@ class LocalDubbingPipeline:
350
350
  speaker = segment.speaker or "speaker_0"
351
351
  cached_path = speaker_wav_paths.get(speaker) if voice_clone else None
352
352
 
353
- if cached_path is not None:
354
- dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=cached_path)
355
- else:
356
- dubbed_audio = self._tts.generate_audio(segment.translated_text)
353
+ try:
354
+ if cached_path is not None:
355
+ dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=cached_path)
356
+ else:
357
+ dubbed_audio = self._tts.generate_audio(segment.translated_text)
358
+ except Exception as e:
359
+ # Chatterbox occasionally crashes on short translated text
360
+ # (alignment_stream_analyzer indexing on tensors with <=5
361
+ # speech tokens). One bad segment shouldn't lose a long
362
+ # multi-hour run — log and skip so the rest proceeds.
363
+ logger.warning(
364
+ "TTS failed for segment %d/%d (speaker=%s, text=%r): %s — skipping",
365
+ i + 1,
366
+ len(translated_segments),
367
+ speaker,
368
+ segment.translated_text,
369
+ e,
370
+ )
371
+ continue
357
372
 
358
373
  dubbed_segments.append(dubbed_audio)
359
374
  target_durations.append(segment.duration)
@@ -7,12 +7,14 @@ import math
7
7
  import re
8
8
  import subprocess
9
9
  import time
10
+ from collections.abc import Callable, Iterator
10
11
  from concurrent.futures import ThreadPoolExecutor
12
+ from contextlib import contextmanager, nullcontext
11
13
  from dataclasses import dataclass, field
12
14
  from datetime import datetime, timezone
13
15
  from importlib import metadata as importlib_metadata
14
16
  from pathlib import Path
15
- from typing import Any
17
+ from typing import Any, TypeVar
16
18
 
17
19
  import numpy as np
18
20
  from PIL import Image
@@ -144,17 +146,28 @@ class VideoAnalysisSource:
144
146
 
145
147
  @dataclass
146
148
  class AnalysisRunInfo:
147
- """Runtime/provenance metadata for a full analysis run."""
149
+ """Runtime/provenance metadata for a full analysis run.
150
+
151
+ ``stage_durations_seconds`` is populated by the analyzer with per-stage
152
+ wall-clock times (whisper, scene_detection, scene_analysis, scene_vlm,
153
+ audio_classification, and -- when both run together --
154
+ whisper_and_scene_detection_parallel). Consumers can persist or aggregate
155
+ these to track pipeline performance over time.
156
+ """
148
157
 
149
158
  created_at: str
150
159
  mode: str
151
160
  library_version: str | None = None
161
+ stage_durations_seconds: dict[str, float] = field(default_factory=dict)
162
+ total_duration_seconds: float | None = None
152
163
 
153
164
  def to_dict(self) -> dict[str, Any]:
154
165
  return {
155
166
  "created_at": self.created_at,
156
167
  "mode": self.mode,
157
168
  "library_version": self.library_version,
169
+ "stage_durations_seconds": dict(self.stage_durations_seconds),
170
+ "total_duration_seconds": self.total_duration_seconds,
158
171
  }
159
172
 
160
173
  @classmethod
@@ -163,6 +176,8 @@ class AnalysisRunInfo:
163
176
  created_at=data["created_at"],
164
177
  mode=data["mode"],
165
178
  library_version=data.get("library_version"),
179
+ stage_durations_seconds={str(k): float(v) for k, v in data["stage_durations_seconds"].items()},
180
+ total_duration_seconds=data["total_duration_seconds"],
166
181
  )
167
182
 
168
183
 
@@ -413,17 +428,17 @@ class VideoAnalyzer:
413
428
  # which corrupts Whisper's model weights if they're initialized at the
414
429
  # same time.
415
430
  if run_whisper and run_scene_det:
416
- transcription, detected = self._run_whisper_and_scene_detection(source_path=source_path, video=video)
431
+ transcription, detected = self._run_whisper_and_scene_detection(
432
+ source_path=source_path, video=video, run_info=run_info
433
+ )
417
434
  else:
418
435
  if run_whisper:
419
- t0 = time.perf_counter()
420
- transcription = self._run_whisper(source_path=source_path, video=video)
421
- logger.info("Whisper transcription completed in %.2fs", time.perf_counter() - t0)
436
+ with _record_stage(run_info, "whisper"):
437
+ transcription = self._run_whisper(source_path=source_path, video=video)
422
438
 
423
439
  if run_scene_det:
424
- t0 = time.perf_counter()
425
- detected = self._run_scene_detection(source_path=source_path, video=video)
426
- logger.info("Scene detection completed in %.2fs", time.perf_counter() - t0)
440
+ with _record_stage(run_info, "scene_detection"):
441
+ detected = self._run_scene_detection(source_path=source_path, video=video)
427
442
 
428
443
  if run_scene_det:
429
444
  self._reset_transnetv2_torch_state()
@@ -442,19 +457,20 @@ class VideoAnalyzer:
442
457
  if not scenes:
443
458
  scenes = self._default_scene_boundaries(metadata)
444
459
 
445
- t0 = time.perf_counter()
446
- scene_section = self._analyze_scenes(
447
- source_path=source_path,
448
- video=video,
449
- metadata=metadata,
450
- scenes=scenes,
451
- preloaded_scene_vlm=None,
452
- )
453
- logger.info("Scene analysis completed in %.2fs", time.perf_counter() - t0)
460
+ with _record_stage(run_info, "scene_analysis"):
461
+ scene_section = self._analyze_scenes(
462
+ source_path=source_path,
463
+ video=video,
464
+ metadata=metadata,
465
+ scenes=scenes,
466
+ preloaded_scene_vlm=None,
467
+ run_info=run_info,
468
+ )
454
469
 
455
470
  audio_section = AudioAnalysisSection(transcription=transcription) if transcription is not None else None
456
471
 
457
- logger.info("Total analysis completed in %.2fs", time.perf_counter() - t_analysis_start)
472
+ run_info.total_duration_seconds = time.perf_counter() - t_analysis_start
473
+ logger.info("Total analysis completed in %.2fs", run_info.total_duration_seconds)
458
474
  return VideoAnalysis(
459
475
  source=source,
460
476
  config=self.config,
@@ -485,17 +501,23 @@ class VideoAnalyzer:
485
501
  return None
486
502
 
487
503
  def _run_whisper_and_scene_detection(
488
- self, *, source_path: Path | None, video: Video | None
504
+ self, *, source_path: Path | None, video: Video | None, run_info: AnalysisRunInfo
489
505
  ) -> tuple[Transcription | None, list[SceneBoundary] | None]:
490
- with ThreadPoolExecutor(max_workers=2) as pool:
491
- t0 = time.perf_counter()
492
- whisper_future = pool.submit(self._run_whisper, source_path=source_path, video=video)
493
- scene_future = pool.submit(self._run_scene_detection, source_path=source_path, video=video)
494
-
495
- transcription = whisper_future.result()
496
- detected = scene_future.result()
497
- elapsed = time.perf_counter() - t0
498
- logger.info("Whisper + scene detection (parallel) completed in %.2fs", elapsed)
506
+ with _record_stage(run_info, "whisper_and_scene_detection_parallel"):
507
+ with ThreadPoolExecutor(max_workers=2) as pool:
508
+ whisper_future = pool.submit(
509
+ _run_with_stage, run_info, "whisper", self._run_whisper, source_path=source_path, video=video
510
+ )
511
+ scene_future = pool.submit(
512
+ _run_with_stage,
513
+ run_info,
514
+ "scene_detection",
515
+ self._run_scene_detection,
516
+ source_path=source_path,
517
+ video=video,
518
+ )
519
+ transcription = whisper_future.result()
520
+ detected = scene_future.result()
499
521
 
500
522
  return transcription, detected
501
523
 
@@ -536,6 +558,7 @@ class VideoAnalyzer:
536
558
  video: Video | None,
537
559
  metadata: VideoMetadata,
538
560
  scenes: list[SceneBoundary],
561
+ run_info: AnalysisRunInfo,
539
562
  preloaded_scene_vlm: SceneVLM | None = None,
540
563
  ) -> SceneAnalysisSection:
541
564
  enabled = self.config.enabled_analyzers
@@ -571,60 +594,61 @@ class VideoAnalyzer:
571
594
  # -- Batched SceneVLM: collect all timestamps, extract frames once, run one forward pass --
572
595
  captions: list[str | None] = [None] * len(scenes)
573
596
  if scene_vlm is not None:
574
- try:
575
- captions = self._run_scene_vlm_batched(
576
- scene_vlm=scene_vlm,
577
- source_path=source_path,
578
- video=video,
579
- metadata=metadata,
580
- scenes=scenes,
581
- )
582
- except Exception:
583
- logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
584
-
585
- samples: list[SceneAnalysisSample] = []
586
- t_audio_total = 0.0
587
- for index, scene in enumerate(scenes):
588
- sample = SceneAnalysisSample(
589
- scene_index=index,
590
- start_second=float(scene.start),
591
- end_second=float(scene.end),
592
- start_frame=int(scene.start_frame),
593
- end_frame=int(scene.end_frame),
594
- caption=captions[index],
595
- )
596
-
597
- if audio_classifier is not None:
598
- t0 = time.perf_counter()
597
+ with _record_stage(run_info, "scene_vlm"):
599
598
  try:
600
- scene_clip: Video | None = None
601
- if path_audio is None:
602
- try:
603
- scene_clip = self._load_scene_video_clip(
604
- source_path=source_path,
605
- video=video,
606
- start_second=scene.start,
607
- end_second=scene.end,
608
- )
609
- except Exception:
610
- scene_clip = None
611
- sample.audio_classification = self._run_scene_audio_classification(
612
- audio_classifier=audio_classifier,
613
- path_audio=path_audio,
614
- scene_clip=scene_clip,
615
- scene_start=scene.start,
616
- scene_end=scene.end,
599
+ captions = self._run_scene_vlm_batched(
600
+ scene_vlm=scene_vlm,
601
+ source_path=source_path,
602
+ video=video,
603
+ metadata=metadata,
604
+ scenes=scenes,
617
605
  )
618
606
  except Exception:
619
- logger.warning(
620
- "AudioClassifier failed for scene %d (%.1f-%.1fs)", index, scene.start, scene.end, exc_info=True
621
- )
622
- t_audio_total += time.perf_counter() - t0
607
+ logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
623
608
 
624
- samples.append(sample)
609
+ samples: list[SceneAnalysisSample] = []
610
+ audio_ctx = _record_stage(run_info, "audio_classification") if audio_classifier is not None else nullcontext()
611
+ with audio_ctx:
612
+ for index, scene in enumerate(scenes):
613
+ sample = SceneAnalysisSample(
614
+ scene_index=index,
615
+ start_second=float(scene.start),
616
+ end_second=float(scene.end),
617
+ start_frame=int(scene.start_frame),
618
+ end_frame=int(scene.end_frame),
619
+ caption=captions[index],
620
+ )
625
621
 
626
- if audio_classifier is not None:
627
- logger.info("AudioClassifier inference total: %.2fs across %d scenes", t_audio_total, len(scenes))
622
+ if audio_classifier is not None:
623
+ try:
624
+ scene_clip: Video | None = None
625
+ if path_audio is None:
626
+ try:
627
+ scene_clip = self._load_scene_video_clip(
628
+ source_path=source_path,
629
+ video=video,
630
+ start_second=scene.start,
631
+ end_second=scene.end,
632
+ )
633
+ except Exception:
634
+ scene_clip = None
635
+ sample.audio_classification = self._run_scene_audio_classification(
636
+ audio_classifier=audio_classifier,
637
+ path_audio=path_audio,
638
+ scene_clip=scene_clip,
639
+ scene_start=scene.start,
640
+ scene_end=scene.end,
641
+ )
642
+ except Exception:
643
+ logger.warning(
644
+ "AudioClassifier failed for scene %d (%.1f-%.1fs)",
645
+ index,
646
+ scene.start,
647
+ scene.end,
648
+ exc_info=True,
649
+ )
650
+
651
+ samples.append(sample)
628
652
 
629
653
  return SceneAnalysisSection(samples=samples)
630
654
 
@@ -893,6 +917,27 @@ def _utc_now_iso() -> str:
893
917
  return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
894
918
 
895
919
 
920
+ @contextmanager
921
+ def _record_stage(run_info: AnalysisRunInfo, stage: str) -> Iterator[None]:
922
+ """Time a block, write the elapsed seconds into ``run_info``, and log it."""
923
+ t0 = time.perf_counter()
924
+ try:
925
+ yield
926
+ finally:
927
+ elapsed = time.perf_counter() - t0
928
+ run_info.stage_durations_seconds[stage] = elapsed
929
+ logger.info("%s completed in %.2fs", stage, elapsed)
930
+
931
+
932
+ _T = TypeVar("_T")
933
+
934
+
935
+ def _run_with_stage(run_info: AnalysisRunInfo, stage: str, fn: Callable[..., _T], /, **kwargs: Any) -> _T:
936
+ """Call ``fn(**kwargs)`` inside ``_record_stage``. Use with ``ThreadPoolExecutor.submit``."""
937
+ with _record_stage(run_info, stage):
938
+ return fn(**kwargs)
939
+
940
+
896
941
  def _library_version() -> str | None:
897
942
  try:
898
943
  return importlib_metadata.version("videopython")
File without changes
File without changes
File without changes