videopython 0.26.8__tar.gz → 0.26.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.26.8 → videopython-0.26.10}/PKG-INFO +1 -1
- {videopython-0.26.8 → videopython-0.26.10}/pyproject.toml +1 -1
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/pipeline.py +19 -4
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/video_analysis.py +123 -78
- {videopython-0.26.8 → videopython-0.26.10}/.gitignore +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/LICENSE +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/README.md +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/_device.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/registry.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/combine.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/description.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/effects.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/progress.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/registry.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/scene.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/streaming.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/transforms.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/transitions.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/utils.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/base/video.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.26.8 → videopython-0.26.10}/src/videopython/py.typed +0 -0
|
@@ -350,10 +350,25 @@ class LocalDubbingPipeline:
|
|
|
350
350
|
speaker = segment.speaker or "speaker_0"
|
|
351
351
|
cached_path = speaker_wav_paths.get(speaker) if voice_clone else None
|
|
352
352
|
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
353
|
+
try:
|
|
354
|
+
if cached_path is not None:
|
|
355
|
+
dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=cached_path)
|
|
356
|
+
else:
|
|
357
|
+
dubbed_audio = self._tts.generate_audio(segment.translated_text)
|
|
358
|
+
except Exception as e:
|
|
359
|
+
# Chatterbox occasionally crashes on short translated text
|
|
360
|
+
# (alignment_stream_analyzer indexing on tensors with <=5
|
|
361
|
+
# speech tokens). One bad segment shouldn't lose a long
|
|
362
|
+
# multi-hour run — log and skip so the rest proceeds.
|
|
363
|
+
logger.warning(
|
|
364
|
+
"TTS failed for segment %d/%d (speaker=%s, text=%r): %s — skipping",
|
|
365
|
+
i + 1,
|
|
366
|
+
len(translated_segments),
|
|
367
|
+
speaker,
|
|
368
|
+
segment.translated_text,
|
|
369
|
+
e,
|
|
370
|
+
)
|
|
371
|
+
continue
|
|
357
372
|
|
|
358
373
|
dubbed_segments.append(dubbed_audio)
|
|
359
374
|
target_durations.append(segment.duration)
|
|
@@ -7,12 +7,14 @@ import math
|
|
|
7
7
|
import re
|
|
8
8
|
import subprocess
|
|
9
9
|
import time
|
|
10
|
+
from collections.abc import Callable, Iterator
|
|
10
11
|
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
+
from contextlib import contextmanager, nullcontext
|
|
11
13
|
from dataclasses import dataclass, field
|
|
12
14
|
from datetime import datetime, timezone
|
|
13
15
|
from importlib import metadata as importlib_metadata
|
|
14
16
|
from pathlib import Path
|
|
15
|
-
from typing import Any
|
|
17
|
+
from typing import Any, TypeVar
|
|
16
18
|
|
|
17
19
|
import numpy as np
|
|
18
20
|
from PIL import Image
|
|
@@ -144,17 +146,28 @@ class VideoAnalysisSource:
|
|
|
144
146
|
|
|
145
147
|
@dataclass
|
|
146
148
|
class AnalysisRunInfo:
|
|
147
|
-
"""Runtime/provenance metadata for a full analysis run.
|
|
149
|
+
"""Runtime/provenance metadata for a full analysis run.
|
|
150
|
+
|
|
151
|
+
``stage_durations_seconds`` is populated by the analyzer with per-stage
|
|
152
|
+
wall-clock times (whisper, scene_detection, scene_analysis, scene_vlm,
|
|
153
|
+
audio_classification, and -- when both run together --
|
|
154
|
+
whisper_and_scene_detection_parallel). Consumers can persist or aggregate
|
|
155
|
+
these to track pipeline performance over time.
|
|
156
|
+
"""
|
|
148
157
|
|
|
149
158
|
created_at: str
|
|
150
159
|
mode: str
|
|
151
160
|
library_version: str | None = None
|
|
161
|
+
stage_durations_seconds: dict[str, float] = field(default_factory=dict)
|
|
162
|
+
total_duration_seconds: float | None = None
|
|
152
163
|
|
|
153
164
|
def to_dict(self) -> dict[str, Any]:
|
|
154
165
|
return {
|
|
155
166
|
"created_at": self.created_at,
|
|
156
167
|
"mode": self.mode,
|
|
157
168
|
"library_version": self.library_version,
|
|
169
|
+
"stage_durations_seconds": dict(self.stage_durations_seconds),
|
|
170
|
+
"total_duration_seconds": self.total_duration_seconds,
|
|
158
171
|
}
|
|
159
172
|
|
|
160
173
|
@classmethod
|
|
@@ -163,6 +176,8 @@ class AnalysisRunInfo:
|
|
|
163
176
|
created_at=data["created_at"],
|
|
164
177
|
mode=data["mode"],
|
|
165
178
|
library_version=data.get("library_version"),
|
|
179
|
+
stage_durations_seconds={str(k): float(v) for k, v in data["stage_durations_seconds"].items()},
|
|
180
|
+
total_duration_seconds=data["total_duration_seconds"],
|
|
166
181
|
)
|
|
167
182
|
|
|
168
183
|
|
|
@@ -413,17 +428,17 @@ class VideoAnalyzer:
|
|
|
413
428
|
# which corrupts Whisper's model weights if they're initialized at the
|
|
414
429
|
# same time.
|
|
415
430
|
if run_whisper and run_scene_det:
|
|
416
|
-
transcription, detected = self._run_whisper_and_scene_detection(
|
|
431
|
+
transcription, detected = self._run_whisper_and_scene_detection(
|
|
432
|
+
source_path=source_path, video=video, run_info=run_info
|
|
433
|
+
)
|
|
417
434
|
else:
|
|
418
435
|
if run_whisper:
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
logger.info("Whisper transcription completed in %.2fs", time.perf_counter() - t0)
|
|
436
|
+
with _record_stage(run_info, "whisper"):
|
|
437
|
+
transcription = self._run_whisper(source_path=source_path, video=video)
|
|
422
438
|
|
|
423
439
|
if run_scene_det:
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
logger.info("Scene detection completed in %.2fs", time.perf_counter() - t0)
|
|
440
|
+
with _record_stage(run_info, "scene_detection"):
|
|
441
|
+
detected = self._run_scene_detection(source_path=source_path, video=video)
|
|
427
442
|
|
|
428
443
|
if run_scene_det:
|
|
429
444
|
self._reset_transnetv2_torch_state()
|
|
@@ -442,19 +457,20 @@ class VideoAnalyzer:
|
|
|
442
457
|
if not scenes:
|
|
443
458
|
scenes = self._default_scene_boundaries(metadata)
|
|
444
459
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
460
|
+
with _record_stage(run_info, "scene_analysis"):
|
|
461
|
+
scene_section = self._analyze_scenes(
|
|
462
|
+
source_path=source_path,
|
|
463
|
+
video=video,
|
|
464
|
+
metadata=metadata,
|
|
465
|
+
scenes=scenes,
|
|
466
|
+
preloaded_scene_vlm=None,
|
|
467
|
+
run_info=run_info,
|
|
468
|
+
)
|
|
454
469
|
|
|
455
470
|
audio_section = AudioAnalysisSection(transcription=transcription) if transcription is not None else None
|
|
456
471
|
|
|
457
|
-
|
|
472
|
+
run_info.total_duration_seconds = time.perf_counter() - t_analysis_start
|
|
473
|
+
logger.info("Total analysis completed in %.2fs", run_info.total_duration_seconds)
|
|
458
474
|
return VideoAnalysis(
|
|
459
475
|
source=source,
|
|
460
476
|
config=self.config,
|
|
@@ -485,17 +501,23 @@ class VideoAnalyzer:
|
|
|
485
501
|
return None
|
|
486
502
|
|
|
487
503
|
def _run_whisper_and_scene_detection(
|
|
488
|
-
self, *, source_path: Path | None, video: Video | None
|
|
504
|
+
self, *, source_path: Path | None, video: Video | None, run_info: AnalysisRunInfo
|
|
489
505
|
) -> tuple[Transcription | None, list[SceneBoundary] | None]:
|
|
490
|
-
with
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
506
|
+
with _record_stage(run_info, "whisper_and_scene_detection_parallel"):
|
|
507
|
+
with ThreadPoolExecutor(max_workers=2) as pool:
|
|
508
|
+
whisper_future = pool.submit(
|
|
509
|
+
_run_with_stage, run_info, "whisper", self._run_whisper, source_path=source_path, video=video
|
|
510
|
+
)
|
|
511
|
+
scene_future = pool.submit(
|
|
512
|
+
_run_with_stage,
|
|
513
|
+
run_info,
|
|
514
|
+
"scene_detection",
|
|
515
|
+
self._run_scene_detection,
|
|
516
|
+
source_path=source_path,
|
|
517
|
+
video=video,
|
|
518
|
+
)
|
|
519
|
+
transcription = whisper_future.result()
|
|
520
|
+
detected = scene_future.result()
|
|
499
521
|
|
|
500
522
|
return transcription, detected
|
|
501
523
|
|
|
@@ -536,6 +558,7 @@ class VideoAnalyzer:
|
|
|
536
558
|
video: Video | None,
|
|
537
559
|
metadata: VideoMetadata,
|
|
538
560
|
scenes: list[SceneBoundary],
|
|
561
|
+
run_info: AnalysisRunInfo,
|
|
539
562
|
preloaded_scene_vlm: SceneVLM | None = None,
|
|
540
563
|
) -> SceneAnalysisSection:
|
|
541
564
|
enabled = self.config.enabled_analyzers
|
|
@@ -571,60 +594,61 @@ class VideoAnalyzer:
|
|
|
571
594
|
# -- Batched SceneVLM: collect all timestamps, extract frames once, run one forward pass --
|
|
572
595
|
captions: list[str | None] = [None] * len(scenes)
|
|
573
596
|
if scene_vlm is not None:
|
|
574
|
-
|
|
575
|
-
captions = self._run_scene_vlm_batched(
|
|
576
|
-
scene_vlm=scene_vlm,
|
|
577
|
-
source_path=source_path,
|
|
578
|
-
video=video,
|
|
579
|
-
metadata=metadata,
|
|
580
|
-
scenes=scenes,
|
|
581
|
-
)
|
|
582
|
-
except Exception:
|
|
583
|
-
logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
|
|
584
|
-
|
|
585
|
-
samples: list[SceneAnalysisSample] = []
|
|
586
|
-
t_audio_total = 0.0
|
|
587
|
-
for index, scene in enumerate(scenes):
|
|
588
|
-
sample = SceneAnalysisSample(
|
|
589
|
-
scene_index=index,
|
|
590
|
-
start_second=float(scene.start),
|
|
591
|
-
end_second=float(scene.end),
|
|
592
|
-
start_frame=int(scene.start_frame),
|
|
593
|
-
end_frame=int(scene.end_frame),
|
|
594
|
-
caption=captions[index],
|
|
595
|
-
)
|
|
596
|
-
|
|
597
|
-
if audio_classifier is not None:
|
|
598
|
-
t0 = time.perf_counter()
|
|
597
|
+
with _record_stage(run_info, "scene_vlm"):
|
|
599
598
|
try:
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
start_second=scene.start,
|
|
607
|
-
end_second=scene.end,
|
|
608
|
-
)
|
|
609
|
-
except Exception:
|
|
610
|
-
scene_clip = None
|
|
611
|
-
sample.audio_classification = self._run_scene_audio_classification(
|
|
612
|
-
audio_classifier=audio_classifier,
|
|
613
|
-
path_audio=path_audio,
|
|
614
|
-
scene_clip=scene_clip,
|
|
615
|
-
scene_start=scene.start,
|
|
616
|
-
scene_end=scene.end,
|
|
599
|
+
captions = self._run_scene_vlm_batched(
|
|
600
|
+
scene_vlm=scene_vlm,
|
|
601
|
+
source_path=source_path,
|
|
602
|
+
video=video,
|
|
603
|
+
metadata=metadata,
|
|
604
|
+
scenes=scenes,
|
|
617
605
|
)
|
|
618
606
|
except Exception:
|
|
619
|
-
logger.warning(
|
|
620
|
-
"AudioClassifier failed for scene %d (%.1f-%.1fs)", index, scene.start, scene.end, exc_info=True
|
|
621
|
-
)
|
|
622
|
-
t_audio_total += time.perf_counter() - t0
|
|
607
|
+
logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
|
|
623
608
|
|
|
624
|
-
|
|
609
|
+
samples: list[SceneAnalysisSample] = []
|
|
610
|
+
audio_ctx = _record_stage(run_info, "audio_classification") if audio_classifier is not None else nullcontext()
|
|
611
|
+
with audio_ctx:
|
|
612
|
+
for index, scene in enumerate(scenes):
|
|
613
|
+
sample = SceneAnalysisSample(
|
|
614
|
+
scene_index=index,
|
|
615
|
+
start_second=float(scene.start),
|
|
616
|
+
end_second=float(scene.end),
|
|
617
|
+
start_frame=int(scene.start_frame),
|
|
618
|
+
end_frame=int(scene.end_frame),
|
|
619
|
+
caption=captions[index],
|
|
620
|
+
)
|
|
625
621
|
|
|
626
|
-
|
|
627
|
-
|
|
622
|
+
if audio_classifier is not None:
|
|
623
|
+
try:
|
|
624
|
+
scene_clip: Video | None = None
|
|
625
|
+
if path_audio is None:
|
|
626
|
+
try:
|
|
627
|
+
scene_clip = self._load_scene_video_clip(
|
|
628
|
+
source_path=source_path,
|
|
629
|
+
video=video,
|
|
630
|
+
start_second=scene.start,
|
|
631
|
+
end_second=scene.end,
|
|
632
|
+
)
|
|
633
|
+
except Exception:
|
|
634
|
+
scene_clip = None
|
|
635
|
+
sample.audio_classification = self._run_scene_audio_classification(
|
|
636
|
+
audio_classifier=audio_classifier,
|
|
637
|
+
path_audio=path_audio,
|
|
638
|
+
scene_clip=scene_clip,
|
|
639
|
+
scene_start=scene.start,
|
|
640
|
+
scene_end=scene.end,
|
|
641
|
+
)
|
|
642
|
+
except Exception:
|
|
643
|
+
logger.warning(
|
|
644
|
+
"AudioClassifier failed for scene %d (%.1f-%.1fs)",
|
|
645
|
+
index,
|
|
646
|
+
scene.start,
|
|
647
|
+
scene.end,
|
|
648
|
+
exc_info=True,
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
samples.append(sample)
|
|
628
652
|
|
|
629
653
|
return SceneAnalysisSection(samples=samples)
|
|
630
654
|
|
|
@@ -893,6 +917,27 @@ def _utc_now_iso() -> str:
|
|
|
893
917
|
return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
|
|
894
918
|
|
|
895
919
|
|
|
920
|
+
@contextmanager
|
|
921
|
+
def _record_stage(run_info: AnalysisRunInfo, stage: str) -> Iterator[None]:
|
|
922
|
+
"""Time a block, write the elapsed seconds into ``run_info``, and log it."""
|
|
923
|
+
t0 = time.perf_counter()
|
|
924
|
+
try:
|
|
925
|
+
yield
|
|
926
|
+
finally:
|
|
927
|
+
elapsed = time.perf_counter() - t0
|
|
928
|
+
run_info.stage_durations_seconds[stage] = elapsed
|
|
929
|
+
logger.info("%s completed in %.2fs", stage, elapsed)
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
_T = TypeVar("_T")
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def _run_with_stage(run_info: AnalysisRunInfo, stage: str, fn: Callable[..., _T], /, **kwargs: Any) -> _T:
|
|
936
|
+
"""Call ``fn(**kwargs)`` inside ``_record_stage``. Use with ``ThreadPoolExecutor.submit``."""
|
|
937
|
+
with _record_stage(run_info, stage):
|
|
938
|
+
return fn(**kwargs)
|
|
939
|
+
|
|
940
|
+
|
|
896
941
|
def _library_version() -> str | None:
|
|
897
942
|
try:
|
|
898
943
|
return importlib_metadata.version("videopython")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|