videopython 0.26.9__tar.gz → 0.27.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. {videopython-0.26.9 → videopython-0.27.0}/PKG-INFO +2 -1
  2. {videopython-0.26.9 → videopython-0.27.0}/pyproject.toml +6 -1
  3. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/understanding/audio.py +91 -6
  4. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/video_analysis.py +123 -78
  5. {videopython-0.26.9 → videopython-0.27.0}/.gitignore +0 -0
  6. {videopython-0.26.9 → videopython-0.27.0}/LICENSE +0 -0
  7. {videopython-0.26.9 → videopython-0.27.0}/README.md +0 -0
  8. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/__init__.py +0 -0
  9. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/__init__.py +0 -0
  10. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/_device.py +0 -0
  11. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/dubbing/__init__.py +0 -0
  12. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/dubbing/dubber.py +0 -0
  13. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/dubbing/models.py +0 -0
  14. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
  15. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/dubbing/remux.py +0 -0
  16. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/dubbing/timing.py +0 -0
  17. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/generation/__init__.py +0 -0
  18. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/generation/audio.py +0 -0
  19. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/generation/image.py +0 -0
  20. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/generation/translation.py +0 -0
  21. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/generation/video.py +0 -0
  22. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/registry.py +0 -0
  23. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/swapping/__init__.py +0 -0
  24. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/swapping/inpainter.py +0 -0
  25. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/swapping/models.py +0 -0
  26. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/swapping/segmenter.py +0 -0
  27. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/swapping/swapper.py +0 -0
  28. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/transforms.py +0 -0
  29. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/understanding/__init__.py +0 -0
  30. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/understanding/image.py +0 -0
  31. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/understanding/separation.py +0 -0
  32. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/ai/understanding/temporal.py +0 -0
  33. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/__init__.py +0 -0
  34. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/audio/__init__.py +0 -0
  35. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/audio/analysis.py +0 -0
  36. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/audio/audio.py +0 -0
  37. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/combine.py +0 -0
  38. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/description.py +0 -0
  39. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/effects.py +0 -0
  40. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/exceptions.py +0 -0
  41. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/progress.py +0 -0
  42. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/registry.py +0 -0
  43. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/scene.py +0 -0
  44. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/streaming.py +0 -0
  45. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/text/__init__.py +0 -0
  46. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/text/overlay.py +0 -0
  47. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/text/transcription.py +0 -0
  48. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/transforms.py +0 -0
  49. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/transitions.py +0 -0
  50. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/utils.py +0 -0
  51. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/base/video.py +0 -0
  52. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/editing/__init__.py +0 -0
  53. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/editing/multicam.py +0 -0
  54. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/editing/premiere_xml.py +0 -0
  55. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/editing/video_edit.py +0 -0
  56. {videopython-0.26.9 → videopython-0.27.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.26.9
3
+ Version: 0.27.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -36,6 +36,7 @@ Requires-Dist: pyannote-audio>=4.0.0; extra == 'ai'
36
36
  Requires-Dist: scikit-learn>=1.3.0; extra == 'ai'
37
37
  Requires-Dist: scipy>=1.10.0; extra == 'ai'
38
38
  Requires-Dist: sentencepiece>=0.1.99; extra == 'ai'
39
+ Requires-Dist: silero-vad>=5.1; extra == 'ai'
39
40
  Requires-Dist: torch>=2.8.0; extra == 'ai'
40
41
  Requires-Dist: torchaudio>=2.8.0; extra == 'ai'
41
42
  Requires-Dist: transformers>=5.2.0; extra == 'ai'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.26.9"
3
+ version = "0.27.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -62,6 +62,8 @@ ai = [
62
62
  "transformers>=5.2.0",
63
63
  "openai-whisper>=20240930",
64
64
  "pyannote-audio>=4.0.0",
65
+ # Voice activity detection (used by AudioToText to gate Whisper language detection)
66
+ "silero-vad>=5.1",
65
67
  "numba>=0.61.0",
66
68
  "ollama>=0.4.5",
67
69
  "scipy>=1.10.0",
@@ -91,6 +93,8 @@ ai = [
91
93
  "transformers>=5.2.0",
92
94
  "openai-whisper>=20240930",
93
95
  "pyannote-audio>=4.0.0",
96
+ # Voice activity detection (used by AudioToText to gate Whisper language detection)
97
+ "silero-vad>=5.1",
94
98
  "numba>=0.61.0",
95
99
  "ollama>=0.4.5",
96
100
  "scipy>=1.10.0",
@@ -130,6 +134,7 @@ module = [
130
134
  "demucs", "demucs.*",
131
135
  "huggingface_hub", "huggingface_hub.*",
132
136
  "pyannote", "pyannote.*",
137
+ "silero_vad", "silero_vad.*",
133
138
  "cv2", "cv2.*",
134
139
  ]
135
140
  ignore_missing_imports = true
@@ -24,10 +24,12 @@ class AudioToText:
24
24
  self,
25
25
  model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small",
26
26
  enable_diarization: bool = False,
27
+ enable_vad: bool = True,
27
28
  device: str | None = None,
28
29
  ):
29
30
  self.model_name = model_name
30
31
  self.enable_diarization = enable_diarization
32
+ self.enable_vad = enable_vad
31
33
  self.device = select_device(device, mps_allowed=False)
32
34
  log_device_initialization(
33
35
  "AudioToText",
@@ -36,6 +38,7 @@ class AudioToText:
36
38
  )
37
39
  self._model: Any = None
38
40
  self._diarization_pipeline: Any = None
41
+ self._vad_model: Any = None
39
42
 
40
43
  def _init_local(self) -> None:
41
44
  """Initialize local Whisper model."""
@@ -51,13 +54,25 @@ class AudioToText:
51
54
  self._diarization_pipeline = Pipeline.from_pretrained(self.PYANNOTE_DIARIZATION_MODEL)
52
55
  self._diarization_pipeline.to(torch.device(self.device))
53
56
 
57
+ def _init_vad(self) -> None:
58
+ """Initialize Silero VAD model.
59
+
60
+ The model is ~2 MB and CPU-fast (~5-15s for a 90 min movie); we keep
61
+ it on CPU regardless of ``self.device`` since dispatch overhead would
62
+ outweigh inference cost.
63
+ """
64
+ from silero_vad import load_silero_vad
65
+
66
+ self._vad_model = load_silero_vad()
67
+
54
68
  def unload(self) -> None:
55
- """Release the Whisper and diarization models so the next call re-initializes.
69
+ """Release the Whisper, diarization, and VAD models so the next call re-initializes.
56
70
 
57
71
  Used by low-memory dubbing to free VRAM between pipeline stages.
58
72
  """
59
73
  self._model = None
60
74
  self._diarization_pipeline = None
75
+ self._vad_model = None
61
76
  release_device_memory(self.device)
62
77
 
63
78
  def _process_transcription_result(self, transcription_result: dict) -> Transcription:
@@ -172,7 +187,60 @@ class AudioToText:
172
187
  all_words = self._assign_speakers_to_words(all_words, diarization_result)
173
188
  return Transcription(words=all_words, language=transcription.language)
174
189
 
175
- def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
190
+ def _run_vad(self, audio_mono: Audio) -> list[tuple[float, float]]:
191
+ """Return voiced spans in seconds using Silero VAD.
192
+
193
+ Audio must already be mono at ``whisper.audio.SAMPLE_RATE`` (16 kHz),
194
+ which is one of Silero's two supported rates.
195
+ """
196
+ import numpy as np
197
+ import torch
198
+
199
+ if self._vad_model is None:
200
+ self._init_vad()
201
+
202
+ from silero_vad import get_speech_timestamps
203
+
204
+ waveform = torch.from_numpy(audio_mono.data.astype(np.float32))
205
+ timestamps = get_speech_timestamps(
206
+ waveform,
207
+ self._vad_model,
208
+ sampling_rate=audio_mono.metadata.sample_rate,
209
+ return_seconds=True,
210
+ )
211
+ return [(float(ts["start"]), float(ts["end"])) for ts in timestamps]
212
+
213
+ def _detect_language(self, audio_mono: Audio, voiced_spans: list[tuple[float, float]]) -> str:
214
+ """Run Whisper language detection on a 30s window of voiced audio.
215
+
216
+ Whisper's auto-detection only inspects the first 30s of input. When
217
+ the file opens with silence/music/credits, that window contains no
218
+ speech and detection picks the closest-looking thing (typically
219
+ English). Concatenating voiced spans up to 30s and running
220
+ ``model.detect_language()`` on the resulting mel fixes this.
221
+ """
222
+ import numpy as np
223
+ import torch
224
+ import whisper
225
+
226
+ sample_rate = audio_mono.metadata.sample_rate
227
+ chunks: list[np.ndarray] = []
228
+ remaining = whisper.audio.N_SAMPLES
229
+ for start, end in voiced_spans:
230
+ if remaining <= 0:
231
+ break
232
+ chunk = audio_mono.data[int(start * sample_rate) : int(end * sample_rate)][:remaining]
233
+ chunks.append(chunk)
234
+ remaining -= len(chunk)
235
+
236
+ voiced_audio = np.concatenate(chunks).astype(np.float32) if chunks else np.zeros(0, dtype=np.float32)
237
+ padded = whisper.audio.pad_or_trim(torch.from_numpy(voiced_audio))
238
+ mel = whisper.audio.log_mel_spectrogram(padded, n_mels=self._model.dims.n_mels).to(self._model.device)
239
+
240
+ _, probs = self._model.detect_language(mel)
241
+ return max(probs, key=probs.get)
242
+
243
+ def _transcribe_with_diarization(self, audio_mono: Audio, language: str | None) -> Transcription:
176
244
  """Transcribe with word timestamps and assign speakers via pyannote."""
177
245
  import numpy as np
178
246
  import torch
@@ -181,7 +249,7 @@ class AudioToText:
181
249
  self._init_diarization()
182
250
 
183
251
  audio_data = audio_mono.data
184
- transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True)
252
+ transcription_result = self._model.transcribe(audio=audio_data, word_timestamps=True, language=language)
185
253
 
186
254
  waveform = torch.from_numpy(audio_data.astype(np.float32)).unsqueeze(0)
187
255
  diarization_result = self._diarization_pipeline(
@@ -200,7 +268,17 @@ class AudioToText:
200
268
  return Transcription(words=all_words, language=transcription.language)
201
269
 
202
270
  def _transcribe_local(self, audio: Audio) -> Transcription:
203
- """Transcribe using local Whisper model."""
271
+ """Transcribe using local Whisper model.
272
+
273
+ When ``enable_vad`` is True (default), Silero VAD locates voiced
274
+ regions and a 30s voiced window is used for Whisper language
275
+ detection -- avoiding the well-known failure where Whisper locks
276
+ onto the wrong language because the first 30s of input is silence
277
+ or music. The detected language is then passed into
278
+ ``transcribe()`` so chunked decoding stays consistent. If VAD
279
+ finds no speech, an empty Transcription is returned without
280
+ invoking Whisper.
281
+ """
204
282
  import whisper
205
283
 
206
284
  if self._model is None:
@@ -208,10 +286,17 @@ class AudioToText:
208
286
 
209
287
  audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
210
288
 
289
+ language: str | None = None
290
+ if self.enable_vad:
291
+ voiced_spans = self._run_vad(audio_mono)
292
+ if not voiced_spans:
293
+ return Transcription(segments=[])
294
+ language = self._detect_language(audio_mono, voiced_spans)
295
+
211
296
  if self.enable_diarization:
212
- return self._transcribe_with_diarization(audio_mono)
297
+ return self._transcribe_with_diarization(audio_mono, language)
213
298
 
214
- transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True)
299
+ transcription_result = self._model.transcribe(audio=audio_mono.data, word_timestamps=True, language=language)
215
300
  return self._process_transcription_result(transcription_result)
216
301
 
217
302
  def transcribe(self, media: Audio | Video) -> Transcription:
@@ -7,12 +7,14 @@ import math
7
7
  import re
8
8
  import subprocess
9
9
  import time
10
+ from collections.abc import Callable, Iterator
10
11
  from concurrent.futures import ThreadPoolExecutor
12
+ from contextlib import contextmanager, nullcontext
11
13
  from dataclasses import dataclass, field
12
14
  from datetime import datetime, timezone
13
15
  from importlib import metadata as importlib_metadata
14
16
  from pathlib import Path
15
- from typing import Any
17
+ from typing import Any, TypeVar
16
18
 
17
19
  import numpy as np
18
20
  from PIL import Image
@@ -144,17 +146,28 @@ class VideoAnalysisSource:
144
146
 
145
147
  @dataclass
146
148
  class AnalysisRunInfo:
147
- """Runtime/provenance metadata for a full analysis run."""
149
+ """Runtime/provenance metadata for a full analysis run.
150
+
151
+ ``stage_durations_seconds`` is populated by the analyzer with per-stage
152
+ wall-clock times (whisper, scene_detection, scene_analysis, scene_vlm,
153
+ audio_classification, and -- when both run together --
154
+ whisper_and_scene_detection_parallel). Consumers can persist or aggregate
155
+ these to track pipeline performance over time.
156
+ """
148
157
 
149
158
  created_at: str
150
159
  mode: str
151
160
  library_version: str | None = None
161
+ stage_durations_seconds: dict[str, float] = field(default_factory=dict)
162
+ total_duration_seconds: float | None = None
152
163
 
153
164
  def to_dict(self) -> dict[str, Any]:
154
165
  return {
155
166
  "created_at": self.created_at,
156
167
  "mode": self.mode,
157
168
  "library_version": self.library_version,
169
+ "stage_durations_seconds": dict(self.stage_durations_seconds),
170
+ "total_duration_seconds": self.total_duration_seconds,
158
171
  }
159
172
 
160
173
  @classmethod
@@ -163,6 +176,8 @@ class AnalysisRunInfo:
163
176
  created_at=data["created_at"],
164
177
  mode=data["mode"],
165
178
  library_version=data.get("library_version"),
179
+ stage_durations_seconds={str(k): float(v) for k, v in data["stage_durations_seconds"].items()},
180
+ total_duration_seconds=data["total_duration_seconds"],
166
181
  )
167
182
 
168
183
 
@@ -413,17 +428,17 @@ class VideoAnalyzer:
413
428
  # which corrupts Whisper's model weights if they're initialized at the
414
429
  # same time.
415
430
  if run_whisper and run_scene_det:
416
- transcription, detected = self._run_whisper_and_scene_detection(source_path=source_path, video=video)
431
+ transcription, detected = self._run_whisper_and_scene_detection(
432
+ source_path=source_path, video=video, run_info=run_info
433
+ )
417
434
  else:
418
435
  if run_whisper:
419
- t0 = time.perf_counter()
420
- transcription = self._run_whisper(source_path=source_path, video=video)
421
- logger.info("Whisper transcription completed in %.2fs", time.perf_counter() - t0)
436
+ with _record_stage(run_info, "whisper"):
437
+ transcription = self._run_whisper(source_path=source_path, video=video)
422
438
 
423
439
  if run_scene_det:
424
- t0 = time.perf_counter()
425
- detected = self._run_scene_detection(source_path=source_path, video=video)
426
- logger.info("Scene detection completed in %.2fs", time.perf_counter() - t0)
440
+ with _record_stage(run_info, "scene_detection"):
441
+ detected = self._run_scene_detection(source_path=source_path, video=video)
427
442
 
428
443
  if run_scene_det:
429
444
  self._reset_transnetv2_torch_state()
@@ -442,19 +457,20 @@ class VideoAnalyzer:
442
457
  if not scenes:
443
458
  scenes = self._default_scene_boundaries(metadata)
444
459
 
445
- t0 = time.perf_counter()
446
- scene_section = self._analyze_scenes(
447
- source_path=source_path,
448
- video=video,
449
- metadata=metadata,
450
- scenes=scenes,
451
- preloaded_scene_vlm=None,
452
- )
453
- logger.info("Scene analysis completed in %.2fs", time.perf_counter() - t0)
460
+ with _record_stage(run_info, "scene_analysis"):
461
+ scene_section = self._analyze_scenes(
462
+ source_path=source_path,
463
+ video=video,
464
+ metadata=metadata,
465
+ scenes=scenes,
466
+ preloaded_scene_vlm=None,
467
+ run_info=run_info,
468
+ )
454
469
 
455
470
  audio_section = AudioAnalysisSection(transcription=transcription) if transcription is not None else None
456
471
 
457
- logger.info("Total analysis completed in %.2fs", time.perf_counter() - t_analysis_start)
472
+ run_info.total_duration_seconds = time.perf_counter() - t_analysis_start
473
+ logger.info("Total analysis completed in %.2fs", run_info.total_duration_seconds)
458
474
  return VideoAnalysis(
459
475
  source=source,
460
476
  config=self.config,
@@ -485,17 +501,23 @@ class VideoAnalyzer:
485
501
  return None
486
502
 
487
503
  def _run_whisper_and_scene_detection(
488
- self, *, source_path: Path | None, video: Video | None
504
+ self, *, source_path: Path | None, video: Video | None, run_info: AnalysisRunInfo
489
505
  ) -> tuple[Transcription | None, list[SceneBoundary] | None]:
490
- with ThreadPoolExecutor(max_workers=2) as pool:
491
- t0 = time.perf_counter()
492
- whisper_future = pool.submit(self._run_whisper, source_path=source_path, video=video)
493
- scene_future = pool.submit(self._run_scene_detection, source_path=source_path, video=video)
494
-
495
- transcription = whisper_future.result()
496
- detected = scene_future.result()
497
- elapsed = time.perf_counter() - t0
498
- logger.info("Whisper + scene detection (parallel) completed in %.2fs", elapsed)
506
+ with _record_stage(run_info, "whisper_and_scene_detection_parallel"):
507
+ with ThreadPoolExecutor(max_workers=2) as pool:
508
+ whisper_future = pool.submit(
509
+ _run_with_stage, run_info, "whisper", self._run_whisper, source_path=source_path, video=video
510
+ )
511
+ scene_future = pool.submit(
512
+ _run_with_stage,
513
+ run_info,
514
+ "scene_detection",
515
+ self._run_scene_detection,
516
+ source_path=source_path,
517
+ video=video,
518
+ )
519
+ transcription = whisper_future.result()
520
+ detected = scene_future.result()
499
521
 
500
522
  return transcription, detected
501
523
 
@@ -536,6 +558,7 @@ class VideoAnalyzer:
536
558
  video: Video | None,
537
559
  metadata: VideoMetadata,
538
560
  scenes: list[SceneBoundary],
561
+ run_info: AnalysisRunInfo,
539
562
  preloaded_scene_vlm: SceneVLM | None = None,
540
563
  ) -> SceneAnalysisSection:
541
564
  enabled = self.config.enabled_analyzers
@@ -571,60 +594,61 @@ class VideoAnalyzer:
571
594
  # -- Batched SceneVLM: collect all timestamps, extract frames once, run one forward pass --
572
595
  captions: list[str | None] = [None] * len(scenes)
573
596
  if scene_vlm is not None:
574
- try:
575
- captions = self._run_scene_vlm_batched(
576
- scene_vlm=scene_vlm,
577
- source_path=source_path,
578
- video=video,
579
- metadata=metadata,
580
- scenes=scenes,
581
- )
582
- except Exception:
583
- logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
584
-
585
- samples: list[SceneAnalysisSample] = []
586
- t_audio_total = 0.0
587
- for index, scene in enumerate(scenes):
588
- sample = SceneAnalysisSample(
589
- scene_index=index,
590
- start_second=float(scene.start),
591
- end_second=float(scene.end),
592
- start_frame=int(scene.start_frame),
593
- end_frame=int(scene.end_frame),
594
- caption=captions[index],
595
- )
596
-
597
- if audio_classifier is not None:
598
- t0 = time.perf_counter()
597
+ with _record_stage(run_info, "scene_vlm"):
599
598
  try:
600
- scene_clip: Video | None = None
601
- if path_audio is None:
602
- try:
603
- scene_clip = self._load_scene_video_clip(
604
- source_path=source_path,
605
- video=video,
606
- start_second=scene.start,
607
- end_second=scene.end,
608
- )
609
- except Exception:
610
- scene_clip = None
611
- sample.audio_classification = self._run_scene_audio_classification(
612
- audio_classifier=audio_classifier,
613
- path_audio=path_audio,
614
- scene_clip=scene_clip,
615
- scene_start=scene.start,
616
- scene_end=scene.end,
599
+ captions = self._run_scene_vlm_batched(
600
+ scene_vlm=scene_vlm,
601
+ source_path=source_path,
602
+ video=video,
603
+ metadata=metadata,
604
+ scenes=scenes,
617
605
  )
618
606
  except Exception:
619
- logger.warning(
620
- "AudioClassifier failed for scene %d (%.1f-%.1fs)", index, scene.start, scene.end, exc_info=True
621
- )
622
- t_audio_total += time.perf_counter() - t0
607
+ logger.warning("Batched SceneVLM failed, skipping visual understanding", exc_info=True)
623
608
 
624
- samples.append(sample)
609
+ samples: list[SceneAnalysisSample] = []
610
+ audio_ctx = _record_stage(run_info, "audio_classification") if audio_classifier is not None else nullcontext()
611
+ with audio_ctx:
612
+ for index, scene in enumerate(scenes):
613
+ sample = SceneAnalysisSample(
614
+ scene_index=index,
615
+ start_second=float(scene.start),
616
+ end_second=float(scene.end),
617
+ start_frame=int(scene.start_frame),
618
+ end_frame=int(scene.end_frame),
619
+ caption=captions[index],
620
+ )
625
621
 
626
- if audio_classifier is not None:
627
- logger.info("AudioClassifier inference total: %.2fs across %d scenes", t_audio_total, len(scenes))
622
+ if audio_classifier is not None:
623
+ try:
624
+ scene_clip: Video | None = None
625
+ if path_audio is None:
626
+ try:
627
+ scene_clip = self._load_scene_video_clip(
628
+ source_path=source_path,
629
+ video=video,
630
+ start_second=scene.start,
631
+ end_second=scene.end,
632
+ )
633
+ except Exception:
634
+ scene_clip = None
635
+ sample.audio_classification = self._run_scene_audio_classification(
636
+ audio_classifier=audio_classifier,
637
+ path_audio=path_audio,
638
+ scene_clip=scene_clip,
639
+ scene_start=scene.start,
640
+ scene_end=scene.end,
641
+ )
642
+ except Exception:
643
+ logger.warning(
644
+ "AudioClassifier failed for scene %d (%.1f-%.1fs)",
645
+ index,
646
+ scene.start,
647
+ scene.end,
648
+ exc_info=True,
649
+ )
650
+
651
+ samples.append(sample)
628
652
 
629
653
  return SceneAnalysisSection(samples=samples)
630
654
 
@@ -893,6 +917,27 @@ def _utc_now_iso() -> str:
893
917
  return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
894
918
 
895
919
 
920
+ @contextmanager
921
+ def _record_stage(run_info: AnalysisRunInfo, stage: str) -> Iterator[None]:
922
+ """Time a block, write the elapsed seconds into ``run_info``, and log it."""
923
+ t0 = time.perf_counter()
924
+ try:
925
+ yield
926
+ finally:
927
+ elapsed = time.perf_counter() - t0
928
+ run_info.stage_durations_seconds[stage] = elapsed
929
+ logger.info("%s completed in %.2fs", stage, elapsed)
930
+
931
+
932
+ _T = TypeVar("_T")
933
+
934
+
935
+ def _run_with_stage(run_info: AnalysisRunInfo, stage: str, fn: Callable[..., _T], /, **kwargs: Any) -> _T:
936
+ """Call ``fn(**kwargs)`` inside ``_record_stage``. Use with ``ThreadPoolExecutor.submit``."""
937
+ with _record_stage(run_info, stage):
938
+ return fn(**kwargs)
939
+
940
+
896
941
  def _library_version() -> str | None:
897
942
  try:
898
943
  return importlib_metadata.version("videopython")
File without changes
File without changes
File without changes