videopython 0.33.3__tar.gz → 0.33.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.33.3 → videopython-0.33.4}/PKG-INFO +1 -1
- {videopython-0.33.3 → videopython-0.33.4}/pyproject.toml +1 -1
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/transcription.py +93 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/editing/transcription_overlay.py +23 -0
- {videopython-0.33.3 → videopython-0.33.4}/.gitignore +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/LICENSE +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/README.md +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/_device.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/config.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/expressiveness.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/loudness.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/dubbing/voice_sample.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/video_analysis/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/video_analysis/analyzer.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/video_analysis/models.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/video_analysis/sampling.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/ai/video_analysis/stages.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/audio/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/audio/analysis.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/audio/audio.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/_dimensions.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/_ffmpeg.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/_video_io.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/description.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/fonts/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/image_text.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/video.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/editing/effects.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/editing/operation.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/editing/streaming.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/editing/transforms.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.33.3 → videopython-0.33.4}/src/videopython/py.typed +0 -0
|
@@ -6,6 +6,13 @@ from typing import Any
|
|
|
6
6
|
|
|
7
7
|
__all__ = ["Transcription", "TranscriptionSegment", "TranscriptionWord"]
|
|
8
8
|
|
|
9
|
+
# Sentence-ending punctuation used by ``Transcription.capitalize_sentences``.
|
|
10
|
+
_SENTENCE_TERMINATORS = (".", "!", "?", "…")
|
|
11
|
+
|
|
12
|
+
# Trailing characters stripped before checking for a sentence terminator
|
|
13
|
+
# (closing quotes/brackets and whitespace), so ``end."`` still ends a sentence.
|
|
14
|
+
_TRAILING_WRAPPERS = "\"')]}»”’ "
|
|
15
|
+
|
|
9
16
|
|
|
10
17
|
@dataclass
|
|
11
18
|
class TranscriptionWord:
|
|
@@ -279,6 +286,92 @@ class Transcription:
|
|
|
279
286
|
|
|
280
287
|
return Transcription(segments=standardized_segments, language=self.language)
|
|
281
288
|
|
|
289
|
+
def capitalize_sentences(self) -> Transcription:
|
|
290
|
+
"""Return a new Transcription with sentence-start capitalization.
|
|
291
|
+
|
|
292
|
+
The first letter of the first spoken word and of every word that
|
|
293
|
+
follows sentence-ending punctuation (``.``, ``!``, ``?``, ``…``) is
|
|
294
|
+
upper-cased. Remaining characters are left untouched, so acronyms and
|
|
295
|
+
proper nouns from the source transcription are preserved. Timing,
|
|
296
|
+
speaker, and language are carried through unchanged.
|
|
297
|
+
|
|
298
|
+
Abbreviation detection is intentionally not attempted: a token like
|
|
299
|
+
``"U.S."`` is treated as a sentence end. This heuristic is adequate
|
|
300
|
+
for burned-in subtitles and avoids a brittle abbreviation list.
|
|
301
|
+
"""
|
|
302
|
+
capitalized_segments: list[TranscriptionSegment] = []
|
|
303
|
+
start_of_sentence = True
|
|
304
|
+
|
|
305
|
+
for segment in self.segments:
|
|
306
|
+
new_words: list[TranscriptionWord] = []
|
|
307
|
+
for word in segment.words:
|
|
308
|
+
token = word.word
|
|
309
|
+
if start_of_sentence:
|
|
310
|
+
idx = next((i for i, ch in enumerate(token) if ch.isalpha()), None)
|
|
311
|
+
if idx is not None:
|
|
312
|
+
token = token[:idx] + token[idx].upper() + token[idx + 1 :]
|
|
313
|
+
start_of_sentence = False
|
|
314
|
+
if token.rstrip(_TRAILING_WRAPPERS).endswith(_SENTENCE_TERMINATORS):
|
|
315
|
+
start_of_sentence = True
|
|
316
|
+
new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))
|
|
317
|
+
|
|
318
|
+
capitalized_segments.append(
|
|
319
|
+
TranscriptionSegment(
|
|
320
|
+
start=segment.start,
|
|
321
|
+
end=segment.end,
|
|
322
|
+
text=" ".join(w.word for w in new_words),
|
|
323
|
+
words=new_words,
|
|
324
|
+
speaker=segment.speaker,
|
|
325
|
+
avg_logprob=segment.avg_logprob,
|
|
326
|
+
no_speech_prob=segment.no_speech_prob,
|
|
327
|
+
compression_ratio=segment.compression_ratio,
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return Transcription(segments=capitalized_segments, language=self.language)
|
|
332
|
+
|
|
333
|
+
def chunk_segments(self, max_words: int) -> Transcription:
|
|
334
|
+
"""Return a new Transcription splitting each segment into smaller cues.
|
|
335
|
+
|
|
336
|
+
Each segment is split into consecutive groups of at most ``max_words``
|
|
337
|
+
words, using that group's own first/last word timings. Unlike
|
|
338
|
+
:meth:`standardize_segments`, words are never merged across the
|
|
339
|
+
original segments, so silence gaps between segments are preserved and
|
|
340
|
+
subtitles do not linger over pauses. Speaker, confidence, and language
|
|
341
|
+
metadata are carried through unchanged.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
max_words: Maximum number of words per output segment.
|
|
345
|
+
|
|
346
|
+
Raises:
|
|
347
|
+
ValueError: If ``max_words`` is not positive.
|
|
348
|
+
"""
|
|
349
|
+
if max_words <= 0:
|
|
350
|
+
raise ValueError("max_words must be positive")
|
|
351
|
+
|
|
352
|
+
chunked_segments: list[TranscriptionSegment] = []
|
|
353
|
+
for segment in self.segments:
|
|
354
|
+
words = segment.words
|
|
355
|
+
if not words:
|
|
356
|
+
chunked_segments.append(segment)
|
|
357
|
+
continue
|
|
358
|
+
for i in range(0, len(words), max_words):
|
|
359
|
+
group = words[i : i + max_words]
|
|
360
|
+
chunked_segments.append(
|
|
361
|
+
TranscriptionSegment(
|
|
362
|
+
start=group[0].start,
|
|
363
|
+
end=group[-1].end,
|
|
364
|
+
text=" ".join(w.word for w in group),
|
|
365
|
+
words=list(group),
|
|
366
|
+
speaker=segment.speaker,
|
|
367
|
+
avg_logprob=segment.avg_logprob,
|
|
368
|
+
no_speech_prob=segment.no_speech_prob,
|
|
369
|
+
compression_ratio=segment.compression_ratio,
|
|
370
|
+
)
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
return Transcription(segments=chunked_segments, language=self.language)
|
|
374
|
+
|
|
282
375
|
def slice(self, start: float, end: float) -> Transcription | None:
|
|
283
376
|
"""Return a new Transcription containing only words within the time range.
|
|
284
377
|
|
|
@@ -78,6 +78,24 @@ class TranscriptionOverlay(Effect):
|
|
|
78
78
|
highlight_bold_font: str | None = Field(
|
|
79
79
|
None, description="Path to a bold .ttf font for the highlighted word, or None to use the regular font."
|
|
80
80
|
)
|
|
81
|
+
max_words_per_cue: int | None = Field(
|
|
82
|
+
5,
|
|
83
|
+
ge=1,
|
|
84
|
+
description=(
|
|
85
|
+
"Maximum words shown on screen at once. Each transcription segment is re-chunked into "
|
|
86
|
+
"cues of at most this many words, without bridging the silence gaps between segments, so "
|
|
87
|
+
"subtitles stay readable and don't linger over pauses. None preserves the source "
|
|
88
|
+
"transcription's segmentation."
|
|
89
|
+
),
|
|
90
|
+
)
|
|
91
|
+
capitalize: bool = Field(
|
|
92
|
+
True,
|
|
93
|
+
description=(
|
|
94
|
+
"Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
|
|
95
|
+
"Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
|
|
96
|
+
"exactly as transcribed."
|
|
97
|
+
),
|
|
98
|
+
)
|
|
81
99
|
|
|
82
100
|
_overlay_cache: dict[tuple[str, int | None], np.ndarray] = PrivateAttr(default_factory=dict)
|
|
83
101
|
|
|
@@ -140,6 +158,11 @@ class TranscriptionOverlay(Effect):
|
|
|
140
158
|
"Pass it via VideoEdit.run(context={'transcription': ...}) or directly to apply()."
|
|
141
159
|
)
|
|
142
160
|
|
|
161
|
+
if self.max_words_per_cue is not None:
|
|
162
|
+
transcription = transcription.chunk_segments(self.max_words_per_cue)
|
|
163
|
+
if self.capitalize:
|
|
164
|
+
transcription = transcription.capitalize_sentences()
|
|
165
|
+
|
|
143
166
|
logger.info("Applying transcription overlay...")
|
|
144
167
|
new_frames = []
|
|
145
168
|
for frame_index, frame in enumerate(tqdm(video.frames, desc="Transcription overlay")):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|