videopython 0.33.3__tar.gz → 0.33.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.33.3 → videopython-0.33.5}/PKG-INFO +1 -1
- {videopython-0.33.3 → videopython-0.33.5}/pyproject.toml +1 -1
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/transcription.py +135 -70
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/transcription_overlay.py +23 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/video_edit.py +116 -4
- {videopython-0.33.3 → videopython-0.33.5}/.gitignore +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/LICENSE +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/README.md +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/_device.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/config.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/expressiveness.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/loudness.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/dubbing/voice_sample.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/video_analysis/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/video_analysis/analyzer.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/video_analysis/models.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/video_analysis/sampling.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/ai/video_analysis/stages.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/audio/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/audio/analysis.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/audio/audio.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/_dimensions.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/_ffmpeg.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/_video_io.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/description.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/fonts/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/image_text.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/video.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/effects.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/operation.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/streaming.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/transforms.py +0 -0
- {videopython-0.33.3 → videopython-0.33.5}/src/videopython/py.typed +0 -0
|
@@ -1,11 +1,18 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
3
|
+
from dataclasses import dataclass, replace
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
7
|
__all__ = ["Transcription", "TranscriptionSegment", "TranscriptionWord"]
|
|
8
8
|
|
|
9
|
+
# Sentence-ending punctuation used by ``Transcription.capitalize_sentences``.
|
|
10
|
+
_SENTENCE_TERMINATORS = (".", "!", "?", "…")
|
|
11
|
+
|
|
12
|
+
# Trailing characters stripped before checking for a sentence terminator
|
|
13
|
+
# (closing quotes/brackets and whitespace), so ``end."`` still ends a sentence.
|
|
14
|
+
_TRAILING_WRAPPERS = "\"')]}»”’ "
|
|
15
|
+
|
|
9
16
|
|
|
10
17
|
@dataclass
|
|
11
18
|
class TranscriptionWord:
|
|
@@ -72,6 +79,38 @@ class TranscriptionSegment:
|
|
|
72
79
|
compression_ratio=data.get("compression_ratio"),
|
|
73
80
|
)
|
|
74
81
|
|
|
82
|
+
@classmethod
|
|
83
|
+
def from_words(
|
|
84
|
+
cls,
|
|
85
|
+
words: list[TranscriptionWord],
|
|
86
|
+
*,
|
|
87
|
+
speaker: str | None = None,
|
|
88
|
+
avg_logprob: float | None = None,
|
|
89
|
+
no_speech_prob: float | None = None,
|
|
90
|
+
compression_ratio: float | None = None,
|
|
91
|
+
) -> TranscriptionSegment:
|
|
92
|
+
"""Build a segment spanning ``words``, deriving start/end/text from them.
|
|
93
|
+
|
|
94
|
+
``words`` must be non-empty: ``start``/``end`` come from the first/last
|
|
95
|
+
word and ``text`` is the words joined by single spaces. Speaker and the
|
|
96
|
+
confidence fields are passed through so callers re-segmenting *within* a
|
|
97
|
+
known source segment can preserve them; callers regrouping words across
|
|
98
|
+
segments (where these are ambiguous) simply omit them, leaving ``None``.
|
|
99
|
+
The ``words`` list is copied, so the result never aliases the caller's.
|
|
100
|
+
"""
|
|
101
|
+
if not words:
|
|
102
|
+
raise ValueError("from_words requires a non-empty word list")
|
|
103
|
+
return cls(
|
|
104
|
+
start=words[0].start,
|
|
105
|
+
end=words[-1].end,
|
|
106
|
+
text=" ".join(w.word for w in words),
|
|
107
|
+
words=list(words),
|
|
108
|
+
speaker=speaker,
|
|
109
|
+
avg_logprob=avg_logprob,
|
|
110
|
+
no_speech_prob=no_speech_prob,
|
|
111
|
+
compression_ratio=compression_ratio,
|
|
112
|
+
)
|
|
113
|
+
|
|
75
114
|
|
|
76
115
|
class Transcription:
|
|
77
116
|
def __init__(
|
|
@@ -117,39 +156,19 @@ class Transcription:
|
|
|
117
156
|
return []
|
|
118
157
|
|
|
119
158
|
current_speaker = words[0].speaker
|
|
120
|
-
current_words = []
|
|
121
|
-
segment_start = words[0].start
|
|
159
|
+
current_words: list[TranscriptionWord] = []
|
|
122
160
|
segments = []
|
|
123
161
|
|
|
124
162
|
for word in words:
|
|
125
163
|
if current_speaker == word.speaker:
|
|
126
164
|
current_words.append(word)
|
|
127
165
|
else:
|
|
128
|
-
|
|
129
|
-
segments.append(
|
|
130
|
-
TranscriptionSegment(
|
|
131
|
-
start=segment_start,
|
|
132
|
-
end=current_words[-1].end,
|
|
133
|
-
text=segment_text.strip(),
|
|
134
|
-
words=current_words.copy(),
|
|
135
|
-
speaker=current_speaker,
|
|
136
|
-
)
|
|
137
|
-
)
|
|
166
|
+
segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
|
|
138
167
|
current_speaker = word.speaker
|
|
139
168
|
current_words = [word]
|
|
140
|
-
segment_start = word.start
|
|
141
169
|
|
|
142
170
|
if current_words:
|
|
143
|
-
|
|
144
|
-
segments.append(
|
|
145
|
-
TranscriptionSegment(
|
|
146
|
-
start=segment_start,
|
|
147
|
-
end=current_words[-1].end,
|
|
148
|
-
text=segment_text.strip(),
|
|
149
|
-
words=current_words.copy(),
|
|
150
|
-
speaker=current_speaker,
|
|
151
|
-
)
|
|
152
|
-
)
|
|
171
|
+
segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
|
|
153
172
|
|
|
154
173
|
return segments
|
|
155
174
|
|
|
@@ -183,22 +202,14 @@ class Transcription:
|
|
|
183
202
|
offset_segments = []
|
|
184
203
|
|
|
185
204
|
for segment in self.segments:
|
|
186
|
-
offset_words = [
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
)
|
|
193
|
-
|
|
205
|
+
offset_words = [
|
|
206
|
+
TranscriptionWord(start=w.start + time, end=w.end + time, word=w.word, speaker=w.speaker)
|
|
207
|
+
for w in segment.words
|
|
208
|
+
]
|
|
209
|
+
# ``replace`` carries text, speaker, and confidence fields through a
|
|
210
|
+
# pure timing shift unchanged -- only timestamps move.
|
|
194
211
|
offset_segments.append(
|
|
195
|
-
|
|
196
|
-
start=segment.start + time,
|
|
197
|
-
end=segment.end + time,
|
|
198
|
-
text=segment.text,
|
|
199
|
-
words=offset_words,
|
|
200
|
-
speaker=segment.speaker,
|
|
201
|
-
)
|
|
212
|
+
replace(segment, start=segment.start + time, end=segment.end + time, words=offset_words)
|
|
202
213
|
)
|
|
203
214
|
|
|
204
215
|
return Transcription(segments=offset_segments, language=self.language)
|
|
@@ -238,16 +249,9 @@ class Transcription:
|
|
|
238
249
|
def _flush(words: list[TranscriptionWord]) -> None:
|
|
239
250
|
if not words:
|
|
240
251
|
return
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
start=words[0].start,
|
|
245
|
-
end=words[-1].end,
|
|
246
|
-
text=segment_text,
|
|
247
|
-
words=words.copy(),
|
|
248
|
-
speaker=words[0].speaker,
|
|
249
|
-
)
|
|
250
|
-
)
|
|
252
|
+
# Words here are regrouped across original segments, so the source
|
|
253
|
+
# segments' confidence fields no longer apply -- left as None.
|
|
254
|
+
standardized_segments.append(TranscriptionSegment.from_words(words, speaker=words[0].speaker))
|
|
251
255
|
|
|
252
256
|
if time is not None:
|
|
253
257
|
current_words: list[TranscriptionWord] = []
|
|
@@ -279,6 +283,84 @@ class Transcription:
|
|
|
279
283
|
|
|
280
284
|
return Transcription(segments=standardized_segments, language=self.language)
|
|
281
285
|
|
|
286
|
+
def capitalize_sentences(self) -> Transcription:
|
|
287
|
+
"""Return a new Transcription with sentence-start capitalization.
|
|
288
|
+
|
|
289
|
+
The first letter of the first spoken word and of every word that
|
|
290
|
+
follows sentence-ending punctuation (``.``, ``!``, ``?``, ``…``) is
|
|
291
|
+
upper-cased. Remaining characters are left untouched, so acronyms and
|
|
292
|
+
proper nouns from the source transcription are preserved. Timing,
|
|
293
|
+
speaker, and language are carried through unchanged.
|
|
294
|
+
|
|
295
|
+
Abbreviation detection is intentionally not attempted: a token like
|
|
296
|
+
``"U.S."`` is treated as a sentence end. This heuristic is adequate
|
|
297
|
+
for burned-in subtitles and avoids a brittle abbreviation list.
|
|
298
|
+
"""
|
|
299
|
+
capitalized_segments: list[TranscriptionSegment] = []
|
|
300
|
+
start_of_sentence = True
|
|
301
|
+
|
|
302
|
+
for segment in self.segments:
|
|
303
|
+
new_words: list[TranscriptionWord] = []
|
|
304
|
+
for word in segment.words:
|
|
305
|
+
token = word.word
|
|
306
|
+
if start_of_sentence:
|
|
307
|
+
idx = next((i for i, ch in enumerate(token) if ch.isalpha()), None)
|
|
308
|
+
if idx is not None:
|
|
309
|
+
token = token[:idx] + token[idx].upper() + token[idx + 1 :]
|
|
310
|
+
start_of_sentence = False
|
|
311
|
+
if token.rstrip(_TRAILING_WRAPPERS).endswith(_SENTENCE_TERMINATORS):
|
|
312
|
+
start_of_sentence = True
|
|
313
|
+
new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))
|
|
314
|
+
|
|
315
|
+
# Casing-only rewrite: segment boundaries, speaker, and confidence
|
|
316
|
+
# are unchanged; only the tokens (and joined text) differ.
|
|
317
|
+
capitalized_segments.append(replace(segment, text=" ".join(w.word for w in new_words), words=new_words))
|
|
318
|
+
|
|
319
|
+
return Transcription(segments=capitalized_segments, language=self.language)
|
|
320
|
+
|
|
321
|
+
def chunk_segments(self, max_words: int) -> Transcription:
|
|
322
|
+
"""Return a new Transcription splitting each segment into smaller cues.
|
|
323
|
+
|
|
324
|
+
Each segment is split into consecutive groups of at most ``max_words``
|
|
325
|
+
words, using that group's own first/last word timings. Unlike
|
|
326
|
+
:meth:`standardize_segments`, words are never merged across the
|
|
327
|
+
original segments, so silence gaps between segments are preserved and
|
|
328
|
+
subtitles do not linger over pauses. Speaker, confidence, and language
|
|
329
|
+
metadata are carried through unchanged.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
max_words: Maximum number of words per output segment.
|
|
333
|
+
|
|
334
|
+
Raises:
|
|
335
|
+
ValueError: If ``max_words`` is not positive.
|
|
336
|
+
"""
|
|
337
|
+
if max_words <= 0:
|
|
338
|
+
raise ValueError("max_words must be positive")
|
|
339
|
+
|
|
340
|
+
chunked_segments: list[TranscriptionSegment] = []
|
|
341
|
+
for segment in self.segments:
|
|
342
|
+
words = segment.words
|
|
343
|
+
if not words:
|
|
344
|
+
# Nothing to split; emit a fresh copy so the result never
|
|
345
|
+
# aliases the source segment.
|
|
346
|
+
chunked_segments.append(replace(segment, words=list(segment.words)))
|
|
347
|
+
continue
|
|
348
|
+
for i in range(0, len(words), max_words):
|
|
349
|
+
group = words[i : i + max_words]
|
|
350
|
+
# Splitting *within* one source segment -- its confidence
|
|
351
|
+
# fields still apply, so carry them through.
|
|
352
|
+
chunked_segments.append(
|
|
353
|
+
TranscriptionSegment.from_words(
|
|
354
|
+
group,
|
|
355
|
+
speaker=segment.speaker,
|
|
356
|
+
avg_logprob=segment.avg_logprob,
|
|
357
|
+
no_speech_prob=segment.no_speech_prob,
|
|
358
|
+
compression_ratio=segment.compression_ratio,
|
|
359
|
+
)
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
return Transcription(segments=chunked_segments, language=self.language)
|
|
363
|
+
|
|
282
364
|
def slice(self, start: float, end: float) -> Transcription | None:
|
|
283
365
|
"""Return a new Transcription containing only words within the time range.
|
|
284
366
|
|
|
@@ -316,34 +398,17 @@ class Transcription:
|
|
|
316
398
|
if word.speaker == current_speaker:
|
|
317
399
|
current_words.append(word)
|
|
318
400
|
else:
|
|
319
|
-
# Finish current segment
|
|
401
|
+
# Finish current segment (speaker is ambiguous across the
|
|
402
|
+
# original segments these words came from -- confidence omitted)
|
|
320
403
|
if current_words:
|
|
321
|
-
|
|
322
|
-
sliced_segments.append(
|
|
323
|
-
TranscriptionSegment(
|
|
324
|
-
start=current_words[0].start,
|
|
325
|
-
end=current_words[-1].end,
|
|
326
|
-
text=segment_text,
|
|
327
|
-
words=current_words.copy(),
|
|
328
|
-
speaker=current_speaker,
|
|
329
|
-
)
|
|
330
|
-
)
|
|
404
|
+
sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
|
|
331
405
|
# Start new segment
|
|
332
406
|
current_speaker = word.speaker
|
|
333
407
|
current_words = [word]
|
|
334
408
|
|
|
335
409
|
# Add final segment
|
|
336
410
|
if current_words:
|
|
337
|
-
|
|
338
|
-
sliced_segments.append(
|
|
339
|
-
TranscriptionSegment(
|
|
340
|
-
start=current_words[0].start,
|
|
341
|
-
end=current_words[-1].end,
|
|
342
|
-
text=segment_text,
|
|
343
|
-
words=current_words.copy(),
|
|
344
|
-
speaker=current_speaker,
|
|
345
|
-
)
|
|
346
|
-
)
|
|
411
|
+
sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
|
|
347
412
|
|
|
348
413
|
return Transcription(segments=sliced_segments, language=self.language)
|
|
349
414
|
|
|
@@ -78,6 +78,24 @@ class TranscriptionOverlay(Effect):
|
|
|
78
78
|
highlight_bold_font: str | None = Field(
|
|
79
79
|
None, description="Path to a bold .ttf font for the highlighted word, or None to use the regular font."
|
|
80
80
|
)
|
|
81
|
+
max_words_per_cue: int | None = Field(
|
|
82
|
+
5,
|
|
83
|
+
ge=1,
|
|
84
|
+
description=(
|
|
85
|
+
"Maximum words shown on screen at once. Each transcription segment is re-chunked into "
|
|
86
|
+
"cues of at most this many words, without bridging the silence gaps between segments, so "
|
|
87
|
+
"subtitles stay readable and don't linger over pauses. None preserves the source "
|
|
88
|
+
"transcription's segmentation."
|
|
89
|
+
),
|
|
90
|
+
)
|
|
91
|
+
capitalize: bool = Field(
|
|
92
|
+
True,
|
|
93
|
+
description=(
|
|
94
|
+
"Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
|
|
95
|
+
"Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
|
|
96
|
+
"exactly as transcribed."
|
|
97
|
+
),
|
|
98
|
+
)
|
|
81
99
|
|
|
82
100
|
_overlay_cache: dict[tuple[str, int | None], np.ndarray] = PrivateAttr(default_factory=dict)
|
|
83
101
|
|
|
@@ -140,6 +158,11 @@ class TranscriptionOverlay(Effect):
|
|
|
140
158
|
"Pass it via VideoEdit.run(context={'transcription': ...}) or directly to apply()."
|
|
141
159
|
)
|
|
142
160
|
|
|
161
|
+
if self.max_words_per_cue is not None:
|
|
162
|
+
transcription = transcription.chunk_segments(self.max_words_per_cue)
|
|
163
|
+
if self.capitalize:
|
|
164
|
+
transcription = transcription.capitalize_sentences()
|
|
165
|
+
|
|
143
166
|
logger.info("Applying transcription overlay...")
|
|
144
167
|
new_frames = []
|
|
145
168
|
for frame_index, frame in enumerate(tqdm(video.frames, desc="Transcription overlay")):
|
|
@@ -24,7 +24,7 @@ import subprocess
|
|
|
24
24
|
import tempfile
|
|
25
25
|
import warnings
|
|
26
26
|
from pathlib import Path
|
|
27
|
-
from typing import Annotated, Any
|
|
27
|
+
from typing import Annotated, Any, Protocol, runtime_checkable
|
|
28
28
|
|
|
29
29
|
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, SerializeAsAny, model_validator
|
|
30
30
|
|
|
@@ -65,6 +65,72 @@ def _resolve_operation(value: Any) -> Operation:
|
|
|
65
65
|
OperationInput = Annotated[SerializeAsAny[Operation], BeforeValidator(_resolve_operation)]
|
|
66
66
|
|
|
67
67
|
|
|
68
|
+
@runtime_checkable
|
|
69
|
+
class SegmentRebaseable(Protocol):
|
|
70
|
+
"""A runtime-context value carrying a source-absolute timeline.
|
|
71
|
+
|
|
72
|
+
Any context entry implementing both ``slice(start, end)`` and
|
|
73
|
+
``offset(delta)`` -- e.g. :class:`videopython.base.transcription.Transcription`
|
|
74
|
+
-- is automatically re-based onto each segment's 0-based local timeline by
|
|
75
|
+
the runner, with no per-type wiring. Keying off structure rather than a
|
|
76
|
+
concrete class keeps the context mechanism generic for future time-based
|
|
77
|
+
context (beat maps, scene markers, ...) and avoids a layering dependency
|
|
78
|
+
from the editing layer onto every such type.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def slice(self, start: float, end: float) -> SegmentRebaseable | None: ...
|
|
82
|
+
|
|
83
|
+
def offset(self, delta: float) -> SegmentRebaseable: ...
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _rebaseable_keys(context: dict[str, Any] | None) -> set[str]:
|
|
87
|
+
"""Context keys whose value carries a re-baseable source-absolute timeline."""
|
|
88
|
+
if not context:
|
|
89
|
+
return set()
|
|
90
|
+
return {k for k, v in context.items() if isinstance(v, SegmentRebaseable)}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _segment_context(
|
|
94
|
+
context: dict[str, Any] | None,
|
|
95
|
+
start: float,
|
|
96
|
+
end: float,
|
|
97
|
+
) -> dict[str, Any] | None:
|
|
98
|
+
"""Re-base time-based context entries onto a cut segment's local timeline.
|
|
99
|
+
|
|
100
|
+
A cut segment is decoded 0-based -- its first frame is ``t=0`` -- but
|
|
101
|
+
context values may carry source-absolute timestamps. Every value
|
|
102
|
+
implementing :class:`SegmentRebaseable` (e.g. a ``Transcription``) is
|
|
103
|
+
sliced to ``[start, end)`` and shifted by ``-start`` so segment operations
|
|
104
|
+
(``add_subtitles``, ``silence_removal``) see segment-local time. Without
|
|
105
|
+
this, subtitles on a segment cut from the middle of a video render blank.
|
|
106
|
+
Values that don't implement the protocol pass through untouched.
|
|
107
|
+
|
|
108
|
+
Slicing always runs (even for ``start == 0``) so out-of-range entries do
|
|
109
|
+
not bleed in. When ``slice`` yields nothing the key is dropped rather than
|
|
110
|
+
passed empty, so the consuming operation raises its own clear "requires
|
|
111
|
+
..." error instead of silently doing nothing.
|
|
112
|
+
|
|
113
|
+
Scope: per-segment only. ``post_operations`` run on the assembled,
|
|
114
|
+
concatenated timeline; re-basing time-based context across a multi-segment
|
|
115
|
+
concat is unsupported and rejected up front by
|
|
116
|
+
:meth:`VideoEdit._assert_post_ops_supported` (single-segment plans are
|
|
117
|
+
unaffected).
|
|
118
|
+
"""
|
|
119
|
+
if not context:
|
|
120
|
+
return context
|
|
121
|
+
rebaseable = {k: v for k, v in context.items() if isinstance(v, SegmentRebaseable)}
|
|
122
|
+
if not rebaseable:
|
|
123
|
+
return context
|
|
124
|
+
rebased = dict(context)
|
|
125
|
+
for key, value in rebaseable.items():
|
|
126
|
+
sliced = value.slice(start, end)
|
|
127
|
+
if sliced is None:
|
|
128
|
+
del rebased[key]
|
|
129
|
+
else:
|
|
130
|
+
rebased[key] = sliced.offset(-start)
|
|
131
|
+
return rebased
|
|
132
|
+
|
|
133
|
+
|
|
68
134
|
def _apply_with_context(op: Operation, video: Video, context: dict[str, Any] | None) -> Video:
|
|
69
135
|
"""Apply ``op`` to ``video``, threading ``op.requires`` keys from ``context``."""
|
|
70
136
|
if op.requires and context:
|
|
@@ -139,9 +205,14 @@ class SegmentConfig(BaseModel):
|
|
|
139
205
|
)
|
|
140
206
|
|
|
141
207
|
def process(self, video: Video, context: dict[str, Any] | None = None) -> Video:
|
|
142
|
-
"""Apply every operation in this segment to ``video`` in order.
|
|
208
|
+
"""Apply every operation in this segment to ``video`` in order.
|
|
209
|
+
|
|
210
|
+
Time-based context (e.g. ``transcription``) is re-based onto this
|
|
211
|
+
segment's 0-based local timeline before any operation sees it.
|
|
212
|
+
"""
|
|
213
|
+
seg_context = _segment_context(context, self.start, self.end)
|
|
143
214
|
for op in self.operations:
|
|
144
|
-
video = _apply_with_context(op, video,
|
|
215
|
+
video = _apply_with_context(op, video, seg_context)
|
|
145
216
|
return video
|
|
146
217
|
|
|
147
218
|
|
|
@@ -288,11 +359,38 @@ class VideoEdit(BaseModel):
|
|
|
288
359
|
metas.append(source_metadata[key])
|
|
289
360
|
return self._validate(metas, context)
|
|
290
361
|
|
|
362
|
+
def _assert_post_ops_supported(self, context: dict[str, Any] | None) -> None:
|
|
363
|
+
"""Reject post_operations needing time-based context on a multi-segment plan.
|
|
364
|
+
|
|
365
|
+
``post_operations`` run on the assembled, concatenated timeline. A
|
|
366
|
+
source-absolute context value (e.g. a ``Transcription``) cannot be
|
|
367
|
+
re-based across a multi-segment concat, and passing the raw value would
|
|
368
|
+
silently mis-time the op (subtitles/silence-removal against the wrong
|
|
369
|
+
timeline). Fail fast with an actionable message instead of producing a
|
|
370
|
+
wrong render. Single-segment plans are unaffected -- their concatenated
|
|
371
|
+
timeline is just the one segment's, handled by ``_segment_context``.
|
|
372
|
+
"""
|
|
373
|
+
if len(self.segments) <= 1 or not self.post_operations:
|
|
374
|
+
return
|
|
375
|
+
rebaseable = _rebaseable_keys(context)
|
|
376
|
+
if not rebaseable:
|
|
377
|
+
return
|
|
378
|
+
for op in self.post_operations:
|
|
379
|
+
clash = sorted(set(op.requires) & rebaseable)
|
|
380
|
+
if clash:
|
|
381
|
+
raise ValueError(
|
|
382
|
+
f"post_operation '{op.op}' requires time-based context {clash}, but the plan "
|
|
383
|
+
f"has {len(self.segments)} segments. post_operations run on the concatenated "
|
|
384
|
+
"timeline and time-based context is not re-based across a multi-segment concat. "
|
|
385
|
+
f"Move '{op.op}' into a segment, or use a single-segment plan."
|
|
386
|
+
)
|
|
387
|
+
|
|
291
388
|
def _validate(
|
|
292
389
|
self,
|
|
293
390
|
source_metas: list[VideoMetadata],
|
|
294
391
|
context: dict[str, Any] | None,
|
|
295
392
|
) -> VideoMetadata:
|
|
393
|
+
self._assert_post_ops_supported(context)
|
|
296
394
|
cut_metas: list[VideoMetadata] = []
|
|
297
395
|
for i, (seg, meta) in enumerate(zip(self.segments, source_metas)):
|
|
298
396
|
if seg.end > meta.total_seconds + 1e-3:
|
|
@@ -325,10 +423,11 @@ class VideoEdit(BaseModel):
|
|
|
325
423
|
meta: VideoMetadata,
|
|
326
424
|
context: dict[str, Any] | None,
|
|
327
425
|
) -> VideoMetadata:
|
|
426
|
+
seg_context = _segment_context(context, segment.start, segment.end)
|
|
328
427
|
for op in segment.operations:
|
|
329
428
|
_validate_effect_window(op, meta.total_seconds)
|
|
330
429
|
try:
|
|
331
|
-
meta = _predict_with_context(op, meta,
|
|
430
|
+
meta = _predict_with_context(op, meta, seg_context)
|
|
332
431
|
except (ValueError, TypeError) as e:
|
|
333
432
|
raise ValueError(f"Segment {index}: metadata prediction failed for '{op.op}': {e}") from e
|
|
334
433
|
return meta
|
|
@@ -367,6 +466,7 @@ class VideoEdit(BaseModel):
|
|
|
367
466
|
|
|
368
467
|
def run(self, context: dict[str, Any] | None = None) -> Video:
|
|
369
468
|
"""Execute the plan in memory and return the final ``Video``."""
|
|
469
|
+
self._assert_post_ops_supported(context)
|
|
370
470
|
target_fps, target_w, target_h = self._matching_targets_from_disk()
|
|
371
471
|
videos = [
|
|
372
472
|
segment.process(segment.load(fps=target_fps, width=target_w, height=target_h), context)
|
|
@@ -393,6 +493,7 @@ class VideoEdit(BaseModel):
|
|
|
393
493
|
isn't streamable. Memory usage is O(1) w.r.t. video length for fully
|
|
394
494
|
streamable pipelines.
|
|
395
495
|
"""
|
|
496
|
+
self._assert_post_ops_supported(context)
|
|
396
497
|
output_path = Path(output_path).with_suffix(f".{format}")
|
|
397
498
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
398
499
|
|
|
@@ -412,6 +513,11 @@ class VideoEdit(BaseModel):
|
|
|
412
513
|
plan = plans[0]
|
|
413
514
|
total_frames = round((plan.end_second - plan.start_second) * plan.output_fps)
|
|
414
515
|
for op in self.post_operations:
|
|
516
|
+
if op.requires:
|
|
517
|
+
# Same reason as the per-segment guard: no runtime context
|
|
518
|
+
# in the streaming path. (Multi-segment + requires already
|
|
519
|
+
# raised by _assert_post_ops_supported.)
|
|
520
|
+
return self._run_to_file_eager(output_path, format, preset, crf, context)
|
|
415
521
|
if not isinstance(op, Effect) or not op.streamable:
|
|
416
522
|
return self._run_to_file_eager(output_path, format, preset, crf, context)
|
|
417
523
|
start_f, end_f = _effect_frame_range(op, plan.output_fps, total_frames)
|
|
@@ -477,6 +583,12 @@ class VideoEdit(BaseModel):
|
|
|
477
583
|
|
|
478
584
|
effect_schedule: list[EffectScheduleEntry] = []
|
|
479
585
|
for op in segment.operations:
|
|
586
|
+
if op.requires:
|
|
587
|
+
# Streaming schedules effects by frame range with no runtime
|
|
588
|
+
# context, so it can't supply -- let alone re-base onto the
|
|
589
|
+
# segment's local timeline -- anything an op `requires`. Defer
|
|
590
|
+
# to the eager path, where _segment_context handles re-basing.
|
|
591
|
+
return None
|
|
480
592
|
if isinstance(op, Effect):
|
|
481
593
|
if not op.streamable:
|
|
482
594
|
return None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|