videopython 0.27.2__tar.gz → 0.28.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.27.2 → videopython-0.28.0}/PKG-INFO +1 -1
- {videopython-0.27.2 → videopython-0.28.0}/pyproject.toml +1 -1
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/__init__.py +4 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/dubber.py +10 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/models.py +97 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/pipeline.py +159 -26
- videopython-0.28.0/src/videopython/ai/dubbing/quality.py +178 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/translation.py +22 -4
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/audio.py +3 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/text/transcription.py +9 -0
- {videopython-0.27.2 → videopython-0.28.0}/.gitignore +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/LICENSE +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/README.md +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/registry.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/combine.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/description.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/effects.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/progress.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/registry.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/scene.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/streaming.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/transforms.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/transitions.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/utils.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/base/video.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.27.2 → videopython-0.28.0}/src/videopython/py.typed +0 -0
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from videopython.ai.dubbing.dubber import VideoDubber
|
|
4
4
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TranslatedSegment
|
|
5
5
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
6
|
+
from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
|
|
6
7
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
7
8
|
|
|
8
9
|
__all__ = [
|
|
@@ -13,4 +14,7 @@ __all__ = [
|
|
|
13
14
|
"SeparatedAudio",
|
|
14
15
|
"LocalDubbingPipeline",
|
|
15
16
|
"TimingSynchronizer",
|
|
17
|
+
"GarbageTranscriptError",
|
|
18
|
+
"TranscriptQuality",
|
|
19
|
+
"assess_transcript",
|
|
16
20
|
]
|
|
@@ -37,6 +37,13 @@ class VideoDubber:
|
|
|
37
37
|
gate; raise to drop more low-confidence windows.
|
|
38
38
|
logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
|
|
39
39
|
log-probability gate.
|
|
40
|
+
strict_quality: When True, the pipeline raises
|
|
41
|
+
:class:`GarbageTranscriptError` before Demucs/translation/TTS run
|
|
42
|
+
if the transcript-quality heuristic returns ``"reject"``. When
|
|
43
|
+
False (default), low-quality transcripts are logged at WARNING
|
|
44
|
+
but processing continues. Either way the
|
|
45
|
+
:class:`TranscriptQuality` is exposed on ``DubbingResult`` for
|
|
46
|
+
inspection.
|
|
40
47
|
"""
|
|
41
48
|
|
|
42
49
|
def __init__(
|
|
@@ -47,6 +54,7 @@ class VideoDubber:
|
|
|
47
54
|
condition_on_previous_text: bool = False,
|
|
48
55
|
no_speech_threshold: float = 0.6,
|
|
49
56
|
logprob_threshold: float | None = -1.0,
|
|
57
|
+
strict_quality: bool = False,
|
|
50
58
|
):
|
|
51
59
|
self.device = device
|
|
52
60
|
self.low_memory = low_memory
|
|
@@ -54,6 +62,7 @@ class VideoDubber:
|
|
|
54
62
|
self.condition_on_previous_text = condition_on_previous_text
|
|
55
63
|
self.no_speech_threshold = no_speech_threshold
|
|
56
64
|
self.logprob_threshold = logprob_threshold
|
|
65
|
+
self.strict_quality = strict_quality
|
|
57
66
|
self._local_pipeline: Any = None
|
|
58
67
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
59
68
|
logger.info(
|
|
@@ -73,6 +82,7 @@ class VideoDubber:
|
|
|
73
82
|
condition_on_previous_text=self.condition_on_previous_text,
|
|
74
83
|
no_speech_threshold=self.no_speech_threshold,
|
|
75
84
|
logprob_threshold=self.logprob_threshold,
|
|
85
|
+
strict_quality=self.strict_quality,
|
|
76
86
|
)
|
|
77
87
|
|
|
78
88
|
def dub(
|
|
@@ -3,10 +3,21 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
6
7
|
|
|
7
8
|
from videopython.base.audio import Audio
|
|
8
9
|
from videopython.base.text.transcription import Transcription, TranscriptionSegment
|
|
9
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from videopython.ai.dubbing.quality import TranscriptQuality
|
|
13
|
+
from videopython.ai.dubbing.timing import TimingAdjustment
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Speed factors within this band of 1.0 are treated as a "clean" timing
|
|
17
|
+
# adjustment (no perceptible compression/stretch). Heuristic threshold for
|
|
18
|
+
# the TimingSummary classification only.
|
|
19
|
+
CLEAN_SPEED_TOLERANCE = 0.01
|
|
20
|
+
|
|
10
21
|
|
|
11
22
|
@dataclass
|
|
12
23
|
class TranslatedSegment:
|
|
@@ -73,6 +84,87 @@ class SeparatedAudio:
|
|
|
73
84
|
return self.music is not None and self.effects is not None
|
|
74
85
|
|
|
75
86
|
|
|
87
|
+
@dataclass
|
|
88
|
+
class TimingSummary:
|
|
89
|
+
"""Aggregate stats over per-segment timing adjustments.
|
|
90
|
+
|
|
91
|
+
Surfaces how aggressively the timing synchronizer had to compress or
|
|
92
|
+
truncate dubbed segments to fit the source's spoken regions. High
|
|
93
|
+
truncation rates indicate translation produced text too long for the
|
|
94
|
+
source duration.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
total_segments: int
|
|
98
|
+
clean_count: int
|
|
99
|
+
stretched_count: int
|
|
100
|
+
truncated_count: int
|
|
101
|
+
mean_speed_factor: float
|
|
102
|
+
max_truncation_seconds: float
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def from_adjustments(cls, adjustments: list[TimingAdjustment]) -> TimingSummary:
|
|
106
|
+
"""Aggregate a list of TimingAdjustments into a TimingSummary."""
|
|
107
|
+
total = len(adjustments)
|
|
108
|
+
if total == 0:
|
|
109
|
+
return cls(
|
|
110
|
+
total_segments=0,
|
|
111
|
+
clean_count=0,
|
|
112
|
+
stretched_count=0,
|
|
113
|
+
truncated_count=0,
|
|
114
|
+
mean_speed_factor=1.0,
|
|
115
|
+
max_truncation_seconds=0.0,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
clean = 0
|
|
119
|
+
stretched = 0
|
|
120
|
+
truncated = 0
|
|
121
|
+
speed_sum = 0.0
|
|
122
|
+
max_truncation = 0.0
|
|
123
|
+
for adj in adjustments:
|
|
124
|
+
speed_sum += adj.speed_factor
|
|
125
|
+
if adj.was_truncated:
|
|
126
|
+
truncated += 1
|
|
127
|
+
truncation = adj.original_duration - adj.actual_duration
|
|
128
|
+
if truncation > max_truncation:
|
|
129
|
+
max_truncation = truncation
|
|
130
|
+
elif abs(adj.speed_factor - 1.0) <= CLEAN_SPEED_TOLERANCE:
|
|
131
|
+
clean += 1
|
|
132
|
+
else:
|
|
133
|
+
stretched += 1
|
|
134
|
+
|
|
135
|
+
return cls(
|
|
136
|
+
total_segments=total,
|
|
137
|
+
clean_count=clean,
|
|
138
|
+
stretched_count=stretched,
|
|
139
|
+
truncated_count=truncated,
|
|
140
|
+
mean_speed_factor=speed_sum / total,
|
|
141
|
+
max_truncation_seconds=max_truncation,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def to_dict(self) -> dict[str, Any]:
|
|
145
|
+
"""Convert to dictionary for JSON serialization."""
|
|
146
|
+
return {
|
|
147
|
+
"total_segments": self.total_segments,
|
|
148
|
+
"clean_count": self.clean_count,
|
|
149
|
+
"stretched_count": self.stretched_count,
|
|
150
|
+
"truncated_count": self.truncated_count,
|
|
151
|
+
"mean_speed_factor": self.mean_speed_factor,
|
|
152
|
+
"max_truncation_seconds": self.max_truncation_seconds,
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
@classmethod
|
|
156
|
+
def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
|
|
157
|
+
"""Create TimingSummary from dictionary."""
|
|
158
|
+
return cls(
|
|
159
|
+
total_segments=data["total_segments"],
|
|
160
|
+
clean_count=data["clean_count"],
|
|
161
|
+
stretched_count=data["stretched_count"],
|
|
162
|
+
truncated_count=data["truncated_count"],
|
|
163
|
+
mean_speed_factor=data["mean_speed_factor"],
|
|
164
|
+
max_truncation_seconds=data["max_truncation_seconds"],
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
76
168
|
@dataclass
|
|
77
169
|
class DubbingResult:
|
|
78
170
|
"""Result of a video dubbing operation.
|
|
@@ -85,6 +177,9 @@ class DubbingResult:
|
|
|
85
177
|
target_lang: Target language for dubbing.
|
|
86
178
|
separated_audio: Separated audio components (if preserve_background=True).
|
|
87
179
|
voice_samples: Dictionary mapping speaker IDs to voice sample Audio.
|
|
180
|
+
timing_summary: Aggregate stats over per-segment timing adjustments.
|
|
181
|
+
transcript_quality: Heuristic quality assessment of the transcription
|
|
182
|
+
(None when the pipeline returned early on an empty transcription).
|
|
88
183
|
"""
|
|
89
184
|
|
|
90
185
|
dubbed_audio: Audio
|
|
@@ -94,6 +189,8 @@ class DubbingResult:
|
|
|
94
189
|
target_lang: str
|
|
95
190
|
separated_audio: SeparatedAudio | None = None
|
|
96
191
|
voice_samples: dict[str, Audio] = field(default_factory=dict)
|
|
192
|
+
timing_summary: TimingSummary | None = None
|
|
193
|
+
transcript_quality: TranscriptQuality | None = None
|
|
97
194
|
|
|
98
195
|
@property
|
|
99
196
|
def num_segments(self) -> int:
|
|
@@ -9,7 +9,8 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
-
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
|
|
12
|
+
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
|
|
13
|
+
from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
|
|
13
14
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
14
15
|
|
|
15
16
|
if TYPE_CHECKING:
|
|
@@ -46,6 +47,14 @@ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
|
|
|
46
47
|
|
|
47
48
|
logger = logging.getLogger(__name__)
|
|
48
49
|
|
|
50
|
+
# Voice-sample quality gating thresholds. Tuned conservatively to favor
|
|
51
|
+
# accepting real-world dialogue over rejecting it; failures fall back to
|
|
52
|
+
# the longest segment with a WARNING log so we can re-tune from production
|
|
53
|
+
# data instead of guessing.
|
|
54
|
+
PEAK_CLIP_THRESHOLD = 0.99
|
|
55
|
+
MIN_VOCAL_BG_RMS_RATIO = 1.5
|
|
56
|
+
VOICE_SAMPLE_TARGET_DURATION = 6.0
|
|
57
|
+
|
|
49
58
|
|
|
50
59
|
class LocalDubbingPipeline:
|
|
51
60
|
"""Local pipeline for video dubbing.
|
|
@@ -64,6 +73,7 @@ class LocalDubbingPipeline:
|
|
|
64
73
|
condition_on_previous_text: bool = False,
|
|
65
74
|
no_speech_threshold: float = 0.6,
|
|
66
75
|
logprob_threshold: float | None = -1.0,
|
|
76
|
+
strict_quality: bool = False,
|
|
67
77
|
):
|
|
68
78
|
self.device = device
|
|
69
79
|
self.low_memory = low_memory
|
|
@@ -71,6 +81,7 @@ class LocalDubbingPipeline:
|
|
|
71
81
|
self.condition_on_previous_text = condition_on_previous_text
|
|
72
82
|
self.no_speech_threshold = no_speech_threshold
|
|
73
83
|
self.logprob_threshold = logprob_threshold
|
|
84
|
+
self.strict_quality = strict_quality
|
|
74
85
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
75
86
|
logger.info(
|
|
76
87
|
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
|
|
@@ -141,12 +152,25 @@ class LocalDubbingPipeline:
|
|
|
141
152
|
|
|
142
153
|
def _extract_voice_samples(
|
|
143
154
|
self,
|
|
144
|
-
|
|
155
|
+
vocal_audio: Any,
|
|
156
|
+
background_audio: Any | None,
|
|
145
157
|
transcription: Any,
|
|
146
158
|
min_duration: float = 3.0,
|
|
147
159
|
max_duration: float = 10.0,
|
|
148
160
|
) -> dict[str, Any]:
|
|
149
|
-
"""Extract
|
|
161
|
+
"""Extract a per-speaker voice sample with quality gating.
|
|
162
|
+
|
|
163
|
+
Picks the highest-scored segment per speaker after rejecting clipped
|
|
164
|
+
slices (peak >= ``PEAK_CLIP_THRESHOLD``) and slices where Demucs left
|
|
165
|
+
the background louder than the vocals
|
|
166
|
+
(``vocal_rms / bg_rms < MIN_VOCAL_BG_RMS_RATIO``). When the
|
|
167
|
+
background track isn't available (e.g. ``revoice`` after
|
|
168
|
+
``low_memory`` dropped it), the RMS check is skipped silently.
|
|
169
|
+
|
|
170
|
+
Falls back to the longest available segment with a WARNING log when
|
|
171
|
+
every candidate is rejected, so the dub continues with the best
|
|
172
|
+
sample we have rather than silently dropping the speaker.
|
|
173
|
+
"""
|
|
150
174
|
from videopython.base.audio import Audio
|
|
151
175
|
|
|
152
176
|
voice_samples: dict[str, Audio] = {}
|
|
@@ -159,29 +183,106 @@ class LocalDubbingPipeline:
|
|
|
159
183
|
segments_by_speaker[speaker].append(segment)
|
|
160
184
|
|
|
161
185
|
for speaker, segments in segments_by_speaker.items():
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
186
|
+
chosen, fallback_reason = self._pick_voice_segment(
|
|
187
|
+
speaker, segments, vocal_audio, background_audio, min_duration
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if chosen is None:
|
|
191
|
+
logger.warning("No usable voice-sample segment for speaker %r (no candidates)", speaker)
|
|
192
|
+
continue
|
|
193
|
+
|
|
194
|
+
if fallback_reason is not None:
|
|
195
|
+
logger.warning(
|
|
196
|
+
"Voice-sample quality fallback for speaker %r (%d candidates): %s — using longest segment",
|
|
197
|
+
speaker,
|
|
198
|
+
len(segments),
|
|
199
|
+
fallback_reason,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
start = chosen.start
|
|
203
|
+
end = min(chosen.end, start + max_duration)
|
|
204
|
+
sliced = vocal_audio.slice(start, end)
|
|
205
|
+
# Audio.slice returns a numpy view into the source. Copy so the
|
|
206
|
+
# short voice sample doesn't keep the full vocals array (~1.3 GB
|
|
207
|
+
# for 2h sources) alive across translate + TTS.
|
|
208
|
+
voice_samples[speaker] = Audio(sliced.data.copy(), sliced.metadata)
|
|
182
209
|
|
|
183
210
|
return voice_samples
|
|
184
211
|
|
|
212
|
+
def _pick_voice_segment(
|
|
213
|
+
self,
|
|
214
|
+
speaker: str,
|
|
215
|
+
segments: list[Any],
|
|
216
|
+
vocal_audio: Any,
|
|
217
|
+
background_audio: Any | None,
|
|
218
|
+
min_duration: float,
|
|
219
|
+
) -> tuple[Any | None, str | None]:
|
|
220
|
+
"""Score eligible segments and pick the best one for ``speaker``.
|
|
221
|
+
|
|
222
|
+
Returns ``(segment, fallback_reason)``. ``fallback_reason`` is None
|
|
223
|
+
when scoring picked a segment cleanly; non-None when every candidate
|
|
224
|
+
was rejected and the longest segment was used instead.
|
|
225
|
+
"""
|
|
226
|
+
if not segments:
|
|
227
|
+
return None, None
|
|
228
|
+
|
|
229
|
+
eligible = [s for s in segments if (s.end - s.start) >= min_duration]
|
|
230
|
+
|
|
231
|
+
rejection_reasons: list[str] = []
|
|
232
|
+
scored: list[tuple[float, Any]] = []
|
|
233
|
+
for segment in eligible:
|
|
234
|
+
score, reason = self._score_voice_segment(segment, vocal_audio, background_audio)
|
|
235
|
+
if score is None:
|
|
236
|
+
rejection_reasons.append(reason or "rejected")
|
|
237
|
+
else:
|
|
238
|
+
scored.append((score, segment))
|
|
239
|
+
|
|
240
|
+
if scored:
|
|
241
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
242
|
+
return scored[0][1], None
|
|
243
|
+
|
|
244
|
+
# All eligible segments rejected (or none met the min duration).
|
|
245
|
+
# Fall back to the longest segment overall so the speaker still
|
|
246
|
+
# gets a clone reference.
|
|
247
|
+
longest = max(segments, key=lambda s: s.end - s.start)
|
|
248
|
+
if eligible:
|
|
249
|
+
reason = ", ".join(sorted(set(rejection_reasons)))
|
|
250
|
+
else:
|
|
251
|
+
reason = f"no segment >= {min_duration:.1f}s"
|
|
252
|
+
return longest, reason
|
|
253
|
+
|
|
254
|
+
def _score_voice_segment(
|
|
255
|
+
self,
|
|
256
|
+
segment: Any,
|
|
257
|
+
vocal_audio: Any,
|
|
258
|
+
background_audio: Any | None,
|
|
259
|
+
) -> tuple[float | None, str | None]:
|
|
260
|
+
"""Return ``(score, reason)`` for a candidate segment.
|
|
261
|
+
|
|
262
|
+
``score`` is ``None`` when the segment is rejected; ``reason`` carries
|
|
263
|
+
the rejection cause so the fallback logger can summarize.
|
|
264
|
+
"""
|
|
265
|
+
vocal_slice = vocal_audio.slice(segment.start, segment.end)
|
|
266
|
+
if vocal_slice.data.size == 0:
|
|
267
|
+
return None, "empty slice"
|
|
268
|
+
|
|
269
|
+
peak = float(np.max(np.abs(vocal_slice.data)))
|
|
270
|
+
if peak >= PEAK_CLIP_THRESHOLD:
|
|
271
|
+
return None, "clipped"
|
|
272
|
+
|
|
273
|
+
vocal_rms = float(np.sqrt(np.mean(vocal_slice.data**2)))
|
|
274
|
+
|
|
275
|
+
if background_audio is not None:
|
|
276
|
+
bg_slice = background_audio.slice(segment.start, segment.end)
|
|
277
|
+
if bg_slice.data.size > 0:
|
|
278
|
+
bg_rms = float(np.sqrt(np.mean(bg_slice.data**2)))
|
|
279
|
+
if bg_rms > 0 and (vocal_rms / bg_rms) < MIN_VOCAL_BG_RMS_RATIO:
|
|
280
|
+
return None, "background-dominated"
|
|
281
|
+
|
|
282
|
+
duration = segment.end - segment.start
|
|
283
|
+
duration_penalty = abs(duration - VOICE_SAMPLE_TARGET_DURATION)
|
|
284
|
+
return vocal_rms - 0.05 * duration_penalty, None
|
|
285
|
+
|
|
185
286
|
def process(
|
|
186
287
|
self,
|
|
187
288
|
source_audio: Audio,
|
|
@@ -266,6 +367,23 @@ class LocalDubbingPipeline:
|
|
|
266
367
|
target_lang=target_lang,
|
|
267
368
|
)
|
|
268
369
|
|
|
370
|
+
# Cheap heuristic gate before the expensive Demucs/translation/TTS
|
|
371
|
+
# stages. Lets strict_quality callers refuse-and-refund without
|
|
372
|
+
# running the rest of the pipeline; non-strict runs continue but
|
|
373
|
+
# surface the assessment on DubbingResult.
|
|
374
|
+
transcript_quality = assess_transcript(transcription, source_audio.metadata.duration_seconds)
|
|
375
|
+
if transcript_quality.recommendation == "reject" and self.strict_quality:
|
|
376
|
+
raise GarbageTranscriptError(
|
|
377
|
+
f"Refusing to dub: {', '.join(transcript_quality.flags)}",
|
|
378
|
+
transcript_quality,
|
|
379
|
+
)
|
|
380
|
+
if transcript_quality.recommendation in ("warn", "reject"):
|
|
381
|
+
logger.warning(
|
|
382
|
+
"Transcript quality flags raised: %s (recommendation=%s)",
|
|
383
|
+
", ".join(transcript_quality.flags),
|
|
384
|
+
transcript_quality.recommendation,
|
|
385
|
+
)
|
|
386
|
+
|
|
269
387
|
detected_lang = source_lang or transcription.language or "en"
|
|
270
388
|
|
|
271
389
|
separated_audio: SeparatedAudio | None = None
|
|
@@ -303,7 +421,7 @@ class LocalDubbingPipeline:
|
|
|
303
421
|
voice_samples: dict[str, Audio] = {}
|
|
304
422
|
if voice_clone:
|
|
305
423
|
report_progress("Extracting voice samples", 0.25)
|
|
306
|
-
voice_samples = self._extract_voice_samples(vocal_audio, transcription)
|
|
424
|
+
voice_samples = self._extract_voice_samples(vocal_audio, background_audio, transcription)
|
|
307
425
|
|
|
308
426
|
# vocals is no longer needed; voice_samples are independent copies.
|
|
309
427
|
# In low_memory mode this is the only ref keeping the buffer alive
|
|
@@ -314,10 +432,19 @@ class LocalDubbingPipeline:
|
|
|
314
432
|
if self._translator is None:
|
|
315
433
|
self._init_translator()
|
|
316
434
|
|
|
435
|
+
# Translation stage spans 0.35 → 0.50 of overall pipeline progress.
|
|
436
|
+
# MarianMT runs sequentially over 8-segment batches; on a 15-min
|
|
437
|
+
# source that's minutes of silent dwell on 0.35 without per-batch
|
|
438
|
+
# ticks. Map the [0,1] translation fraction onto that 15% window.
|
|
439
|
+
def _on_translation_progress(fraction: float) -> None:
|
|
440
|
+
clamped = max(0.0, min(1.0, fraction))
|
|
441
|
+
report_progress(f"Translating text ({int(clamped * 100)}%)", 0.35 + 0.15 * clamped)
|
|
442
|
+
|
|
317
443
|
translated_segments = self._translator.translate_segments(
|
|
318
444
|
segments=transcription.segments,
|
|
319
445
|
target_lang=target_lang,
|
|
320
446
|
source_lang=detected_lang,
|
|
447
|
+
progress_callback=_on_translation_progress,
|
|
321
448
|
)
|
|
322
449
|
self._maybe_unload("_translator")
|
|
323
450
|
|
|
@@ -393,7 +520,8 @@ class LocalDubbingPipeline:
|
|
|
393
520
|
self._init_synchronizer()
|
|
394
521
|
assert self._synchronizer is not None
|
|
395
522
|
|
|
396
|
-
synchronized_segments,
|
|
523
|
+
synchronized_segments, adjustments = self._synchronizer.synchronize_segments(dubbed_segments, target_durations)
|
|
524
|
+
timing_summary = TimingSummary.from_adjustments(adjustments)
|
|
397
525
|
del dubbed_segments
|
|
398
526
|
|
|
399
527
|
report_progress("Assembling final audio", 0.90)
|
|
@@ -429,6 +557,8 @@ class LocalDubbingPipeline:
|
|
|
429
557
|
target_lang=target_lang,
|
|
430
558
|
separated_audio=separated_audio,
|
|
431
559
|
voice_samples=voice_samples,
|
|
560
|
+
timing_summary=timing_summary,
|
|
561
|
+
transcript_quality=transcript_quality,
|
|
432
562
|
)
|
|
433
563
|
|
|
434
564
|
def revoice(
|
|
@@ -486,7 +616,10 @@ class LocalDubbingPipeline:
|
|
|
486
616
|
voice_sample: Audio | None = None
|
|
487
617
|
|
|
488
618
|
if transcription.segments:
|
|
489
|
-
|
|
619
|
+
# revoice doesn't track the background after the low_memory drop,
|
|
620
|
+
# so quality gating degrades to "no RMS check" here. Clipping is
|
|
621
|
+
# still rejected.
|
|
622
|
+
voice_samples = self._extract_voice_samples(vocal_audio, None, transcription)
|
|
490
623
|
if voice_samples:
|
|
491
624
|
voice_sample = next(iter(voice_samples.values()))
|
|
492
625
|
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Cheap heuristics over a Whisper transcription to flag degenerate output.
|
|
2
|
+
|
|
3
|
+
Surfaces three failure modes seen in production where Demucs/translation/TTS
|
|
4
|
+
would otherwise spend minutes producing a useless dub:
|
|
5
|
+
|
|
6
|
+
- Dominant-phrase cascade — one phrase repeats across most segments. The
|
|
7
|
+
classic Whisper failure on ambient music / outro screens
|
|
8
|
+
("Thank you for watching").
|
|
9
|
+
- Low decoder confidence — median per-segment ``avg_logprob`` is poor.
|
|
10
|
+
- Silent input misread as speech — total speech duration is tiny relative
|
|
11
|
+
to the clip's wall-clock duration (only meaningful on long inputs).
|
|
12
|
+
|
|
13
|
+
Each check raises a flag; a recommendation is derived from how many fired.
|
|
14
|
+
Threshold constants live at module scope so production data can re-tune them
|
|
15
|
+
without touching code structure.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import re
|
|
21
|
+
import statistics
|
|
22
|
+
from collections import Counter
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from videopython.base.text.transcription import Transcription
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Tuned conservatively to favor "warn" over "reject"; first-week production
|
|
31
|
+
# data may move them.
|
|
32
|
+
DOMINANT_PHRASE_FRACTION_THRESHOLD = 0.70
|
|
33
|
+
LOW_LOGPROB_MEDIAN_THRESHOLD = -1.5
|
|
34
|
+
LOW_SPEECH_FRACTION_THRESHOLD = 0.05
|
|
35
|
+
SHORT_CLIP_SECONDS = 30.0 # below this, speech-fraction is too unstable to trust
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
Recommendation = Literal["ok", "warn", "reject"]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
_PUNCT_RE = re.compile(r"[^\w\s]+", re.UNICODE)
|
|
42
|
+
_WHITESPACE_RE = re.compile(r"\s+")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _normalize_phrase(text: str) -> str:
|
|
46
|
+
"""Lowercase, strip punctuation, collapse whitespace."""
|
|
47
|
+
cleaned = _PUNCT_RE.sub(" ", text.lower())
|
|
48
|
+
return _WHITESPACE_RE.sub(" ", cleaned).strip()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class TranscriptQuality:
|
|
53
|
+
"""Quality assessment of a Whisper transcription.
|
|
54
|
+
|
|
55
|
+
Attributes:
|
|
56
|
+
recommendation: ``"ok"`` (continue), ``"warn"`` (continue, log), or
|
|
57
|
+
``"reject"`` (caller should refuse to dub if strict_quality).
|
|
58
|
+
dominant_phrase: The repeating phrase that triggered the dominance
|
|
59
|
+
flag, or None when the flag didn't fire.
|
|
60
|
+
dominant_phrase_fraction: Character-count share of the most common
|
|
61
|
+
normalized segment phrase. 0.0 when no segments.
|
|
62
|
+
median_avg_logprob: Median of ``avg_logprob`` across segments that
|
|
63
|
+
carry it; None when no segment had a logprob (e.g. SRT-loaded).
|
|
64
|
+
speech_fraction: Sum of segment durations divided by the audio's
|
|
65
|
+
wall-clock duration.
|
|
66
|
+
flags: Human-readable list of which checks fired.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
recommendation: Recommendation
|
|
70
|
+
dominant_phrase: str | None
|
|
71
|
+
dominant_phrase_fraction: float
|
|
72
|
+
median_avg_logprob: float | None
|
|
73
|
+
speech_fraction: float
|
|
74
|
+
flags: list[str] = field(default_factory=list)
|
|
75
|
+
|
|
76
|
+
def to_dict(self) -> dict[str, Any]:
|
|
77
|
+
return {
|
|
78
|
+
"recommendation": self.recommendation,
|
|
79
|
+
"dominant_phrase": self.dominant_phrase,
|
|
80
|
+
"dominant_phrase_fraction": self.dominant_phrase_fraction,
|
|
81
|
+
"median_avg_logprob": self.median_avg_logprob,
|
|
82
|
+
"speech_fraction": self.speech_fraction,
|
|
83
|
+
"flags": list(self.flags),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def from_dict(cls, data: dict[str, Any]) -> TranscriptQuality:
|
|
88
|
+
return cls(
|
|
89
|
+
recommendation=data["recommendation"],
|
|
90
|
+
dominant_phrase=data.get("dominant_phrase"),
|
|
91
|
+
dominant_phrase_fraction=data.get("dominant_phrase_fraction", 0.0),
|
|
92
|
+
median_avg_logprob=data.get("median_avg_logprob"),
|
|
93
|
+
speech_fraction=data.get("speech_fraction", 0.0),
|
|
94
|
+
flags=list(data.get("flags", [])),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class GarbageTranscriptError(RuntimeError):
|
|
99
|
+
"""Raised by the dubbing pipeline when ``strict_quality=True`` and the
|
|
100
|
+
transcript heuristic returns ``recommendation="reject"``.
|
|
101
|
+
|
|
102
|
+
The triggering :class:`TranscriptQuality` is attached as ``quality`` so
|
|
103
|
+
callers can introspect the flags without re-running the pipeline.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(self, message: str, quality: TranscriptQuality):
|
|
107
|
+
super().__init__(message)
|
|
108
|
+
self.quality = quality
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def assess_transcript(
|
|
112
|
+
transcription: Transcription,
|
|
113
|
+
audio_duration_seconds: float,
|
|
114
|
+
) -> TranscriptQuality:
|
|
115
|
+
"""Run the three quality checks and return a recommendation.
|
|
116
|
+
|
|
117
|
+
See module docstring for what each check looks for.
|
|
118
|
+
"""
|
|
119
|
+
segments = list(transcription.segments)
|
|
120
|
+
|
|
121
|
+
# Dominant-phrase share by character count.
|
|
122
|
+
dominant_phrase: str | None = None
|
|
123
|
+
dominant_fraction = 0.0
|
|
124
|
+
if segments:
|
|
125
|
+
normalized = [_normalize_phrase(s.text) for s in segments]
|
|
126
|
+
char_counts: Counter[str] = Counter()
|
|
127
|
+
total_chars = 0
|
|
128
|
+
for phrase in normalized:
|
|
129
|
+
if not phrase:
|
|
130
|
+
continue
|
|
131
|
+
n = len(phrase)
|
|
132
|
+
char_counts[phrase] += n
|
|
133
|
+
total_chars += n
|
|
134
|
+
if total_chars > 0 and char_counts:
|
|
135
|
+
most_common_phrase, most_common_chars = char_counts.most_common(1)[0]
|
|
136
|
+
dominant_fraction = most_common_chars / total_chars
|
|
137
|
+
dominant_phrase = most_common_phrase
|
|
138
|
+
|
|
139
|
+
# Median avg_logprob across segments that carry it.
|
|
140
|
+
logprobs = [s.avg_logprob for s in segments if s.avg_logprob is not None]
|
|
141
|
+
median_logprob = statistics.median(logprobs) if logprobs else None
|
|
142
|
+
|
|
143
|
+
# Speech fraction = sum of segment durations / audio duration.
|
|
144
|
+
speech_seconds = sum(max(0.0, s.end - s.start) for s in segments)
|
|
145
|
+
speech_fraction = speech_seconds / audio_duration_seconds if audio_duration_seconds > 0 else 0.0
|
|
146
|
+
|
|
147
|
+
flags: list[str] = []
|
|
148
|
+
dominance_flag = dominant_fraction >= DOMINANT_PHRASE_FRACTION_THRESHOLD
|
|
149
|
+
if dominance_flag:
|
|
150
|
+
flags.append(f"dominant phrase {dominant_fraction:.0%}: {dominant_phrase!r}")
|
|
151
|
+
|
|
152
|
+
logprob_flag = median_logprob is not None and median_logprob < LOW_LOGPROB_MEDIAN_THRESHOLD
|
|
153
|
+
if logprob_flag:
|
|
154
|
+
flags.append(f"median avg_logprob {median_logprob:.2f} below {LOW_LOGPROB_MEDIAN_THRESHOLD}")
|
|
155
|
+
|
|
156
|
+
# Speech-fraction is unstable on short clips; skip it there.
|
|
157
|
+
speech_flag = audio_duration_seconds > SHORT_CLIP_SECONDS and speech_fraction < LOW_SPEECH_FRACTION_THRESHOLD
|
|
158
|
+
if speech_flag:
|
|
159
|
+
flags.append(f"speech fraction {speech_fraction:.1%} below {LOW_SPEECH_FRACTION_THRESHOLD:.0%}")
|
|
160
|
+
|
|
161
|
+
# Reject only when dominance + at least one other flag fires; legitimate
|
|
162
|
+
# repetitive content (chants, lyric clips) should warn, not reject.
|
|
163
|
+
recommendation: Recommendation
|
|
164
|
+
if dominance_flag and (logprob_flag or speech_flag):
|
|
165
|
+
recommendation = "reject"
|
|
166
|
+
elif flags:
|
|
167
|
+
recommendation = "warn"
|
|
168
|
+
else:
|
|
169
|
+
recommendation = "ok"
|
|
170
|
+
|
|
171
|
+
return TranscriptQuality(
|
|
172
|
+
recommendation=recommendation,
|
|
173
|
+
dominant_phrase=dominant_phrase if dominance_flag else None,
|
|
174
|
+
dominant_phrase_fraction=dominant_fraction,
|
|
175
|
+
median_avg_logprob=median_logprob,
|
|
176
|
+
speech_fraction=speech_fraction,
|
|
177
|
+
flags=flags,
|
|
178
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from typing import Any
|
|
5
|
+
from typing import Any, Callable
|
|
6
6
|
|
|
7
7
|
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
8
|
from videopython.ai.dubbing.models import TranslatedSegment
|
|
@@ -135,8 +135,15 @@ class TextTranslator:
|
|
|
135
135
|
texts: list[str],
|
|
136
136
|
target_lang: str,
|
|
137
137
|
source_lang: str | None = None,
|
|
138
|
+
progress_callback: Callable[[float], None] | None = None,
|
|
138
139
|
) -> list[str]:
|
|
139
|
-
"""Translate multiple texts to target language.
|
|
140
|
+
"""Translate multiple texts to target language.
|
|
141
|
+
|
|
142
|
+
``progress_callback`` is called once per batch with a fraction in
|
|
143
|
+
``[0, 1]`` representing translation-stage progress. It does not fire
|
|
144
|
+
on the empty-input or same-language shortcuts (those are O(0) work
|
|
145
|
+
and the caller frames its own progress events around the call).
|
|
146
|
+
"""
|
|
140
147
|
import torch
|
|
141
148
|
|
|
142
149
|
if not texts:
|
|
@@ -150,8 +157,9 @@ class TextTranslator:
|
|
|
150
157
|
|
|
151
158
|
translated: list[str] = []
|
|
152
159
|
batch_size = 8
|
|
160
|
+
total = len(texts)
|
|
153
161
|
|
|
154
|
-
for i in range(0,
|
|
162
|
+
for i in range(0, total, batch_size):
|
|
155
163
|
batch = texts[i : i + batch_size]
|
|
156
164
|
inputs = self._tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
157
165
|
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
@@ -162,6 +170,9 @@ class TextTranslator:
|
|
|
162
170
|
for output in outputs:
|
|
163
171
|
translated.append(self._tokenizer.decode(output, skip_special_tokens=True))
|
|
164
172
|
|
|
173
|
+
if progress_callback is not None:
|
|
174
|
+
progress_callback(min(1.0, (i + len(batch)) / total))
|
|
175
|
+
|
|
165
176
|
return translated
|
|
166
177
|
|
|
167
178
|
def translate_segments(
|
|
@@ -169,6 +180,7 @@ class TextTranslator:
|
|
|
169
180
|
segments: list[TranscriptionSegment],
|
|
170
181
|
target_lang: str,
|
|
171
182
|
source_lang: str | None = None,
|
|
183
|
+
progress_callback: Callable[[float], None] | None = None,
|
|
172
184
|
) -> list[TranslatedSegment]:
|
|
173
185
|
"""Translate transcription segments while preserving timing/speaker info.
|
|
174
186
|
|
|
@@ -177,12 +189,18 @@ class TextTranslator:
|
|
|
177
189
|
``translated_text=""`` instead. This avoids MarianMT hallucinating
|
|
178
190
|
full sentences from " .", "...", or single-token Whisper segments,
|
|
179
191
|
which would otherwise be TTS'd into the dubbed track.
|
|
192
|
+
|
|
193
|
+
``progress_callback`` is forwarded to :meth:`translate_batch` so
|
|
194
|
+
callers can render translation-stage progress without knowing the
|
|
195
|
+
batch size.
|
|
180
196
|
"""
|
|
181
197
|
effective_source = source_lang or "en"
|
|
182
198
|
|
|
183
199
|
translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
|
|
184
200
|
translatable_texts = [segments[i].text for i in translatable_indices]
|
|
185
|
-
translated_texts = self.translate_batch(
|
|
201
|
+
translated_texts = self.translate_batch(
|
|
202
|
+
translatable_texts, target_lang, source_lang, progress_callback=progress_callback
|
|
203
|
+
)
|
|
186
204
|
|
|
187
205
|
translation_map: dict[int, str] = dict(zip(translatable_indices, translated_texts))
|
|
188
206
|
|
|
@@ -122,6 +122,9 @@ class AudioToText:
|
|
|
122
122
|
end=segment["end"],
|
|
123
123
|
text=segment["text"],
|
|
124
124
|
words=transcription_words,
|
|
125
|
+
avg_logprob=segment.get("avg_logprob"),
|
|
126
|
+
no_speech_prob=segment.get("no_speech_prob"),
|
|
127
|
+
compression_ratio=segment.get("compression_ratio"),
|
|
125
128
|
)
|
|
126
129
|
transcription_segments.append(transcription_segment)
|
|
127
130
|
|
|
@@ -40,6 +40,9 @@ class TranscriptionSegment:
|
|
|
40
40
|
text: str
|
|
41
41
|
words: list[TranscriptionWord]
|
|
42
42
|
speaker: str | None = None
|
|
43
|
+
avg_logprob: float | None = None
|
|
44
|
+
no_speech_prob: float | None = None
|
|
45
|
+
compression_ratio: float | None = None
|
|
43
46
|
|
|
44
47
|
def to_dict(self) -> dict:
|
|
45
48
|
"""Convert to dictionary for JSON serialization."""
|
|
@@ -49,6 +52,9 @@ class TranscriptionSegment:
|
|
|
49
52
|
"text": self.text,
|
|
50
53
|
"words": [w.to_dict() for w in self.words],
|
|
51
54
|
"speaker": self.speaker,
|
|
55
|
+
"avg_logprob": self.avg_logprob,
|
|
56
|
+
"no_speech_prob": self.no_speech_prob,
|
|
57
|
+
"compression_ratio": self.compression_ratio,
|
|
52
58
|
}
|
|
53
59
|
|
|
54
60
|
@classmethod
|
|
@@ -60,6 +66,9 @@ class TranscriptionSegment:
|
|
|
60
66
|
text=data["text"],
|
|
61
67
|
words=[TranscriptionWord.from_dict(w) for w in data["words"]],
|
|
62
68
|
speaker=data.get("speaker"),
|
|
69
|
+
avg_logprob=data.get("avg_logprob"),
|
|
70
|
+
no_speech_prob=data.get("no_speech_prob"),
|
|
71
|
+
compression_ratio=data.get("compression_ratio"),
|
|
63
72
|
)
|
|
64
73
|
|
|
65
74
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|