videopython 0.33.4__tar.gz → 0.34.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.33.4 → videopython-0.34.0}/PKG-INFO +1 -1
- {videopython-0.33.4 → videopython-0.34.0}/pyproject.toml +1 -1
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/transforms.py +24 -8
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/image_text.py +137 -44
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/transcription.py +60 -88
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/__init__.py +3 -1
- videopython-0.34.0/src/videopython/editing/transcription_overlay.py +516 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/video_edit.py +116 -4
- videopython-0.33.4/src/videopython/editing/transcription_overlay.py +0 -186
- {videopython-0.33.4 → videopython-0.34.0}/.gitignore +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/LICENSE +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/README.md +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/config.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/expressiveness.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/loudness.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/voice_sample.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/analyzer.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/models.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/sampling.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/stages.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/audio/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/audio/analysis.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/audio/audio.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/_dimensions.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/_ffmpeg.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/_video_io.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/description.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/fonts/__init__.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/video.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/effects.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/operation.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/streaming.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/transforms.py +0 -0
- {videopython-0.33.4 → videopython-0.34.0}/src/videopython/py.typed +0 -0
|
@@ -12,7 +12,7 @@ from tqdm import tqdm
|
|
|
12
12
|
|
|
13
13
|
from videopython.ai.understanding.faces import FaceTracker
|
|
14
14
|
from videopython.base._dimensions import floor_to_even
|
|
15
|
-
from videopython.base.video import Video
|
|
15
|
+
from videopython.base.video import Video, VideoMetadata
|
|
16
16
|
from videopython.editing.operation import OpCategory, Operation
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
@@ -76,6 +76,28 @@ class FaceTrackingCrop(Operation):
|
|
|
76
76
|
# "dynamic" — placeholder until motion/look-direction framing is implemented.
|
|
77
77
|
return (face_cx, face_cy - self.headroom)
|
|
78
78
|
|
|
79
|
+
def _resolved_output_dims(self, w: int, h: int) -> tuple[int, int]:
|
|
80
|
+
"""Output ``(width, height)`` after the crop + resize.
|
|
81
|
+
|
|
82
|
+
Every frame is resized to this size regardless of the per-frame face
|
|
83
|
+
position, so it is a pure function of the input dimensions and
|
|
84
|
+
``target_aspect``. Single source of truth shared by :meth:`apply` and
|
|
85
|
+
:meth:`predict_metadata` (mirrors ``Resize._resolve_dims`` /
|
|
86
|
+
``Crop._resolve_box``), so the dry-run cannot disagree with the render.
|
|
87
|
+
"""
|
|
88
|
+
target_ratio = self.target_aspect[0] / self.target_aspect[1]
|
|
89
|
+
if target_ratio < w / h:
|
|
90
|
+
out_h = floor_to_even(h)
|
|
91
|
+
out_w = floor_to_even(int(out_h * target_ratio))
|
|
92
|
+
else:
|
|
93
|
+
out_w = floor_to_even(w)
|
|
94
|
+
out_h = floor_to_even(int(out_w / target_ratio))
|
|
95
|
+
return out_w, out_h
|
|
96
|
+
|
|
97
|
+
def predict_metadata(self, meta: VideoMetadata) -> VideoMetadata:
|
|
98
|
+
out_w, out_h = self._resolved_output_dims(meta.width, meta.height)
|
|
99
|
+
return meta.with_dimensions(out_w, out_h)
|
|
100
|
+
|
|
79
101
|
def _clamp_speed(self, current: tuple[float, float], target: tuple[float, float]) -> tuple[float, float]:
|
|
80
102
|
if self.max_speed is None:
|
|
81
103
|
return target
|
|
@@ -135,13 +157,7 @@ class FaceTrackingCrop(Operation):
|
|
|
135
157
|
)
|
|
136
158
|
|
|
137
159
|
h, w = video.frame_shape[:2]
|
|
138
|
-
|
|
139
|
-
if target_ratio < w / h:
|
|
140
|
-
out_h = floor_to_even(h)
|
|
141
|
-
out_w = floor_to_even(int(out_h * target_ratio))
|
|
142
|
-
else:
|
|
143
|
-
out_w = floor_to_even(w)
|
|
144
|
-
out_h = floor_to_even(int(out_w / target_ratio))
|
|
160
|
+
out_w, out_h = self._resolved_output_dims(w, h)
|
|
145
161
|
|
|
146
162
|
default_x = (w - out_w) // 2
|
|
147
163
|
default_y = (h - out_h) // 2
|
|
@@ -9,6 +9,7 @@ generation helpers (``ai/understanding/image.py``).
|
|
|
9
9
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
|
+
from dataclasses import dataclass
|
|
12
13
|
from enum import Enum
|
|
13
14
|
from typing import TypeAlias
|
|
14
15
|
|
|
@@ -18,7 +19,7 @@ from PIL import Image, ImageDraw, ImageFont
|
|
|
18
19
|
from videopython.base.exceptions import OutOfBoundsError
|
|
19
20
|
from videopython.base.fonts import load_font
|
|
20
21
|
|
|
21
|
-
__all__ = ["ImageText", "TextAlign", "AnchorPoint"]
|
|
22
|
+
__all__ = ["ImageText", "TextAlign", "AnchorPoint", "TextBoxRect"]
|
|
22
23
|
|
|
23
24
|
# Type aliases for clarity
|
|
24
25
|
MarginType: TypeAlias = int | tuple[int, int, int, int]
|
|
@@ -79,6 +80,32 @@ class AnchorPoint(str, Enum):
|
|
|
79
80
|
return (cls.BOTTOM_LEFT, cls.BOTTOM_CENTER, cls.BOTTOM_RIGHT)
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
@dataclass(frozen=True)
|
|
84
|
+
class TextBoxRect:
|
|
85
|
+
"""Resolved geometry of a wrapped text box, without rendering it.
|
|
86
|
+
|
|
87
|
+
Returned by :meth:`ImageText.measure_text_box` — the single source of
|
|
88
|
+
truth for box measure/wrap/anchor/bounds, shared by the renderer
|
|
89
|
+
(:meth:`ImageText.write_text_box`) and dry-run validators so they can
|
|
90
|
+
never disagree on whether text fits.
|
|
91
|
+
|
|
92
|
+
For a non-degenerate box ``(x, y)`` is the anchor-adjusted top-left
|
|
93
|
+
corner and ``width``/``height`` span the wrapped lines. For a degenerate
|
|
94
|
+
box (whitespace-only text → no renderable lines) ``height == 0``,
|
|
95
|
+
``(x, y)`` is the *unadjusted* insertion point, and ``fits`` is ``True``;
|
|
96
|
+
callers short-circuit such boxes (nothing to draw). ``width`` mirrors the
|
|
97
|
+
resolved ``box_width`` and may be a float when an absolute >1 value was
|
|
98
|
+
passed, matching legacy behaviour.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
x: float
|
|
102
|
+
y: float
|
|
103
|
+
width: int | float
|
|
104
|
+
height: int
|
|
105
|
+
fits: bool
|
|
106
|
+
lines: tuple[str, ...]
|
|
107
|
+
|
|
108
|
+
|
|
82
109
|
class ImageText:
|
|
83
110
|
def __init__(
|
|
84
111
|
self,
|
|
@@ -566,6 +593,97 @@ class ImageText:
|
|
|
566
593
|
lines = [" ".join(line) for line in split_lines]
|
|
567
594
|
return lines
|
|
568
595
|
|
|
596
|
+
def available_region(self, margin: MarginType = 0) -> tuple[int, int, int, int]:
|
|
597
|
+
"""The drawable area inside ``margin`` as ``(left, top, width, height)``.
|
|
598
|
+
|
|
599
|
+
Single source of truth for margin-inset geometry: used by
|
|
600
|
+
:meth:`measure_text_box` and by callers that need to clamp a box
|
|
601
|
+
within the margins without re-deriving the margin math.
|
|
602
|
+
"""
|
|
603
|
+
margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
|
|
604
|
+
available_width = self.image_size[1] - margin_left - margin_right
|
|
605
|
+
available_height = self.image_size[0] - margin_top - margin_bottom
|
|
606
|
+
return margin_left, margin_top, available_width, available_height
|
|
607
|
+
|
|
608
|
+
def measure_text_box(
|
|
609
|
+
self,
|
|
610
|
+
text: str,
|
|
611
|
+
font_filename: str | None,
|
|
612
|
+
xy: PositionType,
|
|
613
|
+
box_width: int | float | None = None,
|
|
614
|
+
font_size: int = 11,
|
|
615
|
+
anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
|
|
616
|
+
margin: MarginType = 0,
|
|
617
|
+
) -> TextBoxRect:
|
|
618
|
+
"""Measure where a wrapped text box would land, without drawing it.
|
|
619
|
+
|
|
620
|
+
Pure: resolves margins/box-width/position, wraps the text, applies the
|
|
621
|
+
anchor, and bounds-checks against the image — the exact math
|
|
622
|
+
:meth:`write_text_box` used to do inline. Highlighting and per-line
|
|
623
|
+
alignment (``place``) do not change the box envelope, so they are not
|
|
624
|
+
parameters here; this intentionally preserves the pre-existing
|
|
625
|
+
behaviour that an enlarged highlighted word is *not* accounted for in
|
|
626
|
+
the fit check.
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
A :class:`TextBoxRect`. ``fits`` is ``False`` when the box would
|
|
630
|
+
fall outside the image bounds (the condition that makes
|
|
631
|
+
:meth:`write_text_box` raise :class:`OutOfBoundsError`).
|
|
632
|
+
|
|
633
|
+
Raises:
|
|
634
|
+
ValueError: If ``text`` is empty, ``font_size`` is not positive,
|
|
635
|
+
or an absolute ``box_width`` is not positive.
|
|
636
|
+
"""
|
|
637
|
+
if not text:
|
|
638
|
+
raise ValueError("Text cannot be empty")
|
|
639
|
+
|
|
640
|
+
if font_size <= 0:
|
|
641
|
+
raise ValueError("Font size must be positive")
|
|
642
|
+
|
|
643
|
+
# Process margins to determine available area (shared with callers
|
|
644
|
+
# that clamp boxes inside the margins -- see ``available_region``).
|
|
645
|
+
margin_left, margin_top, available_width, available_height = self.available_region(margin)
|
|
646
|
+
|
|
647
|
+
# Handle relative box width
|
|
648
|
+
if box_width is None:
|
|
649
|
+
box_width = available_width
|
|
650
|
+
elif isinstance(box_width, float) and 0 < box_width <= 1:
|
|
651
|
+
box_width = int(available_width * box_width)
|
|
652
|
+
elif isinstance(box_width, int) and box_width <= 0:
|
|
653
|
+
raise ValueError("Box width must be positive")
|
|
654
|
+
|
|
655
|
+
# Calculate initial position based on margin and anchor before splitting text
|
|
656
|
+
x_pos, y_pos = self._convert_position(xy, margin_top, margin_left, available_width, available_height)
|
|
657
|
+
|
|
658
|
+
# Split text into lines that fit within box_width
|
|
659
|
+
lines = self._split_lines_by_width(text, font_filename, font_size, int(box_width))
|
|
660
|
+
|
|
661
|
+
# Calculate total height of all lines
|
|
662
|
+
lines_height = sum(self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines)
|
|
663
|
+
if lines_height == 0:
|
|
664
|
+
# No renderable lines (e.g. whitespace-only text); position is the
|
|
665
|
+
# unadjusted insertion point and the box trivially "fits".
|
|
666
|
+
return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=0, fits=True, lines=tuple(lines))
|
|
667
|
+
|
|
668
|
+
# Final position calculation based on anchor point
|
|
669
|
+
if anchor in AnchorPoint.center_anchors():
|
|
670
|
+
x_pos -= box_width // 2
|
|
671
|
+
elif anchor in AnchorPoint.right_anchors():
|
|
672
|
+
x_pos -= box_width
|
|
673
|
+
|
|
674
|
+
if anchor in AnchorPoint.middle_anchors():
|
|
675
|
+
y_pos -= lines_height // 2
|
|
676
|
+
elif anchor in AnchorPoint.bottom_anchors():
|
|
677
|
+
y_pos -= lines_height
|
|
678
|
+
|
|
679
|
+
fits = not (
|
|
680
|
+
x_pos < 0
|
|
681
|
+
or y_pos < 0
|
|
682
|
+
or x_pos + box_width > self.image_size[1]
|
|
683
|
+
or y_pos + lines_height > self.image_size[0]
|
|
684
|
+
)
|
|
685
|
+
return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=lines_height, fits=fits, lines=tuple(lines))
|
|
686
|
+
|
|
569
687
|
def write_text_box(
|
|
570
688
|
self,
|
|
571
689
|
text: str,
|
|
@@ -643,49 +761,24 @@ class ImageText:
|
|
|
643
761
|
if highlight_word_index is not None and highlight_color is None:
|
|
644
762
|
highlight_color = text_color
|
|
645
763
|
|
|
646
|
-
#
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
# Calculate total height of all lines
|
|
666
|
-
lines_height = sum([self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines])
|
|
667
|
-
if lines_height == 0:
|
|
668
|
-
# If we have no valid lines or zero height, return the position
|
|
669
|
-
return (int(x_pos), int(y_pos))
|
|
670
|
-
|
|
671
|
-
# Final position calculation based on anchor point
|
|
672
|
-
if anchor in AnchorPoint.center_anchors():
|
|
673
|
-
x_pos -= box_width // 2
|
|
674
|
-
elif anchor in AnchorPoint.right_anchors():
|
|
675
|
-
x_pos -= box_width
|
|
676
|
-
|
|
677
|
-
if anchor in AnchorPoint.middle_anchors():
|
|
678
|
-
y_pos -= lines_height // 2
|
|
679
|
-
elif anchor in AnchorPoint.bottom_anchors():
|
|
680
|
-
y_pos -= lines_height
|
|
681
|
-
|
|
682
|
-
# Verify box will fit within bounds
|
|
683
|
-
if (
|
|
684
|
-
x_pos < 0
|
|
685
|
-
or y_pos < 0
|
|
686
|
-
or x_pos + box_width > self.image_size[1]
|
|
687
|
-
or y_pos + lines_height > self.image_size[0]
|
|
688
|
-
):
|
|
764
|
+
# Measure (single source of truth for box geometry), then render.
|
|
765
|
+
rect = self.measure_text_box(
|
|
766
|
+
text=text,
|
|
767
|
+
font_filename=font_filename,
|
|
768
|
+
xy=xy,
|
|
769
|
+
box_width=box_width,
|
|
770
|
+
font_size=font_size,
|
|
771
|
+
anchor=anchor,
|
|
772
|
+
margin=margin,
|
|
773
|
+
)
|
|
774
|
+
lines = list(rect.lines)
|
|
775
|
+
if rect.height == 0:
|
|
776
|
+
# No renderable lines (e.g. whitespace-only text); nothing to draw.
|
|
777
|
+
return (int(rect.x), int(rect.y))
|
|
778
|
+
box_width = rect.width
|
|
779
|
+
x_pos, y_pos = rect.x, rect.y
|
|
780
|
+
lines_height = rect.height
|
|
781
|
+
if not rect.fits:
|
|
689
782
|
raise OutOfBoundsError(
|
|
690
783
|
f"Text box with size ({box_width}x{lines_height}) at position ({x_pos}, {y_pos}) is out of bounds!"
|
|
691
784
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
3
|
+
from dataclasses import dataclass, replace
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
@@ -79,6 +79,38 @@ class TranscriptionSegment:
|
|
|
79
79
|
compression_ratio=data.get("compression_ratio"),
|
|
80
80
|
)
|
|
81
81
|
|
|
82
|
+
@classmethod
|
|
83
|
+
def from_words(
|
|
84
|
+
cls,
|
|
85
|
+
words: list[TranscriptionWord],
|
|
86
|
+
*,
|
|
87
|
+
speaker: str | None = None,
|
|
88
|
+
avg_logprob: float | None = None,
|
|
89
|
+
no_speech_prob: float | None = None,
|
|
90
|
+
compression_ratio: float | None = None,
|
|
91
|
+
) -> TranscriptionSegment:
|
|
92
|
+
"""Build a segment spanning ``words``, deriving start/end/text from them.
|
|
93
|
+
|
|
94
|
+
``words`` must be non-empty: ``start``/``end`` come from the first/last
|
|
95
|
+
word and ``text`` is the words joined by single spaces. Speaker and the
|
|
96
|
+
confidence fields are passed through so callers re-segmenting *within* a
|
|
97
|
+
known source segment can preserve them; callers regrouping words across
|
|
98
|
+
segments (where these are ambiguous) simply omit them, leaving ``None``.
|
|
99
|
+
The ``words`` list is copied, so the result never aliases the caller's.
|
|
100
|
+
"""
|
|
101
|
+
if not words:
|
|
102
|
+
raise ValueError("from_words requires a non-empty word list")
|
|
103
|
+
return cls(
|
|
104
|
+
start=words[0].start,
|
|
105
|
+
end=words[-1].end,
|
|
106
|
+
text=" ".join(w.word for w in words),
|
|
107
|
+
words=list(words),
|
|
108
|
+
speaker=speaker,
|
|
109
|
+
avg_logprob=avg_logprob,
|
|
110
|
+
no_speech_prob=no_speech_prob,
|
|
111
|
+
compression_ratio=compression_ratio,
|
|
112
|
+
)
|
|
113
|
+
|
|
82
114
|
|
|
83
115
|
class Transcription:
|
|
84
116
|
def __init__(
|
|
@@ -124,39 +156,19 @@ class Transcription:
|
|
|
124
156
|
return []
|
|
125
157
|
|
|
126
158
|
current_speaker = words[0].speaker
|
|
127
|
-
current_words = []
|
|
128
|
-
segment_start = words[0].start
|
|
159
|
+
current_words: list[TranscriptionWord] = []
|
|
129
160
|
segments = []
|
|
130
161
|
|
|
131
162
|
for word in words:
|
|
132
163
|
if current_speaker == word.speaker:
|
|
133
164
|
current_words.append(word)
|
|
134
165
|
else:
|
|
135
|
-
|
|
136
|
-
segments.append(
|
|
137
|
-
TranscriptionSegment(
|
|
138
|
-
start=segment_start,
|
|
139
|
-
end=current_words[-1].end,
|
|
140
|
-
text=segment_text.strip(),
|
|
141
|
-
words=current_words.copy(),
|
|
142
|
-
speaker=current_speaker,
|
|
143
|
-
)
|
|
144
|
-
)
|
|
166
|
+
segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
|
|
145
167
|
current_speaker = word.speaker
|
|
146
168
|
current_words = [word]
|
|
147
|
-
segment_start = word.start
|
|
148
169
|
|
|
149
170
|
if current_words:
|
|
150
|
-
|
|
151
|
-
segments.append(
|
|
152
|
-
TranscriptionSegment(
|
|
153
|
-
start=segment_start,
|
|
154
|
-
end=current_words[-1].end,
|
|
155
|
-
text=segment_text.strip(),
|
|
156
|
-
words=current_words.copy(),
|
|
157
|
-
speaker=current_speaker,
|
|
158
|
-
)
|
|
159
|
-
)
|
|
171
|
+
segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
|
|
160
172
|
|
|
161
173
|
return segments
|
|
162
174
|
|
|
@@ -190,22 +202,14 @@ class Transcription:
|
|
|
190
202
|
offset_segments = []
|
|
191
203
|
|
|
192
204
|
for segment in self.segments:
|
|
193
|
-
offset_words = [
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
)
|
|
200
|
-
|
|
205
|
+
offset_words = [
|
|
206
|
+
TranscriptionWord(start=w.start + time, end=w.end + time, word=w.word, speaker=w.speaker)
|
|
207
|
+
for w in segment.words
|
|
208
|
+
]
|
|
209
|
+
# ``replace`` carries text, speaker, and confidence fields through a
|
|
210
|
+
# pure timing shift unchanged -- only timestamps move.
|
|
201
211
|
offset_segments.append(
|
|
202
|
-
|
|
203
|
-
start=segment.start + time,
|
|
204
|
-
end=segment.end + time,
|
|
205
|
-
text=segment.text,
|
|
206
|
-
words=offset_words,
|
|
207
|
-
speaker=segment.speaker,
|
|
208
|
-
)
|
|
212
|
+
replace(segment, start=segment.start + time, end=segment.end + time, words=offset_words)
|
|
209
213
|
)
|
|
210
214
|
|
|
211
215
|
return Transcription(segments=offset_segments, language=self.language)
|
|
@@ -245,16 +249,9 @@ class Transcription:
|
|
|
245
249
|
def _flush(words: list[TranscriptionWord]) -> None:
|
|
246
250
|
if not words:
|
|
247
251
|
return
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
start=words[0].start,
|
|
252
|
-
end=words[-1].end,
|
|
253
|
-
text=segment_text,
|
|
254
|
-
words=words.copy(),
|
|
255
|
-
speaker=words[0].speaker,
|
|
256
|
-
)
|
|
257
|
-
)
|
|
252
|
+
# Words here are regrouped across original segments, so the source
|
|
253
|
+
# segments' confidence fields no longer apply -- left as None.
|
|
254
|
+
standardized_segments.append(TranscriptionSegment.from_words(words, speaker=words[0].speaker))
|
|
258
255
|
|
|
259
256
|
if time is not None:
|
|
260
257
|
current_words: list[TranscriptionWord] = []
|
|
@@ -315,18 +312,9 @@ class Transcription:
|
|
|
315
312
|
start_of_sentence = True
|
|
316
313
|
new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))
|
|
317
314
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
end=segment.end,
|
|
322
|
-
text=" ".join(w.word for w in new_words),
|
|
323
|
-
words=new_words,
|
|
324
|
-
speaker=segment.speaker,
|
|
325
|
-
avg_logprob=segment.avg_logprob,
|
|
326
|
-
no_speech_prob=segment.no_speech_prob,
|
|
327
|
-
compression_ratio=segment.compression_ratio,
|
|
328
|
-
)
|
|
329
|
-
)
|
|
315
|
+
# Casing-only rewrite: segment boundaries, speaker, and confidence
|
|
316
|
+
# are unchanged; only the tokens (and joined text) differ.
|
|
317
|
+
capitalized_segments.append(replace(segment, text=" ".join(w.word for w in new_words), words=new_words))
|
|
330
318
|
|
|
331
319
|
return Transcription(segments=capitalized_segments, language=self.language)
|
|
332
320
|
|
|
@@ -353,16 +341,17 @@ class Transcription:
|
|
|
353
341
|
for segment in self.segments:
|
|
354
342
|
words = segment.words
|
|
355
343
|
if not words:
|
|
356
|
-
|
|
344
|
+
# Nothing to split; emit a fresh copy so the result never
|
|
345
|
+
# aliases the source segment.
|
|
346
|
+
chunked_segments.append(replace(segment, words=list(segment.words)))
|
|
357
347
|
continue
|
|
358
348
|
for i in range(0, len(words), max_words):
|
|
359
349
|
group = words[i : i + max_words]
|
|
350
|
+
# Splitting *within* one source segment -- its confidence
|
|
351
|
+
# fields still apply, so carry them through.
|
|
360
352
|
chunked_segments.append(
|
|
361
|
-
TranscriptionSegment(
|
|
362
|
-
|
|
363
|
-
end=group[-1].end,
|
|
364
|
-
text=" ".join(w.word for w in group),
|
|
365
|
-
words=list(group),
|
|
353
|
+
TranscriptionSegment.from_words(
|
|
354
|
+
group,
|
|
366
355
|
speaker=segment.speaker,
|
|
367
356
|
avg_logprob=segment.avg_logprob,
|
|
368
357
|
no_speech_prob=segment.no_speech_prob,
|
|
@@ -409,34 +398,17 @@ class Transcription:
|
|
|
409
398
|
if word.speaker == current_speaker:
|
|
410
399
|
current_words.append(word)
|
|
411
400
|
else:
|
|
412
|
-
# Finish current segment
|
|
401
|
+
# Finish current segment (speaker is ambiguous across the
|
|
402
|
+
# original segments these words came from -- confidence omitted)
|
|
413
403
|
if current_words:
|
|
414
|
-
|
|
415
|
-
sliced_segments.append(
|
|
416
|
-
TranscriptionSegment(
|
|
417
|
-
start=current_words[0].start,
|
|
418
|
-
end=current_words[-1].end,
|
|
419
|
-
text=segment_text,
|
|
420
|
-
words=current_words.copy(),
|
|
421
|
-
speaker=current_speaker,
|
|
422
|
-
)
|
|
423
|
-
)
|
|
404
|
+
sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
|
|
424
405
|
# Start new segment
|
|
425
406
|
current_speaker = word.speaker
|
|
426
407
|
current_words = [word]
|
|
427
408
|
|
|
428
409
|
# Add final segment
|
|
429
410
|
if current_words:
|
|
430
|
-
|
|
431
|
-
sliced_segments.append(
|
|
432
|
-
TranscriptionSegment(
|
|
433
|
-
start=current_words[0].start,
|
|
434
|
-
end=current_words[-1].end,
|
|
435
|
-
text=segment_text,
|
|
436
|
-
words=current_words.copy(),
|
|
437
|
-
speaker=current_speaker,
|
|
438
|
-
)
|
|
439
|
-
)
|
|
411
|
+
sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
|
|
440
412
|
|
|
441
413
|
return Transcription(segments=sliced_segments, language=self.language)
|
|
442
414
|
|
|
@@ -21,7 +21,7 @@ from .effects import (
|
|
|
21
21
|
Zoom,
|
|
22
22
|
)
|
|
23
23
|
from .operation import FilterCtx, OpCategory, Operation, TimeRange
|
|
24
|
-
from .transcription_overlay import TranscriptionOverlay
|
|
24
|
+
from .transcription_overlay import SubtitleRegion, SubtitleStyle, TranscriptionOverlay
|
|
25
25
|
from .transforms import (
|
|
26
26
|
Crop,
|
|
27
27
|
CropMode,
|
|
@@ -65,6 +65,8 @@ __all__ = [
|
|
|
65
65
|
"VolumeAdjust",
|
|
66
66
|
"TextOverlay",
|
|
67
67
|
"TranscriptionOverlay",
|
|
68
|
+
"SubtitleStyle",
|
|
69
|
+
"SubtitleRegion",
|
|
68
70
|
"Shake",
|
|
69
71
|
"PunchIn",
|
|
70
72
|
"Flash",
|