videopython 0.33.4__tar.gz → 0.34.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {videopython-0.33.4 → videopython-0.34.0}/PKG-INFO +1 -1
  2. {videopython-0.33.4 → videopython-0.34.0}/pyproject.toml +1 -1
  3. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/transforms.py +24 -8
  4. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/image_text.py +137 -44
  5. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/transcription.py +60 -88
  6. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/__init__.py +3 -1
  7. videopython-0.34.0/src/videopython/editing/transcription_overlay.py +516 -0
  8. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/video_edit.py +116 -4
  9. videopython-0.33.4/src/videopython/editing/transcription_overlay.py +0 -186
  10. {videopython-0.33.4 → videopython-0.34.0}/.gitignore +0 -0
  11. {videopython-0.33.4 → videopython-0.34.0}/LICENSE +0 -0
  12. {videopython-0.33.4 → videopython-0.34.0}/README.md +0 -0
  13. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/__init__.py +0 -0
  14. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/__init__.py +0 -0
  15. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/_device.py +0 -0
  16. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/__init__.py +0 -0
  17. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/config.py +0 -0
  18. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/dubber.py +0 -0
  19. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/expressiveness.py +0 -0
  20. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/loudness.py +0 -0
  21. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/models.py +0 -0
  22. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
  23. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/quality.py +0 -0
  24. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/remux.py +0 -0
  25. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/timing.py +0 -0
  26. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/dubbing/voice_sample.py +0 -0
  27. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/__init__.py +0 -0
  28. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/audio.py +0 -0
  29. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/image.py +0 -0
  30. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/qwen3.py +0 -0
  31. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/translation.py +0 -0
  32. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/generation/video.py +0 -0
  33. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/__init__.py +0 -0
  34. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/audio.py +0 -0
  35. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/faces.py +0 -0
  36. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/image.py +0 -0
  37. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/separation.py +0 -0
  38. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/understanding/temporal.py +0 -0
  39. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/__init__.py +0 -0
  40. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/analyzer.py +0 -0
  41. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/models.py +0 -0
  42. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/sampling.py +0 -0
  43. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/video_analysis/stages.py +0 -0
  44. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/audio/__init__.py +0 -0
  45. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/audio/analysis.py +0 -0
  46. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/audio/audio.py +0 -0
  47. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/__init__.py +0 -0
  48. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/_dimensions.py +0 -0
  49. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/_ffmpeg.py +0 -0
  50. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/_video_io.py +0 -0
  51. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/description.py +0 -0
  52. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/exceptions.py +0 -0
  53. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
  54. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
  55. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/fonts/__init__.py +0 -0
  56. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/video.py +0 -0
  57. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/effects.py +0 -0
  58. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/operation.py +0 -0
  59. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/streaming.py +0 -0
  60. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/transforms.py +0 -0
  61. {videopython-0.33.4 → videopython-0.34.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.33.4
3
+ Version: 0.34.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.33.4"
3
+ version = "0.34.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -12,7 +12,7 @@ from tqdm import tqdm
12
12
 
13
13
  from videopython.ai.understanding.faces import FaceTracker
14
14
  from videopython.base._dimensions import floor_to_even
15
- from videopython.base.video import Video
15
+ from videopython.base.video import Video, VideoMetadata
16
16
  from videopython.editing.operation import OpCategory, Operation
17
17
 
18
18
  logger = logging.getLogger(__name__)
@@ -76,6 +76,28 @@ class FaceTrackingCrop(Operation):
76
76
  # "dynamic" — placeholder until motion/look-direction framing is implemented.
77
77
  return (face_cx, face_cy - self.headroom)
78
78
 
79
+ def _resolved_output_dims(self, w: int, h: int) -> tuple[int, int]:
80
+ """Output ``(width, height)`` after the crop + resize.
81
+
82
+ Every frame is resized to this size regardless of the per-frame face
83
+ position, so it is a pure function of the input dimensions and
84
+ ``target_aspect``. Single source of truth shared by :meth:`apply` and
85
+ :meth:`predict_metadata` (mirrors ``Resize._resolve_dims`` /
86
+ ``Crop._resolve_box``), so the dry-run cannot disagree with the render.
87
+ """
88
+ target_ratio = self.target_aspect[0] / self.target_aspect[1]
89
+ if target_ratio < w / h:
90
+ out_h = floor_to_even(h)
91
+ out_w = floor_to_even(int(out_h * target_ratio))
92
+ else:
93
+ out_w = floor_to_even(w)
94
+ out_h = floor_to_even(int(out_w / target_ratio))
95
+ return out_w, out_h
96
+
97
+ def predict_metadata(self, meta: VideoMetadata) -> VideoMetadata:
98
+ out_w, out_h = self._resolved_output_dims(meta.width, meta.height)
99
+ return meta.with_dimensions(out_w, out_h)
100
+
79
101
  def _clamp_speed(self, current: tuple[float, float], target: tuple[float, float]) -> tuple[float, float]:
80
102
  if self.max_speed is None:
81
103
  return target
@@ -135,13 +157,7 @@ class FaceTrackingCrop(Operation):
135
157
  )
136
158
 
137
159
  h, w = video.frame_shape[:2]
138
- target_ratio = self.target_aspect[0] / self.target_aspect[1]
139
- if target_ratio < w / h:
140
- out_h = floor_to_even(h)
141
- out_w = floor_to_even(int(out_h * target_ratio))
142
- else:
143
- out_w = floor_to_even(w)
144
- out_h = floor_to_even(int(out_w / target_ratio))
160
+ out_w, out_h = self._resolved_output_dims(w, h)
145
161
 
146
162
  default_x = (w - out_w) // 2
147
163
  default_y = (h - out_h) // 2
@@ -9,6 +9,7 @@ generation helpers (``ai/understanding/image.py``).
9
9
 
10
10
  from __future__ import annotations
11
11
 
12
+ from dataclasses import dataclass
12
13
  from enum import Enum
13
14
  from typing import TypeAlias
14
15
 
@@ -18,7 +19,7 @@ from PIL import Image, ImageDraw, ImageFont
18
19
  from videopython.base.exceptions import OutOfBoundsError
19
20
  from videopython.base.fonts import load_font
20
21
 
21
- __all__ = ["ImageText", "TextAlign", "AnchorPoint"]
22
+ __all__ = ["ImageText", "TextAlign", "AnchorPoint", "TextBoxRect"]
22
23
 
23
24
  # Type aliases for clarity
24
25
  MarginType: TypeAlias = int | tuple[int, int, int, int]
@@ -79,6 +80,32 @@ class AnchorPoint(str, Enum):
79
80
  return (cls.BOTTOM_LEFT, cls.BOTTOM_CENTER, cls.BOTTOM_RIGHT)
80
81
 
81
82
 
83
+ @dataclass(frozen=True)
84
+ class TextBoxRect:
85
+ """Resolved geometry of a wrapped text box, without rendering it.
86
+
87
+ Returned by :meth:`ImageText.measure_text_box` — the single source of
88
+ truth for box measure/wrap/anchor/bounds, shared by the renderer
89
+ (:meth:`ImageText.write_text_box`) and dry-run validators so they can
90
+ never disagree on whether text fits.
91
+
92
+ For a non-degenerate box ``(x, y)`` is the anchor-adjusted top-left
93
+ corner and ``width``/``height`` span the wrapped lines. For a degenerate
94
+ box (whitespace-only text → no renderable lines) ``height == 0``,
95
+ ``(x, y)`` is the *unadjusted* insertion point, and ``fits`` is ``True``;
96
+ callers short-circuit such boxes (nothing to draw). ``width`` mirrors the
97
+ resolved ``box_width`` and may be a float when an absolute >1 value was
98
+ passed, matching legacy behaviour.
99
+ """
100
+
101
+ x: float
102
+ y: float
103
+ width: int | float
104
+ height: int
105
+ fits: bool
106
+ lines: tuple[str, ...]
107
+
108
+
82
109
  class ImageText:
83
110
  def __init__(
84
111
  self,
@@ -566,6 +593,97 @@ class ImageText:
566
593
  lines = [" ".join(line) for line in split_lines]
567
594
  return lines
568
595
 
596
+ def available_region(self, margin: MarginType = 0) -> tuple[int, int, int, int]:
597
+ """The drawable area inside ``margin`` as ``(left, top, width, height)``.
598
+
599
+ Single source of truth for margin-inset geometry: used by
600
+ :meth:`measure_text_box` and by callers that need to clamp a box
601
+ within the margins without re-deriving the margin math.
602
+ """
603
+ margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
604
+ available_width = self.image_size[1] - margin_left - margin_right
605
+ available_height = self.image_size[0] - margin_top - margin_bottom
606
+ return margin_left, margin_top, available_width, available_height
607
+
608
+ def measure_text_box(
609
+ self,
610
+ text: str,
611
+ font_filename: str | None,
612
+ xy: PositionType,
613
+ box_width: int | float | None = None,
614
+ font_size: int = 11,
615
+ anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
616
+ margin: MarginType = 0,
617
+ ) -> TextBoxRect:
618
+ """Measure where a wrapped text box would land, without drawing it.
619
+
620
+ Pure: resolves margins/box-width/position, wraps the text, applies the
621
+ anchor, and bounds-checks against the image — the exact math
622
+ :meth:`write_text_box` used to do inline. Highlighting and per-line
623
+ alignment (``place``) do not change the box envelope, so they are not
624
+ parameters here; this intentionally preserves the pre-existing
625
+ behaviour that an enlarged highlighted word is *not* accounted for in
626
+ the fit check.
627
+
628
+ Returns:
629
+ A :class:`TextBoxRect`. ``fits`` is ``False`` when the box would
630
+ fall outside the image bounds (the condition that makes
631
+ :meth:`write_text_box` raise :class:`OutOfBoundsError`).
632
+
633
+ Raises:
634
+ ValueError: If ``text`` is empty, ``font_size`` is not positive,
635
+ or an absolute ``box_width`` is not positive.
636
+ """
637
+ if not text:
638
+ raise ValueError("Text cannot be empty")
639
+
640
+ if font_size <= 0:
641
+ raise ValueError("Font size must be positive")
642
+
643
+ # Process margins to determine available area (shared with callers
644
+ # that clamp boxes inside the margins -- see ``available_region``).
645
+ margin_left, margin_top, available_width, available_height = self.available_region(margin)
646
+
647
+ # Handle relative box width
648
+ if box_width is None:
649
+ box_width = available_width
650
+ elif isinstance(box_width, float) and 0 < box_width <= 1:
651
+ box_width = int(available_width * box_width)
652
+ elif isinstance(box_width, int) and box_width <= 0:
653
+ raise ValueError("Box width must be positive")
654
+
655
+ # Calculate initial position based on margin and anchor before splitting text
656
+ x_pos, y_pos = self._convert_position(xy, margin_top, margin_left, available_width, available_height)
657
+
658
+ # Split text into lines that fit within box_width
659
+ lines = self._split_lines_by_width(text, font_filename, font_size, int(box_width))
660
+
661
+ # Calculate total height of all lines
662
+ lines_height = sum(self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines)
663
+ if lines_height == 0:
664
+ # No renderable lines (e.g. whitespace-only text); position is the
665
+ # unadjusted insertion point and the box trivially "fits".
666
+ return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=0, fits=True, lines=tuple(lines))
667
+
668
+ # Final position calculation based on anchor point
669
+ if anchor in AnchorPoint.center_anchors():
670
+ x_pos -= box_width // 2
671
+ elif anchor in AnchorPoint.right_anchors():
672
+ x_pos -= box_width
673
+
674
+ if anchor in AnchorPoint.middle_anchors():
675
+ y_pos -= lines_height // 2
676
+ elif anchor in AnchorPoint.bottom_anchors():
677
+ y_pos -= lines_height
678
+
679
+ fits = not (
680
+ x_pos < 0
681
+ or y_pos < 0
682
+ or x_pos + box_width > self.image_size[1]
683
+ or y_pos + lines_height > self.image_size[0]
684
+ )
685
+ return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=lines_height, fits=fits, lines=tuple(lines))
686
+
569
687
  def write_text_box(
570
688
  self,
571
689
  text: str,
@@ -643,49 +761,24 @@ class ImageText:
643
761
  if highlight_word_index is not None and highlight_color is None:
644
762
  highlight_color = text_color
645
763
 
646
- # Process margins to determine available area
647
- margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
648
- available_width = self.image_size[1] - margin_left - margin_right
649
- available_height = self.image_size[0] - margin_top - margin_bottom
650
-
651
- # Handle relative box width
652
- if box_width is None:
653
- box_width = available_width
654
- elif isinstance(box_width, float) and 0 < box_width <= 1:
655
- box_width = int(available_width * box_width)
656
- elif isinstance(box_width, int) and box_width <= 0:
657
- raise ValueError("Box width must be positive")
658
-
659
- # Calculate initial position based on margin and anchor before splitting text
660
- x_pos, y_pos = self._convert_position(xy, margin_top, margin_left, available_width, available_height)
661
-
662
- # Split text into lines that fit within box_width
663
- lines = self._split_lines_by_width(text, font_filename, font_size, int(box_width))
664
-
665
- # Calculate total height of all lines
666
- lines_height = sum([self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines])
667
- if lines_height == 0:
668
- # If we have no valid lines or zero height, return the position
669
- return (int(x_pos), int(y_pos))
670
-
671
- # Final position calculation based on anchor point
672
- if anchor in AnchorPoint.center_anchors():
673
- x_pos -= box_width // 2
674
- elif anchor in AnchorPoint.right_anchors():
675
- x_pos -= box_width
676
-
677
- if anchor in AnchorPoint.middle_anchors():
678
- y_pos -= lines_height // 2
679
- elif anchor in AnchorPoint.bottom_anchors():
680
- y_pos -= lines_height
681
-
682
- # Verify box will fit within bounds
683
- if (
684
- x_pos < 0
685
- or y_pos < 0
686
- or x_pos + box_width > self.image_size[1]
687
- or y_pos + lines_height > self.image_size[0]
688
- ):
764
+ # Measure (single source of truth for box geometry), then render.
765
+ rect = self.measure_text_box(
766
+ text=text,
767
+ font_filename=font_filename,
768
+ xy=xy,
769
+ box_width=box_width,
770
+ font_size=font_size,
771
+ anchor=anchor,
772
+ margin=margin,
773
+ )
774
+ lines = list(rect.lines)
775
+ if rect.height == 0:
776
+ # No renderable lines (e.g. whitespace-only text); nothing to draw.
777
+ return (int(rect.x), int(rect.y))
778
+ box_width = rect.width
779
+ x_pos, y_pos = rect.x, rect.y
780
+ lines_height = rect.height
781
+ if not rect.fits:
689
782
  raise OutOfBoundsError(
690
783
  f"Text box with size ({box_width}x{lines_height}) at position ({x_pos}, {y_pos}) is out of bounds!"
691
784
  )
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
3
+ from dataclasses import dataclass, replace
4
4
  from pathlib import Path
5
5
  from typing import Any
6
6
 
@@ -79,6 +79,38 @@ class TranscriptionSegment:
79
79
  compression_ratio=data.get("compression_ratio"),
80
80
  )
81
81
 
82
+ @classmethod
83
+ def from_words(
84
+ cls,
85
+ words: list[TranscriptionWord],
86
+ *,
87
+ speaker: str | None = None,
88
+ avg_logprob: float | None = None,
89
+ no_speech_prob: float | None = None,
90
+ compression_ratio: float | None = None,
91
+ ) -> TranscriptionSegment:
92
+ """Build a segment spanning ``words``, deriving start/end/text from them.
93
+
94
+ ``words`` must be non-empty: ``start``/``end`` come from the first/last
95
+ word and ``text`` is the words joined by single spaces. Speaker and the
96
+ confidence fields are passed through so callers re-segmenting *within* a
97
+ known source segment can preserve them; callers regrouping words across
98
+ segments (where these are ambiguous) simply omit them, leaving ``None``.
99
+ The ``words`` list is copied, so the result never aliases the caller's.
100
+ """
101
+ if not words:
102
+ raise ValueError("from_words requires a non-empty word list")
103
+ return cls(
104
+ start=words[0].start,
105
+ end=words[-1].end,
106
+ text=" ".join(w.word for w in words),
107
+ words=list(words),
108
+ speaker=speaker,
109
+ avg_logprob=avg_logprob,
110
+ no_speech_prob=no_speech_prob,
111
+ compression_ratio=compression_ratio,
112
+ )
113
+
82
114
 
83
115
  class Transcription:
84
116
  def __init__(
@@ -124,39 +156,19 @@ class Transcription:
124
156
  return []
125
157
 
126
158
  current_speaker = words[0].speaker
127
- current_words = []
128
- segment_start = words[0].start
159
+ current_words: list[TranscriptionWord] = []
129
160
  segments = []
130
161
 
131
162
  for word in words:
132
163
  if current_speaker == word.speaker:
133
164
  current_words.append(word)
134
165
  else:
135
- segment_text = " ".join(w.word for w in current_words)
136
- segments.append(
137
- TranscriptionSegment(
138
- start=segment_start,
139
- end=current_words[-1].end,
140
- text=segment_text.strip(),
141
- words=current_words.copy(),
142
- speaker=current_speaker,
143
- )
144
- )
166
+ segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
145
167
  current_speaker = word.speaker
146
168
  current_words = [word]
147
- segment_start = word.start
148
169
 
149
170
  if current_words:
150
- segment_text = " ".join(w.word for w in current_words)
151
- segments.append(
152
- TranscriptionSegment(
153
- start=segment_start,
154
- end=current_words[-1].end,
155
- text=segment_text.strip(),
156
- words=current_words.copy(),
157
- speaker=current_speaker,
158
- )
159
- )
171
+ segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
160
172
 
161
173
  return segments
162
174
 
@@ -190,22 +202,14 @@ class Transcription:
190
202
  offset_segments = []
191
203
 
192
204
  for segment in self.segments:
193
- offset_words = []
194
- for word in segment.words:
195
- offset_words.append(
196
- TranscriptionWord(
197
- start=word.start + time, end=word.end + time, word=word.word, speaker=word.speaker
198
- )
199
- )
200
-
205
+ offset_words = [
206
+ TranscriptionWord(start=w.start + time, end=w.end + time, word=w.word, speaker=w.speaker)
207
+ for w in segment.words
208
+ ]
209
+ # ``replace`` carries text, speaker, and confidence fields through a
210
+ # pure timing shift unchanged -- only timestamps move.
201
211
  offset_segments.append(
202
- TranscriptionSegment(
203
- start=segment.start + time,
204
- end=segment.end + time,
205
- text=segment.text,
206
- words=offset_words,
207
- speaker=segment.speaker,
208
- )
212
+ replace(segment, start=segment.start + time, end=segment.end + time, words=offset_words)
209
213
  )
210
214
 
211
215
  return Transcription(segments=offset_segments, language=self.language)
@@ -245,16 +249,9 @@ class Transcription:
245
249
  def _flush(words: list[TranscriptionWord]) -> None:
246
250
  if not words:
247
251
  return
248
- segment_text = " ".join(w.word for w in words)
249
- standardized_segments.append(
250
- TranscriptionSegment(
251
- start=words[0].start,
252
- end=words[-1].end,
253
- text=segment_text,
254
- words=words.copy(),
255
- speaker=words[0].speaker,
256
- )
257
- )
252
+ # Words here are regrouped across original segments, so the source
253
+ # segments' confidence fields no longer apply -- left as None.
254
+ standardized_segments.append(TranscriptionSegment.from_words(words, speaker=words[0].speaker))
258
255
 
259
256
  if time is not None:
260
257
  current_words: list[TranscriptionWord] = []
@@ -315,18 +312,9 @@ class Transcription:
315
312
  start_of_sentence = True
316
313
  new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))
317
314
 
318
- capitalized_segments.append(
319
- TranscriptionSegment(
320
- start=segment.start,
321
- end=segment.end,
322
- text=" ".join(w.word for w in new_words),
323
- words=new_words,
324
- speaker=segment.speaker,
325
- avg_logprob=segment.avg_logprob,
326
- no_speech_prob=segment.no_speech_prob,
327
- compression_ratio=segment.compression_ratio,
328
- )
329
- )
315
+ # Casing-only rewrite: segment boundaries, speaker, and confidence
316
+ # are unchanged; only the tokens (and joined text) differ.
317
+ capitalized_segments.append(replace(segment, text=" ".join(w.word for w in new_words), words=new_words))
330
318
 
331
319
  return Transcription(segments=capitalized_segments, language=self.language)
332
320
 
@@ -353,16 +341,17 @@ class Transcription:
353
341
  for segment in self.segments:
354
342
  words = segment.words
355
343
  if not words:
356
- chunked_segments.append(segment)
344
+ # Nothing to split; emit a fresh copy so the result never
345
+ # aliases the source segment.
346
+ chunked_segments.append(replace(segment, words=list(segment.words)))
357
347
  continue
358
348
  for i in range(0, len(words), max_words):
359
349
  group = words[i : i + max_words]
350
+ # Splitting *within* one source segment -- its confidence
351
+ # fields still apply, so carry them through.
360
352
  chunked_segments.append(
361
- TranscriptionSegment(
362
- start=group[0].start,
363
- end=group[-1].end,
364
- text=" ".join(w.word for w in group),
365
- words=list(group),
353
+ TranscriptionSegment.from_words(
354
+ group,
366
355
  speaker=segment.speaker,
367
356
  avg_logprob=segment.avg_logprob,
368
357
  no_speech_prob=segment.no_speech_prob,
@@ -409,34 +398,17 @@ class Transcription:
409
398
  if word.speaker == current_speaker:
410
399
  current_words.append(word)
411
400
  else:
412
- # Finish current segment
401
+ # Finish current segment (speaker is ambiguous across the
402
+ # original segments these words came from -- confidence omitted)
413
403
  if current_words:
414
- segment_text = " ".join(w.word for w in current_words)
415
- sliced_segments.append(
416
- TranscriptionSegment(
417
- start=current_words[0].start,
418
- end=current_words[-1].end,
419
- text=segment_text,
420
- words=current_words.copy(),
421
- speaker=current_speaker,
422
- )
423
- )
404
+ sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
424
405
  # Start new segment
425
406
  current_speaker = word.speaker
426
407
  current_words = [word]
427
408
 
428
409
  # Add final segment
429
410
  if current_words:
430
- segment_text = " ".join(w.word for w in current_words)
431
- sliced_segments.append(
432
- TranscriptionSegment(
433
- start=current_words[0].start,
434
- end=current_words[-1].end,
435
- text=segment_text,
436
- words=current_words.copy(),
437
- speaker=current_speaker,
438
- )
439
- )
411
+ sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
440
412
 
441
413
  return Transcription(segments=sliced_segments, language=self.language)
442
414
 
@@ -21,7 +21,7 @@ from .effects import (
21
21
  Zoom,
22
22
  )
23
23
  from .operation import FilterCtx, OpCategory, Operation, TimeRange
24
- from .transcription_overlay import TranscriptionOverlay
24
+ from .transcription_overlay import SubtitleRegion, SubtitleStyle, TranscriptionOverlay
25
25
  from .transforms import (
26
26
  Crop,
27
27
  CropMode,
@@ -65,6 +65,8 @@ __all__ = [
65
65
  "VolumeAdjust",
66
66
  "TextOverlay",
67
67
  "TranscriptionOverlay",
68
+ "SubtitleStyle",
69
+ "SubtitleRegion",
68
70
  "Shake",
69
71
  "PunchIn",
70
72
  "Flash",