lattifai 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +2 -3
  3. lattifai/alignment/lattice1_aligner.py +117 -4
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/segmenter.py +3 -2
  6. lattifai/alignment/text_align.py +2 -1
  7. lattifai/alignment/tokenizer.py +56 -29
  8. lattifai/audio2.py +162 -183
  9. lattifai/cli/alignment.py +5 -0
  10. lattifai/cli/caption.py +6 -6
  11. lattifai/cli/transcribe.py +1 -5
  12. lattifai/cli/youtube.py +3 -0
  13. lattifai/client.py +41 -12
  14. lattifai/config/__init__.py +21 -3
  15. lattifai/config/alignment.py +7 -0
  16. lattifai/config/caption.py +13 -243
  17. lattifai/config/client.py +16 -0
  18. lattifai/config/event.py +102 -0
  19. lattifai/config/transcription.py +25 -1
  20. lattifai/data/__init__.py +8 -0
  21. lattifai/data/caption.py +228 -0
  22. lattifai/errors.py +78 -53
  23. lattifai/event/__init__.py +65 -0
  24. lattifai/event/lattifai.py +166 -0
  25. lattifai/mixin.py +22 -17
  26. lattifai/transcription/base.py +2 -1
  27. lattifai/transcription/gemini.py +147 -16
  28. lattifai/transcription/lattifai.py +8 -11
  29. lattifai/types.py +1 -1
  30. lattifai/youtube/client.py +143 -48
  31. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/METADATA +117 -54
  32. lattifai-1.3.0.dist-info/RECORD +57 -0
  33. lattifai/__init__.py +0 -88
  34. lattifai/alignment/sentence_splitter.py +0 -350
  35. lattifai/caption/__init__.py +0 -96
  36. lattifai/caption/caption.py +0 -661
  37. lattifai/caption/formats/__init__.py +0 -199
  38. lattifai/caption/formats/base.py +0 -211
  39. lattifai/caption/formats/gemini.py +0 -722
  40. lattifai/caption/formats/json.py +0 -194
  41. lattifai/caption/formats/lrc.py +0 -309
  42. lattifai/caption/formats/nle/__init__.py +0 -9
  43. lattifai/caption/formats/nle/audition.py +0 -561
  44. lattifai/caption/formats/nle/avid.py +0 -423
  45. lattifai/caption/formats/nle/fcpxml.py +0 -549
  46. lattifai/caption/formats/nle/premiere.py +0 -589
  47. lattifai/caption/formats/pysubs2.py +0 -642
  48. lattifai/caption/formats/sbv.py +0 -147
  49. lattifai/caption/formats/tabular.py +0 -338
  50. lattifai/caption/formats/textgrid.py +0 -193
  51. lattifai/caption/formats/ttml.py +0 -652
  52. lattifai/caption/formats/vtt.py +0 -469
  53. lattifai/caption/parsers/__init__.py +0 -9
  54. lattifai/caption/parsers/text_parser.py +0 -147
  55. lattifai/caption/standardize.py +0 -636
  56. lattifai/caption/supervision.py +0 -34
  57. lattifai/caption/utils.py +0 -474
  58. lattifai-1.2.2.dist-info/RECORD +0 -76
  59. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  60. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +0 -0
  61. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  62. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,57 @@
1
+ lattifai/_init.py,sha256=ZpiYRpP7kPh6XpQK6nX1zKKvtQE3Xr0T2Ed5S4wDiwU,609
2
+ lattifai/audio2.py,sha256=JpMvwB0e09hLoffji8zOSlgrIhqUUq_zq0ubT_QMspc,15317
3
+ lattifai/client.py,sha256=d6WpodzseoSv6aA8jkj4hwiMOz3irSkD15i1hr2JUsQ,18182
4
+ lattifai/errors.py,sha256=aPTMhTaeQjY0KMifnm-Kpo9MQPsoYsEanZB2f0AA6qQ,11911
5
+ lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
6
+ lattifai/mixin.py,sha256=_d-kzu2w5Y4mZTFMEumh30baWzKsQ9n_IS432woi5vQ,26295
7
+ lattifai/types.py,sha256=JK7KVaZhX89BiKPm4okY0DWLHY1S8aj-YiZXoVH1akw,667
8
+ lattifai/utils.py,sha256=5LeunAN0OQ1jWoKMIThpXSEOxFYD2dCRTdsglosodUU,7963
9
+ lattifai/alignment/__init__.py,sha256=aOyC1P5DqESNLpDh6Gu6LyUZAVMba-IKI7Ugz7v9G4w,344
10
+ lattifai/alignment/lattice1_aligner.py,sha256=MvBpPnhfF8NYtH2ANhQszKNRQROUiSYrBz3aN1bdT3U,11120
11
+ lattifai/alignment/lattice1_worker.py,sha256=Z7hxaS-nucNsUmrphbD8tgBBYPkJOgQb-85nFON94_I,13041
12
+ lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
13
+ lattifai/alignment/punctuation.py,sha256=qLcvuXhBzoEa6bznWZiAB5TAxR6eLr_ZV-PnnCY90UA,1218
14
+ lattifai/alignment/segmenter.py,sha256=JTbBYEXn8hkFwy0tITORy7nKoUPiNYLfi3w1DJNeHZ0,6303
15
+ lattifai/alignment/text_align.py,sha256=sF-6Tsf863BhJcii3joeNa6Auv-7l3SiOhh9j8oPGME,14935
16
+ lattifai/alignment/tokenizer.py,sha256=OIpMGHg1rJ7n97zncDMPpXy32uGOSt1yXiNO4sO6eP0,18839
17
+ lattifai/cli/__init__.py,sha256=PdqoCTqRSFSWrqL3FjBTa5VzJy_e6Rq0OzyT7YkyHpc,541
18
+ lattifai/cli/alignment.py,sha256=rqg6wU2vf6RJ058yWVoXft_UJfOCrEpmE-ye5fhTphg,6129
19
+ lattifai/cli/caption.py,sha256=jkMme73sJ16dkVpRh7O6qjbr14SUeBif00vCTBn7ed0,10339
20
+ lattifai/cli/diarization.py,sha256=GTd2vnTm6cJN6Q3mFP-ShY9bZBl1_zKzWFu-4HHcMzk,4075
21
+ lattifai/cli/transcribe.py,sha256=vZIV0TCbZG_IL2F_Mg49cCGSCBinOOFAtROajVTpNWE,7853
22
+ lattifai/cli/youtube.py,sha256=FJwDl48-cuacP1sdPvX19vdszXdT7EoOZgGYzJpoLeM,6360
23
+ lattifai/config/__init__.py,sha256=nJUVk03JRj4rujoEmkCkQ8akZF7kqIj7ci3XphU9uVA,1249
24
+ lattifai/config/alignment.py,sha256=3JUtgHBueIK_lH9PgeBPjuHGL4VvDEYVs9fvylir6bc,5392
25
+ lattifai/config/caption.py,sha256=OMLsW8QKDWM6A3G5V3Gf-9bgB3D1PC5gO8LiiNNeOwM,7195
26
+ lattifai/config/client.py,sha256=qqHKFPV4iEjVHCDOuGx7kj-tYFtgZZAszOQRFsNFbO8,2359
27
+ lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
28
+ lattifai/config/event.py,sha256=P-_2yOzSATZSXz-ctlWeJQGOKCbNLFnWLBvUZ8JclyA,3845
29
+ lattifai/config/media.py,sha256=nxvgC7zeLsthCARPPUbnK2eMJY8R1d-1XgiAsy8kroA,15568
30
+ lattifai/config/transcription.py,sha256=V0WtZ_p-WsBienRbGyd-zLdX6F_XRsDWGlba_qzwet0,4115
31
+ lattifai/data/__init__.py,sha256=hdUhvlUjPgb3_Hd_cJ30f2oXHBMZRGzaSafd64b3vYA,168
32
+ lattifai/data/caption.py,sha256=MVuZiQ47Lr3A1afFqGkqFzpWjPakmsQusQ86t210Y2Y,7800
33
+ lattifai/diarization/__init__.py,sha256=-ZZ_a5hIQgnlHIOehCTtmVmWOWC2H6eOhSs4AcVtRtk,1782
34
+ lattifai/diarization/lattifai.py,sha256=tCnFL6ywITqeKR8YoCsYvyJxNoIwoC6GsnI9zkXNB-Q,3128
35
+ lattifai/event/__init__.py,sha256=PPAWzrkRK8YgWhG6CtIUkb7nH8svd9_zGOhxjz0_dcM,2448
36
+ lattifai/event/lattifai.py,sha256=QJqUxJsIWryVVoud_qE8af6zoJ89ZyPgHDvQp4OzXg0,5826
37
+ lattifai/transcription/__init__.py,sha256=vMHciyCEPKhhfM3KjMCeDqnyxU1oghF8g5o5SvpnT_4,2669
38
+ lattifai/transcription/base.py,sha256=ywRjIGg6emTx1v8PCSPyHcdugR6PvdTl10H64Iu1iqs,4617
39
+ lattifai/transcription/gemini.py,sha256=p6uZlhPQuzzUsj226Jk-INOt7NF5g4TIN6yEn1ZwrBI,18030
40
+ lattifai/transcription/lattifai.py,sha256=DA7QSN-a_yIZq79Nc_f6lf8_VWW4qqhyXfoZ1Um-31M,3451
41
+ lattifai/transcription/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
42
+ lattifai/transcription/prompts/__init__.py,sha256=G9b42COaCYv3sPPNkHsGDLOMBuVGKt4mXGYal_BYtYQ,1351
43
+ lattifai/transcription/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
44
+ lattifai/transcription/prompts/gemini/transcription_gem.txt,sha256=cljzZ--BDgnnKzqVCakr-fTp2Xk38UOsUquvruNX-LU,4600
45
+ lattifai/workflow/__init__.py,sha256=INpQgc9gZ2Fp-aTHcpR3TEHGtEtPzjOB8T7-jLzVM0E,1547
46
+ lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,38
47
+ lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
48
+ lattifai/workflow/file_manager.py,sha256=yc29Vb7JNUMJ9rwM_YjkAHfDInl8HMVAl9A7z7XiIOU,32974
49
+ lattifai/youtube/__init__.py,sha256=_uO3KCx-t6I-JaYFpcYLYpvkbmEOOni3xBqGEbExg68,1587
50
+ lattifai/youtube/client.py,sha256=VU8FC1N7YYpbc4LeJNAsahNAI1R7e3_7Yjmb1rz7tyI,52878
51
+ lattifai/youtube/types.py,sha256=80RgBmvM4tRbxqyNv9GU6hr9vPp_yhKrK0RJ_vG2h4E,472
52
+ lattifai-1.3.0.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
53
+ lattifai-1.3.0.dist-info/METADATA,sha256=WMgLRzKiJv_Zn1aoxPjWofNZRE4tSjTWxa16zWNYVTk,23008
54
+ lattifai-1.3.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
55
+ lattifai-1.3.0.dist-info/entry_points.txt,sha256=MfoqXNjXrhD7VMApHgaHmAECTcGVUMUiR0uqnTg7Ads,502
56
+ lattifai-1.3.0.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
57
+ lattifai-1.3.0.dist-info/RECORD,,
lattifai/__init__.py DELETED
@@ -1,88 +0,0 @@
1
- import os
2
- import warnings
3
- from importlib.metadata import version
4
-
5
- # Suppress SWIG deprecation warnings before any imports
6
- warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*SwigPy.*")
7
-
8
- # Suppress PyTorch transformer nested tensor warning
9
- warnings.filterwarnings("ignore", category=UserWarning, message=".*enable_nested_tensor.*")
10
-
11
- # Disable tokenizers parallelism warning
12
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
13
-
14
- # Re-export I/O classes
15
- from .caption import Caption
16
-
17
- # Re-export client classes
18
- from .client import LattifAI
19
-
20
- # Re-export config classes
21
- from .config import (
22
- AUDIO_FORMATS,
23
- MEDIA_FORMATS,
24
- VIDEO_FORMATS,
25
- AlignmentConfig,
26
- CaptionConfig,
27
- ClientConfig,
28
- DiarizationConfig,
29
- MediaConfig,
30
- )
31
- from .errors import (
32
- AlignmentError,
33
- APIError,
34
- AudioFormatError,
35
- AudioLoadError,
36
- AudioProcessingError,
37
- CaptionParseError,
38
- CaptionProcessingError,
39
- ConfigurationError,
40
- DependencyError,
41
- LatticeDecodingError,
42
- LatticeEncodingError,
43
- LattifAIError,
44
- ModelLoadError,
45
- )
46
- from .logging import get_logger, set_log_level, setup_logger
47
-
48
- try:
49
- __version__ = version("lattifai")
50
- except Exception:
51
- __version__ = "0.1.0" # fallback version
52
-
53
-
54
- __all__ = [
55
- # Client classes
56
- "LattifAI",
57
- # Config classes
58
- "AlignmentConfig",
59
- "ClientConfig",
60
- "CaptionConfig",
61
- "DiarizationConfig",
62
- "MediaConfig",
63
- "AUDIO_FORMATS",
64
- "VIDEO_FORMATS",
65
- "MEDIA_FORMATS",
66
- # Error classes
67
- "LattifAIError",
68
- "AudioProcessingError",
69
- "AudioLoadError",
70
- "AudioFormatError",
71
- "CaptionProcessingError",
72
- "CaptionParseError",
73
- "AlignmentError",
74
- "LatticeEncodingError",
75
- "LatticeDecodingError",
76
- "ModelLoadError",
77
- "DependencyError",
78
- "APIError",
79
- "ConfigurationError",
80
- # Logging
81
- "setup_logger",
82
- "get_logger",
83
- "set_log_level",
84
- # I/O
85
- "Caption",
86
- # Version
87
- "__version__",
88
- ]
@@ -1,350 +0,0 @@
1
- import re
2
- from typing import List, Optional
3
-
4
- from lattifai.alignment.punctuation import END_PUNCTUATION
5
- from lattifai.caption import Supervision
6
- from lattifai.utils import _resolve_model_path
7
-
8
-
9
- class SentenceSplitter:
10
- """Lazy-initialized sentence splitter using wtpsplit."""
11
-
12
- def __init__(self, device: str = "cpu", model_hub: Optional[str] = "modelscope", lazy_init: bool = True):
13
- """Initialize sentence splitter with lazy loading.
14
-
15
- Args:
16
- device: Device to run the model on (cpu, cuda, mps)
17
- model_hub: Model hub to use (None for huggingface, "modelscope" for modelscope)
18
- """
19
- self.device = device
20
- self.model_hub = model_hub
21
- self._splitter = None
22
- if not lazy_init:
23
- self._init_splitter()
24
-
25
- def _init_splitter(self):
26
- """Initialize the sentence splitter model on first use."""
27
- if self._splitter is not None:
28
- return
29
-
30
- import onnxruntime as ort
31
- from wtpsplit import SaT
32
-
33
- providers = []
34
- device = self.device
35
- if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
36
- providers.append("CUDAExecutionProvider")
37
- elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
38
- providers.append("MPSExecutionProvider")
39
-
40
- if self.model_hub == "modelscope":
41
- downloaded_path = _resolve_model_path("LattifAI/OmniTokenizer", model_hub="modelscope")
42
- sat = SaT(
43
- f"{downloaded_path}/sat-3l-sm",
44
- tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
45
- ort_providers=providers + ["CPUExecutionProvider"],
46
- )
47
- else:
48
- sat_path = _resolve_model_path("segment-any-text/sat-3l-sm", model_hub="huggingface")
49
- sat = SaT(
50
- sat_path,
51
- tokenizer_name_or_path="facebookAI/xlm-roberta-base",
52
- hub_prefix="segment-any-text",
53
- ort_providers=providers + ["CPUExecutionProvider"],
54
- )
55
- self._splitter = sat
56
-
57
- @staticmethod
58
- def _distribute_time_info(
59
- input_supervisions: List[Supervision],
60
- split_texts: List[str],
61
- ) -> List[Supervision]:
62
- """Distribute time information from input supervisions to split sentences.
63
-
64
- Args:
65
- input_supervisions: Original supervisions with time information
66
- split_texts: List of split sentence texts
67
-
68
- Returns:
69
- List of Supervision objects with distributed time information.
70
- Custom attributes are inherited from first_sup with conflict markers.
71
- """
72
- if not input_supervisions:
73
- return [Supervision(text=text, id="", recording_id="", start=0, duration=0) for text in split_texts]
74
-
75
- # Build concatenated input text
76
- input_text = " ".join(sup.text for sup in input_supervisions)
77
-
78
- # Pre-compute supervision position mapping for O(1) lookup
79
- # Format: [(start_pos, end_pos, supervision), ...]
80
- sup_ranges = []
81
- char_pos = 0
82
- for sup in input_supervisions:
83
- sup_start = char_pos
84
- sup_end = char_pos + len(sup.text)
85
- sup_ranges.append((sup_start, sup_end, sup))
86
- char_pos = sup_end + 1 # +1 for space separator
87
-
88
- # Process each split text
89
- result = []
90
- search_start = 0
91
- sup_idx = 0 # Track current supervision index to skip processed ones
92
-
93
- for split_text in split_texts:
94
- text_start = input_text.find(split_text, search_start)
95
- if text_start == -1:
96
- raise ValueError(f"Could not find split text '{split_text}' in input supervisions.")
97
-
98
- text_end = text_start + len(split_text)
99
- search_start = text_end
100
-
101
- # Find overlapping supervisions, starting from last used index
102
- first_sup = None
103
- last_sup = None
104
- first_char_idx = None
105
- last_char_idx = None
106
- overlapping_customs = [] # Track all custom dicts for conflict detection
107
-
108
- # Start from sup_idx, which is the first supervision that might overlap
109
- for i in range(sup_idx, len(sup_ranges)):
110
- sup_start, sup_end, sup = sup_ranges[i]
111
-
112
- # Skip if no overlap (before text_start)
113
- if sup_end <= text_start:
114
- sup_idx = i + 1 # Update starting point for next iteration
115
- continue
116
-
117
- # Stop if no overlap (after text_end)
118
- if sup_start >= text_end:
119
- break
120
-
121
- # Found overlap
122
- if first_sup is None:
123
- first_sup = sup
124
- first_char_idx = max(0, text_start - sup_start)
125
-
126
- last_sup = sup
127
- last_char_idx = min(len(sup.text) - 1, text_end - 1 - sup_start)
128
-
129
- # Collect custom dict for conflict detection
130
- if getattr(sup, "custom", None):
131
- overlapping_customs.append(sup.custom)
132
-
133
- if first_sup is None or last_sup is None:
134
- raise ValueError(f"Could not find supervisions for split text: {split_text}")
135
-
136
- # Calculate timing
137
- start_time = first_sup.start + (first_char_idx / len(first_sup.text)) * first_sup.duration
138
- end_time = last_sup.start + ((last_char_idx + 1) / len(last_sup.text)) * last_sup.duration
139
-
140
- # Inherit custom from first_sup, mark conflicts if multiple sources
141
- merged_custom = None
142
- if overlapping_customs:
143
- # Start with first_sup's custom (inherit strategy)
144
- merged_custom = overlapping_customs[0].copy() if overlapping_customs[0] else {}
145
-
146
- # Detect conflicts if multiple overlapping supervisions have different custom values
147
- if len(overlapping_customs) > 1:
148
- has_conflict = False
149
- for other_custom in overlapping_customs[1:]:
150
- if other_custom and other_custom != overlapping_customs[0]:
151
- has_conflict = True
152
- break
153
-
154
- if has_conflict:
155
- # Mark that this supervision spans multiple sources with different customs
156
- merged_custom["_split_from_multiple"] = True
157
- merged_custom["_source_count"] = len(overlapping_customs)
158
-
159
- result.append(
160
- Supervision(
161
- id="",
162
- text=split_text,
163
- start=start_time,
164
- duration=end_time - start_time,
165
- recording_id=first_sup.recording_id,
166
- custom=merged_custom,
167
- )
168
- )
169
-
170
- return result
171
-
172
- @staticmethod
173
- def _resplit_special_sentence_types(sentence: str) -> List[str]:
174
- """
175
- Re-split special sentence types.
176
-
177
- Examples:
178
- '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:']
179
- '[MUSIC] &gt;&gt; SPEAKER:' -> ['[MUSIC]', '&gt;&gt; SPEAKER:']
180
-
181
- Special handling patterns:
182
- 1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
183
- 2. Use speaker marks (&gt;&gt; or other separators) as split points
184
-
185
- Args:
186
- sentence: Input sentence string
187
-
188
- Returns:
189
- List of re-split sentences. If no special marks are found, returns the original sentence in a list
190
- """
191
- # Detect special mark patterns: [SOMETHING] &gt;&gt; SPEAKER:
192
- # or other forms like [SOMETHING] SPEAKER:
193
-
194
- # Pattern 1: [mark] HTML-encoded separator speaker:
195
- pattern1 = r"^(\[[^\]]+\])\s+(&gt;&gt;|>>)\s+(.+)$"
196
- match1 = re.match(pattern1, sentence.strip())
197
- if match1:
198
- special_mark = match1.group(1)
199
- separator = match1.group(2)
200
- speaker_part = match1.group(3)
201
- return [special_mark, f"{separator} {speaker_part}"]
202
-
203
- # Pattern 2: [mark] speaker:
204
- pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
205
- match2 = re.match(pattern2, sentence.strip())
206
- if match2:
207
- special_mark = match2.group(1)
208
- speaker_label = match2.group(2)
209
- remaining = match2.group(3).strip()
210
- if remaining:
211
- return [special_mark, f"{speaker_label} {remaining}"]
212
- else:
213
- return [special_mark, speaker_label]
214
-
215
- # If no special pattern matches, return the original sentence
216
- return [sentence]
217
-
218
- def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
219
- """Split supervisions into sentences using the sentence splitter.
220
-
221
- Careful about speaker changes.
222
-
223
- Args:
224
- supervisions: List of Supervision objects to split
225
- strip_whitespace: Whether to strip whitespace from split sentences
226
-
227
- Returns:
228
- List of Supervision objects with split sentences
229
- """
230
- self._init_splitter()
231
-
232
- texts, speakers = [], []
233
- text_len, sidx = 0, 0
234
-
235
- def flush_segment(end_idx: int, speaker: Optional[str] = None):
236
- """Flush accumulated text from sidx to end_idx with given speaker."""
237
- nonlocal text_len, sidx
238
- if sidx <= end_idx:
239
- if len(speakers) < len(texts) + 1:
240
- speakers.append(speaker)
241
- text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
242
- texts.append(text)
243
- sidx = end_idx + 1
244
- text_len = 0
245
-
246
- for s, supervision in enumerate(supervisions):
247
- text_len += len(supervision.text)
248
- is_last = s == len(supervisions) - 1
249
-
250
- if supervision.speaker:
251
- # Flush previous segment without speaker (if any)
252
- if sidx < s:
253
- flush_segment(s - 1, None)
254
- text_len = len(supervision.text)
255
-
256
- # Check if we should flush this speaker's segment now
257
- next_has_speaker = not is_last and supervisions[s + 1].speaker
258
- if is_last or next_has_speaker:
259
- flush_segment(s, supervision.speaker)
260
- else:
261
- speakers.append(supervision.speaker)
262
-
263
- elif text_len >= 2000 or is_last:
264
- flush_segment(s, None)
265
-
266
- if len(speakers) != len(texts):
267
- raise ValueError(f"len(speakers)={len(speakers)} != len(texts)={len(texts)}")
268
- sentences = self._splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
269
-
270
- # First pass: collect all split texts with their speakers
271
- split_texts_with_speakers = []
272
- remainder = ""
273
- remainder_speaker = None
274
-
275
- for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
276
- # Prepend remainder from previous iteration to the first sentence
277
- if _sentences and remainder:
278
- _sentences[0] = remainder + _sentences[0]
279
- _speaker = remainder_speaker if remainder_speaker else _speaker
280
- remainder = ""
281
- remainder_speaker = None
282
-
283
- if not _sentences:
284
- continue
285
-
286
- # Process and re-split special sentence types
287
- processed_sentences = []
288
- for s, _sentence in enumerate(_sentences):
289
- if remainder:
290
- _sentence = remainder + _sentence
291
- remainder = ""
292
- # Detect and split special sentence types: e.g., '[APPLAUSE] &gt;&gt; MIRA MURATI:' -> ['[APPLAUSE]', '&gt;&gt; MIRA MURATI:'] # noqa: E501
293
- resplit_parts = self._resplit_special_sentence_types(_sentence)
294
- if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
295
- if s < len(_sentences) - 1:
296
- _sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
297
- else: # last part
298
- remainder = resplit_parts[-1] + " "
299
- processed_sentences.extend(resplit_parts[:-1])
300
- else:
301
- processed_sentences.extend(resplit_parts)
302
- _sentences = processed_sentences
303
-
304
- if not _sentences:
305
- if remainder:
306
- _sentences, remainder = [remainder.strip()], ""
307
- else:
308
- continue
309
-
310
- if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
311
- split_texts_with_speakers.extend(
312
- (text, _speaker if s == 0 else None) for s, text in enumerate(_sentences)
313
- )
314
- _speaker = None # reset speaker after use
315
- else:
316
- split_texts_with_speakers.extend(
317
- (text, _speaker if s == 0 else None) for s, text in enumerate(_sentences[:-1])
318
- )
319
- remainder = _sentences[-1] + " " + remainder
320
- if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
321
- split_texts_with_speakers.append((remainder.strip(), _speaker if len(_sentences) == 1 else None))
322
- remainder = ""
323
- remainder_speaker = None
324
- elif len(_sentences) == 1:
325
- remainder_speaker = _speaker
326
- if k == len(speakers) - 1:
327
- pass # keep _speaker for the last supervision
328
- elif speakers[k + 1] is not None:
329
- raise ValueError(f"Expected speakers[{k + 1}] to be None, got {speakers[k + 1]}")
330
- else:
331
- speakers[k + 1] = _speaker
332
- elif len(_sentences) > 1:
333
- _speaker = None # reset speaker if sentence not ended
334
- remainder_speaker = None
335
- else:
336
- raise ValueError(f"Unexpected state: len(_sentences)={len(_sentences)}")
337
-
338
- if remainder.strip():
339
- split_texts_with_speakers.append((remainder.strip(), remainder_speaker))
340
-
341
- # Second pass: distribute time information
342
- split_texts = [text for text, _ in split_texts_with_speakers]
343
- result_supervisions = self._distribute_time_info(supervisions, split_texts)
344
-
345
- # Third pass: add speaker information
346
- for sup, (_, speaker) in zip(result_supervisions, split_texts_with_speakers):
347
- if speaker:
348
- sup.speaker = speaker
349
-
350
- return result_supervisions
@@ -1,96 +0,0 @@
1
- """Caption processing module for LattifAI.
2
-
3
- This module provides comprehensive caption/subtitle processing capabilities:
4
- - Multi-format reading and writing (SRT, VTT, ASS, TTML, etc.)
5
- - Professional NLE integration (Avid, Final Cut Pro, Premiere Pro, DaVinci Resolve)
6
- - Audio workstation support (Pro Tools, Adobe Audition)
7
- - Advanced features: timecode offset, overlap resolution, word-level timing
8
- """
9
-
10
- from ..config.caption import InputCaptionFormat, OutputCaptionFormat
11
- from .caption import Caption
12
- from .formats.gemini import GeminiReader, GeminiSegment, GeminiWriter
13
- from .formats.nle.audition import (
14
- AuditionCSVConfig,
15
- AuditionCSVWriter,
16
- EdiMarkerConfig,
17
- EdiMarkerWriter,
18
- )
19
-
20
- # Professional NLE format writers (re-exported from formats/)
21
- from .formats.nle.avid import AvidDSConfig, AvidDSWriter, FrameRate
22
- from .formats.nle.fcpxml import FCPXMLConfig, FCPXMLStyle, FCPXMLWriter
23
- from .formats.nle.premiere import PremiereXMLConfig, PremiereXMLWriter
24
- from .formats.ttml import TTMLConfig, TTMLFormat, TTMLRegion, TTMLStyle
25
- from .parsers.text_parser import normalize_text
26
- from .standardize import (
27
- CaptionStandardizer,
28
- CaptionValidator,
29
- StandardizationConfig,
30
- ValidationResult,
31
- apply_margins_to_captions,
32
- standardize_captions,
33
- )
34
- from .supervision import Supervision
35
-
36
- # Create TTMLWriter alias for backward compatibility
37
- TTMLWriter = TTMLFormat
38
-
39
- # Utility functions
40
- from .utils import (
41
- CollisionMode,
42
- TimecodeOffset,
43
- apply_timecode_offset,
44
- detect_overlaps,
45
- format_srt_timestamp,
46
- generate_srt_content,
47
- resolve_overlaps,
48
- split_long_lines,
49
- )
50
-
51
- __all__ = [
52
- # Core classes
53
- "Caption",
54
- "Supervision",
55
- # Standardization
56
- "CaptionStandardizer",
57
- "CaptionValidator",
58
- "StandardizationConfig",
59
- "ValidationResult",
60
- "standardize_captions",
61
- "apply_margins_to_captions",
62
- # Gemini format support
63
- "GeminiReader",
64
- "GeminiWriter",
65
- "GeminiSegment",
66
- # Text utilities
67
- "normalize_text",
68
- # Format types
69
- "InputCaptionFormat",
70
- "OutputCaptionFormat",
71
- # Professional format writers
72
- "AvidDSWriter",
73
- "AvidDSConfig",
74
- "FCPXMLWriter",
75
- "FCPXMLConfig",
76
- "FCPXMLStyle",
77
- "PremiereXMLWriter",
78
- "PremiereXMLConfig",
79
- "AuditionCSVWriter",
80
- "AuditionCSVConfig",
81
- "EdiMarkerWriter",
82
- "EdiMarkerConfig",
83
- "TTMLWriter",
84
- "TTMLConfig",
85
- "TTMLStyle",
86
- "TTMLRegion",
87
- # Utilities
88
- "CollisionMode",
89
- "TimecodeOffset",
90
- "apply_timecode_offset",
91
- "resolve_overlaps",
92
- "detect_overlaps",
93
- "split_long_lines",
94
- "format_srt_timestamp",
95
- "generate_srt_content",
96
- ]