lattifai 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/_init.py +20 -0
- lattifai/alignment/__init__.py +2 -3
- lattifai/alignment/lattice1_aligner.py +117 -4
- lattifai/alignment/lattice1_worker.py +47 -4
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/text_align.py +2 -1
- lattifai/alignment/tokenizer.py +56 -29
- lattifai/audio2.py +162 -183
- lattifai/cli/alignment.py +5 -0
- lattifai/cli/caption.py +6 -6
- lattifai/cli/transcribe.py +1 -5
- lattifai/cli/youtube.py +3 -0
- lattifai/client.py +41 -12
- lattifai/config/__init__.py +21 -3
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +13 -243
- lattifai/config/client.py +16 -0
- lattifai/config/event.py +102 -0
- lattifai/config/transcription.py +25 -1
- lattifai/data/__init__.py +8 -0
- lattifai/data/caption.py +228 -0
- lattifai/errors.py +78 -53
- lattifai/event/__init__.py +65 -0
- lattifai/event/lattifai.py +166 -0
- lattifai/mixin.py +22 -17
- lattifai/transcription/base.py +2 -1
- lattifai/transcription/gemini.py +147 -16
- lattifai/transcription/lattifai.py +8 -11
- lattifai/types.py +1 -1
- lattifai/youtube/client.py +143 -48
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/METADATA +117 -54
- lattifai-1.3.0.dist-info/RECORD +57 -0
- lattifai/__init__.py +0 -88
- lattifai/alignment/sentence_splitter.py +0 -350
- lattifai/caption/__init__.py +0 -96
- lattifai/caption/caption.py +0 -661
- lattifai/caption/formats/__init__.py +0 -199
- lattifai/caption/formats/base.py +0 -211
- lattifai/caption/formats/gemini.py +0 -722
- lattifai/caption/formats/json.py +0 -194
- lattifai/caption/formats/lrc.py +0 -309
- lattifai/caption/formats/nle/__init__.py +0 -9
- lattifai/caption/formats/nle/audition.py +0 -561
- lattifai/caption/formats/nle/avid.py +0 -423
- lattifai/caption/formats/nle/fcpxml.py +0 -549
- lattifai/caption/formats/nle/premiere.py +0 -589
- lattifai/caption/formats/pysubs2.py +0 -642
- lattifai/caption/formats/sbv.py +0 -147
- lattifai/caption/formats/tabular.py +0 -338
- lattifai/caption/formats/textgrid.py +0 -193
- lattifai/caption/formats/ttml.py +0 -652
- lattifai/caption/formats/vtt.py +0 -469
- lattifai/caption/parsers/__init__.py +0 -9
- lattifai/caption/parsers/text_parser.py +0 -147
- lattifai/caption/standardize.py +0 -636
- lattifai/caption/supervision.py +0 -34
- lattifai/caption/utils.py +0 -474
- lattifai-1.2.2.dist-info/RECORD +0 -76
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
lattifai/_init.py,sha256=ZpiYRpP7kPh6XpQK6nX1zKKvtQE3Xr0T2Ed5S4wDiwU,609
|
|
2
|
+
lattifai/audio2.py,sha256=JpMvwB0e09hLoffji8zOSlgrIhqUUq_zq0ubT_QMspc,15317
|
|
3
|
+
lattifai/client.py,sha256=d6WpodzseoSv6aA8jkj4hwiMOz3irSkD15i1hr2JUsQ,18182
|
|
4
|
+
lattifai/errors.py,sha256=aPTMhTaeQjY0KMifnm-Kpo9MQPsoYsEanZB2f0AA6qQ,11911
|
|
5
|
+
lattifai/logging.py,sha256=MbUEeOUFlF92pA9v532DiPPWKl03S7UHCJ6Z652cf0w,2860
|
|
6
|
+
lattifai/mixin.py,sha256=_d-kzu2w5Y4mZTFMEumh30baWzKsQ9n_IS432woi5vQ,26295
|
|
7
|
+
lattifai/types.py,sha256=JK7KVaZhX89BiKPm4okY0DWLHY1S8aj-YiZXoVH1akw,667
|
|
8
|
+
lattifai/utils.py,sha256=5LeunAN0OQ1jWoKMIThpXSEOxFYD2dCRTdsglosodUU,7963
|
|
9
|
+
lattifai/alignment/__init__.py,sha256=aOyC1P5DqESNLpDh6Gu6LyUZAVMba-IKI7Ugz7v9G4w,344
|
|
10
|
+
lattifai/alignment/lattice1_aligner.py,sha256=MvBpPnhfF8NYtH2ANhQszKNRQROUiSYrBz3aN1bdT3U,11120
|
|
11
|
+
lattifai/alignment/lattice1_worker.py,sha256=Z7hxaS-nucNsUmrphbD8tgBBYPkJOgQb-85nFON94_I,13041
|
|
12
|
+
lattifai/alignment/phonemizer.py,sha256=fbhN2DOl39lW4nQWKzyUUTMUabg7v61lB1kj8SKK-Sw,1761
|
|
13
|
+
lattifai/alignment/punctuation.py,sha256=qLcvuXhBzoEa6bznWZiAB5TAxR6eLr_ZV-PnnCY90UA,1218
|
|
14
|
+
lattifai/alignment/segmenter.py,sha256=JTbBYEXn8hkFwy0tITORy7nKoUPiNYLfi3w1DJNeHZ0,6303
|
|
15
|
+
lattifai/alignment/text_align.py,sha256=sF-6Tsf863BhJcii3joeNa6Auv-7l3SiOhh9j8oPGME,14935
|
|
16
|
+
lattifai/alignment/tokenizer.py,sha256=OIpMGHg1rJ7n97zncDMPpXy32uGOSt1yXiNO4sO6eP0,18839
|
|
17
|
+
lattifai/cli/__init__.py,sha256=PdqoCTqRSFSWrqL3FjBTa5VzJy_e6Rq0OzyT7YkyHpc,541
|
|
18
|
+
lattifai/cli/alignment.py,sha256=rqg6wU2vf6RJ058yWVoXft_UJfOCrEpmE-ye5fhTphg,6129
|
|
19
|
+
lattifai/cli/caption.py,sha256=jkMme73sJ16dkVpRh7O6qjbr14SUeBif00vCTBn7ed0,10339
|
|
20
|
+
lattifai/cli/diarization.py,sha256=GTd2vnTm6cJN6Q3mFP-ShY9bZBl1_zKzWFu-4HHcMzk,4075
|
|
21
|
+
lattifai/cli/transcribe.py,sha256=vZIV0TCbZG_IL2F_Mg49cCGSCBinOOFAtROajVTpNWE,7853
|
|
22
|
+
lattifai/cli/youtube.py,sha256=FJwDl48-cuacP1sdPvX19vdszXdT7EoOZgGYzJpoLeM,6360
|
|
23
|
+
lattifai/config/__init__.py,sha256=nJUVk03JRj4rujoEmkCkQ8akZF7kqIj7ci3XphU9uVA,1249
|
|
24
|
+
lattifai/config/alignment.py,sha256=3JUtgHBueIK_lH9PgeBPjuHGL4VvDEYVs9fvylir6bc,5392
|
|
25
|
+
lattifai/config/caption.py,sha256=OMLsW8QKDWM6A3G5V3Gf-9bgB3D1PC5gO8LiiNNeOwM,7195
|
|
26
|
+
lattifai/config/client.py,sha256=qqHKFPV4iEjVHCDOuGx7kj-tYFtgZZAszOQRFsNFbO8,2359
|
|
27
|
+
lattifai/config/diarization.py,sha256=cIkwCfsYqfMns3i6tKWcwBBBkdnhhmB_Eo0TuOPCw9o,2484
|
|
28
|
+
lattifai/config/event.py,sha256=P-_2yOzSATZSXz-ctlWeJQGOKCbNLFnWLBvUZ8JclyA,3845
|
|
29
|
+
lattifai/config/media.py,sha256=nxvgC7zeLsthCARPPUbnK2eMJY8R1d-1XgiAsy8kroA,15568
|
|
30
|
+
lattifai/config/transcription.py,sha256=V0WtZ_p-WsBienRbGyd-zLdX6F_XRsDWGlba_qzwet0,4115
|
|
31
|
+
lattifai/data/__init__.py,sha256=hdUhvlUjPgb3_Hd_cJ30f2oXHBMZRGzaSafd64b3vYA,168
|
|
32
|
+
lattifai/data/caption.py,sha256=MVuZiQ47Lr3A1afFqGkqFzpWjPakmsQusQ86t210Y2Y,7800
|
|
33
|
+
lattifai/diarization/__init__.py,sha256=-ZZ_a5hIQgnlHIOehCTtmVmWOWC2H6eOhSs4AcVtRtk,1782
|
|
34
|
+
lattifai/diarization/lattifai.py,sha256=tCnFL6ywITqeKR8YoCsYvyJxNoIwoC6GsnI9zkXNB-Q,3128
|
|
35
|
+
lattifai/event/__init__.py,sha256=PPAWzrkRK8YgWhG6CtIUkb7nH8svd9_zGOhxjz0_dcM,2448
|
|
36
|
+
lattifai/event/lattifai.py,sha256=QJqUxJsIWryVVoud_qE8af6zoJ89ZyPgHDvQp4OzXg0,5826
|
|
37
|
+
lattifai/transcription/__init__.py,sha256=vMHciyCEPKhhfM3KjMCeDqnyxU1oghF8g5o5SvpnT_4,2669
|
|
38
|
+
lattifai/transcription/base.py,sha256=ywRjIGg6emTx1v8PCSPyHcdugR6PvdTl10H64Iu1iqs,4617
|
|
39
|
+
lattifai/transcription/gemini.py,sha256=p6uZlhPQuzzUsj226Jk-INOt7NF5g4TIN6yEn1ZwrBI,18030
|
|
40
|
+
lattifai/transcription/lattifai.py,sha256=DA7QSN-a_yIZq79Nc_f6lf8_VWW4qqhyXfoZ1Um-31M,3451
|
|
41
|
+
lattifai/transcription/prompts/README.md,sha256=X49KWSQVdjWxxWUp4R2w3ZqKrAOi6_kDNHh1hMaQ4PE,694
|
|
42
|
+
lattifai/transcription/prompts/__init__.py,sha256=G9b42COaCYv3sPPNkHsGDLOMBuVGKt4mXGYal_BYtYQ,1351
|
|
43
|
+
lattifai/transcription/prompts/gemini/README.md,sha256=rt7f7yDGtaobKBo95LG3u56mqa3ABOXQd0UVgJYtYuo,781
|
|
44
|
+
lattifai/transcription/prompts/gemini/transcription_gem.txt,sha256=cljzZ--BDgnnKzqVCakr-fTp2Xk38UOsUquvruNX-LU,4600
|
|
45
|
+
lattifai/workflow/__init__.py,sha256=INpQgc9gZ2Fp-aTHcpR3TEHGtEtPzjOB8T7-jLzVM0E,1547
|
|
46
|
+
lattifai/workflow/agents.py,sha256=yEOnxnhcTvr1iOhCorNvp8B76P6nQsLRXJCu_rCYFfM,38
|
|
47
|
+
lattifai/workflow/base.py,sha256=8QoVIBZwJfr5mppJbtUFafHv5QR9lL-XrULjTWD0oBg,6257
|
|
48
|
+
lattifai/workflow/file_manager.py,sha256=yc29Vb7JNUMJ9rwM_YjkAHfDInl8HMVAl9A7z7XiIOU,32974
|
|
49
|
+
lattifai/youtube/__init__.py,sha256=_uO3KCx-t6I-JaYFpcYLYpvkbmEOOni3xBqGEbExg68,1587
|
|
50
|
+
lattifai/youtube/client.py,sha256=VU8FC1N7YYpbc4LeJNAsahNAI1R7e3_7Yjmb1rz7tyI,52878
|
|
51
|
+
lattifai/youtube/types.py,sha256=80RgBmvM4tRbxqyNv9GU6hr9vPp_yhKrK0RJ_vG2h4E,472
|
|
52
|
+
lattifai-1.3.0.dist-info/licenses/LICENSE,sha256=xGMLmdFJy6Jkz3Hd0znyQLmcxC93FSZB5isKnEDMoQQ,1066
|
|
53
|
+
lattifai-1.3.0.dist-info/METADATA,sha256=WMgLRzKiJv_Zn1aoxPjWofNZRE4tSjTWxa16zWNYVTk,23008
|
|
54
|
+
lattifai-1.3.0.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
55
|
+
lattifai-1.3.0.dist-info/entry_points.txt,sha256=MfoqXNjXrhD7VMApHgaHmAECTcGVUMUiR0uqnTg7Ads,502
|
|
56
|
+
lattifai-1.3.0.dist-info/top_level.txt,sha256=tHSoXF26r-IGfbIP_JoYATqbmf14h5NrnNJGH4j5reI,9
|
|
57
|
+
lattifai-1.3.0.dist-info/RECORD,,
|
lattifai/__init__.py
DELETED
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import warnings
|
|
3
|
-
from importlib.metadata import version
|
|
4
|
-
|
|
5
|
-
# Suppress SWIG deprecation warnings before any imports
|
|
6
|
-
warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*SwigPy.*")
|
|
7
|
-
|
|
8
|
-
# Suppress PyTorch transformer nested tensor warning
|
|
9
|
-
warnings.filterwarnings("ignore", category=UserWarning, message=".*enable_nested_tensor.*")
|
|
10
|
-
|
|
11
|
-
# Disable tokenizers parallelism warning
|
|
12
|
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
13
|
-
|
|
14
|
-
# Re-export I/O classes
|
|
15
|
-
from .caption import Caption
|
|
16
|
-
|
|
17
|
-
# Re-export client classes
|
|
18
|
-
from .client import LattifAI
|
|
19
|
-
|
|
20
|
-
# Re-export config classes
|
|
21
|
-
from .config import (
|
|
22
|
-
AUDIO_FORMATS,
|
|
23
|
-
MEDIA_FORMATS,
|
|
24
|
-
VIDEO_FORMATS,
|
|
25
|
-
AlignmentConfig,
|
|
26
|
-
CaptionConfig,
|
|
27
|
-
ClientConfig,
|
|
28
|
-
DiarizationConfig,
|
|
29
|
-
MediaConfig,
|
|
30
|
-
)
|
|
31
|
-
from .errors import (
|
|
32
|
-
AlignmentError,
|
|
33
|
-
APIError,
|
|
34
|
-
AudioFormatError,
|
|
35
|
-
AudioLoadError,
|
|
36
|
-
AudioProcessingError,
|
|
37
|
-
CaptionParseError,
|
|
38
|
-
CaptionProcessingError,
|
|
39
|
-
ConfigurationError,
|
|
40
|
-
DependencyError,
|
|
41
|
-
LatticeDecodingError,
|
|
42
|
-
LatticeEncodingError,
|
|
43
|
-
LattifAIError,
|
|
44
|
-
ModelLoadError,
|
|
45
|
-
)
|
|
46
|
-
from .logging import get_logger, set_log_level, setup_logger
|
|
47
|
-
|
|
48
|
-
try:
|
|
49
|
-
__version__ = version("lattifai")
|
|
50
|
-
except Exception:
|
|
51
|
-
__version__ = "0.1.0" # fallback version
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
__all__ = [
|
|
55
|
-
# Client classes
|
|
56
|
-
"LattifAI",
|
|
57
|
-
# Config classes
|
|
58
|
-
"AlignmentConfig",
|
|
59
|
-
"ClientConfig",
|
|
60
|
-
"CaptionConfig",
|
|
61
|
-
"DiarizationConfig",
|
|
62
|
-
"MediaConfig",
|
|
63
|
-
"AUDIO_FORMATS",
|
|
64
|
-
"VIDEO_FORMATS",
|
|
65
|
-
"MEDIA_FORMATS",
|
|
66
|
-
# Error classes
|
|
67
|
-
"LattifAIError",
|
|
68
|
-
"AudioProcessingError",
|
|
69
|
-
"AudioLoadError",
|
|
70
|
-
"AudioFormatError",
|
|
71
|
-
"CaptionProcessingError",
|
|
72
|
-
"CaptionParseError",
|
|
73
|
-
"AlignmentError",
|
|
74
|
-
"LatticeEncodingError",
|
|
75
|
-
"LatticeDecodingError",
|
|
76
|
-
"ModelLoadError",
|
|
77
|
-
"DependencyError",
|
|
78
|
-
"APIError",
|
|
79
|
-
"ConfigurationError",
|
|
80
|
-
# Logging
|
|
81
|
-
"setup_logger",
|
|
82
|
-
"get_logger",
|
|
83
|
-
"set_log_level",
|
|
84
|
-
# I/O
|
|
85
|
-
"Caption",
|
|
86
|
-
# Version
|
|
87
|
-
"__version__",
|
|
88
|
-
]
|
|
@@ -1,350 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from typing import List, Optional
|
|
3
|
-
|
|
4
|
-
from lattifai.alignment.punctuation import END_PUNCTUATION
|
|
5
|
-
from lattifai.caption import Supervision
|
|
6
|
-
from lattifai.utils import _resolve_model_path
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class SentenceSplitter:
|
|
10
|
-
"""Lazy-initialized sentence splitter using wtpsplit."""
|
|
11
|
-
|
|
12
|
-
def __init__(self, device: str = "cpu", model_hub: Optional[str] = "modelscope", lazy_init: bool = True):
|
|
13
|
-
"""Initialize sentence splitter with lazy loading.
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
device: Device to run the model on (cpu, cuda, mps)
|
|
17
|
-
model_hub: Model hub to use (None for huggingface, "modelscope" for modelscope)
|
|
18
|
-
"""
|
|
19
|
-
self.device = device
|
|
20
|
-
self.model_hub = model_hub
|
|
21
|
-
self._splitter = None
|
|
22
|
-
if not lazy_init:
|
|
23
|
-
self._init_splitter()
|
|
24
|
-
|
|
25
|
-
def _init_splitter(self):
|
|
26
|
-
"""Initialize the sentence splitter model on first use."""
|
|
27
|
-
if self._splitter is not None:
|
|
28
|
-
return
|
|
29
|
-
|
|
30
|
-
import onnxruntime as ort
|
|
31
|
-
from wtpsplit import SaT
|
|
32
|
-
|
|
33
|
-
providers = []
|
|
34
|
-
device = self.device
|
|
35
|
-
if device.startswith("cuda") and ort.get_all_providers().count("CUDAExecutionProvider") > 0:
|
|
36
|
-
providers.append("CUDAExecutionProvider")
|
|
37
|
-
elif device.startswith("mps") and ort.get_all_providers().count("MPSExecutionProvider") > 0:
|
|
38
|
-
providers.append("MPSExecutionProvider")
|
|
39
|
-
|
|
40
|
-
if self.model_hub == "modelscope":
|
|
41
|
-
downloaded_path = _resolve_model_path("LattifAI/OmniTokenizer", model_hub="modelscope")
|
|
42
|
-
sat = SaT(
|
|
43
|
-
f"{downloaded_path}/sat-3l-sm",
|
|
44
|
-
tokenizer_name_or_path=f"{downloaded_path}/xlm-roberta-base",
|
|
45
|
-
ort_providers=providers + ["CPUExecutionProvider"],
|
|
46
|
-
)
|
|
47
|
-
else:
|
|
48
|
-
sat_path = _resolve_model_path("segment-any-text/sat-3l-sm", model_hub="huggingface")
|
|
49
|
-
sat = SaT(
|
|
50
|
-
sat_path,
|
|
51
|
-
tokenizer_name_or_path="facebookAI/xlm-roberta-base",
|
|
52
|
-
hub_prefix="segment-any-text",
|
|
53
|
-
ort_providers=providers + ["CPUExecutionProvider"],
|
|
54
|
-
)
|
|
55
|
-
self._splitter = sat
|
|
56
|
-
|
|
57
|
-
@staticmethod
|
|
58
|
-
def _distribute_time_info(
|
|
59
|
-
input_supervisions: List[Supervision],
|
|
60
|
-
split_texts: List[str],
|
|
61
|
-
) -> List[Supervision]:
|
|
62
|
-
"""Distribute time information from input supervisions to split sentences.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
input_supervisions: Original supervisions with time information
|
|
66
|
-
split_texts: List of split sentence texts
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
List of Supervision objects with distributed time information.
|
|
70
|
-
Custom attributes are inherited from first_sup with conflict markers.
|
|
71
|
-
"""
|
|
72
|
-
if not input_supervisions:
|
|
73
|
-
return [Supervision(text=text, id="", recording_id="", start=0, duration=0) for text in split_texts]
|
|
74
|
-
|
|
75
|
-
# Build concatenated input text
|
|
76
|
-
input_text = " ".join(sup.text for sup in input_supervisions)
|
|
77
|
-
|
|
78
|
-
# Pre-compute supervision position mapping for O(1) lookup
|
|
79
|
-
# Format: [(start_pos, end_pos, supervision), ...]
|
|
80
|
-
sup_ranges = []
|
|
81
|
-
char_pos = 0
|
|
82
|
-
for sup in input_supervisions:
|
|
83
|
-
sup_start = char_pos
|
|
84
|
-
sup_end = char_pos + len(sup.text)
|
|
85
|
-
sup_ranges.append((sup_start, sup_end, sup))
|
|
86
|
-
char_pos = sup_end + 1 # +1 for space separator
|
|
87
|
-
|
|
88
|
-
# Process each split text
|
|
89
|
-
result = []
|
|
90
|
-
search_start = 0
|
|
91
|
-
sup_idx = 0 # Track current supervision index to skip processed ones
|
|
92
|
-
|
|
93
|
-
for split_text in split_texts:
|
|
94
|
-
text_start = input_text.find(split_text, search_start)
|
|
95
|
-
if text_start == -1:
|
|
96
|
-
raise ValueError(f"Could not find split text '{split_text}' in input supervisions.")
|
|
97
|
-
|
|
98
|
-
text_end = text_start + len(split_text)
|
|
99
|
-
search_start = text_end
|
|
100
|
-
|
|
101
|
-
# Find overlapping supervisions, starting from last used index
|
|
102
|
-
first_sup = None
|
|
103
|
-
last_sup = None
|
|
104
|
-
first_char_idx = None
|
|
105
|
-
last_char_idx = None
|
|
106
|
-
overlapping_customs = [] # Track all custom dicts for conflict detection
|
|
107
|
-
|
|
108
|
-
# Start from sup_idx, which is the first supervision that might overlap
|
|
109
|
-
for i in range(sup_idx, len(sup_ranges)):
|
|
110
|
-
sup_start, sup_end, sup = sup_ranges[i]
|
|
111
|
-
|
|
112
|
-
# Skip if no overlap (before text_start)
|
|
113
|
-
if sup_end <= text_start:
|
|
114
|
-
sup_idx = i + 1 # Update starting point for next iteration
|
|
115
|
-
continue
|
|
116
|
-
|
|
117
|
-
# Stop if no overlap (after text_end)
|
|
118
|
-
if sup_start >= text_end:
|
|
119
|
-
break
|
|
120
|
-
|
|
121
|
-
# Found overlap
|
|
122
|
-
if first_sup is None:
|
|
123
|
-
first_sup = sup
|
|
124
|
-
first_char_idx = max(0, text_start - sup_start)
|
|
125
|
-
|
|
126
|
-
last_sup = sup
|
|
127
|
-
last_char_idx = min(len(sup.text) - 1, text_end - 1 - sup_start)
|
|
128
|
-
|
|
129
|
-
# Collect custom dict for conflict detection
|
|
130
|
-
if getattr(sup, "custom", None):
|
|
131
|
-
overlapping_customs.append(sup.custom)
|
|
132
|
-
|
|
133
|
-
if first_sup is None or last_sup is None:
|
|
134
|
-
raise ValueError(f"Could not find supervisions for split text: {split_text}")
|
|
135
|
-
|
|
136
|
-
# Calculate timing
|
|
137
|
-
start_time = first_sup.start + (first_char_idx / len(first_sup.text)) * first_sup.duration
|
|
138
|
-
end_time = last_sup.start + ((last_char_idx + 1) / len(last_sup.text)) * last_sup.duration
|
|
139
|
-
|
|
140
|
-
# Inherit custom from first_sup, mark conflicts if multiple sources
|
|
141
|
-
merged_custom = None
|
|
142
|
-
if overlapping_customs:
|
|
143
|
-
# Start with first_sup's custom (inherit strategy)
|
|
144
|
-
merged_custom = overlapping_customs[0].copy() if overlapping_customs[0] else {}
|
|
145
|
-
|
|
146
|
-
# Detect conflicts if multiple overlapping supervisions have different custom values
|
|
147
|
-
if len(overlapping_customs) > 1:
|
|
148
|
-
has_conflict = False
|
|
149
|
-
for other_custom in overlapping_customs[1:]:
|
|
150
|
-
if other_custom and other_custom != overlapping_customs[0]:
|
|
151
|
-
has_conflict = True
|
|
152
|
-
break
|
|
153
|
-
|
|
154
|
-
if has_conflict:
|
|
155
|
-
# Mark that this supervision spans multiple sources with different customs
|
|
156
|
-
merged_custom["_split_from_multiple"] = True
|
|
157
|
-
merged_custom["_source_count"] = len(overlapping_customs)
|
|
158
|
-
|
|
159
|
-
result.append(
|
|
160
|
-
Supervision(
|
|
161
|
-
id="",
|
|
162
|
-
text=split_text,
|
|
163
|
-
start=start_time,
|
|
164
|
-
duration=end_time - start_time,
|
|
165
|
-
recording_id=first_sup.recording_id,
|
|
166
|
-
custom=merged_custom,
|
|
167
|
-
)
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
return result
|
|
171
|
-
|
|
172
|
-
@staticmethod
|
|
173
|
-
def _resplit_special_sentence_types(sentence: str) -> List[str]:
|
|
174
|
-
"""
|
|
175
|
-
Re-split special sentence types.
|
|
176
|
-
|
|
177
|
-
Examples:
|
|
178
|
-
'[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:']
|
|
179
|
-
'[MUSIC] >> SPEAKER:' -> ['[MUSIC]', '>> SPEAKER:']
|
|
180
|
-
|
|
181
|
-
Special handling patterns:
|
|
182
|
-
1. Separate special marks at the beginning (e.g., [APPLAUSE], [MUSIC], etc.) from subsequent speaker marks
|
|
183
|
-
2. Use speaker marks (>> or other separators) as split points
|
|
184
|
-
|
|
185
|
-
Args:
|
|
186
|
-
sentence: Input sentence string
|
|
187
|
-
|
|
188
|
-
Returns:
|
|
189
|
-
List of re-split sentences. If no special marks are found, returns the original sentence in a list
|
|
190
|
-
"""
|
|
191
|
-
# Detect special mark patterns: [SOMETHING] >> SPEAKER:
|
|
192
|
-
# or other forms like [SOMETHING] SPEAKER:
|
|
193
|
-
|
|
194
|
-
# Pattern 1: [mark] HTML-encoded separator speaker:
|
|
195
|
-
pattern1 = r"^(\[[^\]]+\])\s+(>>|>>)\s+(.+)$"
|
|
196
|
-
match1 = re.match(pattern1, sentence.strip())
|
|
197
|
-
if match1:
|
|
198
|
-
special_mark = match1.group(1)
|
|
199
|
-
separator = match1.group(2)
|
|
200
|
-
speaker_part = match1.group(3)
|
|
201
|
-
return [special_mark, f"{separator} {speaker_part}"]
|
|
202
|
-
|
|
203
|
-
# Pattern 2: [mark] speaker:
|
|
204
|
-
pattern2 = r"^(\[[^\]]+\])\s+([^:]+:)(.*)$"
|
|
205
|
-
match2 = re.match(pattern2, sentence.strip())
|
|
206
|
-
if match2:
|
|
207
|
-
special_mark = match2.group(1)
|
|
208
|
-
speaker_label = match2.group(2)
|
|
209
|
-
remaining = match2.group(3).strip()
|
|
210
|
-
if remaining:
|
|
211
|
-
return [special_mark, f"{speaker_label} {remaining}"]
|
|
212
|
-
else:
|
|
213
|
-
return [special_mark, speaker_label]
|
|
214
|
-
|
|
215
|
-
# If no special pattern matches, return the original sentence
|
|
216
|
-
return [sentence]
|
|
217
|
-
|
|
218
|
-
def split_sentences(self, supervisions: List[Supervision], strip_whitespace=True) -> List[Supervision]:
|
|
219
|
-
"""Split supervisions into sentences using the sentence splitter.
|
|
220
|
-
|
|
221
|
-
Careful about speaker changes.
|
|
222
|
-
|
|
223
|
-
Args:
|
|
224
|
-
supervisions: List of Supervision objects to split
|
|
225
|
-
strip_whitespace: Whether to strip whitespace from split sentences
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
List of Supervision objects with split sentences
|
|
229
|
-
"""
|
|
230
|
-
self._init_splitter()
|
|
231
|
-
|
|
232
|
-
texts, speakers = [], []
|
|
233
|
-
text_len, sidx = 0, 0
|
|
234
|
-
|
|
235
|
-
def flush_segment(end_idx: int, speaker: Optional[str] = None):
|
|
236
|
-
"""Flush accumulated text from sidx to end_idx with given speaker."""
|
|
237
|
-
nonlocal text_len, sidx
|
|
238
|
-
if sidx <= end_idx:
|
|
239
|
-
if len(speakers) < len(texts) + 1:
|
|
240
|
-
speakers.append(speaker)
|
|
241
|
-
text = " ".join(sup.text for sup in supervisions[sidx : end_idx + 1])
|
|
242
|
-
texts.append(text)
|
|
243
|
-
sidx = end_idx + 1
|
|
244
|
-
text_len = 0
|
|
245
|
-
|
|
246
|
-
for s, supervision in enumerate(supervisions):
|
|
247
|
-
text_len += len(supervision.text)
|
|
248
|
-
is_last = s == len(supervisions) - 1
|
|
249
|
-
|
|
250
|
-
if supervision.speaker:
|
|
251
|
-
# Flush previous segment without speaker (if any)
|
|
252
|
-
if sidx < s:
|
|
253
|
-
flush_segment(s - 1, None)
|
|
254
|
-
text_len = len(supervision.text)
|
|
255
|
-
|
|
256
|
-
# Check if we should flush this speaker's segment now
|
|
257
|
-
next_has_speaker = not is_last and supervisions[s + 1].speaker
|
|
258
|
-
if is_last or next_has_speaker:
|
|
259
|
-
flush_segment(s, supervision.speaker)
|
|
260
|
-
else:
|
|
261
|
-
speakers.append(supervision.speaker)
|
|
262
|
-
|
|
263
|
-
elif text_len >= 2000 or is_last:
|
|
264
|
-
flush_segment(s, None)
|
|
265
|
-
|
|
266
|
-
if len(speakers) != len(texts):
|
|
267
|
-
raise ValueError(f"len(speakers)={len(speakers)} != len(texts)={len(texts)}")
|
|
268
|
-
sentences = self._splitter.split(texts, threshold=0.15, strip_whitespace=strip_whitespace, batch_size=8)
|
|
269
|
-
|
|
270
|
-
# First pass: collect all split texts with their speakers
|
|
271
|
-
split_texts_with_speakers = []
|
|
272
|
-
remainder = ""
|
|
273
|
-
remainder_speaker = None
|
|
274
|
-
|
|
275
|
-
for k, (_speaker, _sentences) in enumerate(zip(speakers, sentences)):
|
|
276
|
-
# Prepend remainder from previous iteration to the first sentence
|
|
277
|
-
if _sentences and remainder:
|
|
278
|
-
_sentences[0] = remainder + _sentences[0]
|
|
279
|
-
_speaker = remainder_speaker if remainder_speaker else _speaker
|
|
280
|
-
remainder = ""
|
|
281
|
-
remainder_speaker = None
|
|
282
|
-
|
|
283
|
-
if not _sentences:
|
|
284
|
-
continue
|
|
285
|
-
|
|
286
|
-
# Process and re-split special sentence types
|
|
287
|
-
processed_sentences = []
|
|
288
|
-
for s, _sentence in enumerate(_sentences):
|
|
289
|
-
if remainder:
|
|
290
|
-
_sentence = remainder + _sentence
|
|
291
|
-
remainder = ""
|
|
292
|
-
# Detect and split special sentence types: e.g., '[APPLAUSE] >> MIRA MURATI:' -> ['[APPLAUSE]', '>> MIRA MURATI:'] # noqa: E501
|
|
293
|
-
resplit_parts = self._resplit_special_sentence_types(_sentence)
|
|
294
|
-
if any(resplit_parts[-1].endswith(sp) for sp in [":", ":"]):
|
|
295
|
-
if s < len(_sentences) - 1:
|
|
296
|
-
_sentences[s + 1] = resplit_parts[-1] + " " + _sentences[s + 1]
|
|
297
|
-
else: # last part
|
|
298
|
-
remainder = resplit_parts[-1] + " "
|
|
299
|
-
processed_sentences.extend(resplit_parts[:-1])
|
|
300
|
-
else:
|
|
301
|
-
processed_sentences.extend(resplit_parts)
|
|
302
|
-
_sentences = processed_sentences
|
|
303
|
-
|
|
304
|
-
if not _sentences:
|
|
305
|
-
if remainder:
|
|
306
|
-
_sentences, remainder = [remainder.strip()], ""
|
|
307
|
-
else:
|
|
308
|
-
continue
|
|
309
|
-
|
|
310
|
-
if any(_sentences[-1].endswith(ep) for ep in END_PUNCTUATION):
|
|
311
|
-
split_texts_with_speakers.extend(
|
|
312
|
-
(text, _speaker if s == 0 else None) for s, text in enumerate(_sentences)
|
|
313
|
-
)
|
|
314
|
-
_speaker = None # reset speaker after use
|
|
315
|
-
else:
|
|
316
|
-
split_texts_with_speakers.extend(
|
|
317
|
-
(text, _speaker if s == 0 else None) for s, text in enumerate(_sentences[:-1])
|
|
318
|
-
)
|
|
319
|
-
remainder = _sentences[-1] + " " + remainder
|
|
320
|
-
if k < len(speakers) - 1 and speakers[k + 1] is not None: # next speaker is set
|
|
321
|
-
split_texts_with_speakers.append((remainder.strip(), _speaker if len(_sentences) == 1 else None))
|
|
322
|
-
remainder = ""
|
|
323
|
-
remainder_speaker = None
|
|
324
|
-
elif len(_sentences) == 1:
|
|
325
|
-
remainder_speaker = _speaker
|
|
326
|
-
if k == len(speakers) - 1:
|
|
327
|
-
pass # keep _speaker for the last supervision
|
|
328
|
-
elif speakers[k + 1] is not None:
|
|
329
|
-
raise ValueError(f"Expected speakers[{k + 1}] to be None, got {speakers[k + 1]}")
|
|
330
|
-
else:
|
|
331
|
-
speakers[k + 1] = _speaker
|
|
332
|
-
elif len(_sentences) > 1:
|
|
333
|
-
_speaker = None # reset speaker if sentence not ended
|
|
334
|
-
remainder_speaker = None
|
|
335
|
-
else:
|
|
336
|
-
raise ValueError(f"Unexpected state: len(_sentences)={len(_sentences)}")
|
|
337
|
-
|
|
338
|
-
if remainder.strip():
|
|
339
|
-
split_texts_with_speakers.append((remainder.strip(), remainder_speaker))
|
|
340
|
-
|
|
341
|
-
# Second pass: distribute time information
|
|
342
|
-
split_texts = [text for text, _ in split_texts_with_speakers]
|
|
343
|
-
result_supervisions = self._distribute_time_info(supervisions, split_texts)
|
|
344
|
-
|
|
345
|
-
# Third pass: add speaker information
|
|
346
|
-
for sup, (_, speaker) in zip(result_supervisions, split_texts_with_speakers):
|
|
347
|
-
if speaker:
|
|
348
|
-
sup.speaker = speaker
|
|
349
|
-
|
|
350
|
-
return result_supervisions
|
lattifai/caption/__init__.py
DELETED
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
"""Caption processing module for LattifAI.
|
|
2
|
-
|
|
3
|
-
This module provides comprehensive caption/subtitle processing capabilities:
|
|
4
|
-
- Multi-format reading and writing (SRT, VTT, ASS, TTML, etc.)
|
|
5
|
-
- Professional NLE integration (Avid, Final Cut Pro, Premiere Pro, DaVinci Resolve)
|
|
6
|
-
- Audio workstation support (Pro Tools, Adobe Audition)
|
|
7
|
-
- Advanced features: timecode offset, overlap resolution, word-level timing
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from ..config.caption import InputCaptionFormat, OutputCaptionFormat
|
|
11
|
-
from .caption import Caption
|
|
12
|
-
from .formats.gemini import GeminiReader, GeminiSegment, GeminiWriter
|
|
13
|
-
from .formats.nle.audition import (
|
|
14
|
-
AuditionCSVConfig,
|
|
15
|
-
AuditionCSVWriter,
|
|
16
|
-
EdiMarkerConfig,
|
|
17
|
-
EdiMarkerWriter,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
# Professional NLE format writers (re-exported from formats/)
|
|
21
|
-
from .formats.nle.avid import AvidDSConfig, AvidDSWriter, FrameRate
|
|
22
|
-
from .formats.nle.fcpxml import FCPXMLConfig, FCPXMLStyle, FCPXMLWriter
|
|
23
|
-
from .formats.nle.premiere import PremiereXMLConfig, PremiereXMLWriter
|
|
24
|
-
from .formats.ttml import TTMLConfig, TTMLFormat, TTMLRegion, TTMLStyle
|
|
25
|
-
from .parsers.text_parser import normalize_text
|
|
26
|
-
from .standardize import (
|
|
27
|
-
CaptionStandardizer,
|
|
28
|
-
CaptionValidator,
|
|
29
|
-
StandardizationConfig,
|
|
30
|
-
ValidationResult,
|
|
31
|
-
apply_margins_to_captions,
|
|
32
|
-
standardize_captions,
|
|
33
|
-
)
|
|
34
|
-
from .supervision import Supervision
|
|
35
|
-
|
|
36
|
-
# Create TTMLWriter alias for backward compatibility
|
|
37
|
-
TTMLWriter = TTMLFormat
|
|
38
|
-
|
|
39
|
-
# Utility functions
|
|
40
|
-
from .utils import (
|
|
41
|
-
CollisionMode,
|
|
42
|
-
TimecodeOffset,
|
|
43
|
-
apply_timecode_offset,
|
|
44
|
-
detect_overlaps,
|
|
45
|
-
format_srt_timestamp,
|
|
46
|
-
generate_srt_content,
|
|
47
|
-
resolve_overlaps,
|
|
48
|
-
split_long_lines,
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
__all__ = [
|
|
52
|
-
# Core classes
|
|
53
|
-
"Caption",
|
|
54
|
-
"Supervision",
|
|
55
|
-
# Standardization
|
|
56
|
-
"CaptionStandardizer",
|
|
57
|
-
"CaptionValidator",
|
|
58
|
-
"StandardizationConfig",
|
|
59
|
-
"ValidationResult",
|
|
60
|
-
"standardize_captions",
|
|
61
|
-
"apply_margins_to_captions",
|
|
62
|
-
# Gemini format support
|
|
63
|
-
"GeminiReader",
|
|
64
|
-
"GeminiWriter",
|
|
65
|
-
"GeminiSegment",
|
|
66
|
-
# Text utilities
|
|
67
|
-
"normalize_text",
|
|
68
|
-
# Format types
|
|
69
|
-
"InputCaptionFormat",
|
|
70
|
-
"OutputCaptionFormat",
|
|
71
|
-
# Professional format writers
|
|
72
|
-
"AvidDSWriter",
|
|
73
|
-
"AvidDSConfig",
|
|
74
|
-
"FCPXMLWriter",
|
|
75
|
-
"FCPXMLConfig",
|
|
76
|
-
"FCPXMLStyle",
|
|
77
|
-
"PremiereXMLWriter",
|
|
78
|
-
"PremiereXMLConfig",
|
|
79
|
-
"AuditionCSVWriter",
|
|
80
|
-
"AuditionCSVConfig",
|
|
81
|
-
"EdiMarkerWriter",
|
|
82
|
-
"EdiMarkerConfig",
|
|
83
|
-
"TTMLWriter",
|
|
84
|
-
"TTMLConfig",
|
|
85
|
-
"TTMLStyle",
|
|
86
|
-
"TTMLRegion",
|
|
87
|
-
# Utilities
|
|
88
|
-
"CollisionMode",
|
|
89
|
-
"TimecodeOffset",
|
|
90
|
-
"apply_timecode_offset",
|
|
91
|
-
"resolve_overlaps",
|
|
92
|
-
"detect_overlaps",
|
|
93
|
-
"split_long_lines",
|
|
94
|
-
"format_srt_timestamp",
|
|
95
|
-
"generate_srt_content",
|
|
96
|
-
]
|