lattifai 1.2.1__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/alignment/__init__.py +10 -1
- lattifai/alignment/lattice1_aligner.py +66 -58
- lattifai/alignment/punctuation.py +38 -0
- lattifai/alignment/sentence_splitter.py +152 -21
- lattifai/alignment/text_align.py +440 -0
- lattifai/alignment/tokenizer.py +82 -40
- lattifai/caption/__init__.py +82 -6
- lattifai/caption/caption.py +335 -1141
- lattifai/caption/formats/__init__.py +199 -0
- lattifai/caption/formats/base.py +211 -0
- lattifai/caption/{gemini_reader.py → formats/gemini.py} +320 -60
- lattifai/caption/formats/json.py +194 -0
- lattifai/caption/formats/lrc.py +309 -0
- lattifai/caption/formats/nle/__init__.py +9 -0
- lattifai/caption/formats/nle/audition.py +561 -0
- lattifai/caption/formats/nle/avid.py +423 -0
- lattifai/caption/formats/nle/fcpxml.py +549 -0
- lattifai/caption/formats/nle/premiere.py +589 -0
- lattifai/caption/formats/pysubs2.py +642 -0
- lattifai/caption/formats/sbv.py +147 -0
- lattifai/caption/formats/tabular.py +338 -0
- lattifai/caption/formats/textgrid.py +193 -0
- lattifai/caption/formats/ttml.py +652 -0
- lattifai/caption/formats/vtt.py +469 -0
- lattifai/caption/parsers/__init__.py +9 -0
- lattifai/caption/{text_parser.py → parsers/text_parser.py} +4 -2
- lattifai/caption/standardize.py +636 -0
- lattifai/caption/utils.py +474 -0
- lattifai/cli/__init__.py +2 -1
- lattifai/cli/caption.py +108 -1
- lattifai/cli/transcribe.py +1 -1
- lattifai/cli/youtube.py +4 -1
- lattifai/client.py +33 -113
- lattifai/config/__init__.py +11 -1
- lattifai/config/alignment.py +7 -0
- lattifai/config/caption.py +267 -23
- lattifai/config/media.py +20 -0
- lattifai/diarization/__init__.py +41 -1
- lattifai/mixin.py +27 -15
- lattifai/transcription/base.py +6 -1
- lattifai/transcription/lattifai.py +19 -54
- lattifai/utils.py +7 -13
- lattifai/workflow/__init__.py +28 -4
- lattifai/workflow/file_manager.py +2 -5
- lattifai/youtube/__init__.py +43 -0
- lattifai/youtube/client.py +1170 -0
- lattifai/youtube/types.py +23 -0
- lattifai-1.2.2.dist-info/METADATA +615 -0
- lattifai-1.2.2.dist-info/RECORD +76 -0
- {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/entry_points.txt +1 -2
- lattifai/caption/gemini_writer.py +0 -173
- lattifai/cli/app_installer.py +0 -142
- lattifai/cli/server.py +0 -44
- lattifai/server/app.py +0 -427
- lattifai/workflow/youtube.py +0 -577
- lattifai-1.2.1.dist-info/METADATA +0 -1134
- lattifai-1.2.1.dist-info/RECORD +0 -58
- {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/WHEEL +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/licenses/LICENSE +0 -0
- {lattifai-1.2.1.dist-info → lattifai-1.2.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""SubViewer (SBV) format handler.
|
|
2
|
+
|
|
3
|
+
SBV is YouTube's native subtitle format with the following structure:
|
|
4
|
+
0:00:00.000,0:00:02.000
|
|
5
|
+
Text line 1
|
|
6
|
+
|
|
7
|
+
0:00:02.000,0:00:04.000
|
|
8
|
+
Text line 2
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List
|
|
13
|
+
|
|
14
|
+
from ..parsers.text_parser import normalize_text as normalize_text_fn
|
|
15
|
+
from ..parsers.text_parser import parse_speaker_text
|
|
16
|
+
from ..supervision import Supervision
|
|
17
|
+
from . import register_format
|
|
18
|
+
from .base import FormatHandler
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@register_format("sbv")
|
|
22
|
+
class SBVFormat(FormatHandler):
|
|
23
|
+
"""SubViewer (SBV) format - YouTube's native format."""
|
|
24
|
+
|
|
25
|
+
extensions = [".sbv"]
|
|
26
|
+
description = "SubViewer - YouTube native subtitle format"
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def _parse_sbv_timestamp(cls, timestamp: str) -> float:
|
|
30
|
+
"""Parse SBV timestamp (H:MM:SS.mmm) to seconds."""
|
|
31
|
+
parts = timestamp.strip().split(":")
|
|
32
|
+
if len(parts) == 3:
|
|
33
|
+
h, m, s = parts
|
|
34
|
+
s_parts = s.split(".")
|
|
35
|
+
seconds = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
|
|
36
|
+
if len(s_parts) > 1:
|
|
37
|
+
seconds += int(s_parts[1]) / 1000.0
|
|
38
|
+
return seconds
|
|
39
|
+
return 0.0
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def _format_sbv_timestamp(cls, seconds: float) -> str:
|
|
43
|
+
"""Format seconds to SBV timestamp (H:MM:SS.mmm)."""
|
|
44
|
+
h = int(seconds // 3600)
|
|
45
|
+
m = int((seconds % 3600) // 60)
|
|
46
|
+
s = int(seconds % 60)
|
|
47
|
+
ms = int((seconds % 1) * 1000)
|
|
48
|
+
return f"{h}:{m:02d}:{s:02d}.{ms:03d}"
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def read(
|
|
52
|
+
cls,
|
|
53
|
+
source,
|
|
54
|
+
normalize_text: bool = True,
|
|
55
|
+
**kwargs,
|
|
56
|
+
) -> List[Supervision]:
|
|
57
|
+
"""Read SBV format."""
|
|
58
|
+
# Get content
|
|
59
|
+
if cls.is_content(source):
|
|
60
|
+
content = source
|
|
61
|
+
else:
|
|
62
|
+
content = Path(source).read_text(encoding="utf-8")
|
|
63
|
+
|
|
64
|
+
supervisions = []
|
|
65
|
+
entries = content.strip().split("\n\n")
|
|
66
|
+
|
|
67
|
+
for entry in entries:
|
|
68
|
+
lines = entry.strip().split("\n")
|
|
69
|
+
if len(lines) < 2:
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
# First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
|
|
73
|
+
timestamp_line = lines[0].strip()
|
|
74
|
+
text_lines = lines[1:]
|
|
75
|
+
|
|
76
|
+
if "," not in timestamp_line:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
start_str, end_str = timestamp_line.split(",", 1)
|
|
81
|
+
start = cls._parse_sbv_timestamp(start_str)
|
|
82
|
+
end = cls._parse_sbv_timestamp(end_str)
|
|
83
|
+
|
|
84
|
+
text = " ".join(text_lines).strip()
|
|
85
|
+
speaker, text = parse_speaker_text(text)
|
|
86
|
+
|
|
87
|
+
if normalize_text:
|
|
88
|
+
text = normalize_text_fn(text)
|
|
89
|
+
|
|
90
|
+
if end > start:
|
|
91
|
+
supervisions.append(
|
|
92
|
+
Supervision(
|
|
93
|
+
text=text,
|
|
94
|
+
start=start,
|
|
95
|
+
duration=end - start,
|
|
96
|
+
speaker=speaker,
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
except (ValueError, IndexError):
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
return supervisions
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def write(
|
|
106
|
+
cls,
|
|
107
|
+
supervisions: List[Supervision],
|
|
108
|
+
output_path,
|
|
109
|
+
include_speaker: bool = True,
|
|
110
|
+
**kwargs,
|
|
111
|
+
) -> Path:
|
|
112
|
+
"""Write SBV format."""
|
|
113
|
+
output_path = Path(output_path)
|
|
114
|
+
content = cls.to_bytes(supervisions, include_speaker=include_speaker)
|
|
115
|
+
output_path.write_bytes(content)
|
|
116
|
+
return output_path
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def to_bytes(
|
|
120
|
+
cls,
|
|
121
|
+
supervisions: List[Supervision],
|
|
122
|
+
include_speaker: bool = True,
|
|
123
|
+
**kwargs,
|
|
124
|
+
) -> bytes:
|
|
125
|
+
"""Convert to SBV format bytes."""
|
|
126
|
+
lines = []
|
|
127
|
+
|
|
128
|
+
for i, sup in enumerate(supervisions):
|
|
129
|
+
start_time = cls._format_sbv_timestamp(sup.start)
|
|
130
|
+
end_time = cls._format_sbv_timestamp(sup.end)
|
|
131
|
+
lines.append(f"{start_time},{end_time}")
|
|
132
|
+
|
|
133
|
+
text = sup.text.strip() if sup.text else ""
|
|
134
|
+
if include_speaker and sup.speaker:
|
|
135
|
+
# Check if speaker should be included
|
|
136
|
+
include_this_speaker = True
|
|
137
|
+
if hasattr(sup, "custom") and sup.custom and not sup.custom.get("original_speaker", True):
|
|
138
|
+
include_this_speaker = False
|
|
139
|
+
|
|
140
|
+
if include_this_speaker:
|
|
141
|
+
text = f"{sup.speaker}: {text}"
|
|
142
|
+
lines.append(text)
|
|
143
|
+
|
|
144
|
+
if i < len(supervisions) - 1:
|
|
145
|
+
lines.append("")
|
|
146
|
+
|
|
147
|
+
return "\n".join(lines).encode("utf-8")
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
"""Tabular and plain text format handlers.
|
|
2
|
+
|
|
3
|
+
Handles: CSV, TSV, AUD (Audacity labels), TXT, JSON
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import csv
|
|
7
|
+
import json
|
|
8
|
+
from io import StringIO
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
from ..parsers.text_parser import normalize_text as normalize_text_fn
|
|
13
|
+
from ..parsers.text_parser import parse_speaker_text, parse_timestamp_text
|
|
14
|
+
from ..supervision import Supervision
|
|
15
|
+
from . import register_format
|
|
16
|
+
from .base import FormatHandler
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@register_format("csv")
|
|
20
|
+
class CSVFormat(FormatHandler):
|
|
21
|
+
"""CSV (Comma-Separated Values) format.
|
|
22
|
+
|
|
23
|
+
Format: speaker,start,end,text (with header)
|
|
24
|
+
Times are in milliseconds.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
extensions = [".csv"]
|
|
28
|
+
description = "CSV - tabular subtitle format"
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def read(
|
|
32
|
+
cls,
|
|
33
|
+
source,
|
|
34
|
+
normalize_text: bool = True,
|
|
35
|
+
**kwargs,
|
|
36
|
+
) -> List[Supervision]:
|
|
37
|
+
"""Read CSV format."""
|
|
38
|
+
if cls.is_content(source):
|
|
39
|
+
lines = list(csv.reader(StringIO(source)))
|
|
40
|
+
else:
|
|
41
|
+
with open(source, "r", encoding="utf-8", newline="") as f:
|
|
42
|
+
lines = list(csv.reader(f))
|
|
43
|
+
|
|
44
|
+
if not lines:
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
# Check for header
|
|
48
|
+
first_line = [col.strip().lower() for col in lines[0]]
|
|
49
|
+
has_header = "start" in first_line and "end" in first_line and "text" in first_line
|
|
50
|
+
has_speaker = "speaker" in first_line
|
|
51
|
+
|
|
52
|
+
supervisions = []
|
|
53
|
+
start_idx = 1 if has_header else 0
|
|
54
|
+
|
|
55
|
+
for parts in lines[start_idx:]:
|
|
56
|
+
if len(parts) < 3:
|
|
57
|
+
continue
|
|
58
|
+
try:
|
|
59
|
+
if has_speaker and len(parts) >= 4:
|
|
60
|
+
speaker = parts[0].strip() or None
|
|
61
|
+
start = float(parts[1]) / 1000.0
|
|
62
|
+
end = float(parts[2]) / 1000.0
|
|
63
|
+
text = ",".join(parts[3:]).strip()
|
|
64
|
+
else:
|
|
65
|
+
start = float(parts[0]) / 1000.0
|
|
66
|
+
end = float(parts[1]) / 1000.0
|
|
67
|
+
text = ",".join(parts[2:]).strip()
|
|
68
|
+
speaker = None
|
|
69
|
+
|
|
70
|
+
if normalize_text:
|
|
71
|
+
text = normalize_text_fn(text)
|
|
72
|
+
|
|
73
|
+
if end > start:
|
|
74
|
+
supervisions.append(Supervision(text=text, start=start, duration=end - start, speaker=speaker))
|
|
75
|
+
except (ValueError, IndexError):
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
return supervisions
|
|
79
|
+
|
|
80
|
+
@classmethod
|
|
81
|
+
def write(cls, supervisions: List[Supervision], output_path, include_speaker: bool = True, **kwargs) -> Path:
|
|
82
|
+
"""Write CSV format."""
|
|
83
|
+
output_path = Path(output_path)
|
|
84
|
+
content = cls.to_bytes(supervisions, include_speaker=include_speaker)
|
|
85
|
+
output_path.write_bytes(content)
|
|
86
|
+
return output_path
|
|
87
|
+
|
|
88
|
+
@classmethod
|
|
89
|
+
def to_bytes(cls, supervisions: List[Supervision], include_speaker: bool = True, **kwargs) -> bytes:
|
|
90
|
+
"""Convert to CSV format bytes."""
|
|
91
|
+
output = StringIO()
|
|
92
|
+
writer = csv.writer(output)
|
|
93
|
+
|
|
94
|
+
if include_speaker:
|
|
95
|
+
writer.writerow(["speaker", "start", "end", "text"])
|
|
96
|
+
for sup in supervisions:
|
|
97
|
+
if cls._should_include_speaker(sup, include_speaker):
|
|
98
|
+
text = f"{sup.speaker} {sup.text.strip()}"
|
|
99
|
+
else:
|
|
100
|
+
text = sup.text.strip()
|
|
101
|
+
writer.writerow([sup.speaker or "", round(1000 * sup.start), round(1000 * sup.end), text])
|
|
102
|
+
else:
|
|
103
|
+
writer.writerow(["start", "end", "text"])
|
|
104
|
+
for sup in supervisions:
|
|
105
|
+
writer.writerow([round(1000 * sup.start), round(1000 * sup.end), sup.text.strip()])
|
|
106
|
+
|
|
107
|
+
return output.getvalue().encode("utf-8")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@register_format("tsv")
|
|
111
|
+
class TSVFormat(FormatHandler):
|
|
112
|
+
"""TSV (Tab-Separated Values) format.
|
|
113
|
+
|
|
114
|
+
Format: speaker\tstart\tend\ttext (with header)
|
|
115
|
+
Times are in milliseconds.
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
extensions = [".tsv"]
|
|
119
|
+
description = "TSV - tab-separated subtitle format"
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def read(cls, source, normalize_text: bool = True, **kwargs) -> List[Supervision]:
|
|
123
|
+
"""Read TSV format."""
|
|
124
|
+
if cls.is_content(source):
|
|
125
|
+
lines = source.strip().split("\n")
|
|
126
|
+
else:
|
|
127
|
+
with open(source, "r", encoding="utf-8") as f:
|
|
128
|
+
lines = f.readlines()
|
|
129
|
+
|
|
130
|
+
if not lines:
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
first_line = lines[0].strip().lower()
|
|
134
|
+
has_header = "start" in first_line and "end" in first_line and "text" in first_line
|
|
135
|
+
has_speaker = "speaker" in first_line
|
|
136
|
+
|
|
137
|
+
supervisions = []
|
|
138
|
+
start_idx = 1 if has_header else 0
|
|
139
|
+
|
|
140
|
+
for line in lines[start_idx:]:
|
|
141
|
+
line = line.strip()
|
|
142
|
+
if not line:
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
parts = line.split("\t")
|
|
146
|
+
if len(parts) < 3:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
if has_speaker and len(parts) >= 4:
|
|
151
|
+
speaker = parts[0].strip() or None
|
|
152
|
+
start = float(parts[1]) / 1000.0
|
|
153
|
+
end = float(parts[2]) / 1000.0
|
|
154
|
+
text = "\t".join(parts[3:]).strip()
|
|
155
|
+
else:
|
|
156
|
+
start = float(parts[0]) / 1000.0
|
|
157
|
+
end = float(parts[1]) / 1000.0
|
|
158
|
+
text = "\t".join(parts[2:]).strip()
|
|
159
|
+
speaker = None
|
|
160
|
+
|
|
161
|
+
if normalize_text:
|
|
162
|
+
text = normalize_text_fn(text)
|
|
163
|
+
|
|
164
|
+
if end > start:
|
|
165
|
+
supervisions.append(Supervision(text=text, start=start, duration=end - start, speaker=speaker))
|
|
166
|
+
except (ValueError, IndexError):
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
return supervisions
|
|
170
|
+
|
|
171
|
+
@classmethod
|
|
172
|
+
def write(cls, supervisions: List[Supervision], output_path, include_speaker: bool = True, **kwargs) -> Path:
|
|
173
|
+
"""Write TSV format."""
|
|
174
|
+
output_path = Path(output_path)
|
|
175
|
+
content = cls.to_bytes(supervisions, include_speaker=include_speaker)
|
|
176
|
+
output_path.write_bytes(content)
|
|
177
|
+
return output_path
|
|
178
|
+
|
|
179
|
+
@classmethod
|
|
180
|
+
def to_bytes(cls, supervisions: List[Supervision], include_speaker: bool = True, **kwargs) -> bytes:
|
|
181
|
+
"""Convert to TSV format bytes."""
|
|
182
|
+
lines = []
|
|
183
|
+
if include_speaker:
|
|
184
|
+
lines.append("speaker\tstart\tend\ttext")
|
|
185
|
+
for sup in supervisions:
|
|
186
|
+
speaker = sup.speaker if cls._should_include_speaker(sup, include_speaker) else ""
|
|
187
|
+
text = sup.text.strip().replace("\t", " ")
|
|
188
|
+
lines.append(f"{speaker}\t{round(1000 * sup.start)}\t{round(1000 * sup.end)}\t{text}")
|
|
189
|
+
else:
|
|
190
|
+
lines.append("start\tend\ttext")
|
|
191
|
+
for sup in supervisions:
|
|
192
|
+
text = sup.text.strip().replace("\t", " ")
|
|
193
|
+
lines.append(f"{round(1000 * sup.start)}\t{round(1000 * sup.end)}\t{text}")
|
|
194
|
+
|
|
195
|
+
return "\n".join(lines).encode("utf-8")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
@register_format("aud")
|
|
199
|
+
class AUDFormat(FormatHandler):
|
|
200
|
+
"""Audacity Labels format.
|
|
201
|
+
|
|
202
|
+
Format: start\tend\t[[speaker]]text
|
|
203
|
+
Times are in seconds.
|
|
204
|
+
"""
|
|
205
|
+
|
|
206
|
+
extensions = [".aud", ".txt"]
|
|
207
|
+
description = "Audacity Labels format"
|
|
208
|
+
|
|
209
|
+
@classmethod
|
|
210
|
+
def can_read(cls, path) -> bool:
|
|
211
|
+
"""Only handle .aud extension for reading."""
|
|
212
|
+
return str(path).lower().endswith(".aud")
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
def read(cls, source, normalize_text: bool = True, **kwargs) -> List[Supervision]:
|
|
216
|
+
"""Read AUD format."""
|
|
217
|
+
import re
|
|
218
|
+
|
|
219
|
+
if cls.is_content(source):
|
|
220
|
+
lines = source.strip().split("\n")
|
|
221
|
+
else:
|
|
222
|
+
with open(source, "r", encoding="utf-8") as f:
|
|
223
|
+
lines = f.readlines()
|
|
224
|
+
|
|
225
|
+
supervisions = []
|
|
226
|
+
for line in lines:
|
|
227
|
+
line = line.strip()
|
|
228
|
+
if not line:
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
parts = line.split("\t")
|
|
232
|
+
if len(parts) < 3:
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
start = float(parts[0])
|
|
237
|
+
end = float(parts[1])
|
|
238
|
+
text = "\t".join(parts[2:]).strip()
|
|
239
|
+
|
|
240
|
+
# Extract speaker from [[speaker]] prefix
|
|
241
|
+
speaker = None
|
|
242
|
+
speaker_match = re.match(r"^\[\[([^\]]+)\]\]\s*(.*)$", text)
|
|
243
|
+
if speaker_match:
|
|
244
|
+
speaker = speaker_match.group(1)
|
|
245
|
+
text = speaker_match.group(2)
|
|
246
|
+
|
|
247
|
+
if normalize_text:
|
|
248
|
+
text = normalize_text_fn(text)
|
|
249
|
+
|
|
250
|
+
if end > start:
|
|
251
|
+
supervisions.append(Supervision(text=text, start=start, duration=end - start, speaker=speaker))
|
|
252
|
+
except (ValueError, IndexError):
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
return supervisions
|
|
256
|
+
|
|
257
|
+
@classmethod
|
|
258
|
+
def write(cls, supervisions: List[Supervision], output_path, include_speaker: bool = True, **kwargs) -> Path:
|
|
259
|
+
"""Write AUD format."""
|
|
260
|
+
output_path = Path(output_path)
|
|
261
|
+
content = cls.to_bytes(supervisions, include_speaker=include_speaker)
|
|
262
|
+
output_path.write_bytes(content)
|
|
263
|
+
return output_path
|
|
264
|
+
|
|
265
|
+
@classmethod
|
|
266
|
+
def to_bytes(cls, supervisions: List[Supervision], include_speaker: bool = True, **kwargs) -> bytes:
|
|
267
|
+
"""Convert to AUD format bytes."""
|
|
268
|
+
lines = []
|
|
269
|
+
for sup in supervisions:
|
|
270
|
+
text = sup.text.strip().replace("\t", " ")
|
|
271
|
+
if cls._should_include_speaker(sup, include_speaker):
|
|
272
|
+
text = f"{sup.speaker} {text}"
|
|
273
|
+
lines.append(f"{sup.start}\t{sup.end}\t{text}")
|
|
274
|
+
|
|
275
|
+
return "\n".join(lines).encode("utf-8")
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
@register_format("txt")
|
|
279
|
+
class TXTFormat(FormatHandler):
|
|
280
|
+
"""Plain text format with optional timestamps.
|
|
281
|
+
|
|
282
|
+
Format: [start-end] text or [start-end] [speaker]: text
|
|
283
|
+
"""
|
|
284
|
+
|
|
285
|
+
extensions = [".txt"]
|
|
286
|
+
description = "Plain text with optional timestamps"
|
|
287
|
+
|
|
288
|
+
@classmethod
|
|
289
|
+
def read(cls, source, normalize_text: bool = True, **kwargs) -> List[Supervision]:
|
|
290
|
+
"""Read TXT format."""
|
|
291
|
+
if cls.is_content(source):
|
|
292
|
+
lines = source.strip().split("\n")
|
|
293
|
+
else:
|
|
294
|
+
with open(source, "r", encoding="utf-8") as f:
|
|
295
|
+
lines = [line.strip() for line in f.readlines()]
|
|
296
|
+
|
|
297
|
+
if normalize_text:
|
|
298
|
+
lines = [normalize_text_fn(line) for line in lines]
|
|
299
|
+
|
|
300
|
+
supervisions = []
|
|
301
|
+
for line in lines:
|
|
302
|
+
if not line:
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
start, end, remaining_text = parse_timestamp_text(line)
|
|
306
|
+
if start is not None and end is not None:
|
|
307
|
+
speaker, text = parse_speaker_text(remaining_text)
|
|
308
|
+
supervisions.append(Supervision(text=text, start=start, duration=end - start, speaker=speaker))
|
|
309
|
+
else:
|
|
310
|
+
speaker, text = parse_speaker_text(line)
|
|
311
|
+
supervisions.append(Supervision(text=text, speaker=speaker))
|
|
312
|
+
|
|
313
|
+
return supervisions
|
|
314
|
+
|
|
315
|
+
@classmethod
|
|
316
|
+
def write(cls, supervisions: List[Supervision], output_path, include_speaker: bool = True, **kwargs) -> Path:
|
|
317
|
+
"""Write TXT format."""
|
|
318
|
+
output_path = Path(output_path)
|
|
319
|
+
content = cls.to_bytes(supervisions, include_speaker=include_speaker)
|
|
320
|
+
output_path.write_bytes(content)
|
|
321
|
+
return output_path
|
|
322
|
+
|
|
323
|
+
@classmethod
|
|
324
|
+
def to_bytes(cls, supervisions: List[Supervision], include_speaker: bool = True, **kwargs) -> bytes:
|
|
325
|
+
"""Convert to TXT format bytes."""
|
|
326
|
+
lines = []
|
|
327
|
+
for sup in supervisions:
|
|
328
|
+
text = sup.text or ""
|
|
329
|
+
if cls._should_include_speaker(sup, include_speaker):
|
|
330
|
+
text = f"{sup.speaker} {text}"
|
|
331
|
+
lines.append(f"[{sup.start:.2f}-{sup.end:.2f}] {text}")
|
|
332
|
+
|
|
333
|
+
return "\n".join(lines).encode("utf-8")
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# JSON format moved to json.py for better organization
|
|
337
|
+
# Import here for backwards compatibility
|
|
338
|
+
from .json import JSONFormat # noqa: F401
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Praat TextGrid format handler.
|
|
2
|
+
|
|
3
|
+
TextGrid is Praat's native annotation format, commonly used in phonetics research.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
|
+
|
|
10
|
+
from lhotse.utils import Pathlike
|
|
11
|
+
|
|
12
|
+
from ..supervision import Supervision
|
|
13
|
+
from . import register_format
|
|
14
|
+
from .base import FormatHandler
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@register_format("textgrid")
|
|
18
|
+
class TextGridFormat(FormatHandler):
|
|
19
|
+
"""Praat TextGrid format for phonetic analysis."""
|
|
20
|
+
|
|
21
|
+
extensions = [".textgrid"]
|
|
22
|
+
description = "Praat TextGrid - phonetics research format"
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def read(
|
|
26
|
+
cls,
|
|
27
|
+
source,
|
|
28
|
+
normalize_text: bool = True,
|
|
29
|
+
**kwargs,
|
|
30
|
+
) -> List[Supervision]:
|
|
31
|
+
"""Read TextGrid format using tgt library.
|
|
32
|
+
|
|
33
|
+
Preserves tier information in Supervision.custom:
|
|
34
|
+
- textgrid_tier: Original tier name
|
|
35
|
+
- textgrid_tier_index: Original tier index (for ordering)
|
|
36
|
+
"""
|
|
37
|
+
from tgt import read_textgrid
|
|
38
|
+
|
|
39
|
+
if cls.is_content(source):
|
|
40
|
+
# Write to temp file for tgt library
|
|
41
|
+
with tempfile.NamedTemporaryFile(suffix=".textgrid", delete=False, mode="w") as f:
|
|
42
|
+
f.write(source)
|
|
43
|
+
temp_path = f.name
|
|
44
|
+
try:
|
|
45
|
+
tgt = read_textgrid(temp_path)
|
|
46
|
+
finally:
|
|
47
|
+
Path(temp_path).unlink(missing_ok=True)
|
|
48
|
+
else:
|
|
49
|
+
tgt = read_textgrid(str(source))
|
|
50
|
+
|
|
51
|
+
supervisions = []
|
|
52
|
+
for tier_idx, tier in enumerate(tgt.tiers):
|
|
53
|
+
for interval in tier.intervals:
|
|
54
|
+
supervisions.append(
|
|
55
|
+
Supervision(
|
|
56
|
+
text=interval.text,
|
|
57
|
+
start=interval.start_time,
|
|
58
|
+
duration=interval.end_time - interval.start_time,
|
|
59
|
+
speaker=tier.name,
|
|
60
|
+
custom={
|
|
61
|
+
"textgrid_tier": tier.name,
|
|
62
|
+
"textgrid_tier_index": tier_idx,
|
|
63
|
+
},
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return sorted(supervisions, key=lambda x: x.start)
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def write(
|
|
71
|
+
cls,
|
|
72
|
+
supervisions: List[Supervision],
|
|
73
|
+
output_path,
|
|
74
|
+
include_speaker: bool = True,
|
|
75
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
76
|
+
**kwargs,
|
|
77
|
+
) -> Path:
|
|
78
|
+
"""Write TextGrid format using tgt library.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
supervisions: List of supervisions to write
|
|
82
|
+
output_path: Output file path
|
|
83
|
+
include_speaker: Whether to include speaker in text
|
|
84
|
+
metadata: Optional metadata (for API consistency)
|
|
85
|
+
"""
|
|
86
|
+
from lhotse.supervision import AlignmentItem
|
|
87
|
+
from tgt import Interval, IntervalTier, TextGrid, write_to_file
|
|
88
|
+
|
|
89
|
+
output_path = Path(output_path)
|
|
90
|
+
tg = TextGrid()
|
|
91
|
+
|
|
92
|
+
utterances = []
|
|
93
|
+
words = []
|
|
94
|
+
scores = {"utterances": [], "words": []}
|
|
95
|
+
|
|
96
|
+
for sup in sorted(supervisions, key=lambda x: x.start):
|
|
97
|
+
text = sup.text or ""
|
|
98
|
+
if include_speaker and sup.speaker:
|
|
99
|
+
# Check if speaker should be included
|
|
100
|
+
include_this_speaker = True
|
|
101
|
+
if hasattr(sup, "custom") and sup.custom and not sup.custom.get("original_speaker", True):
|
|
102
|
+
include_this_speaker = False
|
|
103
|
+
|
|
104
|
+
if include_this_speaker:
|
|
105
|
+
text = f"{sup.speaker} {text}"
|
|
106
|
+
|
|
107
|
+
utterances.append(Interval(sup.start, sup.end, text))
|
|
108
|
+
|
|
109
|
+
# Extract word-level alignment if present
|
|
110
|
+
alignment = getattr(sup, "alignment", None)
|
|
111
|
+
if alignment and "word" in alignment:
|
|
112
|
+
for item in alignment["word"]:
|
|
113
|
+
words.append(Interval(item.start, item.end, item.symbol))
|
|
114
|
+
if item.score is not None:
|
|
115
|
+
scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
|
|
116
|
+
|
|
117
|
+
if hasattr(sup, "custom") and sup.custom and "score" in sup.custom:
|
|
118
|
+
scores["utterances"].append(Interval(sup.start, sup.end, f"{sup.custom['score']:.2f}"))
|
|
119
|
+
|
|
120
|
+
tg.add_tier(IntervalTier(name="utterances", objects=utterances))
|
|
121
|
+
|
|
122
|
+
if words:
|
|
123
|
+
tg.add_tier(IntervalTier(name="words", objects=words))
|
|
124
|
+
|
|
125
|
+
if scores["utterances"]:
|
|
126
|
+
tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
|
|
127
|
+
if scores["words"]:
|
|
128
|
+
tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
|
|
129
|
+
|
|
130
|
+
write_to_file(tg, str(output_path), format="long")
|
|
131
|
+
return output_path
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def to_bytes(
|
|
135
|
+
cls,
|
|
136
|
+
supervisions: List[Supervision],
|
|
137
|
+
include_speaker: bool = True,
|
|
138
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
139
|
+
**kwargs,
|
|
140
|
+
) -> bytes:
|
|
141
|
+
"""Convert to TextGrid format bytes.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
supervisions: List of supervisions to convert
|
|
145
|
+
include_speaker: Whether to include speaker in text
|
|
146
|
+
metadata: Optional metadata (currently unused, for API consistency)
|
|
147
|
+
"""
|
|
148
|
+
# TextGrid requires file I/O due to tgt library implementation
|
|
149
|
+
with tempfile.NamedTemporaryFile(suffix=".textgrid", delete=False) as tmp:
|
|
150
|
+
tmp_path = Path(tmp.name)
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
cls.write(supervisions, tmp_path, include_speaker, metadata=metadata, **kwargs)
|
|
154
|
+
return tmp_path.read_bytes()
|
|
155
|
+
finally:
|
|
156
|
+
tmp_path.unlink(missing_ok=True)
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def extract_metadata(cls, source: Union[Pathlike, str], **kwargs) -> Dict[str, Any]:
|
|
160
|
+
"""Extract metadata from TextGrid.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Dict containing:
|
|
164
|
+
- textgrid_xmin: Minimum time boundary
|
|
165
|
+
- textgrid_xmax: Maximum time boundary
|
|
166
|
+
- textgrid_tiers: List of tier names
|
|
167
|
+
"""
|
|
168
|
+
import re
|
|
169
|
+
from pathlib import Path
|
|
170
|
+
|
|
171
|
+
metadata: Dict[str, Any] = {}
|
|
172
|
+
if cls.is_content(source):
|
|
173
|
+
content = source
|
|
174
|
+
else:
|
|
175
|
+
try:
|
|
176
|
+
with open(source, "r", encoding="utf-8") as f:
|
|
177
|
+
content = f.read()
|
|
178
|
+
except Exception:
|
|
179
|
+
return {}
|
|
180
|
+
|
|
181
|
+
match = re.search(r"xmin\s*=\s*([\d.]+)", content)
|
|
182
|
+
if match:
|
|
183
|
+
metadata["textgrid_xmin"] = float(match.group(1))
|
|
184
|
+
match = re.search(r"xmax\s*=\s*([\d.]+)", content)
|
|
185
|
+
if match:
|
|
186
|
+
metadata["textgrid_xmax"] = float(match.group(1))
|
|
187
|
+
|
|
188
|
+
# Extract tier names
|
|
189
|
+
tier_names = re.findall(r'name\s*=\s*"([^"]+)"', content)
|
|
190
|
+
if tier_names:
|
|
191
|
+
metadata["textgrid_tiers"] = tier_names
|
|
192
|
+
|
|
193
|
+
return metadata
|