lattifai 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lattifai/_init.py +20 -0
  2. lattifai/alignment/__init__.py +2 -3
  3. lattifai/alignment/lattice1_aligner.py +117 -4
  4. lattifai/alignment/lattice1_worker.py +47 -4
  5. lattifai/alignment/segmenter.py +3 -2
  6. lattifai/alignment/text_align.py +2 -1
  7. lattifai/alignment/tokenizer.py +56 -29
  8. lattifai/audio2.py +162 -183
  9. lattifai/cli/alignment.py +5 -0
  10. lattifai/cli/caption.py +6 -6
  11. lattifai/cli/transcribe.py +1 -5
  12. lattifai/cli/youtube.py +3 -0
  13. lattifai/client.py +41 -12
  14. lattifai/config/__init__.py +21 -3
  15. lattifai/config/alignment.py +7 -0
  16. lattifai/config/caption.py +13 -243
  17. lattifai/config/client.py +16 -0
  18. lattifai/config/event.py +102 -0
  19. lattifai/config/transcription.py +25 -1
  20. lattifai/data/__init__.py +8 -0
  21. lattifai/data/caption.py +228 -0
  22. lattifai/errors.py +78 -53
  23. lattifai/event/__init__.py +65 -0
  24. lattifai/event/lattifai.py +166 -0
  25. lattifai/mixin.py +22 -17
  26. lattifai/transcription/base.py +2 -1
  27. lattifai/transcription/gemini.py +147 -16
  28. lattifai/transcription/lattifai.py +8 -11
  29. lattifai/types.py +1 -1
  30. lattifai/youtube/client.py +143 -48
  31. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/METADATA +117 -54
  32. lattifai-1.3.0.dist-info/RECORD +57 -0
  33. lattifai/__init__.py +0 -88
  34. lattifai/alignment/sentence_splitter.py +0 -350
  35. lattifai/caption/__init__.py +0 -96
  36. lattifai/caption/caption.py +0 -661
  37. lattifai/caption/formats/__init__.py +0 -199
  38. lattifai/caption/formats/base.py +0 -211
  39. lattifai/caption/formats/gemini.py +0 -722
  40. lattifai/caption/formats/json.py +0 -194
  41. lattifai/caption/formats/lrc.py +0 -309
  42. lattifai/caption/formats/nle/__init__.py +0 -9
  43. lattifai/caption/formats/nle/audition.py +0 -561
  44. lattifai/caption/formats/nle/avid.py +0 -423
  45. lattifai/caption/formats/nle/fcpxml.py +0 -549
  46. lattifai/caption/formats/nle/premiere.py +0 -589
  47. lattifai/caption/formats/pysubs2.py +0 -642
  48. lattifai/caption/formats/sbv.py +0 -147
  49. lattifai/caption/formats/tabular.py +0 -338
  50. lattifai/caption/formats/textgrid.py +0 -193
  51. lattifai/caption/formats/ttml.py +0 -652
  52. lattifai/caption/formats/vtt.py +0 -469
  53. lattifai/caption/parsers/__init__.py +0 -9
  54. lattifai/caption/parsers/text_parser.py +0 -147
  55. lattifai/caption/standardize.py +0 -636
  56. lattifai/caption/supervision.py +0 -34
  57. lattifai/caption/utils.py +0 -474
  58. lattifai-1.2.2.dist-info/RECORD +0 -76
  59. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/WHEEL +0 -0
  60. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/entry_points.txt +0 -0
  61. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/licenses/LICENSE +0 -0
  62. {lattifai-1.2.2.dist-info → lattifai-1.3.0.dist-info}/top_level.txt +0 -0
@@ -1,147 +0,0 @@
1
- """SubViewer (SBV) format handler.
2
-
3
- SBV is YouTube's native subtitle format with the following structure:
4
- 0:00:00.000,0:00:02.000
5
- Text line 1
6
-
7
- 0:00:02.000,0:00:04.000
8
- Text line 2
9
- """
10
-
11
- from pathlib import Path
12
- from typing import List
13
-
14
- from ..parsers.text_parser import normalize_text as normalize_text_fn
15
- from ..parsers.text_parser import parse_speaker_text
16
- from ..supervision import Supervision
17
- from . import register_format
18
- from .base import FormatHandler
19
-
20
-
21
- @register_format("sbv")
22
- class SBVFormat(FormatHandler):
23
- """SubViewer (SBV) format - YouTube's native format."""
24
-
25
- extensions = [".sbv"]
26
- description = "SubViewer - YouTube native subtitle format"
27
-
28
- @classmethod
29
- def _parse_sbv_timestamp(cls, timestamp: str) -> float:
30
- """Parse SBV timestamp (H:MM:SS.mmm) to seconds."""
31
- parts = timestamp.strip().split(":")
32
- if len(parts) == 3:
33
- h, m, s = parts
34
- s_parts = s.split(".")
35
- seconds = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
36
- if len(s_parts) > 1:
37
- seconds += int(s_parts[1]) / 1000.0
38
- return seconds
39
- return 0.0
40
-
41
- @classmethod
42
- def _format_sbv_timestamp(cls, seconds: float) -> str:
43
- """Format seconds to SBV timestamp (H:MM:SS.mmm)."""
44
- h = int(seconds // 3600)
45
- m = int((seconds % 3600) // 60)
46
- s = int(seconds % 60)
47
- ms = int((seconds % 1) * 1000)
48
- return f"{h}:{m:02d}:{s:02d}.{ms:03d}"
49
-
50
- @classmethod
51
- def read(
52
- cls,
53
- source,
54
- normalize_text: bool = True,
55
- **kwargs,
56
- ) -> List[Supervision]:
57
- """Read SBV format."""
58
- # Get content
59
- if cls.is_content(source):
60
- content = source
61
- else:
62
- content = Path(source).read_text(encoding="utf-8")
63
-
64
- supervisions = []
65
- entries = content.strip().split("\n\n")
66
-
67
- for entry in entries:
68
- lines = entry.strip().split("\n")
69
- if len(lines) < 2:
70
- continue
71
-
72
- # First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
73
- timestamp_line = lines[0].strip()
74
- text_lines = lines[1:]
75
-
76
- if "," not in timestamp_line:
77
- continue
78
-
79
- try:
80
- start_str, end_str = timestamp_line.split(",", 1)
81
- start = cls._parse_sbv_timestamp(start_str)
82
- end = cls._parse_sbv_timestamp(end_str)
83
-
84
- text = " ".join(text_lines).strip()
85
- speaker, text = parse_speaker_text(text)
86
-
87
- if normalize_text:
88
- text = normalize_text_fn(text)
89
-
90
- if end > start:
91
- supervisions.append(
92
- Supervision(
93
- text=text,
94
- start=start,
95
- duration=end - start,
96
- speaker=speaker,
97
- )
98
- )
99
- except (ValueError, IndexError):
100
- continue
101
-
102
- return supervisions
103
-
104
- @classmethod
105
- def write(
106
- cls,
107
- supervisions: List[Supervision],
108
- output_path,
109
- include_speaker: bool = True,
110
- **kwargs,
111
- ) -> Path:
112
- """Write SBV format."""
113
- output_path = Path(output_path)
114
- content = cls.to_bytes(supervisions, include_speaker=include_speaker)
115
- output_path.write_bytes(content)
116
- return output_path
117
-
118
- @classmethod
119
- def to_bytes(
120
- cls,
121
- supervisions: List[Supervision],
122
- include_speaker: bool = True,
123
- **kwargs,
124
- ) -> bytes:
125
- """Convert to SBV format bytes."""
126
- lines = []
127
-
128
- for i, sup in enumerate(supervisions):
129
- start_time = cls._format_sbv_timestamp(sup.start)
130
- end_time = cls._format_sbv_timestamp(sup.end)
131
- lines.append(f"{start_time},{end_time}")
132
-
133
- text = sup.text.strip() if sup.text else ""
134
- if include_speaker and sup.speaker:
135
- # Check if speaker should be included
136
- include_this_speaker = True
137
- if hasattr(sup, "custom") and sup.custom and not sup.custom.get("original_speaker", True):
138
- include_this_speaker = False
139
-
140
- if include_this_speaker:
141
- text = f"{sup.speaker}: {text}"
142
- lines.append(text)
143
-
144
- if i < len(supervisions) - 1:
145
- lines.append("")
146
-
147
- return "\n".join(lines).encode("utf-8")
@@ -1,338 +0,0 @@
1
- """Tabular and plain text format handlers.
2
-
3
- Handles: CSV, TSV, AUD (Audacity labels), TXT, JSON
4
- """
5
-
6
- import csv
7
- import json
8
- from io import StringIO
9
- from pathlib import Path
10
- from typing import List
11
-
12
- from ..parsers.text_parser import normalize_text as normalize_text_fn
13
- from ..parsers.text_parser import parse_speaker_text, parse_timestamp_text
14
- from ..supervision import Supervision
15
- from . import register_format
16
- from .base import FormatHandler
17
-
18
-
19
- @register_format("csv")
20
- class CSVFormat(FormatHandler):
21
- """CSV (Comma-Separated Values) format.
22
-
23
- Format: speaker,start,end,text (with header)
24
- Times are in milliseconds.
25
- """
26
-
27
- extensions = [".csv"]
28
- description = "CSV - tabular subtitle format"
29
-
30
- @classmethod
31
- def read(
32
- cls,
33
- source,
34
- normalize_text: bool = True,
35
- **kwargs,
36
- ) -> List[Supervision]:
37
- """Read CSV format."""
38
- if cls.is_content(source):
39
- lines = list(csv.reader(StringIO(source)))
40
- else:
41
- with open(source, "r", encoding="utf-8", newline="") as f:
42
- lines = list(csv.reader(f))
43
-
44
- if not lines:
45
- return []
46
-
47
- # Check for header
48
- first_line = [col.strip().lower() for col in lines[0]]
49
- has_header = "start" in first_line and "end" in first_line and "text" in first_line
50
- has_speaker = "speaker" in first_line
51
-
52
- supervisions = []
53
- start_idx = 1 if has_header else 0
54
-
55
- for parts in lines[start_idx:]:
56
- if len(parts) < 3:
57
- continue
58
- try:
59
- if has_speaker and len(parts) >= 4:
60
- speaker = parts[0].strip() or None
61
- start = float(parts[1]) / 1000.0
62
- end = float(parts[2]) / 1000.0
63
- text = ",".join(parts[3:]).strip()
64
- else:
65
- start = float(parts[0]) / 1000.0
66
- end = float(parts[1]) / 1000.0
67
- text = ",".join(parts[2:]).strip()
68
- speaker = None
69
-
70
- if normalize_text:
71
- text = normalize_text_fn(text)
72
-
73
- if end > start:
74
- supervisions.append(Supervision(text=text, start=start, duration=end - start, speaker=speaker))
75
- except (ValueError, IndexError):
76
- continue
77
-
78
- return supervisions
79
-
80
- @classmethod
81
- def write(cls, supervisions: List[Supervision], output_path, include_speaker: bool = True, **kwargs) -> Path:
82
- """Write CSV format."""
83
- output_path = Path(output_path)
84
- content = cls.to_bytes(supervisions, include_speaker=include_speaker)
85
- output_path.write_bytes(content)
86
- return output_path
87
-
88
- @classmethod
89
- def to_bytes(cls, supervisions: List[Supervision], include_speaker: bool = True, **kwargs) -> bytes:
90
- """Convert to CSV format bytes."""
91
- output = StringIO()
92
- writer = csv.writer(output)
93
-
94
- if include_speaker:
95
- writer.writerow(["speaker", "start", "end", "text"])
96
- for sup in supervisions:
97
- if cls._should_include_speaker(sup, include_speaker):
98
- text = f"{sup.speaker} {sup.text.strip()}"
99
- else:
100
- text = sup.text.strip()
101
- writer.writerow([sup.speaker or "", round(1000 * sup.start), round(1000 * sup.end), text])
102
- else:
103
- writer.writerow(["start", "end", "text"])
104
- for sup in supervisions:
105
- writer.writerow([round(1000 * sup.start), round(1000 * sup.end), sup.text.strip()])
106
-
107
- return output.getvalue().encode("utf-8")
108
-
109
-
110
- @register_format("tsv")
111
- class TSVFormat(FormatHandler):
112
- """TSV (Tab-Separated Values) format.
113
-
114
- Format: speaker\tstart\tend\ttext (with header)
115
- Times are in milliseconds.
116
- """
117
-
118
- extensions = [".tsv"]
119
- description = "TSV - tab-separated subtitle format"
120
-
121
- @classmethod
122
- def read(cls, source, normalize_text: bool = True, **kwargs) -> List[Supervision]:
123
- """Read TSV format."""
124
- if cls.is_content(source):
125
- lines = source.strip().split("\n")
126
- else:
127
- with open(source, "r", encoding="utf-8") as f:
128
- lines = f.readlines()
129
-
130
- if not lines:
131
- return []
132
-
133
- first_line = lines[0].strip().lower()
134
- has_header = "start" in first_line and "end" in first_line and "text" in first_line
135
- has_speaker = "speaker" in first_line
136
-
137
- supervisions = []
138
- start_idx = 1 if has_header else 0
139
-
140
- for line in lines[start_idx:]:
141
- line = line.strip()
142
- if not line:
143
- continue
144
-
145
- parts = line.split("\t")
146
- if len(parts) < 3:
147
- continue
148
-
149
- try:
150
- if has_speaker and len(parts) >= 4:
151
- speaker = parts[0].strip() or None
152
- start = float(parts[1]) / 1000.0
153
- end = float(parts[2]) / 1000.0
154
- text = "\t".join(parts[3:]).strip()
155
- else:
156
- start = float(parts[0]) / 1000.0
157
- end = float(parts[1]) / 1000.0
158
- text = "\t".join(parts[2:]).strip()
159
- speaker = None
160
-
161
- if normalize_text:
162
- text = normalize_text_fn(text)
163
-
164
- if end > start:
165
- supervisions.append(Supervision(text=text, start=start, duration=end - start, speaker=speaker))
166
- except (ValueError, IndexError):
167
- continue
168
-
169
- return supervisions
170
-
171
- @classmethod
172
- def write(cls, supervisions: List[Supervision], output_path, include_speaker: bool = True, **kwargs) -> Path:
173
- """Write TSV format."""
174
- output_path = Path(output_path)
175
- content = cls.to_bytes(supervisions, include_speaker=include_speaker)
176
- output_path.write_bytes(content)
177
- return output_path
178
-
179
- @classmethod
180
- def to_bytes(cls, supervisions: List[Supervision], include_speaker: bool = True, **kwargs) -> bytes:
181
- """Convert to TSV format bytes."""
182
- lines = []
183
- if include_speaker:
184
- lines.append("speaker\tstart\tend\ttext")
185
- for sup in supervisions:
186
- speaker = sup.speaker if cls._should_include_speaker(sup, include_speaker) else ""
187
- text = sup.text.strip().replace("\t", " ")
188
- lines.append(f"{speaker}\t{round(1000 * sup.start)}\t{round(1000 * sup.end)}\t{text}")
189
- else:
190
- lines.append("start\tend\ttext")
191
- for sup in supervisions:
192
- text = sup.text.strip().replace("\t", " ")
193
- lines.append(f"{round(1000 * sup.start)}\t{round(1000 * sup.end)}\t{text}")
194
-
195
- return "\n".join(lines).encode("utf-8")
196
-
197
-
198
- @register_format("aud")
199
- class AUDFormat(FormatHandler):
200
- """Audacity Labels format.
201
-
202
- Format: start\tend\t[[speaker]]text
203
- Times are in seconds.
204
- """
205
-
206
- extensions = [".aud", ".txt"]
207
- description = "Audacity Labels format"
208
-
209
- @classmethod
210
- def can_read(cls, path) -> bool:
211
- """Only handle .aud extension for reading."""
212
- return str(path).lower().endswith(".aud")
213
-
214
- @classmethod
215
- def read(cls, source, normalize_text: bool = True, **kwargs) -> List[Supervision]:
216
- """Read AUD format."""
217
- import re
218
-
219
- if cls.is_content(source):
220
- lines = source.strip().split("\n")
221
- else:
222
- with open(source, "r", encoding="utf-8") as f:
223
- lines = f.readlines()
224
-
225
- supervisions = []
226
- for line in lines:
227
- line = line.strip()
228
- if not line:
229
- continue
230
-
231
- parts = line.split("\t")
232
- if len(parts) < 3:
233
- continue
234
-
235
- try:
236
- start = float(parts[0])
237
- end = float(parts[1])
238
- text = "\t".join(parts[2:]).strip()
239
-
240
- # Extract speaker from [[speaker]] prefix
241
- speaker = None
242
- speaker_match = re.match(r"^\[\[([^\]]+)\]\]\s*(.*)$", text)
243
- if speaker_match:
244
- speaker = speaker_match.group(1)
245
- text = speaker_match.group(2)
246
-
247
- if normalize_text:
248
- text = normalize_text_fn(text)
249
-
250
- if end > start:
251
- supervisions.append(Supervision(text=text, start=start, duration=end - start, speaker=speaker))
252
- except (ValueError, IndexError):
253
- continue
254
-
255
- return supervisions
256
-
257
- @classmethod
258
- def write(cls, supervisions: List[Supervision], output_path, include_speaker: bool = True, **kwargs) -> Path:
259
- """Write AUD format."""
260
- output_path = Path(output_path)
261
- content = cls.to_bytes(supervisions, include_speaker=include_speaker)
262
- output_path.write_bytes(content)
263
- return output_path
264
-
265
- @classmethod
266
- def to_bytes(cls, supervisions: List[Supervision], include_speaker: bool = True, **kwargs) -> bytes:
267
- """Convert to AUD format bytes."""
268
- lines = []
269
- for sup in supervisions:
270
- text = sup.text.strip().replace("\t", " ")
271
- if cls._should_include_speaker(sup, include_speaker):
272
- text = f"{sup.speaker} {text}"
273
- lines.append(f"{sup.start}\t{sup.end}\t{text}")
274
-
275
- return "\n".join(lines).encode("utf-8")
276
-
277
-
278
- @register_format("txt")
279
- class TXTFormat(FormatHandler):
280
- """Plain text format with optional timestamps.
281
-
282
- Format: [start-end] text or [start-end] [speaker]: text
283
- """
284
-
285
- extensions = [".txt"]
286
- description = "Plain text with optional timestamps"
287
-
288
- @classmethod
289
- def read(cls, source, normalize_text: bool = True, **kwargs) -> List[Supervision]:
290
- """Read TXT format."""
291
- if cls.is_content(source):
292
- lines = source.strip().split("\n")
293
- else:
294
- with open(source, "r", encoding="utf-8") as f:
295
- lines = [line.strip() for line in f.readlines()]
296
-
297
- if normalize_text:
298
- lines = [normalize_text_fn(line) for line in lines]
299
-
300
- supervisions = []
301
- for line in lines:
302
- if not line:
303
- continue
304
-
305
- start, end, remaining_text = parse_timestamp_text(line)
306
- if start is not None and end is not None:
307
- speaker, text = parse_speaker_text(remaining_text)
308
- supervisions.append(Supervision(text=text, start=start, duration=end - start, speaker=speaker))
309
- else:
310
- speaker, text = parse_speaker_text(line)
311
- supervisions.append(Supervision(text=text, speaker=speaker))
312
-
313
- return supervisions
314
-
315
- @classmethod
316
- def write(cls, supervisions: List[Supervision], output_path, include_speaker: bool = True, **kwargs) -> Path:
317
- """Write TXT format."""
318
- output_path = Path(output_path)
319
- content = cls.to_bytes(supervisions, include_speaker=include_speaker)
320
- output_path.write_bytes(content)
321
- return output_path
322
-
323
- @classmethod
324
- def to_bytes(cls, supervisions: List[Supervision], include_speaker: bool = True, **kwargs) -> bytes:
325
- """Convert to TXT format bytes."""
326
- lines = []
327
- for sup in supervisions:
328
- text = sup.text or ""
329
- if cls._should_include_speaker(sup, include_speaker):
330
- text = f"{sup.speaker} {text}"
331
- lines.append(f"[{sup.start:.2f}-{sup.end:.2f}] {text}")
332
-
333
- return "\n".join(lines).encode("utf-8")
334
-
335
-
336
- # JSON format moved to json.py for better organization
337
- # Import here for backwards compatibility
338
- from .json import JSONFormat # noqa: F401
@@ -1,193 +0,0 @@
1
- """Praat TextGrid format handler.
2
-
3
- TextGrid is Praat's native annotation format, commonly used in phonetics research.
4
- """
5
-
6
- import tempfile
7
- from pathlib import Path
8
- from typing import Any, Dict, List, Optional, Union
9
-
10
- from lhotse.utils import Pathlike
11
-
12
- from ..supervision import Supervision
13
- from . import register_format
14
- from .base import FormatHandler
15
-
16
-
17
- @register_format("textgrid")
18
- class TextGridFormat(FormatHandler):
19
- """Praat TextGrid format for phonetic analysis."""
20
-
21
- extensions = [".textgrid"]
22
- description = "Praat TextGrid - phonetics research format"
23
-
24
- @classmethod
25
- def read(
26
- cls,
27
- source,
28
- normalize_text: bool = True,
29
- **kwargs,
30
- ) -> List[Supervision]:
31
- """Read TextGrid format using tgt library.
32
-
33
- Preserves tier information in Supervision.custom:
34
- - textgrid_tier: Original tier name
35
- - textgrid_tier_index: Original tier index (for ordering)
36
- """
37
- from tgt import read_textgrid
38
-
39
- if cls.is_content(source):
40
- # Write to temp file for tgt library
41
- with tempfile.NamedTemporaryFile(suffix=".textgrid", delete=False, mode="w") as f:
42
- f.write(source)
43
- temp_path = f.name
44
- try:
45
- tgt = read_textgrid(temp_path)
46
- finally:
47
- Path(temp_path).unlink(missing_ok=True)
48
- else:
49
- tgt = read_textgrid(str(source))
50
-
51
- supervisions = []
52
- for tier_idx, tier in enumerate(tgt.tiers):
53
- for interval in tier.intervals:
54
- supervisions.append(
55
- Supervision(
56
- text=interval.text,
57
- start=interval.start_time,
58
- duration=interval.end_time - interval.start_time,
59
- speaker=tier.name,
60
- custom={
61
- "textgrid_tier": tier.name,
62
- "textgrid_tier_index": tier_idx,
63
- },
64
- )
65
- )
66
-
67
- return sorted(supervisions, key=lambda x: x.start)
68
-
69
- @classmethod
70
- def write(
71
- cls,
72
- supervisions: List[Supervision],
73
- output_path,
74
- include_speaker: bool = True,
75
- metadata: Optional[Dict[str, Any]] = None,
76
- **kwargs,
77
- ) -> Path:
78
- """Write TextGrid format using tgt library.
79
-
80
- Args:
81
- supervisions: List of supervisions to write
82
- output_path: Output file path
83
- include_speaker: Whether to include speaker in text
84
- metadata: Optional metadata (for API consistency)
85
- """
86
- from lhotse.supervision import AlignmentItem
87
- from tgt import Interval, IntervalTier, TextGrid, write_to_file
88
-
89
- output_path = Path(output_path)
90
- tg = TextGrid()
91
-
92
- utterances = []
93
- words = []
94
- scores = {"utterances": [], "words": []}
95
-
96
- for sup in sorted(supervisions, key=lambda x: x.start):
97
- text = sup.text or ""
98
- if include_speaker and sup.speaker:
99
- # Check if speaker should be included
100
- include_this_speaker = True
101
- if hasattr(sup, "custom") and sup.custom and not sup.custom.get("original_speaker", True):
102
- include_this_speaker = False
103
-
104
- if include_this_speaker:
105
- text = f"{sup.speaker} {text}"
106
-
107
- utterances.append(Interval(sup.start, sup.end, text))
108
-
109
- # Extract word-level alignment if present
110
- alignment = getattr(sup, "alignment", None)
111
- if alignment and "word" in alignment:
112
- for item in alignment["word"]:
113
- words.append(Interval(item.start, item.end, item.symbol))
114
- if item.score is not None:
115
- scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
116
-
117
- if hasattr(sup, "custom") and sup.custom and "score" in sup.custom:
118
- scores["utterances"].append(Interval(sup.start, sup.end, f"{sup.custom['score']:.2f}"))
119
-
120
- tg.add_tier(IntervalTier(name="utterances", objects=utterances))
121
-
122
- if words:
123
- tg.add_tier(IntervalTier(name="words", objects=words))
124
-
125
- if scores["utterances"]:
126
- tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
127
- if scores["words"]:
128
- tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
129
-
130
- write_to_file(tg, str(output_path), format="long")
131
- return output_path
132
-
133
- @classmethod
134
- def to_bytes(
135
- cls,
136
- supervisions: List[Supervision],
137
- include_speaker: bool = True,
138
- metadata: Optional[Dict[str, Any]] = None,
139
- **kwargs,
140
- ) -> bytes:
141
- """Convert to TextGrid format bytes.
142
-
143
- Args:
144
- supervisions: List of supervisions to convert
145
- include_speaker: Whether to include speaker in text
146
- metadata: Optional metadata (currently unused, for API consistency)
147
- """
148
- # TextGrid requires file I/O due to tgt library implementation
149
- with tempfile.NamedTemporaryFile(suffix=".textgrid", delete=False) as tmp:
150
- tmp_path = Path(tmp.name)
151
-
152
- try:
153
- cls.write(supervisions, tmp_path, include_speaker, metadata=metadata, **kwargs)
154
- return tmp_path.read_bytes()
155
- finally:
156
- tmp_path.unlink(missing_ok=True)
157
-
158
- @classmethod
159
- def extract_metadata(cls, source: Union[Pathlike, str], **kwargs) -> Dict[str, Any]:
160
- """Extract metadata from TextGrid.
161
-
162
- Returns:
163
- Dict containing:
164
- - textgrid_xmin: Minimum time boundary
165
- - textgrid_xmax: Maximum time boundary
166
- - textgrid_tiers: List of tier names
167
- """
168
- import re
169
- from pathlib import Path
170
-
171
- metadata: Dict[str, Any] = {}
172
- if cls.is_content(source):
173
- content = source
174
- else:
175
- try:
176
- with open(source, "r", encoding="utf-8") as f:
177
- content = f.read()
178
- except Exception:
179
- return {}
180
-
181
- match = re.search(r"xmin\s*=\s*([\d.]+)", content)
182
- if match:
183
- metadata["textgrid_xmin"] = float(match.group(1))
184
- match = re.search(r"xmax\s*=\s*([\d.]+)", content)
185
- if match:
186
- metadata["textgrid_xmax"] = float(match.group(1))
187
-
188
- # Extract tier names
189
- tier_names = re.findall(r'name\s*=\s*"([^"]+)"', content)
190
- if tier_names:
191
- metadata["textgrid_tiers"] = tier_names
192
-
193
- return metadata