lattifai 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lattifai/__init__.py +10 -0
- lattifai/alignment/lattice1_aligner.py +64 -15
- lattifai/alignment/lattice1_worker.py +135 -50
- lattifai/alignment/segmenter.py +3 -2
- lattifai/alignment/tokenizer.py +14 -13
- lattifai/audio2.py +269 -70
- lattifai/caption/caption.py +213 -19
- lattifai/cli/__init__.py +2 -0
- lattifai/cli/alignment.py +2 -1
- lattifai/cli/app_installer.py +35 -33
- lattifai/cli/caption.py +9 -19
- lattifai/cli/diarization.py +108 -0
- lattifai/cli/server.py +3 -1
- lattifai/cli/transcribe.py +55 -38
- lattifai/cli/youtube.py +1 -0
- lattifai/client.py +42 -121
- lattifai/config/alignment.py +37 -2
- lattifai/config/caption.py +1 -1
- lattifai/config/media.py +23 -3
- lattifai/config/transcription.py +4 -0
- lattifai/diarization/lattifai.py +18 -7
- lattifai/errors.py +7 -3
- lattifai/mixin.py +45 -16
- lattifai/server/app.py +2 -1
- lattifai/transcription/__init__.py +1 -1
- lattifai/transcription/base.py +21 -2
- lattifai/transcription/gemini.py +127 -1
- lattifai/transcription/lattifai.py +30 -2
- lattifai/utils.py +96 -28
- lattifai/workflow/file_manager.py +15 -13
- lattifai/workflow/youtube.py +16 -1
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/METADATA +86 -22
- lattifai-1.1.0.dist-info/RECORD +57 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/entry_points.txt +2 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/licenses/LICENSE +1 -1
- lattifai-1.0.4.dist-info/RECORD +0 -56
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/WHEEL +0 -0
- {lattifai-1.0.4.dist-info → lattifai-1.1.0.dist-info}/top_level.txt +0 -0
lattifai/caption/caption.py
CHANGED
|
@@ -4,17 +4,19 @@ import json
|
|
|
4
4
|
import re
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Optional,
|
|
7
|
+
from typing import Any, Dict, List, Optional, TypeVar
|
|
8
8
|
|
|
9
9
|
from lhotse.supervision import AlignmentItem
|
|
10
10
|
from lhotse.utils import Pathlike
|
|
11
11
|
from tgt import TextGrid
|
|
12
12
|
|
|
13
|
-
from ..config.caption import InputCaptionFormat, OutputCaptionFormat
|
|
13
|
+
from ..config.caption import InputCaptionFormat, OutputCaptionFormat # noqa: F401
|
|
14
14
|
from .supervision import Supervision
|
|
15
15
|
from .text_parser import normalize_text as normalize_text_fn
|
|
16
16
|
from .text_parser import parse_speaker_text, parse_timestamp_text
|
|
17
17
|
|
|
18
|
+
DiarizationOutput = TypeVar("DiarizationOutput")
|
|
19
|
+
|
|
18
20
|
|
|
19
21
|
@dataclass
|
|
20
22
|
class Caption:
|
|
@@ -40,7 +42,7 @@ class Caption:
|
|
|
40
42
|
# Audio Event Detection results
|
|
41
43
|
audio_events: Optional[TextGrid] = None
|
|
42
44
|
# Speaker Diarization results
|
|
43
|
-
speaker_diarization: Optional[
|
|
45
|
+
speaker_diarization: Optional[DiarizationOutput] = None
|
|
44
46
|
# Alignment results
|
|
45
47
|
alignments: List[Supervision] = field(default_factory=list)
|
|
46
48
|
|
|
@@ -272,7 +274,7 @@ class Caption:
|
|
|
272
274
|
cls,
|
|
273
275
|
transcription: List[Supervision],
|
|
274
276
|
audio_events: Optional[TextGrid] = None,
|
|
275
|
-
speaker_diarization: Optional[
|
|
277
|
+
speaker_diarization: Optional[DiarizationOutput] = None,
|
|
276
278
|
language: Optional[str] = None,
|
|
277
279
|
source_path: Optional[Pathlike] = None,
|
|
278
280
|
metadata: Optional[Dict[str, str]] = None,
|
|
@@ -283,7 +285,7 @@ class Caption:
|
|
|
283
285
|
Args:
|
|
284
286
|
transcription: List of transcription supervision segments
|
|
285
287
|
audio_events: Optional TextGrid with audio event detection results
|
|
286
|
-
speaker_diarization: Optional
|
|
288
|
+
speaker_diarization: Optional DiarizationOutput with speaker diarization results
|
|
287
289
|
language: Language code
|
|
288
290
|
source_path: Source file path
|
|
289
291
|
metadata: Additional metadata
|
|
@@ -307,7 +309,7 @@ class Caption:
|
|
|
307
309
|
cls,
|
|
308
310
|
path: Pathlike,
|
|
309
311
|
format: Optional[str] = None,
|
|
310
|
-
normalize_text: bool =
|
|
312
|
+
normalize_text: bool = True,
|
|
311
313
|
) -> "Caption":
|
|
312
314
|
"""
|
|
313
315
|
Read caption file and return Caption object.
|
|
@@ -384,9 +386,9 @@ class Caption:
|
|
|
384
386
|
"""
|
|
385
387
|
Read speaker diarization TextGrid from file.
|
|
386
388
|
"""
|
|
387
|
-
from
|
|
389
|
+
from lattifai_core.diarization import DiarizationOutput
|
|
388
390
|
|
|
389
|
-
self.speaker_diarization =
|
|
391
|
+
self.speaker_diarization = DiarizationOutput.read(path)
|
|
390
392
|
return self.speaker_diarization
|
|
391
393
|
|
|
392
394
|
def write_speaker_diarization(
|
|
@@ -399,9 +401,7 @@ class Caption:
|
|
|
399
401
|
if not self.speaker_diarization:
|
|
400
402
|
raise ValueError("No speaker diarization data to write.")
|
|
401
403
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
write_to_file(self.speaker_diarization, path, format="long")
|
|
404
|
+
self.speaker_diarization.write(path)
|
|
405
405
|
return path
|
|
406
406
|
|
|
407
407
|
@staticmethod
|
|
@@ -451,7 +451,10 @@ class Caption:
|
|
|
451
451
|
else:
|
|
452
452
|
if include_speaker_in_text and sup.speaker is not None:
|
|
453
453
|
# Use [SPEAKER]: format for consistency with parsing
|
|
454
|
-
|
|
454
|
+
if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
|
|
455
|
+
text = f"[{sup.speaker}]: {sup.text}"
|
|
456
|
+
else:
|
|
457
|
+
text = f"{sup.text}"
|
|
455
458
|
else:
|
|
456
459
|
text = sup.text
|
|
457
460
|
f.write(f"[{sup.start:.2f}-{sup.end:.2f}] {text}\n")
|
|
@@ -471,7 +474,12 @@ class Caption:
|
|
|
471
474
|
tg = TextGrid()
|
|
472
475
|
supervisions, words, scores = [], [], {"utterances": [], "words": []}
|
|
473
476
|
for supervision in sorted(alignments, key=lambda x: x.start):
|
|
474
|
-
|
|
477
|
+
# Respect `original_speaker` custom flag: default to include speaker when missing
|
|
478
|
+
if (
|
|
479
|
+
include_speaker_in_text
|
|
480
|
+
and supervision.speaker is not None
|
|
481
|
+
and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
|
|
482
|
+
):
|
|
475
483
|
text = f"{supervision.speaker} {supervision.text}"
|
|
476
484
|
else:
|
|
477
485
|
text = supervision.text
|
|
@@ -505,6 +513,8 @@ class Caption:
|
|
|
505
513
|
cls._write_csv(alignments, output_path, include_speaker_in_text)
|
|
506
514
|
elif str(output_path)[-4:].lower() == ".aud":
|
|
507
515
|
cls._write_aud(alignments, output_path, include_speaker_in_text)
|
|
516
|
+
elif str(output_path)[-4:].lower() == ".sbv":
|
|
517
|
+
cls._write_sbv(alignments, output_path, include_speaker_in_text)
|
|
508
518
|
else:
|
|
509
519
|
import pysubs2
|
|
510
520
|
|
|
@@ -524,7 +534,10 @@ class Caption:
|
|
|
524
534
|
)
|
|
525
535
|
else:
|
|
526
536
|
if include_speaker_in_text and sup.speaker is not None:
|
|
527
|
-
|
|
537
|
+
if not sup.has_custom("original_speaker") or sup.custom["original_speaker"]:
|
|
538
|
+
text = f"{sup.speaker} {sup.text}"
|
|
539
|
+
else:
|
|
540
|
+
text = f"{sup.text}"
|
|
528
541
|
else:
|
|
529
542
|
text = sup.text
|
|
530
543
|
subs.append(
|
|
@@ -535,7 +548,14 @@ class Caption:
|
|
|
535
548
|
name=sup.speaker or "",
|
|
536
549
|
)
|
|
537
550
|
)
|
|
538
|
-
|
|
551
|
+
|
|
552
|
+
# MicroDVD format requires framerate to be specified
|
|
553
|
+
output_ext = str(output_path).lower().split(".")[-1]
|
|
554
|
+
if output_ext == "sub":
|
|
555
|
+
# Default to 25 fps for MicroDVD format if not specified
|
|
556
|
+
subs.save(output_path, fps=25.0)
|
|
557
|
+
else:
|
|
558
|
+
subs.save(output_path)
|
|
539
559
|
|
|
540
560
|
return output_path
|
|
541
561
|
|
|
@@ -821,7 +841,16 @@ class Caption:
|
|
|
821
841
|
if cls._is_youtube_vtt_with_word_timestamps(content):
|
|
822
842
|
return cls._parse_youtube_vtt_with_word_timestamps(content, normalize_text)
|
|
823
843
|
|
|
824
|
-
|
|
844
|
+
# Match Gemini format: explicit format, or files ending with Gemini.md/Gemini3.md,
|
|
845
|
+
# or files containing "gemini" in the name with .md extension
|
|
846
|
+
caption_str = str(caption).lower()
|
|
847
|
+
is_gemini_format = (
|
|
848
|
+
format == "gemini"
|
|
849
|
+
or str(caption).endswith("Gemini.md")
|
|
850
|
+
or str(caption).endswith("Gemini3.md")
|
|
851
|
+
or ("gemini" in caption_str and caption_str.endswith(".md"))
|
|
852
|
+
)
|
|
853
|
+
if is_gemini_format:
|
|
825
854
|
from .gemini_reader import GeminiReader
|
|
826
855
|
|
|
827
856
|
supervisions = GeminiReader.extract_for_alignment(caption)
|
|
@@ -850,6 +879,8 @@ class Caption:
|
|
|
850
879
|
supervisions = cls._parse_csv(caption, normalize_text)
|
|
851
880
|
elif format == "aud" or str(caption)[-4:].lower() == ".aud":
|
|
852
881
|
supervisions = cls._parse_aud(caption, normalize_text)
|
|
882
|
+
elif format == "sbv" or str(caption)[-4:].lower() == ".sbv":
|
|
883
|
+
supervisions = cls._parse_sbv(caption, normalize_text)
|
|
853
884
|
elif format == "txt" or (format == "auto" and str(caption)[-4:].lower() == ".txt"):
|
|
854
885
|
if not Path(str(caption)).exists(): # str
|
|
855
886
|
lines = [line.strip() for line in str(caption).split("\n")]
|
|
@@ -1113,6 +1144,101 @@ class Caption:
|
|
|
1113
1144
|
|
|
1114
1145
|
return supervisions
|
|
1115
1146
|
|
|
1147
|
+
@classmethod
|
|
1148
|
+
def _parse_sbv(cls, caption: Pathlike, normalize_text: Optional[bool] = False) -> List[Supervision]:
|
|
1149
|
+
"""
|
|
1150
|
+
Parse SubViewer (SBV) format caption file.
|
|
1151
|
+
|
|
1152
|
+
Format:
|
|
1153
|
+
0:00:00.000,0:00:02.000
|
|
1154
|
+
Text line 1
|
|
1155
|
+
|
|
1156
|
+
0:00:02.000,0:00:04.000
|
|
1157
|
+
Text line 2
|
|
1158
|
+
|
|
1159
|
+
Args:
|
|
1160
|
+
caption: Caption file path
|
|
1161
|
+
normalize_text: Whether to normalize text
|
|
1162
|
+
|
|
1163
|
+
Returns:
|
|
1164
|
+
List of Supervision objects
|
|
1165
|
+
"""
|
|
1166
|
+
caption_path = Path(str(caption))
|
|
1167
|
+
if not caption_path.exists():
|
|
1168
|
+
raise FileNotFoundError(f"Caption file not found: {caption}")
|
|
1169
|
+
|
|
1170
|
+
supervisions = []
|
|
1171
|
+
|
|
1172
|
+
with open(caption_path, "r", encoding="utf-8") as f:
|
|
1173
|
+
content = f.read()
|
|
1174
|
+
|
|
1175
|
+
# Split by double newlines to separate entries
|
|
1176
|
+
entries = content.strip().split("\n\n")
|
|
1177
|
+
|
|
1178
|
+
for entry in entries:
|
|
1179
|
+
lines = entry.strip().split("\n")
|
|
1180
|
+
if len(lines) < 2:
|
|
1181
|
+
continue
|
|
1182
|
+
|
|
1183
|
+
# First line: timestamp (H:MM:SS.mmm,H:MM:SS.mmm)
|
|
1184
|
+
timestamp_line = lines[0].strip()
|
|
1185
|
+
# Remaining lines: text
|
|
1186
|
+
text_lines = lines[1:]
|
|
1187
|
+
|
|
1188
|
+
try:
|
|
1189
|
+
# Parse timestamp: 0:00:00.000,0:00:02.000
|
|
1190
|
+
if "," not in timestamp_line:
|
|
1191
|
+
continue
|
|
1192
|
+
|
|
1193
|
+
start_str, end_str = timestamp_line.split(",", 1)
|
|
1194
|
+
|
|
1195
|
+
# Parse start time
|
|
1196
|
+
start_parts = start_str.strip().split(":")
|
|
1197
|
+
if len(start_parts) == 3:
|
|
1198
|
+
h, m, s = start_parts
|
|
1199
|
+
s_parts = s.split(".")
|
|
1200
|
+
start = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
|
|
1201
|
+
if len(s_parts) > 1:
|
|
1202
|
+
start += int(s_parts[1]) / 1000.0
|
|
1203
|
+
else:
|
|
1204
|
+
continue
|
|
1205
|
+
|
|
1206
|
+
# Parse end time
|
|
1207
|
+
end_parts = end_str.strip().split(":")
|
|
1208
|
+
if len(end_parts) == 3:
|
|
1209
|
+
h, m, s = end_parts
|
|
1210
|
+
s_parts = s.split(".")
|
|
1211
|
+
end = int(h) * 3600 + int(m) * 60 + int(s_parts[0])
|
|
1212
|
+
if len(s_parts) > 1:
|
|
1213
|
+
end += int(s_parts[1]) / 1000.0
|
|
1214
|
+
else:
|
|
1215
|
+
continue
|
|
1216
|
+
|
|
1217
|
+
# Parse text and speaker
|
|
1218
|
+
text = " ".join(text_lines).strip()
|
|
1219
|
+
speaker, text = parse_speaker_text(text)
|
|
1220
|
+
|
|
1221
|
+
if normalize_text:
|
|
1222
|
+
text = normalize_text_fn(text)
|
|
1223
|
+
|
|
1224
|
+
duration = end - start
|
|
1225
|
+
if duration < 0:
|
|
1226
|
+
continue
|
|
1227
|
+
|
|
1228
|
+
supervisions.append(
|
|
1229
|
+
Supervision(
|
|
1230
|
+
text=text,
|
|
1231
|
+
start=start,
|
|
1232
|
+
duration=duration,
|
|
1233
|
+
speaker=speaker,
|
|
1234
|
+
)
|
|
1235
|
+
)
|
|
1236
|
+
except (ValueError, IndexError):
|
|
1237
|
+
# Skip malformed entries
|
|
1238
|
+
continue
|
|
1239
|
+
|
|
1240
|
+
return supervisions
|
|
1241
|
+
|
|
1116
1242
|
@classmethod
|
|
1117
1243
|
def _write_tsv(
|
|
1118
1244
|
cls,
|
|
@@ -1136,7 +1262,11 @@ class Caption:
|
|
|
1136
1262
|
if include_speaker_in_text:
|
|
1137
1263
|
file.write("speaker\tstart\tend\ttext\n")
|
|
1138
1264
|
for supervision in alignments:
|
|
1139
|
-
|
|
1265
|
+
# Respect `original_speaker` custom flag: default to True when missing
|
|
1266
|
+
include_speaker = supervision.speaker and (
|
|
1267
|
+
not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
|
|
1268
|
+
)
|
|
1269
|
+
speaker = supervision.speaker if include_speaker else ""
|
|
1140
1270
|
start_ms = round(1000 * supervision.start)
|
|
1141
1271
|
end_ms = round(1000 * supervision.end)
|
|
1142
1272
|
text = supervision.text.strip().replace("\t", " ")
|
|
@@ -1174,7 +1304,10 @@ class Caption:
|
|
|
1174
1304
|
writer = csv.writer(file)
|
|
1175
1305
|
writer.writerow(["speaker", "start", "end", "text"])
|
|
1176
1306
|
for supervision in alignments:
|
|
1177
|
-
|
|
1307
|
+
include_speaker = supervision.speaker and (
|
|
1308
|
+
not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"]
|
|
1309
|
+
)
|
|
1310
|
+
speaker = supervision.speaker if include_speaker else ""
|
|
1178
1311
|
start_ms = round(1000 * supervision.start)
|
|
1179
1312
|
end_ms = round(1000 * supervision.end)
|
|
1180
1313
|
text = supervision.text.strip()
|
|
@@ -1212,11 +1345,72 @@ class Caption:
|
|
|
1212
1345
|
end = supervision.end
|
|
1213
1346
|
text = supervision.text.strip().replace("\t", " ")
|
|
1214
1347
|
|
|
1215
|
-
|
|
1348
|
+
# Respect `original_speaker` custom flag when adding speaker prefix
|
|
1349
|
+
if (
|
|
1350
|
+
include_speaker_in_text
|
|
1351
|
+
and supervision.speaker
|
|
1352
|
+
and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
|
|
1353
|
+
):
|
|
1216
1354
|
text = f"[[{supervision.speaker}]]{text}"
|
|
1217
1355
|
|
|
1218
1356
|
file.write(f"{start}\t{end}\t{text}\n")
|
|
1219
1357
|
|
|
1358
|
+
@classmethod
|
|
1359
|
+
def _write_sbv(
|
|
1360
|
+
cls,
|
|
1361
|
+
alignments: List[Supervision],
|
|
1362
|
+
output_path: Pathlike,
|
|
1363
|
+
include_speaker_in_text: bool = True,
|
|
1364
|
+
) -> None:
|
|
1365
|
+
"""
|
|
1366
|
+
Write caption to SubViewer (SBV) format.
|
|
1367
|
+
|
|
1368
|
+
Format:
|
|
1369
|
+
0:00:00.000,0:00:02.000
|
|
1370
|
+
Text line 1
|
|
1371
|
+
|
|
1372
|
+
0:00:02.000,0:00:04.000
|
|
1373
|
+
Text line 2
|
|
1374
|
+
|
|
1375
|
+
Args:
|
|
1376
|
+
alignments: List of supervision segments to write
|
|
1377
|
+
output_path: Path to output SBV file
|
|
1378
|
+
include_speaker_in_text: Whether to include speaker in text
|
|
1379
|
+
"""
|
|
1380
|
+
with open(output_path, "w", encoding="utf-8") as file:
|
|
1381
|
+
for i, supervision in enumerate(alignments):
|
|
1382
|
+
# Format timestamps as H:MM:SS.mmm
|
|
1383
|
+
start_h = int(supervision.start // 3600)
|
|
1384
|
+
start_m = int((supervision.start % 3600) // 60)
|
|
1385
|
+
start_s = int(supervision.start % 60)
|
|
1386
|
+
start_ms = int((supervision.start % 1) * 1000)
|
|
1387
|
+
|
|
1388
|
+
end_h = int(supervision.end // 3600)
|
|
1389
|
+
end_m = int((supervision.end % 3600) // 60)
|
|
1390
|
+
end_s = int(supervision.end % 60)
|
|
1391
|
+
end_ms = int((supervision.end % 1) * 1000)
|
|
1392
|
+
|
|
1393
|
+
start_time = f"{start_h}:{start_m:02d}:{start_s:02d}.{start_ms:03d}"
|
|
1394
|
+
end_time = f"{end_h}:{end_m:02d}:{end_s:02d}.{end_ms:03d}"
|
|
1395
|
+
|
|
1396
|
+
# Write timestamp line
|
|
1397
|
+
file.write(f"{start_time},{end_time}\n")
|
|
1398
|
+
|
|
1399
|
+
# Write text (with optional speaker). Respect `original_speaker` custom flag.
|
|
1400
|
+
text = supervision.text.strip()
|
|
1401
|
+
if (
|
|
1402
|
+
include_speaker_in_text
|
|
1403
|
+
and supervision.speaker
|
|
1404
|
+
and (not supervision.has_custom("original_speaker") or supervision.custom["original_speaker"])
|
|
1405
|
+
):
|
|
1406
|
+
text = f"{supervision.speaker}: {text}"
|
|
1407
|
+
|
|
1408
|
+
file.write(f"{text}\n")
|
|
1409
|
+
|
|
1410
|
+
# Add blank line between entries (except after last one)
|
|
1411
|
+
if i < len(alignments) - 1:
|
|
1412
|
+
file.write("\n")
|
|
1413
|
+
|
|
1220
1414
|
@classmethod
|
|
1221
1415
|
def _parse_caption(
|
|
1222
1416
|
cls, caption: Pathlike, format: Optional[OutputCaptionFormat], normalize_text: Optional[bool] = False
|
lattifai/cli/__init__.py
CHANGED
|
@@ -5,12 +5,14 @@ import nemo_run as run # noqa: F401
|
|
|
5
5
|
# Import and re-export entrypoints at package level so NeMo Run can find them
|
|
6
6
|
from lattifai.cli.alignment import align
|
|
7
7
|
from lattifai.cli.caption import convert
|
|
8
|
+
from lattifai.cli.diarization import diarize
|
|
8
9
|
from lattifai.cli.transcribe import transcribe, transcribe_align
|
|
9
10
|
from lattifai.cli.youtube import youtube
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"align",
|
|
13
14
|
"convert",
|
|
15
|
+
"diarize",
|
|
14
16
|
"transcribe",
|
|
15
17
|
"transcribe_align",
|
|
16
18
|
"youtube",
|
lattifai/cli/alignment.py
CHANGED
|
@@ -81,7 +81,7 @@ def align(
|
|
|
81
81
|
caption.word_level=true \\
|
|
82
82
|
caption.normalize_text=true \\
|
|
83
83
|
alignment.device=mps \\
|
|
84
|
-
alignment.model_name=
|
|
84
|
+
alignment.model_name=LattifAI/Lattice-1-Alpha
|
|
85
85
|
"""
|
|
86
86
|
media_config = media or MediaConfig()
|
|
87
87
|
|
|
@@ -142,6 +142,7 @@ def align(
|
|
|
142
142
|
output_caption_path=caption_config.output_path,
|
|
143
143
|
split_sentence=caption_config.split_sentence,
|
|
144
144
|
channel_selector=media_config.channel_selector,
|
|
145
|
+
streaming_chunk_secs=media_config.streaming_chunk_secs,
|
|
145
146
|
)
|
|
146
147
|
|
|
147
148
|
|
lattifai/cli/app_installer.py
CHANGED
|
@@ -5,6 +5,8 @@ import subprocess
|
|
|
5
5
|
import sys
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
+
from lattifai.utils import safe_print
|
|
9
|
+
|
|
8
10
|
|
|
9
11
|
def check_command_exists(cmd: str) -> bool:
|
|
10
12
|
"""Check if a command exists in PATH."""
|
|
@@ -19,17 +21,17 @@ def install_nodejs():
|
|
|
19
21
|
"""Install Node.js based on the operating system."""
|
|
20
22
|
system = platform.system().lower()
|
|
21
23
|
|
|
22
|
-
|
|
24
|
+
safe_print("📦 Node.js not found. Installing Node.js...\n")
|
|
23
25
|
|
|
24
26
|
try:
|
|
25
27
|
if system == "darwin": # macOS
|
|
26
28
|
# Check if Homebrew is installed
|
|
27
29
|
if check_command_exists("brew"):
|
|
28
|
-
|
|
30
|
+
safe_print("🍺 Using Homebrew to install Node.js...")
|
|
29
31
|
subprocess.run(["brew", "install", "node"], check=True)
|
|
30
|
-
|
|
32
|
+
safe_print("✓ Node.js installed via Homebrew\n")
|
|
31
33
|
else:
|
|
32
|
-
|
|
34
|
+
safe_print("❌ Homebrew not found.")
|
|
33
35
|
print(" Please install Homebrew first:")
|
|
34
36
|
print(
|
|
35
37
|
' /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"'
|
|
@@ -40,46 +42,46 @@ def install_nodejs():
|
|
|
40
42
|
elif system == "linux":
|
|
41
43
|
# Try common package managers
|
|
42
44
|
if check_command_exists("apt"):
|
|
43
|
-
|
|
45
|
+
safe_print("🐧 Using apt to install Node.js...")
|
|
44
46
|
subprocess.run(["sudo", "apt", "update"], check=True)
|
|
45
47
|
subprocess.run(["sudo", "apt", "install", "-y", "nodejs", "npm"], check=True)
|
|
46
|
-
|
|
48
|
+
safe_print("✓ Node.js installed via apt\n")
|
|
47
49
|
elif check_command_exists("yum"):
|
|
48
|
-
|
|
50
|
+
safe_print("🐧 Using yum to install Node.js...")
|
|
49
51
|
subprocess.run(["sudo", "yum", "install", "-y", "nodejs", "npm"], check=True)
|
|
50
|
-
|
|
52
|
+
safe_print("✓ Node.js installed via yum\n")
|
|
51
53
|
elif check_command_exists("dnf"):
|
|
52
|
-
|
|
54
|
+
safe_print("🐧 Using dnf to install Node.js...")
|
|
53
55
|
subprocess.run(["sudo", "dnf", "install", "-y", "nodejs", "npm"], check=True)
|
|
54
|
-
|
|
56
|
+
safe_print("✓ Node.js installed via dnf\n")
|
|
55
57
|
elif check_command_exists("pacman"):
|
|
56
|
-
|
|
58
|
+
safe_print("🐧 Using pacman to install Node.js...")
|
|
57
59
|
subprocess.run(["sudo", "pacman", "-S", "--noconfirm", "nodejs", "npm"], check=True)
|
|
58
|
-
|
|
60
|
+
safe_print("✓ Node.js installed via pacman\n")
|
|
59
61
|
else:
|
|
60
|
-
|
|
62
|
+
safe_print("❌ No supported package manager found (apt/yum/dnf/pacman).")
|
|
61
63
|
print(" Please install Node.js manually from: https://nodejs.org/")
|
|
62
64
|
sys.exit(1)
|
|
63
65
|
|
|
64
66
|
elif system == "windows":
|
|
65
|
-
|
|
67
|
+
safe_print("❌ Automatic installation on Windows is not supported.")
|
|
66
68
|
print(" Please download and install Node.js from: https://nodejs.org/")
|
|
67
69
|
print(" Then run this command again.")
|
|
68
70
|
sys.exit(1)
|
|
69
71
|
|
|
70
72
|
else:
|
|
71
|
-
|
|
73
|
+
safe_print(f"❌ Unsupported operating system: {system}")
|
|
72
74
|
print(" Please install Node.js manually from: https://nodejs.org/")
|
|
73
75
|
sys.exit(1)
|
|
74
76
|
|
|
75
77
|
# Verify installation
|
|
76
78
|
if not check_command_exists("npm"):
|
|
77
|
-
|
|
79
|
+
safe_print("❌ Node.js installation verification failed.")
|
|
78
80
|
print(" Please restart your terminal and try again.")
|
|
79
81
|
sys.exit(1)
|
|
80
82
|
|
|
81
83
|
except subprocess.CalledProcessError as e:
|
|
82
|
-
|
|
84
|
+
safe_print(f"\n❌ Error during Node.js installation: {e}")
|
|
83
85
|
print(" Please install Node.js manually from: https://nodejs.org/")
|
|
84
86
|
sys.exit(1)
|
|
85
87
|
|
|
@@ -90,49 +92,49 @@ def main():
|
|
|
90
92
|
app_dir = Path(__file__).parent.parent.parent.parent / "app"
|
|
91
93
|
|
|
92
94
|
if not app_dir.exists():
|
|
93
|
-
|
|
95
|
+
safe_print(f"❌ Error: app directory not found at {app_dir}")
|
|
94
96
|
print(" Make sure you're in the lattifai-python repository.")
|
|
95
97
|
sys.exit(1)
|
|
96
98
|
|
|
97
|
-
|
|
99
|
+
safe_print("🚀 Installing lai-app (LattifAI Web Application)...\n")
|
|
98
100
|
|
|
99
101
|
# Check if npm is installed, if not, install Node.js
|
|
100
102
|
if not check_command_exists("npm"):
|
|
101
103
|
install_nodejs()
|
|
102
104
|
else:
|
|
103
105
|
npm_version = subprocess.run(["npm", "--version"], capture_output=True, text=True, check=True).stdout.strip()
|
|
104
|
-
|
|
106
|
+
safe_print(f"✓ npm is already installed (v{npm_version})\n")
|
|
105
107
|
|
|
106
108
|
# Change to app directory and run installation
|
|
107
109
|
try:
|
|
108
|
-
|
|
110
|
+
safe_print(f"📁 Working directory: {app_dir}\n")
|
|
109
111
|
|
|
110
112
|
# Install dependencies
|
|
111
|
-
|
|
113
|
+
safe_print("📦 Installing dependencies...")
|
|
112
114
|
subprocess.run(["npm", "install"], cwd=app_dir, check=True)
|
|
113
|
-
|
|
115
|
+
safe_print("✓ Dependencies installed\n")
|
|
114
116
|
|
|
115
117
|
# Build the application
|
|
116
|
-
|
|
118
|
+
safe_print("🔨 Building application...")
|
|
117
119
|
subprocess.run(["npm", "run", "build"], cwd=app_dir, check=True)
|
|
118
|
-
|
|
120
|
+
safe_print("✓ Application built\n")
|
|
119
121
|
|
|
120
122
|
# Link globally
|
|
121
|
-
|
|
123
|
+
safe_print("🔗 Linking lai-app command globally...")
|
|
122
124
|
subprocess.run(["npm", "link"], cwd=app_dir, check=True)
|
|
123
|
-
|
|
125
|
+
safe_print("✓ lai-app command linked globally\n")
|
|
124
126
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
127
|
+
safe_print("=" * 60)
|
|
128
|
+
safe_print("✅ lai-app installed successfully!")
|
|
129
|
+
safe_print("=" * 60)
|
|
130
|
+
safe_print("\n🎉 You can now run:")
|
|
129
131
|
print(" lai-app # Start the web application")
|
|
130
132
|
print(" lai-app --help # Show help")
|
|
131
133
|
print(" lai-app --port 8080 # Use custom port")
|
|
132
|
-
|
|
134
|
+
safe_print("\n📖 For more information, see app/CLI_USAGE.md\n")
|
|
133
135
|
|
|
134
136
|
except subprocess.CalledProcessError as e:
|
|
135
|
-
|
|
137
|
+
safe_print(f"\n❌ Error during installation: {e}")
|
|
136
138
|
sys.exit(1)
|
|
137
139
|
|
|
138
140
|
|
lattifai/cli/caption.py
CHANGED
|
@@ -7,13 +7,14 @@ from lhotse.utils import Pathlike
|
|
|
7
7
|
from typing_extensions import Annotated
|
|
8
8
|
|
|
9
9
|
from lattifai.config import CaptionConfig
|
|
10
|
+
from lattifai.utils import safe_print
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
@run.cli.entrypoint(name="convert", namespace="caption")
|
|
13
14
|
def convert(
|
|
14
15
|
input_path: Pathlike,
|
|
15
16
|
output_path: Pathlike,
|
|
16
|
-
include_speaker_in_text: bool =
|
|
17
|
+
include_speaker_in_text: bool = False,
|
|
17
18
|
normalize_text: bool = False,
|
|
18
19
|
):
|
|
19
20
|
"""
|
|
@@ -55,7 +56,7 @@ def convert(
|
|
|
55
56
|
caption = Caption.read(input_path, normalize_text=normalize_text)
|
|
56
57
|
caption.write(output_path, include_speaker_in_text=include_speaker_in_text)
|
|
57
58
|
|
|
58
|
-
|
|
59
|
+
safe_print(f"✅ Converted {input_path} -> {output_path}")
|
|
59
60
|
return output_path
|
|
60
61
|
|
|
61
62
|
|
|
@@ -63,7 +64,6 @@ def convert(
|
|
|
63
64
|
def normalize(
|
|
64
65
|
input_path: Pathlike,
|
|
65
66
|
output_path: Pathlike,
|
|
66
|
-
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
67
67
|
):
|
|
68
68
|
"""
|
|
69
69
|
Normalize caption text by cleaning HTML entities and whitespace.
|
|
@@ -81,9 +81,6 @@ def normalize(
|
|
|
81
81
|
Args:
|
|
82
82
|
input_path: Path to input caption file to normalize
|
|
83
83
|
output_path: Path to output caption file (defaults to overwriting input file)
|
|
84
|
-
caption: Caption configuration for text normalization.
|
|
85
|
-
Fields: input_format, output_format, normalize_text (automatically enabled),
|
|
86
|
-
encoding
|
|
87
84
|
|
|
88
85
|
Examples:
|
|
89
86
|
# Normalize and save to new file (positional arguments)
|
|
@@ -92,13 +89,9 @@ def normalize(
|
|
|
92
89
|
# Normalize with format conversion
|
|
93
90
|
lai caption normalize input.vtt output.srt
|
|
94
91
|
|
|
95
|
-
# Normalize with custom caption config
|
|
96
|
-
lai caption normalize input.srt output.srt \\
|
|
97
|
-
caption.encoding=utf-8
|
|
98
|
-
|
|
99
92
|
# Using keyword arguments (traditional syntax)
|
|
100
|
-
lai caption normalize
|
|
101
|
-
input_path=input.srt
|
|
93
|
+
lai caption normalize \
|
|
94
|
+
input_path=input.srt \
|
|
102
95
|
output_path=output.srt
|
|
103
96
|
"""
|
|
104
97
|
from pathlib import Path
|
|
@@ -112,9 +105,9 @@ def normalize(
|
|
|
112
105
|
caption_obj.write(output_path, include_speaker_in_text=True)
|
|
113
106
|
|
|
114
107
|
if output_path == input_path:
|
|
115
|
-
|
|
108
|
+
safe_print(f"✅ Normalized {input_path} (in-place)")
|
|
116
109
|
else:
|
|
117
|
-
|
|
110
|
+
safe_print(f"✅ Normalized {input_path} -> {output_path}")
|
|
118
111
|
|
|
119
112
|
return output_path
|
|
120
113
|
|
|
@@ -124,7 +117,6 @@ def shift(
|
|
|
124
117
|
input_path: Pathlike,
|
|
125
118
|
output_path: Pathlike,
|
|
126
119
|
seconds: float,
|
|
127
|
-
caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
|
|
128
120
|
):
|
|
129
121
|
"""
|
|
130
122
|
Shift caption timestamps by a specified number of seconds.
|
|
@@ -140,8 +132,6 @@ def shift(
|
|
|
140
132
|
output_path: Path to output caption file (can be same as input for in-place modification)
|
|
141
133
|
seconds: Number of seconds to shift timestamps. Positive values delay captions,
|
|
142
134
|
negative values advance them earlier.
|
|
143
|
-
caption: Caption configuration for reading/writing.
|
|
144
|
-
Fields: input_format, output_format, encoding
|
|
145
135
|
|
|
146
136
|
Examples:
|
|
147
137
|
# Delay captions by 2 seconds (positional arguments)
|
|
@@ -181,9 +171,9 @@ def shift(
|
|
|
181
171
|
direction = f"advanced by {abs(seconds)}s"
|
|
182
172
|
|
|
183
173
|
if output_path == input_path:
|
|
184
|
-
|
|
174
|
+
safe_print(f"✅ Shifted timestamps {direction} in {input_path} (in-place)")
|
|
185
175
|
else:
|
|
186
|
-
|
|
176
|
+
safe_print(f"✅ Shifted timestamps {direction}: {input_path} -> {output_path}")
|
|
187
177
|
|
|
188
178
|
return output_path
|
|
189
179
|
|