PyPI - lattifai - Versions diffs - 0.4.5__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

lattifai 0.4.5py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

lattifai/__init__.py +26 -27
lattifai/base_client.py +7 -7
lattifai/bin/agent.py +90 -91
lattifai/bin/align.py +110 -111
lattifai/bin/cli_base.py +3 -3
lattifai/bin/subtitle.py +45 -45
lattifai/client.py +56 -56
lattifai/errors.py +73 -73
lattifai/io/__init__.py +12 -11
lattifai/io/gemini_reader.py +30 -30
lattifai/io/gemini_writer.py +17 -17
lattifai/io/reader.py +13 -12
lattifai/io/supervision.py +3 -3
lattifai/io/text_parser.py +43 -16
lattifai/io/utils.py +4 -4
lattifai/io/writer.py +31 -19
lattifai/tokenizer/__init__.py +1 -1
lattifai/tokenizer/phonemizer.py +3 -3
lattifai/tokenizer/tokenizer.py +83 -82
lattifai/utils.py +15 -15
lattifai/workers/__init__.py +1 -1
lattifai/workers/lattice1_alpha.py +46 -46
lattifai/workflows/__init__.py +11 -11
lattifai/workflows/agents.py +2 -0
lattifai/workflows/base.py +22 -22
lattifai/workflows/file_manager.py +182 -182
lattifai/workflows/gemini.py +29 -29
lattifai/workflows/prompts/__init__.py +4 -4
lattifai/workflows/youtube.py +233 -233
{lattifai-0.4.5.dist-info → lattifai-0.4.6.dist-info}/METADATA +7 -9
lattifai-0.4.6.dist-info/RECORD +39 -0
{lattifai-0.4.5.dist-info → lattifai-0.4.6.dist-info}/licenses/LICENSE +1 -1
lattifai-0.4.5.dist-info/RECORD +0 -39
{lattifai-0.4.5.dist-info → lattifai-0.4.6.dist-info}/WHEEL +0 -0
{lattifai-0.4.5.dist-info → lattifai-0.4.6.dist-info}/entry_points.txt +0 -0
{lattifai-0.4.5.dist-info → lattifai-0.4.6.dist-info}/top_level.txt +0 -0

lattifai/io/gemini_reader.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """Reader for YouTube transcript files with speaker labels and timestamps."""
 import re
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Optional
 from lhotse.utils import Pathlike
@@ -18,7 +18,7 @@ class GeminiSegment:
     timestamp: Optional[float] = None
     speaker: Optional[str] = None
     section: Optional[str] = None
-    segment_type: str = 'dialogue'  # 'dialogue', 'event', or 'section_header'
+    segment_type: str = "dialogue"  # 'dialogue', 'event', or 'section_header'
     line_number: int = 0
     @property
@@ -31,15 +31,15 @@ class GeminiReader:
     """Parser for YouTube transcript format with speaker labels and timestamps."""
     # Regex patterns for parsing (supports both [HH:MM:SS] and [MM:SS] formats)
-    TIMESTAMP_PATTERN = re.compile(r'\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]')
-    SECTION_HEADER_PATTERN = re.compile(r'^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$')
-    SPEAKER_PATTERN = re.compile(r'^\*\*(.+?[:：])\*\*\s*(.+)$')
-    EVENT_PATTERN = re.compile(r'^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$')
-    INLINE_TIMESTAMP_PATTERN = re.compile(r'^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$')
+    TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
+    SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
+    SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[:：])\*\*\s*(.+)$")
+    EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
+    INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
     # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
-    YOUTUBE_SECTION_PATTERN = re.compile(r'^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$')
-    YOUTUBE_INLINE_PATTERN = re.compile(r'^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$')
+    YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
+    YOUTUBE_INLINE_PATTERN = re.compile(r"^(.+?)\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]$")
     @classmethod
     def parse_timestamp(cls, *args) -> float:
@@ -61,7 +61,7 @@ class GeminiReader:
             # Direct seconds (from YouTube &t= parameter)
             return int(args[0])
         else:
-            raise ValueError(f'Invalid timestamp args: {args}')
+            raise ValueError(f"Invalid timestamp args: {args}")
     @classmethod
     def read(
@@ -82,13 +82,13 @@ class GeminiReader:
         """
         transcript_path = Path(transcript_path).expanduser().resolve()
         if not transcript_path.exists():
-            raise FileNotFoundError(f'Transcript file not found: {transcript_path}')
+            raise FileNotFoundError(f"Transcript file not found: {transcript_path}")
         segments: List[GeminiSegment] = []
         current_section = None
         current_speaker = None
-        with open(transcript_path, 'r', encoding='utf-8') as f:
+        with open(transcript_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
         for line_num, line in enumerate(lines, start=1):
@@ -97,9 +97,9 @@ class GeminiReader:
                 continue
             # Skip table of contents
-            if line.startswith('* ['):
+            if line.startswith("* ["):
                 continue
-            if line.startswith('## Table of Contents'):
+            if line.startswith("## Table of Contents"):
                 continue
             # Parse section headers
@@ -114,7 +114,7 @@ class GeminiReader:
                             text=section_title.strip(),
                             timestamp=timestamp,
                             section=current_section,
-                            segment_type='section_header',
+                            segment_type="section_header",
                             line_number=line_num,
                         )
                     )
@@ -133,7 +133,7 @@ class GeminiReader:
                             text=section_title.strip(),
                             timestamp=timestamp,
                             section=current_section,
-                            segment_type='section_header',
+                            segment_type="section_header",
                             line_number=line_num,
                         )
                     )
@@ -158,7 +158,7 @@ class GeminiReader:
                             text=event_text.strip(),
                             timestamp=timestamp,
                             section=current_section,
-                            segment_type='event',
+                            segment_type="event",
                             line_number=line_num,
                         )
                     )
@@ -200,7 +200,7 @@ class GeminiReader:
                         timestamp=timestamp,
                         speaker=current_speaker,
                         section=current_section,
-                        segment_type='dialogue',
+                        segment_type="dialogue",
                         line_number=line_num,
                     )
                 )
@@ -228,7 +228,7 @@ class GeminiReader:
                         timestamp=timestamp,
                         speaker=current_speaker,
                         section=current_section,
-                        segment_type='dialogue',
+                        segment_type="dialogue",
                         line_number=line_num,
                     )
                 )
@@ -246,14 +246,14 @@ class GeminiReader:
                         timestamp=timestamp,
                         speaker=current_speaker,
                         section=current_section,
-                        segment_type='dialogue',
+                        segment_type="dialogue",
                         line_number=line_num,
                     )
                 )
                 continue
             # Skip markdown headers and other formatting
-            if line.startswith('#'):
+            if line.startswith("#"):
                 continue
         return segments
@@ -283,10 +283,10 @@ class GeminiReader:
         segments = cls.read(transcript_path, include_events=False, include_sections=False)
         # Filter to only dialogue segments with timestamps
-        dialogue_segments = [s for s in segments if s.segment_type == 'dialogue' and s.timestamp is not None]
+        dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
         if not dialogue_segments:
-            raise ValueError(f'No dialogue segments with timestamps found in {transcript_path}')
+            raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
         # Sort by timestamp
         dialogue_segments.sort(key=lambda x: x.timestamp)
@@ -308,7 +308,7 @@ class GeminiReader:
                     text=segment.text,
                     start=segment.timestamp,
                     duration=max(duration, min_duration),
-                    id=f'segment_{i:05d}',
+                    id=f"segment_{i:05d}",
                     speaker=segment.speaker,
                 )
             )
@@ -337,13 +337,13 @@ class GeminiReader:
                 else:
                     # Different speaker or gap too large, save previous segment
                     if current_texts:
-                        merged_text = ' '.join(current_texts)
+                        merged_text = " ".join(current_texts)
                         merged.append(
                             Supervision(
                                 text=merged_text,
                                 start=current_start,
                                 duration=last_end_time - current_start,
-                                id=f'merged_{len(merged):05d}',
+                                id=f"merged_{len(merged):05d}",
                             )
                         )
                     current_speaker = segment.speaker
@@ -353,13 +353,13 @@ class GeminiReader:
             # Add final segment
             if current_texts:
-                merged_text = ' '.join(current_texts)
+                merged_text = " ".join(current_texts)
                 merged.append(
                     Supervision(
                         text=merged_text,
                         start=current_start,
                         duration=last_end_time - current_start,
-                        id=f'merged_{len(merged):05d}',
+                        id=f"merged_{len(merged):05d}",
                     )
                 )
@@ -368,4 +368,4 @@ class GeminiReader:
         return supervisions
-__all__ = ['GeminiReader', 'GeminiSegment']
+__all__ = ["GeminiReader", "GeminiSegment"]

lattifai/io/gemini_writer.py CHANGED Viewed

@@ -19,7 +19,7 @@ class GeminiWriter:
         hours = int(seconds // 3600)
         minutes = int((seconds % 3600) // 60)
         secs = int(seconds % 60)
-        return f'[{hours:02d}:{minutes:02d}:{secs:02d}]'
+        return f"[{hours:02d}:{minutes:02d}:{secs:02d}]"
     @classmethod
     def update_timestamps(
@@ -44,7 +44,7 @@ class GeminiWriter:
         output_path = Path(output_path)
         # Read original file
-        with open(original_path, 'r', encoding='utf-8') as f:
+        with open(original_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
         # Parse original segments to get line numbers
@@ -66,7 +66,7 @@ class GeminiWriter:
         # Write updated content
         output_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_path, 'w', encoding='utf-8') as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             f.writelines(updated_lines)
         return output_path
@@ -83,7 +83,7 @@ class GeminiWriter:
         mapping = {}
         # Create a simple text-based matching
-        dialogue_segments = [s for s in original_segments if s.segment_type == 'dialogue']
+        dialogue_segments = [s for s in original_segments if s.segment_type == "dialogue"]
         # Try to match based on text content
         for aligned_sup in aligned_supervisions:
@@ -120,7 +120,7 @@ class GeminiWriter:
         # Replace timestamp patterns
         # Pattern 1: [HH:MM:SS] at the end or in brackets
-        line = re.sub(r'\[\d{2}:\d{2}:\d{2}\]', new_ts_str, line)
+        line = re.sub(r"\[\d{2}:\d{2}:\d{2}\]", new_ts_str, line)
         return line
@@ -146,28 +146,28 @@ class GeminiWriter:
         output_path = Path(output_path)
         output_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_path, 'w', encoding='utf-8') as f:
-            f.write('# Aligned Transcript\n\n')
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write("# Aligned Transcript\n\n")
             for i, sup in enumerate(aligned_supervisions):
                 # Write segment with timestamp
                 start_ts = cls.format_timestamp(sup.start)
-                f.write(f'{start_ts} {sup.text}\n')
+                f.write(f"{start_ts} {sup.text}\n")
                 # Optionally write word-level timestamps
-                if include_word_timestamps and hasattr(sup, 'alignment') and sup.alignment:
-                    if 'word' in sup.alignment:
-                        f.write('  Words: ')
+                if include_word_timestamps and hasattr(sup, "alignment") and sup.alignment:
+                    if "word" in sup.alignment:
+                        f.write("  Words: ")
                         word_parts = []
-                        for word_info in sup.alignment['word']:
-                            word_ts = cls.format_timestamp(word_info['start'])
+                        for word_info in sup.alignment["word"]:
+                            word_ts = cls.format_timestamp(word_info["start"])
                             word_parts.append(f'{word_info["symbol"]}{word_ts}')
-                        f.write(' '.join(word_parts))
-                        f.write('\n')
+                        f.write(" ".join(word_parts))
+                        f.write("\n")
-                f.write('\n')
+                f.write("\n")
         return output_path
-__all__ = ['GeminiWriter']
+__all__ = ["GeminiWriter"]

lattifai/io/reader.py CHANGED Viewed

@@ -7,7 +7,7 @@ from lhotse.utils import Pathlike
 from .supervision import Supervision
 from .text_parser import parse_speaker_text
-SubtitleFormat = Literal['txt', 'srt', 'vtt', 'ass', 'auto']
+SubtitleFormat = Literal["txt", "srt", "vtt", "ass", "auto"]
 class SubtitleReader(ABCMeta):
@@ -27,28 +27,27 @@ class SubtitleReader(ABCMeta):
             Parsed text in Lhotse Cut
         """
         if not format and Path(str(subtitle)).exists():
-            format = Path(str(subtitle)).suffix.lstrip('.').lower()
+            format = Path(str(subtitle)).suffix.lstrip(".").lower()
         elif format:
             format = format.lower()
-        if format == 'gemini' or str(subtitle).endswith('Gemini.md'):
+        if format == "gemini" or str(subtitle).endswith("Gemini.md"):
             from .gemini_reader import GeminiReader
             supervisions = GeminiReader.extract_for_alignment(subtitle)
-        elif format == 'txt' or (format == 'auto' and str(subtitle)[-4:].lower() == '.txt'):
+        elif format == "txt" or (format == "auto" and str(subtitle)[-4:].lower() == ".txt"):
             if not Path(str(subtitle)).exists():  # str
-                lines = [line.strip() for line in str(subtitle).split('\n')]
+                lines = [line.strip() for line in str(subtitle).split("\n")]
             else:  # file
                 path_str = str(subtitle)
-                with open(path_str, encoding='utf-8') as f:
+                with open(path_str, encoding="utf-8") as f:
                     lines = [line.strip() for line in f.readlines()]
             supervisions = [Supervision(text=line) for line in lines if line]
         else:
             try:
                 supervisions = cls._parse_subtitle(subtitle, format=format)
             except Exception as e:
-                del e
-                print(f"Failed to parse subtitle with format {format}, trying 'gemini' parser.")
+                print(f"Failed to parse subtitle with Format: {format}, Exception: {e}, trying 'gemini' parser.")
                 from .gemini_reader import GeminiReader
                 supervisions = GeminiReader.extract_for_alignment(subtitle)
@@ -61,18 +60,20 @@ class SubtitleReader(ABCMeta):
         try:
             subs: pysubs2.SSAFile = pysubs2.load(
-                subtitle, encoding='utf-8', format_=format if format != 'auto' else None
+                subtitle, encoding="utf-8", format_=format if format != "auto" else None
             )  # file
         except IOError:
             try:
                 subs: pysubs2.SSAFile = pysubs2.SSAFile.from_string(
-                    subtitle, format_=format if format != 'auto' else None
+                    subtitle, format_=format if format != "auto" else None
                 )  # str
-            except:
-                subs: pysubs2.SSAFile = pysubs2.load(subtitle, encoding='utf-8')  # auto detect format
+            except Exception as e:
+                del e
+                subs: pysubs2.SSAFile = pysubs2.load(subtitle, encoding="utf-8")  # auto detect format
         supervisions = []
         for event in subs.events:
+            # NOT apply text_parser.py:normalize_html_text here, to keep original text in subtitles
             speaker, text = parse_speaker_text(event.text)
             supervisions.append(
                 Supervision(

lattifai/io/supervision.py CHANGED Viewed

@@ -24,10 +24,10 @@ class Supervision(SupervisionSegment):
     """
     text: Optional[str] = None
-    id: str = ''
-    recording_id: str = ''
+    id: str = ""
+    recording_id: str = ""
     start: Seconds = 0.0
     duration: Seconds = 0.0
-__all__ = ['Supervision']
+__all__ = ["Supervision"]

lattifai/io/text_parser.py CHANGED Viewed

@@ -3,23 +3,50 @@ import re
 from typing import Optional, Tuple
 # 来自于字幕中常见的说话人标记格式
-SPEAKER_PATTERN = re.compile(r'((?:>>|&gt;&gt;|>|&gt;).*?[:：])\s*(.*)')
+SPEAKER_PATTERN = re.compile(r"((?:>>|&gt;&gt;|>|&gt;).*?[:：])\s*(.*)")
 # Transcriber Output Example:
 # 26:19.919 --> 26:34.921
 # [SPEAKER_01]: 越来越多的科技巨头入...
-SPEAKER_LATTIFAI = re.compile(r'(^\[SPEAKER_.*?\][:：])\s*(.*)')
+SPEAKER_LATTIFAI = re.compile(r"(^\[SPEAKER_.*?\][:：])\s*(.*)")
 # NISHTHA BHATIA: Hey, everyone.
 # DIETER: Oh, hey, Nishtha.
 # GEMINI: That might
-SPEAKER_PATTERN2 = re.compile(r'^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[:：])\s*(.*)$')
+SPEAKER_PATTERN2 = re.compile(r"^([A-Z]{1,15}(?:\s+[A-Z]{1,15})?[:：])\s*(.*)$")
+def normalize_html_text(text: str) -> str:
+    """Normalize HTML text by decoding entities and stripping whitespace."""
+    html_entities = {
+        "&amp;": "&",
+        "&lt;": "<",
+        "&gt;": ">",
+        "&quot;": '"',
+        "&#39;": "'",
+        "&nbsp;": " ",
+        "\\N": " ",
+        "…": " ",
+    }
+    for entity, char in html_entities.items():
+        text = text.replace(entity, char)
+    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
+    # Convert curly apostrophes to straight apostrophes for common English contractions
+    # Handles: 't 's 'll 're 've 'd 'm
+    # For example, convert "don't" to "don't"
+    text = re.sub(r"([a-zA-Z])’([tsdm]|ll|re|ve)\b", r"\1'\2", text, flags=re.IGNORECASE)
+    # For example, convert "5’s" to "5's"
+    text = re.sub(r"([0-9])’([s])\b", r"\1'\2", text, flags=re.IGNORECASE)
+    return text.strip()
 def parse_speaker_text(line) -> Tuple[Optional[str], str]:
-    line = line.replace('\\N', ' ')
+    """Parse a line of text to extract speaker and content."""
-    if ':' not in line and '：' not in line:
+    if ":" not in line and "：" not in line:
         return None, line
     # 匹配以 >> 开头的行，并去除开头的名字和冒号
@@ -31,7 +58,7 @@ def parse_speaker_text(line) -> Tuple[Optional[str], str]:
     if match:
         assert len(match.groups()) == 2, match.groups()
         if not match.group(1):
-            logging.error(f'ParseSub LINE [{line}]')
+            logging.error(f"ParseSub LINE [{line}]")
         else:
             return match.group(1).strip(), match.group(2).strip()
@@ -43,15 +70,15 @@ def parse_speaker_text(line) -> Tuple[Optional[str], str]:
     return None, line
-if __name__ == '__main__':
-    pattern = re.compile(r'>>\s*(.*?)\s*[:：]\s*(.*)')
-    pattern = re.compile(r'(>>.*?[:：])\s*(.*)')
+if __name__ == "__main__":
+    pattern = re.compile(r">>\s*(.*?)\s*[:：]\s*(.*)")
+    pattern = re.compile(r"(>>.*?[:：])\s*(.*)")
     test_strings = [
-        '>>Key: Value',
-        '>>  Key with space : Value with space ',
-        '>>  全角键 ： 全角值',
-        '>>Key：Value xxx. >>Key：Value',
+        ">>Key: Value",
+        ">>  Key with space : Value with space ",
+        ">>  全角键 ： 全角值",
+        ">>Key：Value xxx. >>Key：Value",
     ]
     for text in test_strings:
@@ -60,16 +87,16 @@ if __name__ == '__main__':
             print(f"Input: '{text}'")
             print(f"  Key:   '{match.group(1)}'")
             print(f"  Value: '{match.group(2)}'")
-            print('-------------')
+            print("-------------")
     # pattern2
-    test_strings2 = ['NISHTHA BHATIA: Hey, everyone.', 'DIETER: Oh, hey, Nishtha.', 'GEMINI: That might']
+    test_strings2 = ["NISHTHA BHATIA: Hey, everyone.", "DIETER: Oh, hey, Nishtha.", "GEMINI: That might"]
     for text in test_strings2:
         match = SPEAKER_PATTERN2.match(text)
         if match:
             print(f"  Input: '{text}'")
             print(f"Speaker: '{match.group(1)}'")
             print(f"Content: '{match.group(2)}'")
-            print('-------------')
+            print("-------------")
         else:
             raise ValueError(f"No match for: '{text}'")

lattifai/io/utils.py CHANGED Viewed

@@ -3,13 +3,13 @@ Utility constants and helper functions for subtitle I/O operations
 """
 # Supported subtitle formats for reading/writing
-SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'md']
+SUBTITLE_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "md"]
 # Input subtitle formats (includes special formats like 'auto' and 'gemini')
-INPUT_SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'auto', 'gemini']
+INPUT_SUBTITLE_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "auto", "gemini"]
 # Output subtitle formats (includes special formats like 'TextGrid' and 'json')
-OUTPUT_SUBTITLE_FORMATS = ['srt', 'vtt', 'ass', 'ssa', 'sub', 'sbv', 'txt', 'TextGrid', 'json']
+OUTPUT_SUBTITLE_FORMATS = ["srt", "vtt", "ass", "ssa", "sub", "sbv", "txt", "TextGrid", "json"]
 # All subtitle formats combined (for file detection)
-ALL_SUBTITLE_FORMATS = list(set(SUBTITLE_FORMATS + ['TextGrid', 'json', 'gemini']))
+ALL_SUBTITLE_FORMATS = list(set(SUBTITLE_FORMATS + ["TextGrid", "json", "gemini"]))

lattifai/io/writer.py CHANGED Viewed

@@ -14,45 +14,57 @@ class SubtitleWriter(ABCMeta):
     @classmethod
     def write(cls, alignments: List[Supervision], output_path: Pathlike) -> Pathlike:
-        if str(output_path)[-4:].lower() == '.txt':
-            with open(output_path, 'w', encoding='utf-8') as f:
+        if str(output_path)[-4:].lower() == ".txt":
+            with open(output_path, "w", encoding="utf-8") as f:
                 for sup in alignments:
                     word_items = parse_alignment_from_supervision(sup)
                     if word_items:
                         for item in word_items:
-                            f.write(f'[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n')
+                            f.write(f"[{item.start:.2f}-{item.end:.2f}] {item.symbol}\n")
                     else:
-                        text = f'{sup.speaker} {sup.text}' if sup.speaker is not None else sup.text
-                        f.write(f'[{sup.start:.2f}-{sup.end:.2f}] {text}\n')
+                        text = f"{sup.speaker} {sup.text}" if sup.speaker is not None else sup.text
+                        f.write(f"[{sup.start:.2f}-{sup.end:.2f}] {text}\n")
-        elif str(output_path)[-5:].lower() == '.json':
-            with open(output_path, 'w', encoding='utf-8') as f:
+        elif str(output_path)[-5:].lower() == ".json":
+            with open(output_path, "w", encoding="utf-8") as f:
                 # Enhanced JSON export with word-level alignment
                 json_data = []
                 for sup in alignments:
                     sup_dict = sup.to_dict()
                     json_data.append(sup_dict)
                 json.dump(json_data, f, ensure_ascii=False, indent=4)
-        elif str(output_path).endswith('.TextGrid') or str(output_path).endswith('.textgrid'):
+        elif str(output_path).lower().endswith(".textgrid"):
             from tgt import Interval, IntervalTier, TextGrid, write_to_file
             tg = TextGrid()
-            supervisions, words = [], []
+            supervisions, words, scores = [], [], {"utterances": [], "words": []}
             for supervision in sorted(alignments, key=lambda x: x.start):
                 text = (
-                    f'{supervision.speaker} {supervision.text}' if supervision.speaker is not None else supervision.text
+                    f"{supervision.speaker} {supervision.text}" if supervision.speaker is not None else supervision.text
                 )
-                supervisions.append(Interval(supervision.start, supervision.end, text or ''))
+                supervisions.append(Interval(supervision.start, supervision.end, text or ""))
                 # Extract word-level alignment using helper function
                 word_items = parse_alignment_from_supervision(supervision)
                 if word_items:
                     for item in word_items:
                         words.append(Interval(item.start, item.end, item.symbol))
+                        if item.score is not None:
+                            scores["words"].append(Interval(item.start, item.end, f"{item.score:.2f}"))
+                if supervision.has_custom("score"):
+                    scores["utterances"].append(
+                        Interval(supervision.start, supervision.end, f"{supervision.score:.2f}")
+                    )
-            tg.add_tier(IntervalTier(name='utterances', objects=supervisions))
+            tg.add_tier(IntervalTier(name="utterances", objects=supervisions))
             if words:
-                tg.add_tier(IntervalTier(name='words', objects=words))
-            write_to_file(tg, output_path, format='long')
+                tg.add_tier(IntervalTier(name="words", objects=words))
+            if scores["utterances"]:
+                tg.add_tier(IntervalTier(name="utterance_scores", objects=scores["utterances"]))
+            if scores["words"]:
+                tg.add_tier(IntervalTier(name="word_scores", objects=scores["words"]))
+            write_to_file(tg, output_path, format="long")
         else:
             subs = pysubs2.SSAFile()
             for sup in alignments:
@@ -64,8 +76,8 @@ class SubtitleWriter(ABCMeta):
                             pysubs2.SSAEvent(start=int(word.start * 1000), end=int(word.end * 1000), text=word.symbol)
                         )
                 else:
-                    text = f'{sup.speaker} {sup.text}' if sup.speaker is not None else sup.text
-                    subs.append(pysubs2.SSAEvent(start=int(sup.start * 1000), end=int(sup.end * 1000), text=text or ''))
+                    text = f"{sup.speaker} {sup.text}" if sup.speaker is not None else sup.text
+                    subs.append(pysubs2.SSAEvent(start=int(sup.start * 1000), end=int(sup.end * 1000), text=text or ""))
             subs.save(output_path)
         return output_path
@@ -81,10 +93,10 @@ def parse_alignment_from_supervision(supervision: Any) -> Optional[List[Alignmen
     Returns:
         List of AlignmentItem objects, or None if no alignment data present
     """
-    if not hasattr(supervision, 'alignment') or not supervision.alignment:
+    if not hasattr(supervision, "alignment") or not supervision.alignment:
         return None
-    if 'word' not in supervision.alignment:
+    if "word" not in supervision.alignment:
         return None
-    return supervision.alignment['word']
+    return supervision.alignment["word"]

lattifai/tokenizer/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .tokenizer import AsyncLatticeTokenizer, LatticeTokenizer
-__all__ = ['LatticeTokenizer', 'AsyncLatticeTokenizer']
+__all__ = ["LatticeTokenizer", "AsyncLatticeTokenizer"]

lattifai/tokenizer/phonemizer.py CHANGED Viewed

@@ -4,13 +4,13 @@ from typing import List, Optional, Union
 from dp.phonemizer import Phonemizer  # g2p-phonemizer
 from num2words import num2words
-LANGUAGE = 'omni'
+LANGUAGE = "omni"
 class G2Phonemizer:
     def __init__(self, model_checkpoint, device):
         self.phonemizer = Phonemizer.from_checkpoint(model_checkpoint, device=device).predictor
-        self.pattern = re.compile(r'\d+')
+        self.pattern = re.compile(r"\d+")
     def num2words(self, word, lang: str):
         matches = self.pattern.findall(word)
@@ -31,7 +31,7 @@ class G2Phonemizer:
             is_list = False
         predictions = self.phonemizer(
-            [self.num2words(word.replace(' .', '.').replace('.', ' .'), lang=lang or 'en') for word in words],
+            [self.num2words(word.replace(" .", ".").replace(".", " ."), lang=lang or "en") for word in words],
             lang=LANGUAGE,
             batch_size=min(batch_size or len(words), 128),
             num_prons=num_prons,

lattifai 0.4.5__py3-none-any.whl → 0.4.6__py3-none-any.whl

lattifai 0.4.5py3-none-any.whl → 0.4.6py3-none-any.whl