PyPI - lattifai - Versions diffs - 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

lattifai 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

lattifai/__init__.py +0 -25
lattifai/alignment/lattice1_aligner.py +12 -9
lattifai/alignment/lattice1_worker.py +124 -155
lattifai/alignment/segmenter.py +1 -1
lattifai/alignment/sentence_splitter.py +219 -0
lattifai/alignment/tokenizer.py +23 -179
lattifai/audio2.py +1 -1
lattifai/caption/caption.py +0 -2
lattifai/caption/gemini_reader.py +151 -60
lattifai/cli/diarization.py +3 -1
lattifai/cli/transcribe.py +3 -8
lattifai/cli/youtube.py +11 -0
lattifai/client.py +96 -47
lattifai/config/alignment.py +2 -2
lattifai/config/client.py +5 -0
lattifai/mixin.py +17 -8
lattifai/utils.py +40 -4
lattifai/workflow/youtube.py +55 -57
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/METADATA +331 -48
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/RECORD +24 -23
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/WHEEL +0 -0
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/entry_points.txt +0 -0
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.1.0.dist-info → lattifai-1.2.1.dist-info}/top_level.txt +0 -0

lattifai/caption/gemini_reader.py CHANGED Viewed

@@ -15,7 +15,8 @@ class GeminiSegment:
     """Represents a segment in the Gemini transcript with metadata."""
     text: str
-    timestamp: Optional[float] = None
+    timestamp: Optional[float] = None  # For backward compatibility (start time)
+    end_timestamp: Optional[float] = None  # End time when timestamp is at the end
     speaker: Optional[str] = None
     section: Optional[str] = None
     segment_type: str = "dialogue"  # 'dialogue', 'event', or 'section_header'
@@ -26,6 +27,11 @@ class GeminiSegment:
         """Return start time in seconds."""
         return self.timestamp if self.timestamp is not None else 0.0
+    @property
+    def end(self) -> Optional[float]:
+        """Return end time in seconds if available."""
+        return self.end_timestamp
 class GeminiReader:
     """Parser for YouTube transcript format with speaker labels and timestamps."""
@@ -34,8 +40,12 @@ class GeminiReader:
     TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
     SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
     SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[:：])\*\*\s*(.+)$")
-    EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
-    INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
+    # Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
+    EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
+    # Timestamp at the end indicates end time
+    INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
+    # Timestamp at the beginning indicates start time
+    INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
     # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
     YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
@@ -144,18 +154,22 @@ class GeminiReader:
             if event_match:
                 groups = event_match.groups()
                 event_text = groups[0]
-                # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
-                if groups[1] is not None:  # HH:MM:SS format
-                    timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
-                elif groups[4] is not None:  # MM:SS format
-                    timestamp = cls.parse_timestamp(groups[4], groups[5])
+                # Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
+                hours_or_minutes = groups[1]
+                minutes_or_seconds = groups[2]
+                seconds_optional = groups[3]
+                if seconds_optional is not None:
+                    # HH:MM:SS format
+                    timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
                 else:
-                    timestamp = None
+                    # MM:SS format
+                    timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
                 if include_events and timestamp is not None:
                     segments.append(
                         GeminiSegment(
-                            text=event_text.strip(),
+                            text=f"[{event_text.strip()}]",
                             timestamp=timestamp,
                             section=current_section,
                             segment_type="event",
@@ -170,34 +184,44 @@ class GeminiReader:
                 speaker, text_with_timestamp = speaker_match.groups()
                 current_speaker = speaker.strip()
-                # Extract timestamp from the end of the text
-                timestamp_match = cls.INLINE_TIMESTAMP_PATTERN.match(text_with_timestamp.strip())
+                # Check for timestamp at the beginning (start time)
+                start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
+                # Check for timestamp at the end (end time)
+                end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
                 youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
-                if timestamp_match:
-                    groups = timestamp_match.groups()
-                    text = groups[0]
+                start_timestamp = None
+                end_timestamp = None
+                text = text_with_timestamp.strip()
+                if start_match:
+                    groups = start_match.groups()
+                    # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
+                    if groups[0] is not None:  # HH:MM:SS format
+                        start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
+                    elif groups[3] is not None:  # MM:SS format
+                        start_timestamp = cls.parse_timestamp(groups[3], groups[4])
+                    text = groups[5]  # Text is after timestamp
+                elif end_match:
+                    groups = end_match.groups()
+                    text = groups[0]  # Text is before timestamp
                     # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
                     if groups[1] is not None:  # HH:MM:SS format
-                        timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
+                        end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
                     elif groups[4] is not None:  # MM:SS format
-                        timestamp = cls.parse_timestamp(groups[4], groups[5])
-                    else:
-                        timestamp = None
+                        end_timestamp = cls.parse_timestamp(groups[4], groups[5])
                 elif youtube_match:
                     groups = youtube_match.groups()
                     text = groups[0]
-                    # Extract seconds from URL parameter
+                    # Extract seconds from URL parameter (treat as end time)
                     url_seconds = groups[3]
-                    timestamp = cls.parse_timestamp(url_seconds)
-                else:
-                    text = text_with_timestamp.strip()
-                    timestamp = None
+                    end_timestamp = cls.parse_timestamp(url_seconds)
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
-                        timestamp=timestamp,
+                        timestamp=start_timestamp,
+                        end_timestamp=end_timestamp,
                         speaker=current_speaker,
                         section=current_section,
                         segment_type="dialogue",
@@ -207,25 +231,50 @@ class GeminiReader:
                 current_speaker = None  # Reset speaker after use
                 continue
-            # Parse plain text with timestamp at the end
-            inline_match = cls.INLINE_TIMESTAMP_PATTERN.match(line)
+            # Parse plain text with timestamp (check both positions)
+            start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
+            end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
             youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
-            if inline_match:
-                groups = inline_match.groups()
-                text = groups[0]
+            start_timestamp = None
+            end_timestamp = None
+            text = None
+            if start_match:
+                groups = start_match.groups()
+                # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
+                if groups[0] is not None:  # HH:MM:SS format
+                    start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
+                elif groups[3] is not None:  # MM:SS format
+                    start_timestamp = cls.parse_timestamp(groups[3], groups[4])
+                text = groups[5]  # Text is after timestamp
+                segments.append(
+                    GeminiSegment(
+                        text=text.strip(),
+                        timestamp=start_timestamp,
+                        end_timestamp=None,
+                        speaker=current_speaker,
+                        section=current_section,
+                        segment_type="dialogue",
+                        line_number=line_num,
+                    )
+                )
+                continue
+            elif end_match:
+                groups = end_match.groups()
+                text = groups[0]  # Text is before timestamp
                 # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
                 if groups[1] is not None:  # HH:MM:SS format
-                    timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
+                    end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
                 elif groups[4] is not None:  # MM:SS format
-                    timestamp = cls.parse_timestamp(groups[4], groups[5])
-                else:
-                    timestamp = None
+                    end_timestamp = cls.parse_timestamp(groups[4], groups[5])
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
-                        timestamp=timestamp,
+                        timestamp=None,
+                        end_timestamp=end_timestamp,
                         speaker=current_speaker,
                         section=current_section,
                         segment_type="dialogue",
@@ -236,14 +285,15 @@ class GeminiReader:
             elif youtube_inline_match:
                 groups = youtube_inline_match.groups()
                 text = groups[0]
-                # Extract seconds from URL parameter
+                # Extract seconds from URL parameter (treat as end time)
                 url_seconds = groups[3]
-                timestamp = cls.parse_timestamp(url_seconds)
+                end_timestamp = cls.parse_timestamp(url_seconds)
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
-                        timestamp=timestamp,
+                        timestamp=None,
+                        end_timestamp=end_timestamp,
                         speaker=current_speaker,
                         section=current_section,
                         segment_type="dialogue",
@@ -280,38 +330,79 @@ class GeminiReader:
         Returns:
                 List of Supervision objects ready for alignment
         """
-        segments = cls.read(transcript_path, include_events=False, include_sections=False)
+        segments = cls.read(transcript_path, include_events=True, include_sections=False)
-        # Filter to only dialogue segments with timestamps
-        dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
+        # Filter to dialogue and event segments with timestamps (either start or end)
+        dialogue_segments = [
+            s
+            for s in segments
+            if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
+        ]
         if not dialogue_segments:
             raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
-        # Sort by timestamp
-        dialogue_segments.sort(key=lambda x: x.timestamp)
+        # Sort by timestamp (use start time if available, otherwise end time)
+        dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
         # Convert to Supervision objects
         supervisions: List[Supervision] = []
+        prev_end_time = 0.0
         for i, segment in enumerate(dialogue_segments):
-            # Estimate duration based on next segment
-            if i < len(dialogue_segments) - 1:
-                duration = dialogue_segments[i + 1].timestamp - segment.timestamp
-            else:
-                # Last segment: estimate based on text length (rough heuristic)
-                words = len(segment.text.split())
-                duration = words * 0.3  # ~0.3 seconds per word
-            supervisions.append(
-                Supervision(
-                    text=segment.text,
-                    start=segment.timestamp,
-                    duration=max(duration, min_duration),
-                    id=f"segment_{i:05d}",
-                    speaker=segment.speaker,
-                )
-            )
+            seg_start = None
+            seg_end = None
+            # Determine start and end times based on available timestamps
+            if segment.timestamp is not None:
+                # Has start time
+                seg_start = segment.timestamp
+                if segment.end_timestamp is not None:
+                    # Has both start and end
+                    seg_end = segment.end_timestamp
+                else:
+                    # Only has start, estimate end
+                    if i < len(dialogue_segments) - 1:
+                        # Use next segment's time
+                        next_seg = dialogue_segments[i + 1]
+                        if next_seg.timestamp is not None:
+                            seg_end = next_seg.timestamp
+                        elif next_seg.end_timestamp is not None:
+                            # Next has only end, estimate its start and use that
+                            words_next = len(next_seg.text.split())
+                            estimated_duration_next = words_next * 0.3
+                            seg_end = next_seg.end_timestamp - estimated_duration_next
+                    if seg_end is None:
+                        # Estimate based on text length
+                        words = len(segment.text.split())
+                        seg_end = seg_start + words * 0.3
+            elif segment.end_timestamp is not None:
+                # Only has end time, need to infer start
+                seg_end = segment.end_timestamp
+                # Use previous segment's end time as start, or estimate based on text
+                if prev_end_time > 0:
+                    seg_start = prev_end_time
+                else:
+                    # Estimate start based on text length
+                    words = len(segment.text.split())
+                    estimated_duration = words * 0.3
+                    seg_start = seg_end - estimated_duration
+            if seg_start is not None and seg_end is not None:
+                duration = max(seg_end - seg_start, min_duration)
+                if segment.segment_type == "dialogue":
+                    supervisions.append(
+                        Supervision(
+                            text=segment.text,
+                            start=seg_start,
+                            duration=duration,
+                            id=f"segment_{i:05d}",
+                            speaker=segment.speaker,
+                        )
+                    )
+                prev_end_time = seg_start + duration
         # Optionally merge consecutive segments from same speaker
         if merge_consecutive:

lattifai/cli/diarization.py CHANGED Viewed

@@ -8,7 +8,7 @@ import nemo_run as run
 from typing_extensions import Annotated
 from lattifai.client import LattifAI
-from lattifai.config import CaptionConfig, ClientConfig, DiarizationConfig, MediaConfig
+from lattifai.config import AlignmentConfig, CaptionConfig, ClientConfig, DiarizationConfig, MediaConfig
 from lattifai.utils import safe_print
 __all__ = ["diarize"]
@@ -22,6 +22,7 @@ def diarize(
     media: Annotated[Optional[MediaConfig], run.Config[MediaConfig]] = None,
     caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
     client: Annotated[Optional[ClientConfig], run.Config[ClientConfig]] = None,
+    alignment: Annotated[Optional[AlignmentConfig], run.Config[AlignmentConfig]] = None,
     diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
 ):
     """Run speaker diarization on aligned captions and audio."""
@@ -53,6 +54,7 @@ def diarize(
     client_instance = LattifAI(
         client_config=client,
+        alignment_config=alignment,
         caption_config=caption_config,
         diarization_config=diarization_config,
     )

lattifai/cli/transcribe.py CHANGED Viewed

@@ -108,12 +108,7 @@ def transcribe(
     is_url = media_config.is_input_remote()
     # Prepare output paths
-    if is_url:
-        # For URLs, use output_dir from media_config or current directory
-        output_path = media_config.output_dir
-    else:
-        # For files, use input path directory
-        output_path = Path(media_config.input_path).parent
+    output_dir = media_config.output_dir or Path(media_config.input_path).parent
     # Create transcriber
     if not transcription_config.lattice_model_path:
@@ -140,7 +135,7 @@ def transcribe(
             input_path = asyncio.run(
                 downloader.download_media(
                     url=media_config.input_path,
-                    output_dir=str(output_path),
+                    output_dir=str(output_dir),
                     media_format=media_config.normalize_format(),
                     force_overwrite=media_config.force_overwrite,
                 )
@@ -167,7 +162,7 @@ def transcribe(
         if is_url:
             # For URLs, generate output filename based on transcriber
             output_format = transcriber.file_suffix.lstrip(".")
-            final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
+            final_output = output_dir / f"youtube_LattifAI_{transcriber.name}.{output_format}"
         else:
             # For files, use input filename with suffix
             final_output = Path(media_config.input_path).with_suffix(".LattifAI.srt")

lattifai/cli/youtube.py CHANGED Viewed

@@ -25,6 +25,7 @@ def youtube(
     caption: Annotated[Optional[CaptionConfig], run.Config[CaptionConfig]] = None,
     transcription: Annotated[Optional[TranscriptionConfig], run.Config[TranscriptionConfig]] = None,
     diarization: Annotated[Optional[DiarizationConfig], run.Config[DiarizationConfig]] = None,
+    use_transcription: bool = False,
 ):
     """
     Download media from YouTube (when needed) and align captions.
@@ -55,6 +56,11 @@ def youtube(
             Fields: gemini_api_key, model_name, language, device
         diarization: Speaker diarization configuration.
             Fields: enabled, num_speakers, min_speakers, max_speakers, device
+        use_transcription: If True, skip YouTube caption download and directly use
+            transcription.model_name to transcribe. If False (default), first try to
+            download YouTube captions; if download fails (no captions available or
+            errors like HTTP 429), automatically fallback to transcription if
+            transcription.model_name is configured.
     Examples:
         # Download from YouTube and align (positional argument)
@@ -108,7 +114,11 @@ def youtube(
         transcription_config=transcription,
         diarization_config=diarization,
     )
     # Call the client's youtube method
+    # If use_transcription=True, skip YouTube caption download and use transcription directly.
+    # If use_transcription=False (default), try YouTube captions first; on failure,
+    # automatically fallback to transcription if transcription.model_name is configured.
     return lattifai_client.youtube(
         url=media_config.input_path,
         output_dir=media_config.output_dir,
@@ -118,6 +128,7 @@ def youtube(
         split_sentence=caption_config.split_sentence,
         channel_selector=media_config.channel_selector,
         streaming_chunk_secs=media_config.streaming_chunk_secs,
+        use_transcription=use_transcription,
     )

lattifai/client.py CHANGED Viewed

@@ -56,6 +56,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
         # Initialize base API client
         super().__init__(config=client_config)
+        self.config = client_config
         # Initialize all configs with defaults
         alignment_config, transcription_config, diarization_config = self._init_configs(
@@ -125,38 +126,20 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
                 safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
                 if caption.supervisions and alignment_strategy == "transcription":
-                    # raise NotImplementedError("Transcription-based alignment is not yet implemented.")
-                    assert (
-                        "gemini" not in self.transcriber.name.lower()
-                    ), "Transcription-based alignment is not supported with Gemini transcriber."
-                    assert (
-                        caption.supervisions
-                    ), "Input caption should contain supervisions when using transcription-based alignment."
                     if not caption.transcription:
-                        import asyncio
-                        safe_print(colorful.cyan("📝 Transcribing media for alignment..."))
-                        if output_caption_path:
-                            transcript_file = (
-                                Path(str(output_caption_path)).parent
-                                / f"{Path(str(media_audio)).stem}_{self.transcriber.file_name}"
-                            )
-                            if transcript_file.exists():
-                                # print(colorful.cyan(f"Reading existing transcription from {transcript_file}"))
-                                transcript = self._read_caption(transcript_file, verbose=False)
-                                caption.transcription = transcript.supervisions
-                                caption.audio_events = transcript.audio_events
-                        if not caption.transcription:
-                            transcript = asyncio.run(
-                                self.transcriber.transcribe(media_audio, language=self.caption_config.source_lang)
-                            )
-                            caption.transcription = transcript.transcription
-                            caption.audio_events = transcript.audio_events
+                        transcript = self._transcribe(
+                            media_audio,
+                            source_lang=self.caption_config.source_lang,
+                            is_async=False,
+                            output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
+                        )
+                        caption.transcription = transcript.supervisions or transcript.transcription
+                        caption.audio_events = transcript.audio_events
+                    assert caption.transcription, "Transcription is empty after transcription step."
                     # Align caption.supervisions with transcription to get segments
                     import regex
-                    from error_align import ErrorAlign, error_align  # noqa: F401
+                    from error_align import error_align  # noqa: F401
                     from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
                     JOIN_TOKEN = "❄"
@@ -183,21 +166,82 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
                             )
                         )
-                    alignments = error_align(
-                        f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions),
-                        f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription),
-                        tokenizer=custom_tokenizer,
-                    )
+                    if split_sentence or self.caption_config.split_sentence:
+                        caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
-                    for align in alignments:
-                        if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
-                            pass
+                    ref = f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions)
+                    hyp = f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription)
+                    alignments = error_align(ref, hyp, tokenizer=custom_tokenizer)
-                        # if align.op_type == OpType.MATCH:
-                        #     continue
-                        # elif align.op_type in (OpType.INSERT, OpType.DELETE, OpType.SUBSTITUTE):
-                        #     # print(colorful.yellow(f"⚠️ Alignment warning: {op}"))
-                        #     pass
+                    idx = 0
+                    for k, align in enumerate(alignments):
+                        if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
+                            # safe_print(f"Segment {k}: JOIN_TOKEN detected, creating segment.")
+                            # Find first non-None ref_slice starting from idx
+                            ref_start = 0
+                            for i in range(idx, k + 1):
+                                if i < len(alignments) and alignments[i].ref_slice is not None:
+                                    ref_start = alignments[i].ref_slice.start
+                                    break
+                            # Find last non-None ref_slice up to current position
+                            ref_stop = len(ref)
+                            for i in range(k, idx - 1, -1):
+                                if i < len(alignments) and alignments[i].ref_slice is not None:
+                                    ref_stop = alignments[i].ref_slice.stop
+                                    break
+                            # Find first non-None hyp_slice starting from idx
+                            hyp_start = 0
+                            for i in range(idx, k + 1):
+                                if i < len(alignments) and alignments[i].hyp_slice is not None:
+                                    hyp_start = alignments[i].hyp_slice.start
+                                    break
+                            # Find last non-None hyp_slice up to current position
+                            hyp_stop = len(hyp)
+                            for i in range(k, idx - 1, -1):
+                                if i < len(alignments) and alignments[i].hyp_slice is not None:
+                                    hyp_stop = alignments[i].hyp_slice.stop
+                                    break
+                            safe_print(f"[REF]: {ref[ref_start:ref_stop]}")
+                            safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop]}\n")
+                            idx = k + 1
+                    # last part - handle remaining alignments after last JOIN_TOKEN
+                    if idx < len(alignments):
+                        # Find first non-None ref_slice starting from idx
+                        ref_start = 0
+                        for i in range(idx, len(alignments)):
+                            if alignments[i].ref_slice is not None:
+                                ref_start = alignments[i].ref_slice.start
+                                break
+                        # Find last non-None ref_slice from end
+                        ref_stop = len(ref)
+                        for i in range(len(alignments) - 1, idx - 1, -1):
+                            if alignments[i].ref_slice is not None:
+                                ref_stop = alignments[i].ref_slice.stop
+                                break
+                        # Find first non-None hyp_slice starting from idx
+                        hyp_start = 0
+                        for i in range(idx, len(alignments)):
+                            if alignments[i].hyp_slice is not None:
+                                hyp_start = alignments[i].hyp_slice.start
+                                break
+                        # Find last non-None hyp_slice from end
+                        hyp_stop = len(hyp)
+                        for i in range(len(alignments) - 1, idx - 1, -1):
+                            if alignments[i].hyp_slice is not None:
+                                hyp_stop = alignments[i].hyp_slice.stop
+                                break
+                        safe_print(f"[REF]: {ref[ref_start:ref_stop + 1]}")
+                        safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop + 1]}\n")
                     raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
                 else:
@@ -219,6 +263,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
                         )
                 # align each segment
+                sr = media_audio.sampling_rate
                 supervisions, alignments = [], []
                 for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
                     print(
@@ -233,10 +278,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
                     offset = round(start, 4)
                     # Extract audio slice
-                    audio_slice_ndarray = media_audio.ndarray[
-                        :, int(start * media_audio.sampling_rate) : int(end * media_audio.sampling_rate)
-                    ]
-                    emission = self.aligner.emission(audio_slice_ndarray)
+                    audio_slice = media_audio.ndarray[:, int(start * sr) : int(end * sr)]
+                    emission = self.aligner.emission(audio_slice)
                     # Align segment
                     _supervisions, _alignments = self.aligner.alignment(
@@ -269,6 +312,10 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
             if output_caption_path:
                 self._write_caption(caption, output_caption_path)
+            # Profile if enabled
+            if self.config.profile:
+                self.aligner.profile()
         except (CaptionProcessingError, LatticeEncodingError, AlignmentError, LatticeDecodingError):
             # Re-raise our specific errors as-is
             raise
@@ -363,7 +410,9 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
         # Step 1: Download media
         media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
-        media_audio = self.audio_loader(media_file, channel_selector=channel_selector)
+        media_audio = self.audio_loader(
+            media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
+        )
         # Step 2: Get or create captions (download or transcribe)
         caption = self._download_or_transcribe_caption(
@@ -388,7 +437,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
             output_caption_path=output_caption_path,
             split_sentence=split_sentence,
             channel_selector=channel_selector,
-            streaming_chunk_secs=streaming_chunk_secs,
+            streaming_chunk_secs=None,
         )
         return caption

lattifai/config/alignment.py CHANGED Viewed

@@ -28,11 +28,11 @@ class AlignmentConfig:
     """Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
     batch_size: int = 1
-    """Batch size for inference (number of samples processed simultaneously)."""
+    """Batch size for inference (number of samples processed simultaneously, NotImplemented yet)."""
     # Segmented Alignment for Long Audio
     trust_caption_timestamps: bool = False
-    """When True, use original caption timestamps as strong reference constraints during alignment.
+    """When True, use original caption.supervisions' timestamps as strong reference constraints during alignment.
     The alignment process will still adjust timestamps but stay close to the input timing.
     Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
     while preserving the approximate timing from the original captions.

lattifai/config/client.py CHANGED Viewed

@@ -26,6 +26,11 @@ class ClientConfig:
     default_headers: Optional[Dict[str, str]] = field(default=None)
     """Optional static headers to include in all requests."""
+    profile: bool = False
+    """Enable profiling of client operations tasks.
+    When True, prints detailed timing information for various stages of the process.
+    """
     def __post_init__(self):
         """Validate and auto-populate configuration after initialization."""

lattifai 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

lattifai 1.1.0py3-none-any.whl → 1.2.1py3-none-any.whl