PyPI - lattifai - Versions diffs - 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

lattifai 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

lattifai/__init__.py +0 -24
lattifai/alignment/lattice1_aligner.py +1 -1
lattifai/alignment/lattice1_worker.py +1 -6
lattifai/alignment/segmenter.py +1 -1
lattifai/alignment/sentence_splitter.py +219 -0
lattifai/alignment/tokenizer.py +10 -181
lattifai/caption/caption.py +0 -2
lattifai/caption/gemini_reader.py +151 -60
lattifai/cli/transcribe.py +3 -8
lattifai/client.py +91 -47
lattifai/config/alignment.py +2 -2
lattifai/mixin.py +10 -4
lattifai/utils.py +74 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/METADATA +2 -1
{lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/RECORD +19 -18
{lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/WHEEL +0 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/entry_points.txt +0 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/licenses/LICENSE +0 -0
{lattifai-1.2.0.dist-info → lattifai-1.2.1.dist-info}/top_level.txt +0 -0

lattifai/caption/gemini_reader.py CHANGED Viewed

@@ -15,7 +15,8 @@ class GeminiSegment:
     """Represents a segment in the Gemini transcript with metadata."""
     text: str
-    timestamp: Optional[float] = None
+    timestamp: Optional[float] = None  # For backward compatibility (start time)
+    end_timestamp: Optional[float] = None  # End time when timestamp is at the end
     speaker: Optional[str] = None
     section: Optional[str] = None
     segment_type: str = "dialogue"  # 'dialogue', 'event', or 'section_header'
@@ -26,6 +27,11 @@ class GeminiSegment:
         """Return start time in seconds."""
         return self.timestamp if self.timestamp is not None else 0.0
+    @property
+    def end(self) -> Optional[float]:
+        """Return end time in seconds if available."""
+        return self.end_timestamp
 class GeminiReader:
     """Parser for YouTube transcript format with speaker labels and timestamps."""
@@ -34,8 +40,12 @@ class GeminiReader:
     TIMESTAMP_PATTERN = re.compile(r"\[(\d{1,2}):(\d{2}):(\d{2})\]|\[(\d{1,2}):(\d{2})\]")
     SECTION_HEADER_PATTERN = re.compile(r"^##\s*\[(\d{1,2}):(\d{2}):(\d{2})\]\s*(.+)$")
     SPEAKER_PATTERN = re.compile(r"^\*\*(.+?[:：])\*\*\s*(.+)$")
-    EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
-    INLINE_TIMESTAMP_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
+    # Event pattern: [Event] [HH:MM:SS] or [Event] [MM:SS] - prioritize HH:MM:SS format
+    EVENT_PATTERN = re.compile(r"^\[([^\]]+)\]\s*\[(\d{1,2}):(\d{2})(?::(\d{2}))?\]$")
+    # Timestamp at the end indicates end time
+    INLINE_TIMESTAMP_END_PATTERN = re.compile(r"^(.+?)\s*\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]$")
+    # Timestamp at the beginning indicates start time
+    INLINE_TIMESTAMP_START_PATTERN = re.compile(r"^\[(?:(\d{1,2}):(\d{2}):(\d{2})|(\d{1,2}):(\d{2}))\]\s*(.+)$")
     # New patterns for YouTube link format: [[MM:SS](URL&t=seconds)]
     YOUTUBE_SECTION_PATTERN = re.compile(r"^##\s*\[\[(\d{1,2}):(\d{2})\]\([^)]*&t=(\d+)\)\]\s*(.+)$")
@@ -144,18 +154,22 @@ class GeminiReader:
             if event_match:
                 groups = event_match.groups()
                 event_text = groups[0]
-                # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
-                if groups[1] is not None:  # HH:MM:SS format
-                    timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
-                elif groups[4] is not None:  # MM:SS format
-                    timestamp = cls.parse_timestamp(groups[4], groups[5])
+                # Parse timestamp - groups: (event_text, hours/minutes, minutes/seconds, seconds_optional)
+                hours_or_minutes = groups[1]
+                minutes_or_seconds = groups[2]
+                seconds_optional = groups[3]
+                if seconds_optional is not None:
+                    # HH:MM:SS format
+                    timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds, seconds_optional)
                 else:
-                    timestamp = None
+                    # MM:SS format
+                    timestamp = cls.parse_timestamp(hours_or_minutes, minutes_or_seconds)
                 if include_events and timestamp is not None:
                     segments.append(
                         GeminiSegment(
-                            text=event_text.strip(),
+                            text=f"[{event_text.strip()}]",
                             timestamp=timestamp,
                             section=current_section,
                             segment_type="event",
@@ -170,34 +184,44 @@ class GeminiReader:
                 speaker, text_with_timestamp = speaker_match.groups()
                 current_speaker = speaker.strip()
-                # Extract timestamp from the end of the text
-                timestamp_match = cls.INLINE_TIMESTAMP_PATTERN.match(text_with_timestamp.strip())
+                # Check for timestamp at the beginning (start time)
+                start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(text_with_timestamp.strip())
+                # Check for timestamp at the end (end time)
+                end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(text_with_timestamp.strip())
                 youtube_match = cls.YOUTUBE_INLINE_PATTERN.match(text_with_timestamp.strip())
-                if timestamp_match:
-                    groups = timestamp_match.groups()
-                    text = groups[0]
+                start_timestamp = None
+                end_timestamp = None
+                text = text_with_timestamp.strip()
+                if start_match:
+                    groups = start_match.groups()
+                    # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
+                    if groups[0] is not None:  # HH:MM:SS format
+                        start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
+                    elif groups[3] is not None:  # MM:SS format
+                        start_timestamp = cls.parse_timestamp(groups[3], groups[4])
+                    text = groups[5]  # Text is after timestamp
+                elif end_match:
+                    groups = end_match.groups()
+                    text = groups[0]  # Text is before timestamp
                     # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
                     if groups[1] is not None:  # HH:MM:SS format
-                        timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
+                        end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
                     elif groups[4] is not None:  # MM:SS format
-                        timestamp = cls.parse_timestamp(groups[4], groups[5])
-                    else:
-                        timestamp = None
+                        end_timestamp = cls.parse_timestamp(groups[4], groups[5])
                 elif youtube_match:
                     groups = youtube_match.groups()
                     text = groups[0]
-                    # Extract seconds from URL parameter
+                    # Extract seconds from URL parameter (treat as end time)
                     url_seconds = groups[3]
-                    timestamp = cls.parse_timestamp(url_seconds)
-                else:
-                    text = text_with_timestamp.strip()
-                    timestamp = None
+                    end_timestamp = cls.parse_timestamp(url_seconds)
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
-                        timestamp=timestamp,
+                        timestamp=start_timestamp,
+                        end_timestamp=end_timestamp,
                         speaker=current_speaker,
                         section=current_section,
                         segment_type="dialogue",
@@ -207,25 +231,50 @@ class GeminiReader:
                 current_speaker = None  # Reset speaker after use
                 continue
-            # Parse plain text with timestamp at the end
-            inline_match = cls.INLINE_TIMESTAMP_PATTERN.match(line)
+            # Parse plain text with timestamp (check both positions)
+            start_match = cls.INLINE_TIMESTAMP_START_PATTERN.match(line)
+            end_match = cls.INLINE_TIMESTAMP_END_PATTERN.match(line)
             youtube_inline_match = cls.YOUTUBE_INLINE_PATTERN.match(line)
-            if inline_match:
-                groups = inline_match.groups()
-                text = groups[0]
+            start_timestamp = None
+            end_timestamp = None
+            text = None
+            if start_match:
+                groups = start_match.groups()
+                # Parse timestamp - can be HH:MM:SS (groups 0,1,2) or MM:SS (groups 3,4)
+                if groups[0] is not None:  # HH:MM:SS format
+                    start_timestamp = cls.parse_timestamp(groups[0], groups[1], groups[2])
+                elif groups[3] is not None:  # MM:SS format
+                    start_timestamp = cls.parse_timestamp(groups[3], groups[4])
+                text = groups[5]  # Text is after timestamp
+                segments.append(
+                    GeminiSegment(
+                        text=text.strip(),
+                        timestamp=start_timestamp,
+                        end_timestamp=None,
+                        speaker=current_speaker,
+                        section=current_section,
+                        segment_type="dialogue",
+                        line_number=line_num,
+                    )
+                )
+                continue
+            elif end_match:
+                groups = end_match.groups()
+                text = groups[0]  # Text is before timestamp
                 # Parse timestamp - can be HH:MM:SS (groups 1,2,3) or MM:SS (groups 4,5)
                 if groups[1] is not None:  # HH:MM:SS format
-                    timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
+                    end_timestamp = cls.parse_timestamp(groups[1], groups[2], groups[3])
                 elif groups[4] is not None:  # MM:SS format
-                    timestamp = cls.parse_timestamp(groups[4], groups[5])
-                else:
-                    timestamp = None
+                    end_timestamp = cls.parse_timestamp(groups[4], groups[5])
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
-                        timestamp=timestamp,
+                        timestamp=None,
+                        end_timestamp=end_timestamp,
                         speaker=current_speaker,
                         section=current_section,
                         segment_type="dialogue",
@@ -236,14 +285,15 @@ class GeminiReader:
             elif youtube_inline_match:
                 groups = youtube_inline_match.groups()
                 text = groups[0]
-                # Extract seconds from URL parameter
+                # Extract seconds from URL parameter (treat as end time)
                 url_seconds = groups[3]
-                timestamp = cls.parse_timestamp(url_seconds)
+                end_timestamp = cls.parse_timestamp(url_seconds)
                 segments.append(
                     GeminiSegment(
                         text=text.strip(),
-                        timestamp=timestamp,
+                        timestamp=None,
+                        end_timestamp=end_timestamp,
                         speaker=current_speaker,
                         section=current_section,
                         segment_type="dialogue",
@@ -280,38 +330,79 @@ class GeminiReader:
         Returns:
                 List of Supervision objects ready for alignment
         """
-        segments = cls.read(transcript_path, include_events=False, include_sections=False)
+        segments = cls.read(transcript_path, include_events=True, include_sections=False)
-        # Filter to only dialogue segments with timestamps
-        dialogue_segments = [s for s in segments if s.segment_type == "dialogue" and s.timestamp is not None]
+        # Filter to dialogue and event segments with timestamps (either start or end)
+        dialogue_segments = [
+            s
+            for s in segments
+            if s.segment_type in ("dialogue", "event") and (s.timestamp is not None or s.end_timestamp is not None)
+        ]
         if not dialogue_segments:
             raise ValueError(f"No dialogue segments with timestamps found in {transcript_path}")
-        # Sort by timestamp
-        dialogue_segments.sort(key=lambda x: x.timestamp)
+        # Sort by timestamp (use start time if available, otherwise end time)
+        dialogue_segments.sort(key=lambda x: x.timestamp if x.timestamp is not None else x.end_timestamp)
         # Convert to Supervision objects
         supervisions: List[Supervision] = []
+        prev_end_time = 0.0
         for i, segment in enumerate(dialogue_segments):
-            # Estimate duration based on next segment
-            if i < len(dialogue_segments) - 1:
-                duration = dialogue_segments[i + 1].timestamp - segment.timestamp
-            else:
-                # Last segment: estimate based on text length (rough heuristic)
-                words = len(segment.text.split())
-                duration = words * 0.3  # ~0.3 seconds per word
-            supervisions.append(
-                Supervision(
-                    text=segment.text,
-                    start=segment.timestamp,
-                    duration=max(duration, min_duration),
-                    id=f"segment_{i:05d}",
-                    speaker=segment.speaker,
-                )
-            )
+            seg_start = None
+            seg_end = None
+            # Determine start and end times based on available timestamps
+            if segment.timestamp is not None:
+                # Has start time
+                seg_start = segment.timestamp
+                if segment.end_timestamp is not None:
+                    # Has both start and end
+                    seg_end = segment.end_timestamp
+                else:
+                    # Only has start, estimate end
+                    if i < len(dialogue_segments) - 1:
+                        # Use next segment's time
+                        next_seg = dialogue_segments[i + 1]
+                        if next_seg.timestamp is not None:
+                            seg_end = next_seg.timestamp
+                        elif next_seg.end_timestamp is not None:
+                            # Next has only end, estimate its start and use that
+                            words_next = len(next_seg.text.split())
+                            estimated_duration_next = words_next * 0.3
+                            seg_end = next_seg.end_timestamp - estimated_duration_next
+                    if seg_end is None:
+                        # Estimate based on text length
+                        words = len(segment.text.split())
+                        seg_end = seg_start + words * 0.3
+            elif segment.end_timestamp is not None:
+                # Only has end time, need to infer start
+                seg_end = segment.end_timestamp
+                # Use previous segment's end time as start, or estimate based on text
+                if prev_end_time > 0:
+                    seg_start = prev_end_time
+                else:
+                    # Estimate start based on text length
+                    words = len(segment.text.split())
+                    estimated_duration = words * 0.3
+                    seg_start = seg_end - estimated_duration
+            if seg_start is not None and seg_end is not None:
+                duration = max(seg_end - seg_start, min_duration)
+                if segment.segment_type == "dialogue":
+                    supervisions.append(
+                        Supervision(
+                            text=segment.text,
+                            start=seg_start,
+                            duration=duration,
+                            id=f"segment_{i:05d}",
+                            speaker=segment.speaker,
+                        )
+                    )
+                prev_end_time = seg_start + duration
         # Optionally merge consecutive segments from same speaker
         if merge_consecutive:

lattifai/cli/transcribe.py CHANGED Viewed

@@ -108,12 +108,7 @@ def transcribe(
     is_url = media_config.is_input_remote()
     # Prepare output paths
-    if is_url:
-        # For URLs, use output_dir from media_config or current directory
-        output_path = media_config.output_dir
-    else:
-        # For files, use input path directory
-        output_path = Path(media_config.input_path).parent
+    output_dir = media_config.output_dir or Path(media_config.input_path).parent
     # Create transcriber
     if not transcription_config.lattice_model_path:
@@ -140,7 +135,7 @@ def transcribe(
             input_path = asyncio.run(
                 downloader.download_media(
                     url=media_config.input_path,
-                    output_dir=str(output_path),
+                    output_dir=str(output_dir),
                     media_format=media_config.normalize_format(),
                     force_overwrite=media_config.force_overwrite,
                 )
@@ -167,7 +162,7 @@ def transcribe(
         if is_url:
             # For URLs, generate output filename based on transcriber
             output_format = transcriber.file_suffix.lstrip(".")
-            final_output = output_path / f"youtube_LattifAI_{transcriber.name}.{output_format}"
+            final_output = output_dir / f"youtube_LattifAI_{transcriber.name}.{output_format}"
         else:
             # For files, use input filename with suffix
             final_output = Path(media_config.input_path).with_suffix(".LattifAI.srt")

lattifai/client.py CHANGED Viewed

@@ -126,38 +126,20 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
                 safe_print(colorful.cyan(f"🔄 Using segmented alignment strategy: {alignment_strategy}"))
                 if caption.supervisions and alignment_strategy == "transcription":
-                    # raise NotImplementedError("Transcription-based alignment is not yet implemented.")
-                    assert (
-                        "gemini" not in self.transcriber.name.lower()
-                    ), "Transcription-based alignment is not supported with Gemini transcriber."
-                    assert (
-                        caption.supervisions
-                    ), "Input caption should contain supervisions when using transcription-based alignment."
                     if not caption.transcription:
-                        import asyncio
-                        safe_print(colorful.cyan("📝 Transcribing media for alignment..."))
-                        if output_caption_path:
-                            transcript_file = (
-                                Path(str(output_caption_path)).parent
-                                / f"{Path(str(media_audio)).stem}_{self.transcriber.file_name}"
-                            )
-                            if transcript_file.exists():
-                                # print(colorful.cyan(f"Reading existing transcription from {transcript_file}"))
-                                transcript = self._read_caption(transcript_file, verbose=False)
-                                caption.transcription = transcript.supervisions
-                                caption.audio_events = transcript.audio_events
-                        if not caption.transcription:
-                            transcript = asyncio.run(
-                                self.transcriber.transcribe(media_audio, language=self.caption_config.source_lang)
-                            )
-                            caption.transcription = transcript.transcription
-                            caption.audio_events = transcript.audio_events
+                        transcript = self._transcribe(
+                            media_audio,
+                            source_lang=self.caption_config.source_lang,
+                            is_async=False,
+                            output_dir=Path(str(output_caption_path)).parent if output_caption_path else None,
+                        )
+                        caption.transcription = transcript.supervisions or transcript.transcription
+                        caption.audio_events = transcript.audio_events
+                    assert caption.transcription, "Transcription is empty after transcription step."
                     # Align caption.supervisions with transcription to get segments
                     import regex
-                    from error_align import ErrorAlign, error_align  # noqa: F401
+                    from error_align import error_align  # noqa: F401
                     from error_align.utils import DELIMITERS, NUMERIC_TOKEN, STANDARD_TOKEN, OpType
                     JOIN_TOKEN = "❄"
@@ -184,21 +166,82 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
                             )
                         )
-                    alignments = error_align(
-                        f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions),
-                        f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription),
-                        tokenizer=custom_tokenizer,
-                    )
+                    if split_sentence or self.caption_config.split_sentence:
+                        caption.supervisions = self.aligner.tokenizer.split_sentences(caption.supervisions)
-                    for align in alignments:
-                        if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
-                            pass
+                    ref = f"{JOIN_TOKEN}".join(sup.text for sup in caption.supervisions)
+                    hyp = f"{JOIN_TOKEN}".join(sup.text for sup in caption.transcription)
+                    alignments = error_align(ref, hyp, tokenizer=custom_tokenizer)
-                        # if align.op_type == OpType.MATCH:
-                        #     continue
-                        # elif align.op_type in (OpType.INSERT, OpType.DELETE, OpType.SUBSTITUTE):
-                        #     # print(colorful.yellow(f"⚠️ Alignment warning: {op}"))
-                        #     pass
+                    idx = 0
+                    for k, align in enumerate(alignments):
+                        if align.hyp == JOIN_TOKEN and align.op_type == OpType.MATCH:
+                            # safe_print(f"Segment {k}: JOIN_TOKEN detected, creating segment.")
+                            # Find first non-None ref_slice starting from idx
+                            ref_start = 0
+                            for i in range(idx, k + 1):
+                                if i < len(alignments) and alignments[i].ref_slice is not None:
+                                    ref_start = alignments[i].ref_slice.start
+                                    break
+                            # Find last non-None ref_slice up to current position
+                            ref_stop = len(ref)
+                            for i in range(k, idx - 1, -1):
+                                if i < len(alignments) and alignments[i].ref_slice is not None:
+                                    ref_stop = alignments[i].ref_slice.stop
+                                    break
+                            # Find first non-None hyp_slice starting from idx
+                            hyp_start = 0
+                            for i in range(idx, k + 1):
+                                if i < len(alignments) and alignments[i].hyp_slice is not None:
+                                    hyp_start = alignments[i].hyp_slice.start
+                                    break
+                            # Find last non-None hyp_slice up to current position
+                            hyp_stop = len(hyp)
+                            for i in range(k, idx - 1, -1):
+                                if i < len(alignments) and alignments[i].hyp_slice is not None:
+                                    hyp_stop = alignments[i].hyp_slice.stop
+                                    break
+                            safe_print(f"[REF]: {ref[ref_start:ref_stop]}")
+                            safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop]}\n")
+                            idx = k + 1
+                    # last part - handle remaining alignments after last JOIN_TOKEN
+                    if idx < len(alignments):
+                        # Find first non-None ref_slice starting from idx
+                        ref_start = 0
+                        for i in range(idx, len(alignments)):
+                            if alignments[i].ref_slice is not None:
+                                ref_start = alignments[i].ref_slice.start
+                                break
+                        # Find last non-None ref_slice from end
+                        ref_stop = len(ref)
+                        for i in range(len(alignments) - 1, idx - 1, -1):
+                            if alignments[i].ref_slice is not None:
+                                ref_stop = alignments[i].ref_slice.stop
+                                break
+                        # Find first non-None hyp_slice starting from idx
+                        hyp_start = 0
+                        for i in range(idx, len(alignments)):
+                            if alignments[i].hyp_slice is not None:
+                                hyp_start = alignments[i].hyp_slice.start
+                                break
+                        # Find last non-None hyp_slice from end
+                        hyp_stop = len(hyp)
+                        for i in range(len(alignments) - 1, idx - 1, -1):
+                            if alignments[i].hyp_slice is not None:
+                                hyp_stop = alignments[i].hyp_slice.stop
+                                break
+                        safe_print(f"[REF]: {ref[ref_start:ref_stop + 1]}")
+                        safe_print(f"[HYP]: {hyp[hyp_start:hyp_stop + 1]}\n")
                     raise NotImplementedError("Transcription-based segmentation is not yet implemented.")
                 else:
@@ -220,6 +263,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
                         )
                 # align each segment
+                sr = media_audio.sampling_rate
                 supervisions, alignments = [], []
                 for i, (start, end, _supervisions, skipalign) in enumerate(segments, 1):
                     print(
@@ -234,10 +278,8 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
                     offset = round(start, 4)
                     # Extract audio slice
-                    audio_slice_ndarray = media_audio.ndarray[
-                        :, int(start * media_audio.sampling_rate) : int(end * media_audio.sampling_rate)
-                    ]
-                    emission = self.aligner.emission(audio_slice_ndarray)
+                    audio_slice = media_audio.ndarray[:, int(start * sr) : int(end * sr)]
+                    emission = self.aligner.emission(audio_slice)
                     # Align segment
                     _supervisions, _alignments = self.aligner.alignment(
@@ -368,7 +410,9 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
         # Step 1: Download media
         media_file = self._download_media_sync(url, output_dir, media_format, force_overwrite)
-        media_audio = self.audio_loader(media_file, channel_selector=channel_selector)
+        media_audio = self.audio_loader(
+            media_file, channel_selector=channel_selector, streaming_chunk_secs=streaming_chunk_secs
+        )
         # Step 2: Get or create captions (download or transcribe)
         caption = self._download_or_transcribe_caption(
@@ -393,7 +437,7 @@ class LattifAI(LattifAIClientMixin, SyncAPIClient):
             output_caption_path=output_caption_path,
             split_sentence=split_sentence,
             channel_selector=channel_selector,
-            streaming_chunk_secs=streaming_chunk_secs,
+            streaming_chunk_secs=None,
         )
         return caption

lattifai/config/alignment.py CHANGED Viewed

@@ -28,11 +28,11 @@ class AlignmentConfig:
     """Computation device: 'cpu' for CPU, 'cuda' for NVIDIA GPU, 'mps' for Apple Silicon."""
     batch_size: int = 1
-    """Batch size for inference (number of samples processed simultaneously)."""
+    """Batch size for inference (number of samples processed simultaneously, NotImplemented yet)."""
     # Segmented Alignment for Long Audio
     trust_caption_timestamps: bool = False
-    """When True, use original caption timestamps as strong reference constraints during alignment.
+    """When True, use original caption.supervisions' timestamps as strong reference constraints during alignment.
     The alignment process will still adjust timestamps but stay close to the input timing.
     Use this when you want to re-segment caption sentence boundaries (caption.split_sentence=True)
     while preserving the approximate timing from the original captions.

lattifai/mixin.py CHANGED Viewed

@@ -290,12 +290,12 @@ class LattifAIClientMixin:
             diarization_file = Path(str(input_caption)).with_suffix(".SpkDiar")
             if diarization_file.exists():
                 if verbose:
-                    safe_print(colorful.cyan(f"📖 Step 1b: Reading speaker diarization from {diarization_file}"))
+                    safe_print(colorful.cyan(f"📖 Step1b: Reading speaker diarization from {diarization_file}"))
                 caption.read_speaker_diarization(diarization_file)
             events_file = Path(str(input_caption)).with_suffix(".AED")
             if events_file.exists():
                 if verbose:
-                    safe_print(colorful.cyan(f"📖 Step 1c: Reading audio events from {events_file}"))
+                    safe_print(colorful.cyan(f"📖 Step1c: Reading audio events from {events_file}"))
                 from tgt import read_textgrid
                 caption.audio_events = read_textgrid(events_file)
@@ -404,6 +404,14 @@ class LattifAIClientMixin:
             # Transcription mode: use Transcriber to transcribe
             self._validate_transcription_setup()
+            if output_dir:
+                # Generate transcript file path
+                transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
+                if transcript_file.exists():
+                    safe_print(colorful.cyan(f"    Using existing transcript file: {transcript_file}"))
+                    transcription = self._read_caption(transcript_file, normalize_text=False)
+                    return transcription
             safe_print(colorful.cyan(f"🎤 Transcribing({self.transcriber.name}) media: {str(media_file)} ..."))
             transcription = await self.transcriber.transcribe_file(media_file, language=source_lang)
             safe_print(colorful.green("         ✓ Transcription completed."))
@@ -442,8 +450,6 @@ class LattifAIClientMixin:
                         safe_print(colorful.yellow(f"First segment: {transcription.transcription[0].text}"))
             if output_dir:
-                # Generate transcript file path
-                transcript_file = output_dir / f"{Path(str(media_file)).stem}_{self.transcriber.file_name}"
                 await asyncio.to_thread(self.transcriber.write, transcription, transcript_file, encoding="utf-8")
                 safe_print(colorful.green(f"         ✓ Transcription saved to: {transcript_file}"))

lattifai 1.2.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

lattifai 1.2.0py3-none-any.whl → 1.2.1py3-none-any.whl