PyPI - mkv-episode-matcher - Versions diffs - 0.3.1__tar.gz → 0.3.3__tar.gz - Mend

mkv-episode-matcher 0.3.1tar.gz → 0.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (52) hide show

{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.coverage RENAMED Viewed

Binary file

{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: mkv-episode-matcher
-Version: 0.3.1
+Version: 0.3.3
 Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
 Home-page: https://github.com/Jsakkos/mkv-episode-matcher
 Author: Jonathan Sakkos

mkv_episode_matcher-0.3.3/mkv_episode_matcher/episode_identification.py ADDED Viewed

@@ -0,0 +1,150 @@
+import json
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+import torch
+from rapidfuzz import fuzz
+from loguru import logger
+import whisper
+import numpy as np
+import re
+class EpisodeMatcher:
+    def __init__(self, cache_dir, show_name, min_confidence=0.6):
+        self.cache_dir = Path(cache_dir)
+        self.min_confidence = min_confidence
+        self.show_name = show_name
+        self.chunk_duration = 300  # 5 minutes
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
+        self.temp_dir.mkdir(exist_ok=True)
+    def clean_text(self, text):
+        text = text.lower().strip()
+        text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
+        text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
+        return ' '.join(text.split())
+    def chunk_score(self, whisper_chunk, ref_chunk):
+        whisper_clean = self.clean_text(whisper_chunk)
+        ref_clean = self.clean_text(ref_chunk)
+        return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
+                fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
+    def extract_audio_chunk(self, mkv_file, start_time):
+        """Extract a chunk of audio from MKV file."""
+        chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
+        if not chunk_path.exists():
+            cmd = [
+                'ffmpeg',
+                '-ss', str(start_time),
+                '-t', str(self.chunk_duration),
+                '-i', mkv_file,
+                '-vn',
+                '-acodec', 'pcm_s16le',
+                '-ar', '16000',
+                '-ac', '1',
+                str(chunk_path)
+            ]
+            subprocess.run(cmd, capture_output=True)
+        return str(chunk_path)
+    def load_reference_chunk(self, srt_file, chunk_idx):
+        """Load reference subtitles for a specific time chunk."""
+        chunk_start = chunk_idx * self.chunk_duration
+        chunk_end = chunk_start + self.chunk_duration
+        text_lines = []
+        with open(srt_file, 'r', encoding='utf-8') as f:
+            content = f.read().strip()
+        for block in content.split('\n\n'):
+            lines = block.split('\n')
+            if len(lines) < 3 or '-->' not in lines[1]:  # Skip malformed blocks
+                continue
+            try:
+                timestamp = lines[1]
+                text = ' '.join(lines[2:])
+                end_time = timestamp.split(' --> ')[1].strip()
+                hours, minutes, seconds = map(float, end_time.replace(',','.').split(':'))
+                total_seconds = hours * 3600 + minutes * 60 + seconds
+                if chunk_start <= total_seconds <= chunk_end:
+                    text_lines.append(text)
+            except (IndexError, ValueError):
+                continue
+        return ' '.join(text_lines)
+    def identify_episode(self, video_file, temp_dir, season_number):
+        try:
+            # Get video duration
+            duration = float(subprocess.check_output([
+                'ffprobe', '-v', 'error',
+                '-show_entries', 'format=duration',
+                '-of', 'default=noprint_wrappers=1:nokey=1',
+                video_file
+            ]).decode())
+            total_chunks = int(np.ceil(duration / self.chunk_duration))
+            # Load Whisper model
+            model = whisper.load_model("base", device=self.device)
+            # Get season-specific reference files
+            reference_dir = self.cache_dir / "data" / self.show_name
+            season_pattern = f"S{season_number:02d}E"
+            reference_files = [
+                f for f in reference_dir.glob("*.srt")
+                if season_pattern in f.name
+            ]
+            if not reference_files:
+                logger.error(f"No reference files found for season {season_number}")
+                return None
+            # Process chunks until match found
+            for chunk_idx in range(min(3, total_chunks)):  # Only try first 3 chunks
+                start_time = chunk_idx * self.chunk_duration
+                audio_path = self.extract_audio_chunk(video_file, start_time)
+                # Transcribe chunk
+                result = model.transcribe(
+                    audio_path,
+                    task="transcribe",
+                    language="en"
+                )
+                chunk_text = result["text"]
+                best_confidence = 0
+                best_match = None
+                # Compare with reference chunks
+                for ref_file in reference_files:
+                    ref_text = self.load_reference_chunk(ref_file, chunk_idx)
+                    confidence = self.chunk_score(chunk_text, ref_text)
+                    if confidence > best_confidence:
+                        best_confidence = confidence
+                        best_match = ref_file
+                    if confidence > self.min_confidence:
+                        season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
+                        if season_ep:
+                            season, episode = map(int, season_ep.groups())
+                            return {
+                                'season': season,
+                                'episode': episode,
+                                'confidence': best_confidence,
+                                'reference_file': str(best_match),
+                            }
+            return None
+        finally:
+            # Cleanup temp files
+            for file in self.temp_dir.glob("chunk_*.wav"):
+                file.unlink()

{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/episode_matcher.py RENAMED Viewed

@@ -5,7 +5,7 @@ import shutil
 import glob
 import os
 from loguru import logger
+import re
 from mkv_episode_matcher.__main__ import CONFIG_FILE, CACHE_DIR
 from mkv_episode_matcher.config import get_config
 from mkv_episode_matcher.mkv_to_srt import convert_mkv_to_srt
@@ -17,20 +17,18 @@ from mkv_episode_matcher.utils import (
     get_subtitles,
     process_reference_srt_files,
     process_srt_files,
-    compare_and_rename_files,get_valid_seasons
+    compare_and_rename_files,get_valid_seasons,rename_episode_file
 )
 from mkv_episode_matcher.speech_to_text import process_speech_to_text
 from mkv_episode_matcher.episode_identification import EpisodeMatcher
 def process_show(season=None, dry_run=False, get_subs=False):
-    """Process the show using both speech recognition and OCR fallback."""
+    """Process the show using streaming speech recognition with OCR fallback."""
     config = get_config(CONFIG_FILE)
     show_dir = config.get("show_dir")
+    show_name = clean_text(os.path.basename(show_dir))
+    matcher = EpisodeMatcher(CACHE_DIR, show_name)
-    # Initialize episode matcher
-    matcher = EpisodeMatcher(CACHE_DIR)
-    # Get valid season directories
     season_paths = get_valid_seasons(show_dir)
     if not season_paths:
         logger.warning(f"No seasons with .mkv files found")
@@ -43,9 +41,7 @@ def process_show(season=None, dry_run=False, get_subs=False):
             return
         season_paths = [season_path]
-    # Process each season
     for season_path in season_paths:
-        # Get MKV files that haven't been processed
         mkv_files = [f for f in glob.glob(os.path.join(season_path, "*.mkv"))
                     if not check_filename(f)]
@@ -53,66 +49,52 @@ def process_show(season=None, dry_run=False, get_subs=False):
             logger.info(f"No new files to process in {season_path}")
             continue
-        # Create temp directories
+        season_num = int(re.search(r'Season (\d+)', season_path).group(1))
         temp_dir = Path(season_path) / "temp"
         ocr_dir = Path(season_path) / "ocr"
         temp_dir.mkdir(exist_ok=True)
         ocr_dir.mkdir(exist_ok=True)
         try:
-            # Download subtitles if requested
             if get_subs:
-                show_id = fetch_show_id(matcher.series_name)
+                show_id = fetch_show_id(matcher.show_name)
                 if show_id:
-                    seasons = {int(os.path.basename(p).split()[-1]) for p in season_paths}
-                    get_subtitles(show_id, seasons=seasons)
+                    get_subtitles(show_id, seasons={season_num})
             unmatched_files = []
-            # First pass: Try speech recognition matching
             for mkv_file in mkv_files:
                 logger.info(f"Attempting speech recognition match for {mkv_file}")
+                match = matcher.identify_episode(mkv_file, temp_dir, season_num)
-                # Extract audio and run speech recognition
-                process_speech_to_text(mkv_file, str(temp_dir))
-                match = matcher.identify_episode(mkv_file, temp_dir)
-                if match and match['confidence'] >= matcher.min_confidence:
-                    # Rename the file
-                    new_name = f"{matcher.series_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
+                if match:
+                    new_name = f"{matcher.show_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
                     new_path = os.path.join(season_path, new_name)
                     logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
                               f"(confidence: {match['confidence']:.2f})")
                     if not dry_run:
-                        os.rename(mkv_file, new_path)
+                        logger.info(f"Renaming {mkv_file} to {new_name}")
+                        rename_episode_file(mkv_file, new_name)
                 else:
-                    logger.info(f"Speech recognition match failed for {mkv_file}, will try OCR")
+                    logger.info(f"Speech recognition match failed for {mkv_file}, trying OCR")
                     unmatched_files.append(mkv_file)
-            # Second pass: Try OCR for unmatched files
+            # OCR fallback for unmatched files
             if unmatched_files:
                 logger.info(f"Attempting OCR matching for {len(unmatched_files)} unmatched files")
-                # Convert files to SRT using OCR
                 convert_mkv_to_srt(season_path, unmatched_files)
-                # Process OCR results
-                reference_text_dict = process_reference_srt_files(matcher.series_name)
+                reference_text_dict = process_reference_srt_files(matcher.show_name)
                 srt_text_dict = process_srt_files(str(ocr_dir))
-                # Compare and rename
                 compare_and_rename_files(
                     srt_text_dict,
                     reference_text_dict,
                     dry_run=dry_run,
-                    min_confidence=0.1  # Lower threshold for OCR
                 )
         finally:
-            # Cleanup
             if not dry_run:
                 shutil.rmtree(temp_dir)
                 cleanup_ocr_files(show_dir)

{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/utils.py RENAMED Viewed

@@ -117,8 +117,10 @@ def rename_episode_file(original_file_path, new_filename):
     except OSError as e:
         logger.error(f"Failed to rename file: {e}")
         return None
+    except FileExistsError as e:
+        logger.error(f"Failed to rename file: {e}")
+        return None
 def get_subtitles(show_id, seasons: set[int]):
     """
     Retrieves and saves subtitles for a given TV show and seasons.
@@ -233,9 +235,7 @@ def clean_text(text):
     cleaned_text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
     # Strip leading/trailing whitespace
     return cleaned_text.strip()
-# mkv_episode_matcher/utils.py
-# Add this to your existing utils.py, keeping all other functions
 def process_reference_srt_files(series_name):
     """
@@ -357,12 +357,9 @@ def compare_and_rename_files(srt_files, reference_files, dry_run=False):
                 logger.info(f"Matching lines: {matching_lines}")
                 logger.info(f"Found matching file: {mkv_file} ->{reference}")
                 new_filename = os.path.join(parent_dir, reference)
-                if not os.path.exists(new_filename):
-                    if os.path.exists(mkv_file) and not dry_run:
-                        logger.info(f"Renaming {mkv_file} to {new_filename}")
-                        os.rename(mkv_file, new_filename)
-                else:
-                    logger.info(f"File {new_filename} already exists, skipping")
+                if not dry_run:
+                    logger.info(f"Renaming {mkv_file} to {new_filename}")
+                    rename_episode_file(mkv_file, new_filename)
 def compare_text(text1, text2):
     """

{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: mkv-episode-matcher
-Version: 0.3.1
+Version: 0.3.3
 Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
 Home-page: https://github.com/Jsakkos/mkv-episode-matcher
 Author: Jonathan Sakkos

{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/SOURCES.txt RENAMED Viewed

@@ -46,4 +46,5 @@ mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py
 mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py
 mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py
 tests/__init__.py
-tests/test_improvements.py
+tests/test_improvements.py
+tests/test_main.py

{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = mkv_episode_matcher
-version = 0.3.1
+version = 0.3.3
 author = Jonathan Sakkos
 author_email = jonathansakkos@gmail.com
 description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.

mkv_episode_matcher-0.3.3/tests/test_main.py ADDED Viewed

@@ -0,0 +1,137 @@
+import pytest
+import os
+import shutil
+from pathlib import Path
+from unittest.mock import Mock, patch, mock_open
+from mkv_episode_matcher.episode_matcher import process_show
+from mkv_episode_matcher.utils import (
+    get_valid_seasons,
+    check_filename,
+    rename_episode_file,
+    clean_text,
+    extract_season_episode
+)
+from mkv_episode_matcher.episode_identification import EpisodeMatcher
+from mkv_episode_matcher.config import get_config, set_config
+@pytest.fixture
+def temp_show_dir(tmp_path):
+    show_dir = tmp_path / "Test Show"
+    show_dir.mkdir()
+    season_dir = show_dir / "Season 1"
+    season_dir.mkdir()
+    (season_dir / "episode1.mkv").touch()
+    (season_dir / "episode2.mkv").touch()
+    return show_dir
+@pytest.fixture
+def mock_config():
+    return {
+        "tmdb_api_key": "test_key",
+        "show_dir": "/test/path",
+        "max_threads": 4,
+        "open_subtitles_api_key": "test_key",
+        "open_subtitles_user_agent": "test_agent",
+        "open_subtitles_username": "test_user",
+        "open_subtitles_password": "test_pass",
+        "tesseract_path": "/test/tesseract"
+    }
+class TestUtilities:
+    def test_get_valid_seasons(self, temp_show_dir):
+        seasons = get_valid_seasons(str(temp_show_dir))
+        assert len(seasons) == 1
+        assert str(temp_show_dir / "Season 1") in seasons
+    def test_check_filename(self):
+        assert check_filename("Show - S01E02.mkv") == True
+        assert check_filename("random_file.mkv") == False
+    def test_rename_episode_file(self, temp_show_dir):
+        original = temp_show_dir / "Season 1" / "episode1.mkv"
+        new_name = "Show - S01E01.mkv"
+        result = rename_episode_file(str(original), new_name)
+        assert result is not None
+        assert Path(result).name == new_name
+    def test_clean_text(self):
+        text = "Test [action] (note) {tag}"
+        assert clean_text(text) == "Test"
+    def test_extract_season_episode(self):
+        filename = "Show - S01E02.mkv"
+        season, episode = extract_season_episode(filename)
+        assert season == 1
+        assert episode == 2
+class TestConfiguration:
+    def test_set_config(self, tmp_path, mock_config):
+        config_file = tmp_path / "config.ini"
+        set_config(
+            mock_config["tmdb_api_key"],
+            mock_config["open_subtitles_api_key"],
+            mock_config["open_subtitles_user_agent"],
+            mock_config["open_subtitles_username"],
+            mock_config["open_subtitles_password"],
+            mock_config["show_dir"],
+            str(config_file),
+            mock_config["tesseract_path"]
+        )
+        assert config_file.exists()
+    def test_get_config(self, tmp_path, mock_config):
+        config_file = tmp_path / "config.ini"
+        set_config(
+            mock_config["tmdb_api_key"],
+            mock_config["open_subtitles_api_key"],
+            mock_config["open_subtitles_user_agent"],
+            mock_config["open_subtitles_username"],
+            mock_config["open_subtitles_password"],
+            mock_config["show_dir"],
+            str(config_file),
+            mock_config["tesseract_path"]
+        )
+        config = get_config(str(config_file))
+        assert config["tmdb_api_key"] == mock_config["tmdb_api_key"]
+        assert config["show_dir"] == mock_config["show_dir"]
+class TestEpisodeMatcher:
+    @pytest.fixture
+    def matcher(self, tmp_path):
+        return EpisodeMatcher(tmp_path, "Test Show")
+    def test_clean_text(self, matcher):
+        text = "Test [action] <tag> T-t-test"
+        assert matcher.clean_text(text) == "test action tag test"
+    def test_chunk_score(self, matcher):
+        score = matcher.chunk_score("Test dialogue", "test dialog")
+        assert 0 <= score <= 1
+    @patch('subprocess.run')
+    def test_extract_audio_chunk(self, mock_run, matcher, tmp_path):
+        mkv_file = tmp_path / "test.mkv"
+        mkv_file.touch()
+        chunk = matcher.extract_audio_chunk(str(mkv_file), 0)
+        assert isinstance(chunk, str)
+        assert mock_run.called
+class TestProcessShow:
+    @patch('mkv_episode_matcher.episode_matcher.get_valid_seasons')
+    @patch('mkv_episode_matcher.episode_matcher.get_config')
+    def test_process_show_no_seasons(self, mock_config, mock_seasons, mock_config_data):
+        mock_seasons.return_value = []
+        mock_config.return_value = mock_config_data
+        process_show()
+        mock_seasons.assert_called_once()
+    @patch('mkv_episode_matcher.episode_matcher.get_valid_seasons')
+    @patch('mkv_episode_matcher.episode_matcher.get_config')
+    def test_process_show_with_season(self, mock_config, mock_seasons, temp_show_dir, mock_config_data):
+        mock_seasons.return_value = [str(temp_show_dir / "Season 1")]
+        mock_config.return_value = mock_config_data
+        process_show(season=1)
+        mock_seasons.assert_called_once()
+if __name__ == '__main__':
+    pytest.main(['-v'])

{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/uv.lock RENAMED Viewed

@@ -374,7 +374,7 @@ wheels = [
 [[package]]
 name = "mkv-episode-matcher"
-version = "0.3.0.post1.dev4+g3a186f5.d20241126"
+version = "0.3.2.post1.dev0+g2c513fa.d20241126"
 source = { editable = "." }
 dependencies = [
     { name = "configparser" },

mkv_episode_matcher-0.3.1/mkv_episode_matcher/episode_identification.py DELETED Viewed

@@ -1,208 +0,0 @@
-# mkv_episode_matcher/episode_identification.py
-import os
-import glob
-from pathlib import Path
-from rapidfuzz import fuzz
-from collections import defaultdict
-import re
-from loguru import logger
-import json
-import shutil
-class EpisodeMatcher:
-    def __init__(self, cache_dir, min_confidence=0.6):
-        self.cache_dir = Path(cache_dir)
-        self.min_confidence = min_confidence
-        self.whisper_segments = None
-        self.series_name = None
-    def clean_text(self, text):
-        """Clean text by removing stage directions and normalizing repeated words."""
-        # Remove stage directions like [groans] and <i>SHIP:</i>
-        text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
-        # Remove repeated words with dashes (e.g., "Y-y-you" -> "you")
-        text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
-        # Remove multiple spaces
-        text = ' '.join(text.split())
-        return text.lower()
-    def chunk_score(self, whisper_chunk, ref_chunk):
-        """Calculate fuzzy match score between two chunks of text."""
-        whisper_clean = self.clean_text(whisper_chunk)
-        ref_clean = self.clean_text(ref_chunk)
-        # Use token sort ratio to handle word order differences
-        token_sort = fuzz.token_sort_ratio(whisper_clean, ref_clean)
-        # Use partial ratio to catch substring matches
-        partial = fuzz.partial_ratio(whisper_clean, ref_clean)
-        # Weight token sort more heavily but consider partial matches
-        return (token_sort * 0.7 + partial * 0.3) / 100.0
-    def identify_episode(self, video_file, temp_dir):
-        """Identify which episode matches this video file."""
-        # Get series name from parent directory
-        self.series_name = Path(video_file).parent.parent.name
-        # Load whisper transcript if not already processed
-        segments_file = Path(temp_dir) / f"{Path(video_file).stem}.segments.json"
-        if not segments_file.exists():
-            logger.error(f"No transcript found for {video_file}. Run speech recognition first.")
-            return None
-        with open(segments_file) as f:
-            self.whisper_segments = json.load(f)
-        # Get reference directory for this series
-        reference_dir = self.cache_dir / "data" / self.series_name
-        if not reference_dir.exists():
-            logger.error(f"No reference files found for {self.series_name}")
-            return None
-        # Match against reference files
-        match = self.match_all_references(reference_dir)
-        if match and match['confidence'] >= self.min_confidence:
-            # Extract season and episode from filename
-            match_file = Path(match['file'])
-            season_ep = re.search(r'S(\d+)E(\d+)', match_file.stem)
-            if season_ep:
-                season, episode = map(int, season_ep.groups())
-                return {
-                    'season': season,
-                    'episode': episode,
-                    'confidence': match['confidence'],
-                    'reference_file': str(match_file),
-                    'chunk_scores': match['chunk_scores']
-                }
-        return None
-    def match_all_references(self, reference_dir):
-        """Process all reference files and track matching scores."""
-        results = defaultdict(list)
-        best_match = None
-        best_confidence = 0
-        def process_chunks(ref_segments, filename):
-            nonlocal best_match, best_confidence
-            chunk_size = 300  # 5 minute chunks
-            whisper_chunks = defaultdict(list)
-            ref_chunks = defaultdict(list)
-            # Group segments into time chunks
-            for seg in self.whisper_segments:
-                chunk_idx = int(float(seg['start']) // chunk_size)
-                whisper_chunks[chunk_idx].append(seg['text'])
-            for seg in ref_segments:
-                chunk_idx = int(seg['start'] // chunk_size)
-                ref_chunks[chunk_idx].append(seg['text'])
-            # Score each chunk
-            for chunk_idx in whisper_chunks:
-                whisper_text = ' '.join(whisper_chunks[chunk_idx])
-                # Look for matching reference chunk and adjacent chunks
-                scores = []
-                for ref_idx in range(max(0, chunk_idx-1), chunk_idx+2):
-                    if ref_idx in ref_chunks:
-                        ref_text = ' '.join(ref_chunks[ref_idx])
-                        score = self.chunk_score(whisper_text, ref_text)
-                        scores.append(score)
-                if scores:
-                    chunk_confidence = max(scores)
-                    logger.info(f"File: {filename}, "
-                              f"Time: {chunk_idx*chunk_size}-{(chunk_idx+1)*chunk_size}s, "
-                              f"Confidence: {chunk_confidence:.2f}")
-                    results[filename].append({
-                        'chunk_idx': chunk_idx,
-                        'confidence': chunk_confidence
-                    })
-                    # Early exit if we find a very good match
-                    if chunk_confidence > self.min_confidence:
-                        chunk_scores = results[filename]
-                        confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
-                                      for c in chunk_scores) / len(chunk_scores)
-                        if confidence > best_confidence:
-                            best_confidence = confidence
-                            best_match = {
-                                'file': filename,
-                                'confidence': confidence,
-                                'chunk_scores': chunk_scores
-                            }
-                        return True
-            return False
-        # Process each reference file
-        for ref_file in glob.glob(os.path.join(reference_dir, "*.srt")):
-            ref_segments = self.parse_srt_to_segments(ref_file)
-            filename = os.path.basename(ref_file)
-            if process_chunks(ref_segments, filename):
-                break
-        # If no early match found, find best overall match
-        if not best_match:
-            for filename, chunks in results.items():
-                # Weight earlier chunks more heavily
-                confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
-                               for c in chunks) / len(chunks)
-                if confidence > best_confidence:
-                    best_confidence = confidence
-                    best_match = {
-                        'file': filename,
-                        'confidence': confidence,
-                        'chunk_scores': chunks
-                    }
-        return best_match
-    def parse_srt_to_segments(self, srt_file):
-        """Parse SRT file into list of segments with start/end times and text."""
-        segments = []
-        current_segment = {}
-        with open(srt_file, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
-        i = 0
-        while i < len(lines):
-            line = lines[i].strip()
-            if line.isdigit():  # Index
-                if current_segment:
-                    segments.append(current_segment)
-                current_segment = {}
-            elif '-->' in line:  # Timestamp
-                start, end = line.split(' --> ')
-                current_segment['start'] = self.timestr_to_seconds(start)
-                current_segment['end'] = self.timestr_to_seconds(end)
-            elif line:  # Text
-                if 'text' in current_segment:
-                    current_segment['text'] += ' ' + line
-                else:
-                    current_segment['text'] = line
-            i += 1
-        if current_segment:
-            segments.append(current_segment)
-        return segments
-    def timestr_to_seconds(self, timestr):
-        """Convert SRT timestamp to seconds."""
-        h, m, s = timestr.replace(',','.').split(':')
-        return float(h) * 3600 + float(m) * 60 + float(s)