PyPI - mkv-episode-matcher - Versions diffs - 0.4.5__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

mkv-episode-matcher 0.4.5py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (25) hide show

mkv_episode_matcher/__init__.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """MKV Episode Matcher package."""
-from importlib.metadata import version, PackageNotFoundError
+from importlib.metadata import PackageNotFoundError, version
 try:
     __version__ = version("mkv-episode-matcher")
 except PackageNotFoundError:
     # package is not installed
     __version__ = "unknown"

mkv_episode_matcher/__main__.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # __main__.py
 import argparse
 import os
-import sys
 from loguru import logger
 from mkv_episode_matcher import __version__
 from mkv_episode_matcher.config import get_config, set_config
@@ -34,7 +34,7 @@ if not os.path.exists(log_dir):
 logger.add(
     os.path.join(log_dir, "stdout.log"),
     format="{time} {level} {message}",
-    level="DEBUG",
+    level="INFO",
     rotation="10 MB",
 )
@@ -56,7 +56,6 @@ def main():
     --season: The season number to be processed. If not provided, all seasons will be processed.
     --dry-run: A boolean flag indicating whether to perform a dry run (i.e., not rename any files). If not provided, the function will rename files.
     --get-subs: A boolean flag indicating whether to download subtitles for the show. If not provided, the function will not download subtitles.
-    --tesseract-path: The path to the tesseract executable. If not provided, the function will try to get it from the cache or prompt the user to input it.
     The function logs its progress to two separate log files: one for standard output and one for errors.
     """
@@ -67,7 +66,7 @@ def main():
         "--version",
         action="version",
         version=f"%(prog)s {__version__}",
-        help="Show the version number and exit"
+        help="Show the version number and exit",
     )
     parser.add_argument("--tmdb-api-key", help="TMDb API key")
     parser.add_argument("--show-dir", help="Main directory of the show")
@@ -92,13 +91,6 @@ def main():
         nargs="?",
         help="Download subtitles for the show (default: None)",
     )
-    parser.add_argument(
-        "--tesseract-path",
-        type=str,
-        default=None,
-        nargs="?",
-        help="Path to the tesseract executable (default: None)",
-    )
     parser.add_argument(
         "--check-gpu",
         type=bool,
@@ -108,7 +100,8 @@ def main():
     )
     args = parser.parse_args()
     if args.check_gpu:
-        from mkv_episode_matcher.speech_to_text import check_gpu_support
+        from mkv_episode_matcher.utils import check_gpu_support
         check_gpu_support()
         return
     logger.debug(f"Command-line arguments: {args}")
@@ -118,17 +111,17 @@ def main():
     # Get TMDb API key
     tmdb_api_key = args.tmdb_api_key or config.get("tmdb_api_key")
-    if not tmdb_api_key:
-        tmdb_api_key = input("Enter your TMDb API key: ")
-    logger.debug(f"TMDb API Key: {tmdb_api_key}")
     logger.debug("Getting OpenSubtitles API key")
     open_subtitles_api_key = config.get("open_subtitles_api_key")
     open_subtitles_user_agent = config.get("open_subtitles_user_agent")
     open_subtitles_username = config.get("open_subtitles_username")
     open_subtitles_password = config.get("open_subtitles_password")
     if args.get_subs:
+        if not tmdb_api_key:
+            tmdb_api_key = input("Enter your TMDb API key: ")
+        logger.debug(f"TMDb API Key: {tmdb_api_key}")
         if not open_subtitles_api_key:
             open_subtitles_api_key = input("Enter your OpenSubtitles API key: ")
         if not open_subtitles_user_agent:
@@ -137,24 +130,17 @@ def main():
             open_subtitles_username = input("Enter your OpenSubtitles Username: ")
         if not open_subtitles_password:
             open_subtitles_password = input("Enter your OpenSubtitles Password: ")
-    # Use config for show directory and tesseract path
+    # Use config for show directory
     show_dir = args.show_dir or config.get("show_dir")
     if not show_dir:
         show_dir = input("Enter the main directory of the show:")
     logger.info(f"Show Directory: {show_dir}")
     if not show_dir:
         show_dir = os.getcwd()
-    if not args.tesseract_path:
-        tesseract_path = config.get("tesseract_path")
-        if not tesseract_path:
-            tesseract_path = input(r"Enter the path to the tesseract executable: ['C:\Program Files\Tesseract-OCR\tesseract.exe']")
-    else:
-        tesseract_path = args.tesseract_path
-    logger.debug(f"Teesseract Path: {tesseract_path}")
     logger.debug(f"Show Directory: {show_dir}")
     # Set the configuration
     set_config(
         tmdb_api_key,
@@ -164,7 +150,6 @@ def main():
         open_subtitles_password,
         show_dir,
         CONFIG_FILE,
-        tesseract_path=tesseract_path,
     )
     logger.info("Configuration set")

mkv_episode_matcher/config.py CHANGED Viewed

@@ -27,7 +27,6 @@ def set_config(
     open_subtitles_password,
     show_dir,
     file,
-    tesseract_path=None,
 ):
     """
     Sets the configuration values and writes them to a file.
@@ -40,7 +39,6 @@ def set_config(
         open_subtitles_password (str): The password for OpenSubtitles.
         show_dir (str): The directory where the TV show episodes are located.
         file (str): The path to the configuration file.
-        tesseract_path (str, optional): The path to the Tesseract OCR executable.
     Returns:
         None
@@ -54,7 +52,6 @@ def set_config(
         "open_subtitles_user_agent": str(open_subtitles_user_agent),
         "open_subtitles_username": str(open_subtitles_username),
         "open_subtitles_password": str(open_subtitles_password),
-        "tesseract_path": str(tesseract_path),
     }
     logger.info(
         f"Setting config with API:{tmdb_api_key}, show_dir: {show_dir}, and max_threads: {MAX_THREADS}"

mkv_episode_matcher/episode_identification.py CHANGED Viewed

@@ -1,54 +1,62 @@
-import json
-import os
+import re
 import subprocess
 import tempfile
 from pathlib import Path
+import chardet
+import numpy as np
 import torch
-from rapidfuzz import fuzz
-from loguru import logger
 import whisper
-import numpy as np
-import re
-from pathlib import Path
-import chardet
 from loguru import logger
+from rapidfuzz import fuzz
 class EpisodeMatcher:
     def __init__(self, cache_dir, show_name, min_confidence=0.6):
         self.cache_dir = Path(cache_dir)
         self.min_confidence = min_confidence
         self.show_name = show_name
-        self.chunk_duration = 300  # 5 minutes
+        self.chunk_duration = 30
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
         self.temp_dir.mkdir(exist_ok=True)
     def clean_text(self, text):
         text = text.lower().strip()
-        text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
-        text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
-        return ' '.join(text.split())
+        text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
+        text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
+        return " ".join(text.split())
     def chunk_score(self, whisper_chunk, ref_chunk):
         whisper_clean = self.clean_text(whisper_chunk)
         ref_clean = self.clean_text(ref_chunk)
-        return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
-                fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
+        return (
+            fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7
+            + fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3
+        ) / 100.0
     def extract_audio_chunk(self, mkv_file, start_time):
         """Extract a chunk of audio from MKV file."""
         chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
         if not chunk_path.exists():
             cmd = [
-                'ffmpeg',
-                '-ss', str(start_time),
-                '-t', str(self.chunk_duration),
-                '-i', mkv_file,
-                '-vn',
-                '-acodec', 'pcm_s16le',
-                '-ar', '16000',
-                '-ac', '1',
-                str(chunk_path)
+                "ffmpeg",
+                "-ss",
+                str(start_time),
+                "-t",
+                str(self.chunk_duration),
+                "-i",
+                mkv_file,
+                "-vn",  # Disable video
+                "-sn",  # Disable subtitles
+                "-dn",  # Disable data streams
+                "-acodec",
+                "pcm_s16le",
+                "-ar",
+                "16000",
+                "-ac",
+                "1",
+                str(chunk_path),
             ]
             subprocess.run(cmd, capture_output=True)
         return str(chunk_path)
@@ -56,227 +64,305 @@ class EpisodeMatcher:
     def load_reference_chunk(self, srt_file, chunk_idx):
         """
         Load reference subtitles for a specific time chunk with robust encoding handling.
         Args:
             srt_file (str or Path): Path to the SRT file
             chunk_idx (int): Index of the chunk to load
         Returns:
             str: Combined text from the subtitle chunk
         """
         chunk_start = chunk_idx * self.chunk_duration
         chunk_end = chunk_start + self.chunk_duration
         try:
             # Read the file content using our robust reader
             reader = SubtitleReader()
             content = reader.read_srt_file(srt_file)
             # Extract subtitles for the time chunk
             text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
-            return ' '.join(text_lines)
+            return " ".join(text_lines)
         except Exception as e:
             logger.error(f"Error loading reference chunk from {srt_file}: {e}")
-            return ''
+            return ""
+    def _try_match_with_model(
+        self, video_file, model_name, max_duration, reference_files
+    ):
+        """
+        Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
+        Args:
+            video_file: Path to the video file
+            model_name: Name of the Whisper model to use
+            max_duration: Maximum duration in seconds to check
+            reference_files: List of reference subtitle files
+        """
+        # Use cached model
+        model = get_whisper_model(model_name, self.device)
+        # Calculate number of chunks to check (30 seconds each)
+        num_chunks = max_duration // self.chunk_duration
+        for chunk_idx in range(num_chunks):
+            start_time = chunk_idx * self.chunk_duration
+            logger.debug(f"Trying {model_name} model at {start_time} seconds")
+            audio_path = self.extract_audio_chunk(video_file, start_time)
+            result = model.transcribe(audio_path, task="transcribe", language="en")
+            chunk_text = result["text"]
+            best_confidence = 0
+            best_match = None
+            # Compare with reference chunks
+            for ref_file in reference_files:
+                ref_text = self.load_reference_chunk(ref_file, chunk_idx)
+                confidence = self.chunk_score(chunk_text, ref_text)
+                if confidence > best_confidence:
+                    best_confidence = confidence
+                    best_match = ref_file
+                if confidence > self.min_confidence:
+                    season_ep = re.search(r"S(\d+)E(\d+)", best_match.stem)
+                    if season_ep:
+                        season, episode = map(int, season_ep.groups())
+                        return {
+                            "season": season,
+                            "episode": episode,
+                            "confidence": best_confidence,
+                            "reference_file": str(best_match),
+                            "matched_at": start_time,
+                        }
+            logger.info(
+                f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})"
+            )
+        return None
     def identify_episode(self, video_file, temp_dir, season_number):
+        """Progressive episode identification with faster initial attempt."""
         try:
-            # Get video duration
-            duration = float(subprocess.check_output([
-                'ffprobe', '-v', 'error',
-                '-show_entries', 'format=duration',
-                '-of', 'default=noprint_wrappers=1:nokey=1',
-                video_file
-            ]).decode())
-            total_chunks = int(np.ceil(duration / self.chunk_duration))
-            # Load Whisper model
-            model = whisper.load_model("base", device=self.device)
-            # Get season-specific reference files using multiple patterns
+            # Get reference files first
             reference_dir = self.cache_dir / "data" / self.show_name
-            # Create season patterns for different formats
             patterns = [
-                f"S{season_number:02d}E",  # S01E01
-                f"S{season_number}E",      # S1E01
-                f"{season_number:02d}x",   # 01x01
-                f"{season_number}x",       # 1x01
+                f"S{season_number:02d}E",
+                f"S{season_number}E",
+                f"{season_number:02d}x",
+                f"{season_number}x",
             ]
             reference_files = []
-            for pattern in patterns:
-                files = [f for f in reference_dir.glob("*.srt")
-                        if any(re.search(f"{p}\\d+", f.name, re.IGNORECASE)
-                        for p in patterns)]
+            # TODO Figure our why patterns is not being used
+            for _pattern in patterns:
+                files = [
+                    f
+                    for f in reference_dir.glob("*.srt")
+                    if any(
+                        re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
+                    )
+                ]
                 reference_files.extend(files)
-            # Remove duplicates while preserving order
             reference_files = list(dict.fromkeys(reference_files))
             if not reference_files:
                 logger.error(f"No reference files found for season {season_number}")
                 return None
-            # Process chunks until match found
-            for chunk_idx in range(min(3, total_chunks)):  # Only try first 3 chunks
-                start_time = chunk_idx * self.chunk_duration
-                audio_path = self.extract_audio_chunk(video_file, start_time)
-                # Transcribe chunk
-                result = model.transcribe(
-                    audio_path,
-                    task="transcribe",
-                    language="en"
+            duration = float(
+                subprocess.check_output([
+                    "ffprobe",
+                    "-v",
+                    "error",
+                    "-show_entries",
+                    "format=duration",
+                    "-of",
+                    "default=noprint_wrappers=1:nokey=1",
+                    video_file,
+                ]).decode()
+            )
+            duration = int(np.ceil(duration))
+            # Try with tiny model first (fastest)
+            logger.info("Attempting match with tiny model...")
+            match = self._try_match_with_model(
+                video_file, "tiny", duration, reference_files
+            )
+            if (
+                match and match["confidence"] > 0.65
+            ):  # Slightly lower threshold for tiny
+                logger.info(
+                    f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
+                )
+                return match
+            # If no match, try base model
+            logger.info(
+                "No match in first 3 minutes, extending base model search to 10 minutes..."
+            )
+            match = self._try_match_with_model(
+                video_file, "base", duration, reference_files
+            )
+            if match:
+                logger.info(
+                    f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
                 )
-                chunk_text = result["text"]
-                best_confidence = 0
-                best_match = None
-                # Compare with reference chunks
-                for ref_file in reference_files:
-                    ref_text = self.load_reference_chunk(ref_file, chunk_idx)
-                    confidence = self.chunk_score(chunk_text, ref_text)
-                    if confidence > best_confidence:
-                        best_confidence = confidence
-                        best_match = ref_file
-                    if confidence > self.min_confidence:
-                        season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
-                        if season_ep:
-                            season, episode = map(int, season_ep.groups())
-                            return {
-                                'season': season,
-                                'episode': episode,
-                                'confidence': best_confidence,
-                                'reference_file': str(best_match),
-                            }
+                return match
+            logger.info("Speech recognition match failed")
             return None
         finally:
             # Cleanup temp files
             for file in self.temp_dir.glob("chunk_*.wav"):
-                file.unlink()
+                try:
+                    file.unlink()
+                except Exception as e:
+                    logger.warning(f"Failed to delete temp file {file}: {e}")
 def detect_file_encoding(file_path):
     """
     Detect the encoding of a file using chardet.
     Args:
         file_path (str or Path): Path to the file
     Returns:
         str: Detected encoding, defaults to 'utf-8' if detection fails
     """
     try:
-        with open(file_path, 'rb') as f:
+        with open(file_path, "rb") as f:
             raw_data = f.read()
         result = chardet.detect(raw_data)
-        encoding = result['encoding']
-        confidence = result['confidence']
-        logger.debug(f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}")
-        return encoding if encoding else 'utf-8'
+        encoding = result["encoding"]
+        confidence = result["confidence"]
+        logger.debug(
+            f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}"
+        )
+        return encoding if encoding else "utf-8"
     except Exception as e:
         logger.warning(f"Error detecting encoding for {file_path}: {e}")
-        return 'utf-8'
+        return "utf-8"
 def read_file_with_fallback(file_path, encodings=None):
     """
     Read a file trying multiple encodings in order of preference.
     Args:
         file_path (str or Path): Path to the file
         encodings (list): List of encodings to try, defaults to common subtitle encodings
     Returns:
         str: File contents
     Raises:
         ValueError: If file cannot be read with any encoding
     """
     if encodings is None:
         # First try detected encoding, then fallback to common subtitle encodings
         detected = detect_file_encoding(file_path)
-        encodings = [detected, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+        encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
     file_path = Path(file_path)
     errors = []
     for encoding in encodings:
         try:
-            with open(file_path, 'r', encoding=encoding) as f:
+            with open(file_path, encoding=encoding) as f:
                 content = f.read()
             logger.debug(f"Successfully read {file_path} using {encoding} encoding")
             return content
         except UnicodeDecodeError as e:
             errors.append(f"{encoding}: {str(e)}")
             continue
-    error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(errors)
+    error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
+        errors
+    )
     logger.error(error_msg)
     raise ValueError(error_msg)
 class SubtitleReader:
     """Helper class for reading and parsing subtitle files."""
     @staticmethod
     def parse_timestamp(timestamp):
         """Parse SRT timestamp into seconds."""
-        hours, minutes, seconds = timestamp.replace(',', '.').split(':')
+        hours, minutes, seconds = timestamp.replace(",", ".").split(":")
         return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
     @staticmethod
     def read_srt_file(file_path):
         """
         Read an SRT file and return its contents with robust encoding handling.
         Args:
             file_path (str or Path): Path to the SRT file
         Returns:
             str: Contents of the SRT file
         """
         return read_file_with_fallback(file_path)
     @staticmethod
     def extract_subtitle_chunk(content, start_time, end_time):
         """
         Extract subtitle text for a specific time window.
         Args:
             content (str): Full SRT file content
             start_time (float): Chunk start time in seconds
             end_time (float): Chunk end time in seconds
         Returns:
             list: List of subtitle texts within the time window
         """
         text_lines = []
-        for block in content.strip().split('\n\n'):
-            lines = block.split('\n')
-            if len(lines) < 3 or '-->' not in lines[1]:
+        for block in content.strip().split("\n\n"):
+            lines = block.split("\n")
+            if len(lines) < 3 or "-->" not in lines[1]:
                 continue
             try:
                 timestamp = lines[1]
-                text = ' '.join(lines[2:])
-                end_stamp = timestamp.split(' --> ')[1].strip()
+                text = " ".join(lines[2:])
+                end_stamp = timestamp.split(" --> ")[1].strip()
                 total_seconds = SubtitleReader.parse_timestamp(end_stamp)
                 if start_time <= total_seconds <= end_time:
                     text_lines.append(text)
             except (IndexError, ValueError) as e:
                 logger.warning(f"Error parsing subtitle block: {e}")
                 continue
-        return text_lines
+        return text_lines
+_whisper_models = {}
+def get_whisper_model(model_name="tiny", device=None):
+    """Cache whisper models to avoid reloading."""
+    global _whisper_models
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    key = f"{model_name}_{device}"
+    if key not in _whisper_models:
+        _whisper_models[key] = whisper.load_model(model_name, device=device)
+        logger.info(f"Loaded {model_name} model on {device}")
+    return _whisper_models[key]

mkv-episode-matcher 0.4.5__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

mkv-episode-matcher 0.4.5py3-none-any.whl → 0.6.0py3-none-any.whl