PyPI - mkv-episode-matcher - Versions diffs - 0.1.13__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

mkv-episode-matcher 0.1.13py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (21) hide show

mkv_episode_matcher/__main__.py +8 -4
mkv_episode_matcher/episode_identification.py +208 -0
mkv_episode_matcher/episode_matcher.py +98 -242
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +38 -12
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +16644 -193
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +125 -80
mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +7 -5
mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +49 -20
mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +53 -49
mkv_episode_matcher/mkv_to_srt.py +150 -22
mkv_episode_matcher/speech_to_text.py +90 -0
mkv_episode_matcher/utils.py +222 -74
mkv_episode_matcher-0.3.0.dist-info/METADATA +119 -0
mkv_episode_matcher-0.3.0.dist-info/RECORD +25 -0
mkv_episode_matcher/notebooks/get_subtitles_test.ipynb +0 -252
mkv_episode_matcher/notebooks/whisper.ipynb +0 -122
mkv_episode_matcher-0.1.13.dist-info/METADATA +0 -113
mkv_episode_matcher-0.1.13.dist-info/RECORD +0 -25
{mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/WHEEL +0 -0
{mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/entry_points.txt +0 -0
{mkv_episode_matcher-0.1.13.dist-info → mkv_episode_matcher-0.3.0.dist-info}/top_level.txt +0 -0

mkv_episode_matcher/__main__.py CHANGED Viewed

@@ -10,7 +10,6 @@ from mkv_episode_matcher.config import get_config, set_config
 logger.info("Starting the application")
 # Check if the configuration directory exists, if not create it
 if not os.path.exists(os.path.join(os.path.expanduser("~"), ".mkv-episode-matcher")):
     os.makedirs(os.path.join(os.path.expanduser("~"), ".mkv-episode-matcher"))
@@ -31,10 +30,16 @@ if not os.path.exists(log_dir):
     os.mkdir(log_dir)
 # Add a new handler for stdout logs
-logger.add(os.path.join(log_dir,"stdout.log"), format="{time} {level} {message}", level="DEBUG", rotation="10 MB")
+logger.add(
+    os.path.join(log_dir, "stdout.log"),
+    format="{time} {level} {message}",
+    level="DEBUG",
+    rotation="10 MB",
+)
 # Add a new handler for error logs
-logger.add(os.path.join(log_dir,"stderr.log"), level="ERROR", rotation="10 MB")
+logger.add(os.path.join(log_dir, "stderr.log"), level="ERROR", rotation="10 MB")
 @logger.catch
 def main():
@@ -55,7 +60,6 @@ def main():
     The function logs its progress to two separate log files: one for standard output and one for errors.
     """
     # Parse command-line arguments
     parser = argparse.ArgumentParser(description="Process shows with TMDb API")
     parser.add_argument("--tmdb-api-key", help="TMDb API key")

mkv_episode_matcher/episode_identification.py ADDED Viewed

@@ -0,0 +1,208 @@
+# mkv_episode_matcher/episode_identification.py
+import os
+import glob
+from pathlib import Path
+from rapidfuzz import fuzz
+from collections import defaultdict
+import re
+from loguru import logger
+import json
+import shutil
+class EpisodeMatcher:
+    def __init__(self, cache_dir, min_confidence=0.6):
+        self.cache_dir = Path(cache_dir)
+        self.min_confidence = min_confidence
+        self.whisper_segments = None
+        self.series_name = None
+    def clean_text(self, text):
+        """Clean text by removing stage directions and normalizing repeated words."""
+        # Remove stage directions like [groans] and <i>SHIP:</i>
+        text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
+        # Remove repeated words with dashes (e.g., "Y-y-you" -> "you")
+        text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
+        # Remove multiple spaces
+        text = ' '.join(text.split())
+        return text.lower()
+    def chunk_score(self, whisper_chunk, ref_chunk):
+        """Calculate fuzzy match score between two chunks of text."""
+        whisper_clean = self.clean_text(whisper_chunk)
+        ref_clean = self.clean_text(ref_chunk)
+        # Use token sort ratio to handle word order differences
+        token_sort = fuzz.token_sort_ratio(whisper_clean, ref_clean)
+        # Use partial ratio to catch substring matches
+        partial = fuzz.partial_ratio(whisper_clean, ref_clean)
+        # Weight token sort more heavily but consider partial matches
+        return (token_sort * 0.7 + partial * 0.3) / 100.0
+    def identify_episode(self, video_file, temp_dir):
+        """Identify which episode matches this video file."""
+        # Get series name from parent directory
+        self.series_name = Path(video_file).parent.parent.name
+        # Load whisper transcript if not already processed
+        segments_file = Path(temp_dir) / f"{Path(video_file).stem}.segments.json"
+        if not segments_file.exists():
+            logger.error(f"No transcript found for {video_file}. Run speech recognition first.")
+            return None
+        with open(segments_file) as f:
+            self.whisper_segments = json.load(f)
+        # Get reference directory for this series
+        reference_dir = self.cache_dir / "data" / self.series_name
+        if not reference_dir.exists():
+            logger.error(f"No reference files found for {self.series_name}")
+            return None
+        # Match against reference files
+        match = self.match_all_references(reference_dir)
+        if match and match['confidence'] >= self.min_confidence:
+            # Extract season and episode from filename
+            match_file = Path(match['file'])
+            season_ep = re.search(r'S(\d+)E(\d+)', match_file.stem)
+            if season_ep:
+                season, episode = map(int, season_ep.groups())
+                return {
+                    'season': season,
+                    'episode': episode,
+                    'confidence': match['confidence'],
+                    'reference_file': str(match_file),
+                    'chunk_scores': match['chunk_scores']
+                }
+        return None
+    def match_all_references(self, reference_dir):
+        """Process all reference files and track matching scores."""
+        results = defaultdict(list)
+        best_match = None
+        best_confidence = 0
+        def process_chunks(ref_segments, filename):
+            nonlocal best_match, best_confidence
+            chunk_size = 300  # 5 minute chunks
+            whisper_chunks = defaultdict(list)
+            ref_chunks = defaultdict(list)
+            # Group segments into time chunks
+            for seg in self.whisper_segments:
+                chunk_idx = int(float(seg['start']) // chunk_size)
+                whisper_chunks[chunk_idx].append(seg['text'])
+            for seg in ref_segments:
+                chunk_idx = int(seg['start'] // chunk_size)
+                ref_chunks[chunk_idx].append(seg['text'])
+            # Score each chunk
+            for chunk_idx in whisper_chunks:
+                whisper_text = ' '.join(whisper_chunks[chunk_idx])
+                # Look for matching reference chunk and adjacent chunks
+                scores = []
+                for ref_idx in range(max(0, chunk_idx-1), chunk_idx+2):
+                    if ref_idx in ref_chunks:
+                        ref_text = ' '.join(ref_chunks[ref_idx])
+                        score = self.chunk_score(whisper_text, ref_text)
+                        scores.append(score)
+                if scores:
+                    chunk_confidence = max(scores)
+                    logger.info(f"File: {filename}, "
+                              f"Time: {chunk_idx*chunk_size}-{(chunk_idx+1)*chunk_size}s, "
+                              f"Confidence: {chunk_confidence:.2f}")
+                    results[filename].append({
+                        'chunk_idx': chunk_idx,
+                        'confidence': chunk_confidence
+                    })
+                    # Early exit if we find a very good match
+                    if chunk_confidence > self.min_confidence:
+                        chunk_scores = results[filename]
+                        confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
+                                      for c in chunk_scores) / len(chunk_scores)
+                        if confidence > best_confidence:
+                            best_confidence = confidence
+                            best_match = {
+                                'file': filename,
+                                'confidence': confidence,
+                                'chunk_scores': chunk_scores
+                            }
+                        return True
+            return False
+        # Process each reference file
+        for ref_file in glob.glob(os.path.join(reference_dir, "*.srt")):
+            ref_segments = self.parse_srt_to_segments(ref_file)
+            filename = os.path.basename(ref_file)
+            if process_chunks(ref_segments, filename):
+                break
+        # If no early match found, find best overall match
+        if not best_match:
+            for filename, chunks in results.items():
+                # Weight earlier chunks more heavily
+                confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
+                               for c in chunks) / len(chunks)
+                if confidence > best_confidence:
+                    best_confidence = confidence
+                    best_match = {
+                        'file': filename,
+                        'confidence': confidence,
+                        'chunk_scores': chunks
+                    }
+        return best_match
+    def parse_srt_to_segments(self, srt_file):
+        """Parse SRT file into list of segments with start/end times and text."""
+        segments = []
+        current_segment = {}
+        with open(srt_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+        i = 0
+        while i < len(lines):
+            line = lines[i].strip()
+            if line.isdigit():  # Index
+                if current_segment:
+                    segments.append(current_segment)
+                current_segment = {}
+            elif '-->' in line:  # Timestamp
+                start, end = line.split(' --> ')
+                current_segment['start'] = self.timestr_to_seconds(start)
+                current_segment['end'] = self.timestr_to_seconds(end)
+            elif line:  # Text
+                if 'text' in current_segment:
+                    current_segment['text'] += ' ' + line
+                else:
+                    current_segment['text'] = line
+            i += 1
+        if current_segment:
+            segments.append(current_segment)
+        return segments
+    def timestr_to_seconds(self, timestr):
+        """Convert SRT timestamp to seconds."""
+        h, m, s = timestr.replace(',','.').split(':')
+        return float(h) * 3600 + float(m) * 60 + float(s)

mkv_episode_matcher/episode_matcher.py CHANGED Viewed

@@ -1,261 +1,117 @@
-# episode_matcher.py
-import os
-import re
+# mkv_episode_matcher/episode_matcher.py
+from pathlib import Path
+import shutil
+import glob
+import os
 from loguru import logger
-from mkv_episode_matcher.__main__ import CACHE_DIR, CONFIG_FILE
+from mkv_episode_matcher.__main__ import CONFIG_FILE, CACHE_DIR
 from mkv_episode_matcher.config import get_config
 from mkv_episode_matcher.mkv_to_srt import convert_mkv_to_srt
 from mkv_episode_matcher.tmdb_client import fetch_show_id
-from mkv_episode_matcher.utils import check_filename, cleanup_ocr_files, get_subtitles,clean_text
+from mkv_episode_matcher.utils import (
+    check_filename,
+    clean_text,
+    cleanup_ocr_files,
+    get_subtitles,
+    process_reference_srt_files,
+    process_srt_files,
+    compare_and_rename_files,get_valid_seasons
+)
+from mkv_episode_matcher.speech_to_text import process_speech_to_text
+from mkv_episode_matcher.episode_identification import EpisodeMatcher
-# hash_data = {}
-@logger.catch
 def process_show(season=None, dry_run=False, get_subs=False):
-    """
-    Process the show by downloading episode images and finding matching episodes.
-    Args:
-        season (int, optional): The season number to process. If provided, only that season will be processed. Defaults to None.
-        dry_run (bool, optional): Whether to perform a dry run without actually processing the episodes. Defaults to False.
-        get_subs (bool, optional): Whether to download subtitles for the episodes. Defaults to False.
-    """
+    """Process the show using both speech recognition and OCR fallback."""
     config = get_config(CONFIG_FILE)
     show_dir = config.get("show_dir")
-    show_name = clean_text(os.path.basename(show_dir))
-    logger.info(f"Processing show '{show_name}'...")
-    show_id = fetch_show_id(show_name)
-    if show_id is None:
-        logger.error(f"Could not find show '{os.path.basename(show_dir)}' on TMDb.")
-        return
-    # Get all season directories
-    season_paths = [
-        os.path.join(show_dir, d)
-        for d in os.listdir(show_dir)
-        if os.path.isdir(os.path.join(show_dir, d))
-    ]
-    # Filter seasons to only include those with .mkv files
-    valid_season_paths = []
-    for season_path in season_paths:
-        mkv_files = [
-            f for f in os.listdir(season_path)
-            if f.endswith(".mkv")
-        ]
-        if mkv_files:
-            valid_season_paths.append(season_path)
-    if not valid_season_paths:
-        logger.warning(f"No seasons with .mkv files found in show '{show_name}'")
+    # Initialize episode matcher
+    matcher = EpisodeMatcher(CACHE_DIR)
+    # Get valid season directories
+    season_paths = get_valid_seasons(show_dir)
+    if not season_paths:
+        logger.warning(f"No seasons with .mkv files found")
         return
-    logger.info(
-        f"Found {len(valid_season_paths)} seasons with .mkv files for show '{show_name}'"
-    )
-    # Extract season numbers from valid paths
-    seasons_to_process = [
-        int(os.path.basename(season_path).split()[-1])
-        for season_path in valid_season_paths
-    ]
-    if get_subs:
-        get_subtitles(show_id, seasons=set(seasons_to_process))
     if season is not None:
-        # If specific season requested, check if it has .mkv files
         season_path = os.path.join(show_dir, f"Season {season}")
-        if season_path not in valid_season_paths:
+        if season_path not in season_paths:
             logger.warning(f"Season {season} has no .mkv files to process")
             return
-        mkv_files = [
-            os.path.join(season_path, f)
-            for f in os.listdir(season_path)
-            if f.endswith(".mkv")
-        ]
-    else:
-        # Process all valid seasons
-        for season_path in valid_season_paths:
-            mkv_files = [
-                os.path.join(season_path, f)
-                for f in os.listdir(season_path)
-                if f.endswith(".mkv")
-            ]
-    # Filter out files that have already been processed
-    for f in mkv_files:
-        if check_filename(f):
-            logger.info(f"Skipping {f}, already processed")
-            mkv_files.remove(f)
-    if len(mkv_files) == 0:
-        logger.info("No new files to process")
-        return
-    convert_mkv_to_srt(season_path, mkv_files)
-    reference_text_dict = process_reference_srt_files(show_name)
-    srt_text_dict = process_srt_files(show_dir)
-    compare_and_rename_files(srt_text_dict, reference_text_dict, dry_run=dry_run)
-    cleanup_ocr_files(show_dir)
-def check_filename(filename):
-    """
-    Check if the filename is in the correct format.
-    Args:
-        filename (str): The filename to check.
-    Returns:
-        bool: True if the filename is in the correct format, False otherwise.
-    """
-    # Check if the filename matches the expected format
-    match = re.match(r".*S\d+E\d+", filename)
-    return bool(match)
-def extract_srt_text(filepath):
-    """
-    Extracts the text from an SRT file.
-    Args:
-        filepath (str): The path to the SRT file.
-    Returns:
-        list: A list of lists, where each inner list represents a block of text from the SRT file.
-              Each inner list contains the lines of text for that block.
-    """
-    # extract the text from the file
-    with open(filepath) as f:
-        filepath = f.read()
-    text_lines = [
-        filepath.split("\n\n")[i].split("\n")[2:]
-        for i in range(len(filepath.split("\n\n")))
-    ]
-    # remove empty lines
-    text_lines = [[line for line in lines if line] for lines in text_lines]
-    # remove <i> or </i> tags
-    text_lines = [
-        [re.sub(r"<i>|</i>|", "", line) for line in lines] for lines in text_lines
-    ]
-    # remove empty lists
-    text_lines = [lines for lines in text_lines if lines]
-    return text_lines
+        season_paths = [season_path]
-def compare_text(text1, text2):
-    """
-    Compare two lists of text lines and return the number of matching lines.
-    Args:
-        text1 (list): List of text lines from the first source.
-        text2 (list): List of text lines from the second source.
-    Returns:
-        int: Number of matching lines between the two sources.
-    """
-    # Flatten the list of text lines
-    flat_text1 = [line for lines in text1 for line in lines]
-    flat_text2 = [line for lines in text2 for line in lines]
-    # Compare the two lists of text lines
-    matching_lines = set(flat_text1).intersection(flat_text2)
-    return len(matching_lines)
-def extract_season_episode(filename):
-    """
-    Extract the season and episode number from the filename.
-    Args:
-        filename (str): The filename to extract the season and episode from.
-    Returns:
-        tuple: A tuple containing the season and episode number.
-    """
-    # Extract the season and episode number from the filename
-    match = re.search(r"S(\d+)E(\d+)", filename)
-    if match:
-        season = int(match.group(1))
-        episode = int(match.group(2))
-        return season, episode
-    else:
-        return None, None
-def process_reference_srt_files(series_name):
-    """
-    Process reference SRT files for a given series.
-    Args:
-        series_name (str): The name of the series.
-    Returns:
-        dict: A dictionary containing the reference files where the keys are the MKV filenames
-              and the values are the corresponding SRT texts.
-    """
-    reference_files = {}
-    reference_dir = os.path.join(CACHE_DIR, "data", series_name)
-    for dirpath, _, filenames in os.walk(reference_dir):
-        for filename in filenames:
-            if filename.lower().endswith(".srt"):
-                srt_file = os.path.join(dirpath, filename)
-                logger.info(f"Processing {srt_file}")
-                srt_text = extract_srt_text(srt_file)
-                season, episode = extract_season_episode(filename)
-                mkv_filename = f"{series_name} - S{season:02}E{episode:02}.mkv"
-                reference_files[mkv_filename] = srt_text
-    return reference_files
-def process_srt_files(show_dir):
-    """
-    Process all SRT files in the given directory and its subdirectories.
-    Args:
-        show_dir (str): The directory path where the SRT files are located.
-    Returns:
-        dict: A dictionary containing the SRT file paths as keys and their corresponding text content as values.
-    """
-    srt_files = {}
-    for dirpath, _, filenames in os.walk(show_dir):
-        for filename in filenames:
-            if filename.lower().endswith(".srt"):
-                srt_file = os.path.join(dirpath, filename)
-                logger.info(f"Processing {srt_file}")
-                srt_text = extract_srt_text(srt_file)
-                srt_files[srt_file] = srt_text
-    return srt_files
-def compare_and_rename_files(srt_files, reference_files, dry_run=False):
-    """
-    Compare the srt files with the reference files and rename the matching mkv files.
-    Args:
-        srt_files (dict): A dictionary containing the srt files as keys and their contents as values.
-        reference_files (dict): A dictionary containing the reference files as keys and their contents as values.
-        dry_run (bool, optional): If True, the function will only log the renaming actions without actually renaming the files. Defaults to False.
-    """
-    logger.info(
-        f"Comparing {len(srt_files)} srt files with {len(reference_files)} reference files"
-    )
-    for srt_text in srt_files.keys():
-        parent_dir = os.path.dirname(os.path.dirname(srt_text))
-        for reference in reference_files.keys():
-            season, episode = extract_season_episode(reference)
-            mkv_file = os.path.join(
-                parent_dir, os.path.basename(srt_text).replace(".srt", ".mkv")
-            )
-            matching_lines = compare_text(
-                reference_files[reference], srt_files[srt_text]
-            )
-            if matching_lines >= int(len(reference_files[reference]) * 0.1):
-                logger.info(f"Matching lines: {matching_lines}")
-                logger.info(f"Found matching file: {mkv_file} ->{reference}")
-                new_filename = os.path.join(parent_dir, reference)
-                if not os.path.exists(new_filename):
-                    if os.path.exists(mkv_file) and not dry_run:
-                        logger.info(f"Renaming {mkv_file} to {new_filename}")
-                        os.rename(mkv_file, new_filename)
+    # Process each season
+    for season_path in season_paths:
+        # Get MKV files that haven't been processed
+        mkv_files = [f for f in glob.glob(os.path.join(season_path, "*.mkv"))
+                    if not check_filename(f)]
+        if not mkv_files:
+            logger.info(f"No new files to process in {season_path}")
+            continue
+        # Create temp directories
+        temp_dir = Path(season_path) / "temp"
+        ocr_dir = Path(season_path) / "ocr"
+        temp_dir.mkdir(exist_ok=True)
+        ocr_dir.mkdir(exist_ok=True)
+        try:
+            unmatched_files = []
+            # First pass: Try speech recognition matching
+            for mkv_file in mkv_files:
+                logger.info(f"Attempting speech recognition match for {mkv_file}")
+                # Extract audio and run speech recognition
+                process_speech_to_text(mkv_file, str(temp_dir))
+                match = matcher.identify_episode(mkv_file, temp_dir)
+                if match and match['confidence'] >= matcher.min_confidence:
+                    # Rename the file
+                    new_name = f"{matcher.series_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
+                    new_path = os.path.join(season_path, new_name)
+                    logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
+                              f"(confidence: {match['confidence']:.2f})")
+                    if not dry_run:
+                        os.rename(mkv_file, new_path)
                 else:
-                    logger.info(f"File {new_filename} already exists, skipping")
+                    logger.info(f"Speech recognition match failed for {mkv_file}, will try OCR")
+                    unmatched_files.append(mkv_file)
+            # Second pass: Try OCR for unmatched files
+            if unmatched_files:
+                logger.info(f"Attempting OCR matching for {len(unmatched_files)} unmatched files")
+                # Convert files to SRT using OCR
+                convert_mkv_to_srt(season_path, unmatched_files)
+                # Process OCR results
+                reference_text_dict = process_reference_srt_files(matcher.series_name)
+                srt_text_dict = process_srt_files(str(ocr_dir))
+                # Compare and rename
+                compare_and_rename_files(
+                    srt_text_dict,
+                    reference_text_dict,
+                    dry_run=dry_run,
+                    min_confidence=0.1  # Lower threshold for OCR
+                )
+            # Download subtitles if requested
+            if get_subs:
+                show_id = fetch_show_id(matcher.series_name)
+                if show_id:
+                    seasons = {int(os.path.basename(p).split()[-1]) for p in season_paths}
+                    get_subtitles(show_id, seasons=seasons)
+        finally:
+            # Cleanup
+            if not dry_run:
+                shutil.rmtree(temp_dir)
+                cleanup_ocr_files(show_dir)

mkv-episode-matcher 0.1.13__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

mkv-episode-matcher 0.1.13py3-none-any.whl → 0.3.0py3-none-any.whl