mkv-episode-matcher 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (53) hide show
  1. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/PKG-INFO +1 -1
  2. mkv_episode_matcher-0.3.4/mkv_episode_matcher/episode_identification.py +150 -0
  3. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/episode_matcher.py +16 -34
  4. mkv_episode_matcher-0.3.4/mkv_episode_matcher/subtitle_utils.py +82 -0
  5. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/utils.py +50 -53
  6. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher.egg-info/PKG-INFO +1 -1
  7. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher.egg-info/SOURCES.txt +3 -2
  8. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/setup.cfg +1 -1
  9. mkv_episode_matcher-0.3.4/tests/test_main.py +137 -0
  10. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/uv.lock +1 -1
  11. mkv_episode_matcher-0.3.2/.coverage +0 -0
  12. mkv_episode_matcher-0.3.2/mkv_episode_matcher/episode_identification.py +0 -208
  13. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/.gitattributes +0 -0
  14. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/.github/funding.yml +0 -0
  15. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/.github/workflows/documentation.yml +0 -0
  16. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/.github/workflows/python-publish.yml +0 -0
  17. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/.gitignore +0 -0
  18. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/.gitmodules +0 -0
  19. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/.python-version +0 -0
  20. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/.vscode/settings.json +0 -0
  21. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/README.md +0 -0
  22. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/docs/api/index.md +0 -0
  23. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/docs/cli.md +0 -0
  24. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/docs/configuration.md +0 -0
  25. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/docs/installation.md +0 -0
  26. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/docs/quickstart.md +0 -0
  27. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/docs/tips.md +0 -0
  28. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkdocs.yml +0 -0
  29. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/.gitattributes +0 -0
  30. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/__init__.py +0 -0
  31. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/__main__.py +0 -0
  32. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/config.py +0 -0
  33. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -0
  34. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -0
  35. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -0
  36. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -0
  37. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/README.md +0 -0
  38. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  39. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -0
  40. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -0
  41. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -0
  42. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -0
  43. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/mkv_to_srt.py +0 -0
  44. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/speech_to_text.py +0 -0
  45. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher/tmdb_client.py +0 -0
  46. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher.egg-info/dependency_links.txt +0 -0
  47. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher.egg-info/entry_points.txt +0 -0
  48. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher.egg-info/requires.txt +0 -0
  49. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/mkv_episode_matcher.egg-info/top_level.txt +0 -0
  50. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/pyproject.toml +0 -0
  51. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/setup.py +0 -0
  52. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/tests/__init__.py +0 -0
  53. {mkv_episode_matcher-0.3.2 → mkv_episode_matcher-0.3.4}/tests/test_improvements.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mkv-episode-matcher
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -0,0 +1,150 @@
1
+ import json
2
+ import os
3
+ import subprocess
4
+ import tempfile
5
+ from pathlib import Path
6
+ import torch
7
+ from rapidfuzz import fuzz
8
+ from loguru import logger
9
+ import whisper
10
+ import numpy as np
11
+ import re
12
+ class EpisodeMatcher:
13
+ def __init__(self, cache_dir, show_name, min_confidence=0.6):
14
+ self.cache_dir = Path(cache_dir)
15
+ self.min_confidence = min_confidence
16
+ self.show_name = show_name
17
+ self.chunk_duration = 300 # 5 minutes
18
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
20
+ self.temp_dir.mkdir(exist_ok=True)
21
+
22
+ def clean_text(self, text):
23
+ text = text.lower().strip()
24
+ text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
25
+ text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
26
+ return ' '.join(text.split())
27
+
28
+ def chunk_score(self, whisper_chunk, ref_chunk):
29
+ whisper_clean = self.clean_text(whisper_chunk)
30
+ ref_clean = self.clean_text(ref_chunk)
31
+ return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
32
+ fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
33
+
34
+ def extract_audio_chunk(self, mkv_file, start_time):
35
+ """Extract a chunk of audio from MKV file."""
36
+ chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
37
+ if not chunk_path.exists():
38
+ cmd = [
39
+ 'ffmpeg',
40
+ '-ss', str(start_time),
41
+ '-t', str(self.chunk_duration),
42
+ '-i', mkv_file,
43
+ '-vn',
44
+ '-acodec', 'pcm_s16le',
45
+ '-ar', '16000',
46
+ '-ac', '1',
47
+ str(chunk_path)
48
+ ]
49
+ subprocess.run(cmd, capture_output=True)
50
+ return str(chunk_path)
51
+
52
+ def load_reference_chunk(self, srt_file, chunk_idx):
53
+ """Load reference subtitles for a specific time chunk."""
54
+ chunk_start = chunk_idx * self.chunk_duration
55
+ chunk_end = chunk_start + self.chunk_duration
56
+ text_lines = []
57
+
58
+ with open(srt_file, 'r', encoding='utf-8') as f:
59
+ content = f.read().strip()
60
+
61
+ for block in content.split('\n\n'):
62
+ lines = block.split('\n')
63
+ if len(lines) < 3 or '-->' not in lines[1]: # Skip malformed blocks
64
+ continue
65
+
66
+ try:
67
+ timestamp = lines[1]
68
+ text = ' '.join(lines[2:])
69
+
70
+ end_time = timestamp.split(' --> ')[1].strip()
71
+ hours, minutes, seconds = map(float, end_time.replace(',','.').split(':'))
72
+ total_seconds = hours * 3600 + minutes * 60 + seconds
73
+
74
+ if chunk_start <= total_seconds <= chunk_end:
75
+ text_lines.append(text)
76
+
77
+ except (IndexError, ValueError):
78
+ continue
79
+
80
+ return ' '.join(text_lines)
81
+
82
+ def identify_episode(self, video_file, temp_dir, season_number):
83
+ try:
84
+ # Get video duration
85
+ duration = float(subprocess.check_output([
86
+ 'ffprobe', '-v', 'error',
87
+ '-show_entries', 'format=duration',
88
+ '-of', 'default=noprint_wrappers=1:nokey=1',
89
+ video_file
90
+ ]).decode())
91
+
92
+ total_chunks = int(np.ceil(duration / self.chunk_duration))
93
+
94
+ # Load Whisper model
95
+ model = whisper.load_model("base", device=self.device)
96
+
97
+ # Get season-specific reference files
98
+ reference_dir = self.cache_dir / "data" / self.show_name
99
+ season_pattern = f"S{season_number:02d}E"
100
+ reference_files = [
101
+ f for f in reference_dir.glob("*.srt")
102
+ if season_pattern in f.name
103
+ ]
104
+
105
+ if not reference_files:
106
+ logger.error(f"No reference files found for season {season_number}")
107
+ return None
108
+
109
+ # Process chunks until match found
110
+ for chunk_idx in range(min(3, total_chunks)): # Only try first 3 chunks
111
+ start_time = chunk_idx * self.chunk_duration
112
+ audio_path = self.extract_audio_chunk(video_file, start_time)
113
+
114
+ # Transcribe chunk
115
+ result = model.transcribe(
116
+ audio_path,
117
+ task="transcribe",
118
+ language="en"
119
+ )
120
+
121
+ chunk_text = result["text"]
122
+ best_confidence = 0
123
+ best_match = None
124
+
125
+ # Compare with reference chunks
126
+ for ref_file in reference_files:
127
+ ref_text = self.load_reference_chunk(ref_file, chunk_idx)
128
+ confidence = self.chunk_score(chunk_text, ref_text)
129
+
130
+ if confidence > best_confidence:
131
+ best_confidence = confidence
132
+ best_match = ref_file
133
+
134
+ if confidence > self.min_confidence:
135
+ season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
136
+ if season_ep:
137
+ season, episode = map(int, season_ep.groups())
138
+ return {
139
+ 'season': season,
140
+ 'episode': episode,
141
+ 'confidence': best_confidence,
142
+ 'reference_file': str(best_match),
143
+ }
144
+
145
+ return None
146
+
147
+ finally:
148
+ # Cleanup temp files
149
+ for file in self.temp_dir.glob("chunk_*.wav"):
150
+ file.unlink()
@@ -5,7 +5,7 @@ import shutil
5
5
  import glob
6
6
  import os
7
7
  from loguru import logger
8
-
8
+ import re
9
9
  from mkv_episode_matcher.__main__ import CONFIG_FILE, CACHE_DIR
10
10
  from mkv_episode_matcher.config import get_config
11
11
  from mkv_episode_matcher.mkv_to_srt import convert_mkv_to_srt
@@ -17,20 +17,18 @@ from mkv_episode_matcher.utils import (
17
17
  get_subtitles,
18
18
  process_reference_srt_files,
19
19
  process_srt_files,
20
- compare_and_rename_files,get_valid_seasons
20
+ compare_and_rename_files,get_valid_seasons,rename_episode_file
21
21
  )
22
22
  from mkv_episode_matcher.speech_to_text import process_speech_to_text
23
23
  from mkv_episode_matcher.episode_identification import EpisodeMatcher
24
24
 
25
25
  def process_show(season=None, dry_run=False, get_subs=False):
26
- """Process the show using both speech recognition and OCR fallback."""
26
+ """Process the show using streaming speech recognition with OCR fallback."""
27
27
  config = get_config(CONFIG_FILE)
28
28
  show_dir = config.get("show_dir")
29
29
  show_name = clean_text(os.path.basename(show_dir))
30
- # Initialize episode matcher
31
- matcher = EpisodeMatcher(CACHE_DIR,show_name)
30
+ matcher = EpisodeMatcher(CACHE_DIR, show_name)
32
31
 
33
- # Get valid season directories
34
32
  season_paths = get_valid_seasons(show_dir)
35
33
  if not season_paths:
36
34
  logger.warning(f"No seasons with .mkv files found")
@@ -43,9 +41,7 @@ def process_show(season=None, dry_run=False, get_subs=False):
43
41
  return
44
42
  season_paths = [season_path]
45
43
 
46
- # Process each season
47
44
  for season_path in season_paths:
48
- # Get MKV files that haven't been processed
49
45
  mkv_files = [f for f in glob.glob(os.path.join(season_path, "*.mkv"))
50
46
  if not check_filename(f)]
51
47
 
@@ -53,66 +49,52 @@ def process_show(season=None, dry_run=False, get_subs=False):
53
49
  logger.info(f"No new files to process in {season_path}")
54
50
  continue
55
51
 
56
- # Create temp directories
52
+ season_num = int(re.search(r'Season (\d+)', season_path).group(1))
57
53
  temp_dir = Path(season_path) / "temp"
58
54
  ocr_dir = Path(season_path) / "ocr"
59
55
  temp_dir.mkdir(exist_ok=True)
60
56
  ocr_dir.mkdir(exist_ok=True)
61
57
 
62
58
  try:
63
- # Download subtitles if requested
64
59
  if get_subs:
65
- show_id = fetch_show_id(matcher.series_name)
60
+ show_id = fetch_show_id(matcher.show_name)
66
61
  if show_id:
67
- seasons = {int(os.path.basename(p).split()[-1]) for p in season_paths}
68
- get_subtitles(show_id, seasons=seasons)
62
+ get_subtitles(show_id, seasons={season_num})
63
+
69
64
  unmatched_files = []
70
-
71
- # First pass: Try speech recognition matching
72
65
  for mkv_file in mkv_files:
73
66
  logger.info(f"Attempting speech recognition match for {mkv_file}")
67
+ match = matcher.identify_episode(mkv_file, temp_dir, season_num)
74
68
 
75
- # Extract audio and run speech recognition
76
- process_speech_to_text(mkv_file, str(temp_dir))
77
- match = matcher.identify_episode(mkv_file, temp_dir)
78
-
79
- if match and match['confidence'] >= matcher.min_confidence:
80
- # Rename the file
81
- new_name = f"{matcher.series_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
69
+ if match:
70
+ new_name = f"{matcher.show_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
82
71
  new_path = os.path.join(season_path, new_name)
83
72
 
84
73
  logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
85
74
  f"(confidence: {match['confidence']:.2f})")
86
75
 
87
76
  if not dry_run:
88
- os.rename(mkv_file, new_path)
77
+ logger.info(f"Renaming {mkv_file} to {new_name}")
78
+ rename_episode_file(mkv_file, new_name)
89
79
  else:
90
- logger.info(f"Speech recognition match failed for {mkv_file}, will try OCR")
80
+ logger.info(f"Speech recognition match failed for {mkv_file}, trying OCR")
91
81
  unmatched_files.append(mkv_file)
92
82
 
93
- # Second pass: Try OCR for unmatched files
83
+ # OCR fallback for unmatched files
94
84
  if unmatched_files:
95
85
  logger.info(f"Attempting OCR matching for {len(unmatched_files)} unmatched files")
96
-
97
- # Convert files to SRT using OCR
98
86
  convert_mkv_to_srt(season_path, unmatched_files)
99
87
 
100
- # Process OCR results
101
- reference_text_dict = process_reference_srt_files(matcher.series_name)
88
+ reference_text_dict = process_reference_srt_files(matcher.show_name)
102
89
  srt_text_dict = process_srt_files(str(ocr_dir))
103
90
 
104
- # Compare and rename
105
91
  compare_and_rename_files(
106
92
  srt_text_dict,
107
93
  reference_text_dict,
108
94
  dry_run=dry_run,
109
- min_confidence=0.1 # Lower threshold for OCR
110
95
  )
111
-
112
-
113
96
 
114
97
  finally:
115
- # Cleanup
116
98
  if not dry_run:
117
99
  shutil.rmtree(temp_dir)
118
100
  cleanup_ocr_files(show_dir)
@@ -0,0 +1,82 @@
1
+ from typing import List, Optional, Union
2
+ import os
3
+ import re
4
+
5
+ def generate_subtitle_patterns(series_name: str, season: int, episode: int) -> List[str]:
6
+ """
7
+ Generate various common subtitle filename patterns.
8
+
9
+ Args:
10
+ series_name (str): Name of the series
11
+ season (int): Season number
12
+ episode (int): Episode number
13
+
14
+ Returns:
15
+ List[str]: List of possible subtitle filenames
16
+ """
17
+ patterns = [
18
+ # Standard format: "Show Name - S01E02.srt"
19
+ f"{series_name} - S{season:02d}E{episode:02d}.srt",
20
+
21
+ # Season x Episode format: "Show Name - 1x02.srt"
22
+ f"{series_name} - {season}x{episode:02d}.srt",
23
+
24
+ # Separate season/episode: "Show Name - Season 1 Episode 02.srt"
25
+ f"{series_name} - Season {season} Episode {episode:02d}.srt",
26
+
27
+ # Compact format: "ShowName.S01E02.srt"
28
+ f"{series_name.replace(' ', '')}.S{season:02d}E{episode:02d}.srt",
29
+
30
+ # Numbered format: "Show Name 102.srt"
31
+ f"{series_name} {season:01d}{episode:02d}.srt",
32
+
33
+ # Dot format: "Show.Name.1x02.srt"
34
+ f"{series_name.replace(' ', '.')}.{season}x{episode:02d}.srt",
35
+
36
+ # Underscore format: "Show_Name_S01E02.srt"
37
+ f"{series_name.replace(' ', '_')}_S{season:02d}E{episode:02d}.srt",
38
+ ]
39
+
40
+ return patterns
41
+
42
+ def find_existing_subtitle(series_cache_dir: str, series_name: str, season: int, episode: int) -> Optional[str]:
43
+ """
44
+ Check for existing subtitle files in various naming formats.
45
+
46
+ Args:
47
+ series_cache_dir (str): Directory containing subtitle files
48
+ series_name (str): Name of the series
49
+ season (int): Season number
50
+ episode (int): Episode number
51
+
52
+ Returns:
53
+ Optional[str]: Path to existing subtitle file if found, None otherwise
54
+ """
55
+ patterns = generate_subtitle_patterns(series_name, season, episode)
56
+
57
+ for pattern in patterns:
58
+ filepath = os.path.join(series_cache_dir, pattern)
59
+ if os.path.exists(filepath):
60
+ return filepath
61
+
62
+ return None
63
+
64
+ def sanitize_filename(filename: str) -> str:
65
+ """
66
+ Sanitize filename by removing/replacing invalid characters.
67
+
68
+ Args:
69
+ filename (str): Original filename
70
+
71
+ Returns:
72
+ str: Sanitized filename
73
+ """
74
+ # Replace problematic characters
75
+ filename = filename.replace(':', ' -')
76
+ filename = filename.replace('/', '-')
77
+ filename = filename.replace('\\', '-')
78
+
79
+ # Remove any other invalid characters
80
+ filename = re.sub(r'[<>:"/\\|?*]', '', filename)
81
+
82
+ return filename.strip()
@@ -10,7 +10,7 @@ from opensubtitlescom import OpenSubtitles
10
10
  from mkv_episode_matcher.__main__ import CACHE_DIR, CONFIG_FILE
11
11
  from mkv_episode_matcher.config import get_config
12
12
  from mkv_episode_matcher.tmdb_client import fetch_season_details
13
-
13
+ from mkv_episode_matcher.subtitle_utils import find_existing_subtitle,sanitize_filename
14
14
  def get_valid_seasons(show_dir):
15
15
  """
16
16
  Get all season directories that contain MKV files.
@@ -117,8 +117,10 @@ def rename_episode_file(original_file_path, new_filename):
117
117
  except OSError as e:
118
118
  logger.error(f"Failed to rename file: {e}")
119
119
  return None
120
-
121
-
120
+ except FileExistsError as e:
121
+ logger.error(f"Failed to rename file: {e}")
122
+ return None
123
+
122
124
  def get_subtitles(show_id, seasons: set[int]):
123
125
  """
124
126
  Retrieves and saves subtitles for a given TV show and seasons.
@@ -126,20 +128,17 @@ def get_subtitles(show_id, seasons: set[int]):
126
128
  Args:
127
129
  show_id (int): The ID of the TV show.
128
130
  seasons (Set[int]): A set of season numbers for which subtitles should be retrieved.
129
-
130
- Returns:
131
- None
132
131
  """
133
-
134
132
  logger.info(f"Getting subtitles for show ID {show_id}")
135
133
  config = get_config(CONFIG_FILE)
136
134
  show_dir = config.get("show_dir")
137
- series_name = os.path.basename(show_dir)
135
+ series_name = sanitize_filename(os.path.basename(show_dir))
138
136
  tmdb_api_key = config.get("tmdb_api_key")
139
137
  open_subtitles_api_key = config.get("open_subtitles_api_key")
140
138
  open_subtitles_user_agent = config.get("open_subtitles_user_agent")
141
139
  open_subtitles_username = config.get("open_subtitles_username")
142
140
  open_subtitles_password = config.get("open_subtitles_password")
141
+
143
142
  if not all([
144
143
  show_dir,
145
144
  tmdb_api_key,
@@ -149,63 +148,66 @@ def get_subtitles(show_id, seasons: set[int]):
149
148
  open_subtitles_password,
150
149
  ]):
151
150
  logger.error("Missing configuration settings. Please run the setup script.")
151
+ return
152
+
152
153
  try:
153
- # Initialize the OpenSubtitles client
154
154
  subtitles = OpenSubtitles(open_subtitles_user_agent, open_subtitles_api_key)
155
-
156
- # Log in (retrieve auth token)
157
155
  subtitles.login(open_subtitles_username, open_subtitles_password)
158
156
  except Exception as e:
159
157
  logger.error(f"Failed to log in to OpenSubtitles: {e}")
160
158
  return
159
+
161
160
  for season in seasons:
162
161
  episodes = fetch_season_details(show_id, season)
163
162
  logger.info(f"Found {episodes} episodes in Season {season}")
164
163
 
165
164
  for episode in range(1, episodes + 1):
166
165
  logger.info(f"Processing Season {season}, Episode {episode}...")
166
+
167
167
  series_cache_dir = os.path.join(CACHE_DIR, "data", series_name)
168
168
  os.makedirs(series_cache_dir, exist_ok=True)
169
+
170
+ # Check for existing subtitle in any supported format
171
+ existing_subtitle = find_existing_subtitle(
172
+ series_cache_dir, series_name, season, episode
173
+ )
174
+
175
+ if existing_subtitle:
176
+ logger.info(f"Subtitle already exists: {os.path.basename(existing_subtitle)}")
177
+ continue
178
+
179
+ # Default to standard format for new downloads
169
180
  srt_filepath = os.path.join(
170
181
  series_cache_dir,
171
182
  f"{series_name} - S{season:02d}E{episode:02d}.srt",
172
183
  )
173
- if not os.path.exists(srt_filepath):
174
- # get the episode info from TMDB
175
- url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season}/episode/{episode}?api_key={tmdb_api_key}"
176
- response = requests.get(url)
177
- response.raise_for_status()
178
- episode_data = response.json()
179
- episode_data["name"]
180
- episode_id = episode_data["id"]
181
- # search for the subtitle
182
- response = subtitles.search(tmdb_id=episode_id, languages="en")
183
- if len(response.data) == 0:
184
- logger.warning(
185
- f"No subtitles found for {series_name} - S{season:02d}E{episode:02d}"
186
- )
187
-
188
- for subtitle in response.data:
189
- subtitle_dict = subtitle.to_dict()
190
- # Remove special characters and convert to uppercase
191
- filename_clean = re.sub(
192
- r"\W+", " ", subtitle_dict["file_name"]
193
- ).upper()
194
- if f"E{episode:02d}" in filename_clean:
195
- logger.info(f"Original filename: {subtitle_dict['file_name']}")
196
- srt_file = subtitles.download_and_save(subtitle)
197
- series_name = series_name.replace(":", " -")
198
- shutil.move(srt_file, srt_filepath)
199
- logger.info(f"Subtitle saved to {srt_filepath}")
200
- break
201
- else:
202
- continue
203
- else:
204
- logger.info(
205
- f"Subtitle already exists for {series_name} - S{season:02d}E{episode:02d}"
184
+
185
+ # get the episode info from TMDB
186
+ url = f"https://api.themoviedb.org/3/tv/{show_id}/season/{season}/episode/{episode}?api_key={tmdb_api_key}"
187
+ response = requests.get(url)
188
+ response.raise_for_status()
189
+ episode_data = response.json()
190
+ episode_id = episode_data["id"]
191
+
192
+ # search for the subtitle
193
+ response = subtitles.search(tmdb_id=episode_id, languages="en")
194
+ if len(response.data) == 0:
195
+ logger.warning(
196
+ f"No subtitles found for {series_name} - S{season:02d}E{episode:02d}"
206
197
  )
207
198
  continue
208
199
 
200
+ for subtitle in response.data:
201
+ subtitle_dict = subtitle.to_dict()
202
+ # Remove special characters and convert to uppercase
203
+ filename_clean = re.sub(r"\W+", " ", subtitle_dict["file_name"]).upper()
204
+ if f"E{episode:02d}" in filename_clean:
205
+ logger.info(f"Original filename: {subtitle_dict['file_name']}")
206
+ srt_file = subtitles.download_and_save(subtitle)
207
+ shutil.move(srt_file, srt_filepath)
208
+ logger.info(f"Subtitle saved to {srt_filepath}")
209
+ break
210
+
209
211
 
210
212
  def cleanup_ocr_files(show_dir):
211
213
  """
@@ -233,10 +235,8 @@ def clean_text(text):
233
235
  cleaned_text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
234
236
  # Strip leading/trailing whitespace
235
237
  return cleaned_text.strip()
236
- # mkv_episode_matcher/utils.py
237
-
238
- # Add this to your existing utils.py, keeping all other functions
239
238
 
239
+ @logger.catch
240
240
  def process_reference_srt_files(series_name):
241
241
  """
242
242
  Process reference SRT files for a given series.
@@ -357,12 +357,9 @@ def compare_and_rename_files(srt_files, reference_files, dry_run=False):
357
357
  logger.info(f"Matching lines: {matching_lines}")
358
358
  logger.info(f"Found matching file: {mkv_file} ->{reference}")
359
359
  new_filename = os.path.join(parent_dir, reference)
360
- if not os.path.exists(new_filename):
361
- if os.path.exists(mkv_file) and not dry_run:
362
- logger.info(f"Renaming {mkv_file} to {new_filename}")
363
- os.rename(mkv_file, new_filename)
364
- else:
365
- logger.info(f"File {new_filename} already exists, skipping")
360
+ if not dry_run:
361
+ logger.info(f"Renaming {mkv_file} to {new_filename}")
362
+ rename_episode_file(mkv_file, new_filename)
366
363
 
367
364
  def compare_text(text1, text2):
368
365
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mkv-episode-matcher
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -1,4 +1,3 @@
1
- .coverage
2
1
  .gitattributes
3
2
  .gitignore
4
3
  .gitmodules
@@ -27,6 +26,7 @@ mkv_episode_matcher/episode_identification.py
27
26
  mkv_episode_matcher/episode_matcher.py
28
27
  mkv_episode_matcher/mkv_to_srt.py
29
28
  mkv_episode_matcher/speech_to_text.py
29
+ mkv_episode_matcher/subtitle_utils.py
30
30
  mkv_episode_matcher/tmdb_client.py
31
31
  mkv_episode_matcher/utils.py
32
32
  mkv_episode_matcher.egg-info/PKG-INFO
@@ -46,4 +46,5 @@ mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py
46
46
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py
47
47
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py
48
48
  tests/__init__.py
49
- tests/test_improvements.py
49
+ tests/test_improvements.py
50
+ tests/test_main.py
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = mkv_episode_matcher
3
- version = 0.3.2
3
+ version = 0.3.4
4
4
  author = Jonathan Sakkos
5
5
  author_email = jonathansakkos@gmail.com
6
6
  description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
@@ -0,0 +1,137 @@
1
+ import pytest
2
+ import os
3
+ import shutil
4
+ from pathlib import Path
5
+ from unittest.mock import Mock, patch, mock_open
6
+ from mkv_episode_matcher.episode_matcher import process_show
7
+ from mkv_episode_matcher.utils import (
8
+ get_valid_seasons,
9
+ check_filename,
10
+ rename_episode_file,
11
+ clean_text,
12
+ extract_season_episode
13
+ )
14
+ from mkv_episode_matcher.episode_identification import EpisodeMatcher
15
+ from mkv_episode_matcher.config import get_config, set_config
16
+
17
+ @pytest.fixture
18
+ def temp_show_dir(tmp_path):
19
+ show_dir = tmp_path / "Test Show"
20
+ show_dir.mkdir()
21
+ season_dir = show_dir / "Season 1"
22
+ season_dir.mkdir()
23
+ (season_dir / "episode1.mkv").touch()
24
+ (season_dir / "episode2.mkv").touch()
25
+ return show_dir
26
+
27
+ @pytest.fixture
28
+ def mock_config():
29
+ return {
30
+ "tmdb_api_key": "test_key",
31
+ "show_dir": "/test/path",
32
+ "max_threads": 4,
33
+ "open_subtitles_api_key": "test_key",
34
+ "open_subtitles_user_agent": "test_agent",
35
+ "open_subtitles_username": "test_user",
36
+ "open_subtitles_password": "test_pass",
37
+ "tesseract_path": "/test/tesseract"
38
+ }
39
+
40
+ class TestUtilities:
41
+ def test_get_valid_seasons(self, temp_show_dir):
42
+ seasons = get_valid_seasons(str(temp_show_dir))
43
+ assert len(seasons) == 1
44
+ assert str(temp_show_dir / "Season 1") in seasons
45
+
46
+ def test_check_filename(self):
47
+ assert check_filename("Show - S01E02.mkv") == True
48
+ assert check_filename("random_file.mkv") == False
49
+
50
+ def test_rename_episode_file(self, temp_show_dir):
51
+ original = temp_show_dir / "Season 1" / "episode1.mkv"
52
+ new_name = "Show - S01E01.mkv"
53
+ result = rename_episode_file(str(original), new_name)
54
+ assert result is not None
55
+ assert Path(result).name == new_name
56
+
57
+ def test_clean_text(self):
58
+ text = "Test [action] (note) {tag}"
59
+ assert clean_text(text) == "Test"
60
+
61
+ def test_extract_season_episode(self):
62
+ filename = "Show - S01E02.mkv"
63
+ season, episode = extract_season_episode(filename)
64
+ assert season == 1
65
+ assert episode == 2
66
+
67
+ class TestConfiguration:
68
+ def test_set_config(self, tmp_path, mock_config):
69
+ config_file = tmp_path / "config.ini"
70
+ set_config(
71
+ mock_config["tmdb_api_key"],
72
+ mock_config["open_subtitles_api_key"],
73
+ mock_config["open_subtitles_user_agent"],
74
+ mock_config["open_subtitles_username"],
75
+ mock_config["open_subtitles_password"],
76
+ mock_config["show_dir"],
77
+ str(config_file),
78
+ mock_config["tesseract_path"]
79
+ )
80
+ assert config_file.exists()
81
+
82
+ def test_get_config(self, tmp_path, mock_config):
83
+ config_file = tmp_path / "config.ini"
84
+ set_config(
85
+ mock_config["tmdb_api_key"],
86
+ mock_config["open_subtitles_api_key"],
87
+ mock_config["open_subtitles_user_agent"],
88
+ mock_config["open_subtitles_username"],
89
+ mock_config["open_subtitles_password"],
90
+ mock_config["show_dir"],
91
+ str(config_file),
92
+ mock_config["tesseract_path"]
93
+ )
94
+ config = get_config(str(config_file))
95
+ assert config["tmdb_api_key"] == mock_config["tmdb_api_key"]
96
+ assert config["show_dir"] == mock_config["show_dir"]
97
+
98
+ class TestEpisodeMatcher:
99
+ @pytest.fixture
100
+ def matcher(self, tmp_path):
101
+ return EpisodeMatcher(tmp_path, "Test Show")
102
+
103
+ def test_clean_text(self, matcher):
104
+ text = "Test [action] <tag> T-t-test"
105
+ assert matcher.clean_text(text) == "test action tag test"
106
+
107
+ def test_chunk_score(self, matcher):
108
+ score = matcher.chunk_score("Test dialogue", "test dialog")
109
+ assert 0 <= score <= 1
110
+
111
+ @patch('subprocess.run')
112
+ def test_extract_audio_chunk(self, mock_run, matcher, tmp_path):
113
+ mkv_file = tmp_path / "test.mkv"
114
+ mkv_file.touch()
115
+ chunk = matcher.extract_audio_chunk(str(mkv_file), 0)
116
+ assert isinstance(chunk, str)
117
+ assert mock_run.called
118
+
119
+ class TestProcessShow:
120
+ @patch('mkv_episode_matcher.episode_matcher.get_valid_seasons')
121
+ @patch('mkv_episode_matcher.episode_matcher.get_config')
122
+ def test_process_show_no_seasons(self, mock_config, mock_seasons, mock_config_data):
123
+ mock_seasons.return_value = []
124
+ mock_config.return_value = mock_config_data
125
+ process_show()
126
+ mock_seasons.assert_called_once()
127
+
128
+ @patch('mkv_episode_matcher.episode_matcher.get_valid_seasons')
129
+ @patch('mkv_episode_matcher.episode_matcher.get_config')
130
+ def test_process_show_with_season(self, mock_config, mock_seasons, temp_show_dir, mock_config_data):
131
+ mock_seasons.return_value = [str(temp_show_dir / "Season 1")]
132
+ mock_config.return_value = mock_config_data
133
+ process_show(season=1)
134
+ mock_seasons.assert_called_once()
135
+
136
+ if __name__ == '__main__':
137
+ pytest.main(['-v'])
@@ -374,7 +374,7 @@ wheels = [
374
374
 
375
375
  [[package]]
376
376
  name = "mkv-episode-matcher"
377
- version = "0.3.1.post1.dev0+g11924a6.d20241126"
377
+ version = "0.3.3.post1.dev0+g40bb202.d20241207"
378
378
  source = { editable = "." }
379
379
  dependencies = [
380
380
  { name = "configparser" },
Binary file
@@ -1,208 +0,0 @@
1
- # mkv_episode_matcher/episode_identification.py
2
-
3
- import os
4
- import glob
5
- from pathlib import Path
6
- from rapidfuzz import fuzz
7
- from collections import defaultdict
8
- import re
9
- from loguru import logger
10
- import json
11
- import shutil
12
-
13
- class EpisodeMatcher:
14
- def __init__(self, cache_dir, show_name,min_confidence=0.6):
15
- self.cache_dir = Path(cache_dir)
16
- self.min_confidence = min_confidence
17
- self.whisper_segments = None
18
- self.series_name = show_name
19
-
20
- def clean_text(self, text):
21
- """Clean text by removing stage directions and normalizing repeated words."""
22
- # Remove stage directions like [groans] and <i>SHIP:</i>
23
- text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
24
- # Remove repeated words with dashes (e.g., "Y-y-you" -> "you")
25
- text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
26
- # Remove multiple spaces
27
- text = ' '.join(text.split())
28
- return text.lower()
29
-
30
- def chunk_score(self, whisper_chunk, ref_chunk):
31
- """Calculate fuzzy match score between two chunks of text."""
32
- whisper_clean = self.clean_text(whisper_chunk)
33
- ref_clean = self.clean_text(ref_chunk)
34
-
35
- # Use token sort ratio to handle word order differences
36
- token_sort = fuzz.token_sort_ratio(whisper_clean, ref_clean)
37
- # Use partial ratio to catch substring matches
38
- partial = fuzz.partial_ratio(whisper_clean, ref_clean)
39
-
40
- # Weight token sort more heavily but consider partial matches
41
- return (token_sort * 0.7 + partial * 0.3) / 100.0
42
-
43
- def identify_episode(self, video_file, temp_dir):
44
- """Identify which episode matches this video file."""
45
-
46
- # Get series name from parent directory
47
- self.series_name = Path(video_file).parent.parent.name
48
-
49
- # Load whisper transcript if not already processed
50
- segments_file = Path(temp_dir) / f"{Path(video_file).stem}.segments.json"
51
- if not segments_file.exists():
52
- logger.error(f"No transcript found for {video_file}. Run speech recognition first.")
53
- return None
54
-
55
- with open(segments_file) as f:
56
- self.whisper_segments = json.load(f)
57
-
58
- # Get reference directory for this series
59
- reference_dir = self.cache_dir / "data" / self.series_name
60
- if not reference_dir.exists():
61
- logger.error(f"No reference files found for {self.series_name}")
62
- return None
63
-
64
- # Match against reference files
65
- match = self.match_all_references(reference_dir)
66
-
67
- if match and match['confidence'] >= self.min_confidence:
68
- # Extract season and episode from filename
69
- match_file = Path(match['file'])
70
- season_ep = re.search(r'S(\d+)E(\d+)', match_file.stem)
71
- if season_ep:
72
- season, episode = map(int, season_ep.groups())
73
- return {
74
- 'season': season,
75
- 'episode': episode,
76
- 'confidence': match['confidence'],
77
- 'reference_file': str(match_file),
78
- 'chunk_scores': match['chunk_scores']
79
- }
80
-
81
- return None
82
-
83
- def match_all_references(self, reference_dir):
84
- """Process all reference files and track matching scores."""
85
- results = defaultdict(list)
86
- best_match = None
87
- best_confidence = 0
88
-
89
- def process_chunks(ref_segments, filename):
90
- nonlocal best_match, best_confidence
91
-
92
- chunk_size = 300 # 5 minute chunks
93
- whisper_chunks = defaultdict(list)
94
- ref_chunks = defaultdict(list)
95
-
96
- # Group segments into time chunks
97
- for seg in self.whisper_segments:
98
- chunk_idx = int(float(seg['start']) // chunk_size)
99
- whisper_chunks[chunk_idx].append(seg['text'])
100
-
101
- for seg in ref_segments:
102
- chunk_idx = int(seg['start'] // chunk_size)
103
- ref_chunks[chunk_idx].append(seg['text'])
104
-
105
- # Score each chunk
106
- for chunk_idx in whisper_chunks:
107
- whisper_text = ' '.join(whisper_chunks[chunk_idx])
108
-
109
- # Look for matching reference chunk and adjacent chunks
110
- scores = []
111
- for ref_idx in range(max(0, chunk_idx-1), chunk_idx+2):
112
- if ref_idx in ref_chunks:
113
- ref_text = ' '.join(ref_chunks[ref_idx])
114
- score = self.chunk_score(whisper_text, ref_text)
115
- scores.append(score)
116
-
117
- if scores:
118
- chunk_confidence = max(scores)
119
- logger.info(f"File: {filename}, "
120
- f"Time: {chunk_idx*chunk_size}-{(chunk_idx+1)*chunk_size}s, "
121
- f"Confidence: {chunk_confidence:.2f}")
122
-
123
- results[filename].append({
124
- 'chunk_idx': chunk_idx,
125
- 'confidence': chunk_confidence
126
- })
127
-
128
- # Early exit if we find a very good match
129
- if chunk_confidence > self.min_confidence:
130
- chunk_scores = results[filename]
131
- confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
132
- for c in chunk_scores) / len(chunk_scores)
133
-
134
- if confidence > best_confidence:
135
- best_confidence = confidence
136
- best_match = {
137
- 'file': filename,
138
- 'confidence': confidence,
139
- 'chunk_scores': chunk_scores
140
- }
141
- return True
142
-
143
- return False
144
-
145
- # Process each reference file
146
- for ref_file in glob.glob(os.path.join(reference_dir, "*.srt")):
147
- ref_segments = self.parse_srt_to_segments(ref_file)
148
- filename = os.path.basename(ref_file)
149
-
150
- if process_chunks(ref_segments, filename):
151
- break
152
-
153
- # If no early match found, find best overall match
154
- if not best_match:
155
- for filename, chunks in results.items():
156
- # Weight earlier chunks more heavily
157
- confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
158
- for c in chunks) / len(chunks)
159
-
160
- if confidence > best_confidence:
161
- best_confidence = confidence
162
- best_match = {
163
- 'file': filename,
164
- 'confidence': confidence,
165
- 'chunk_scores': chunks
166
- }
167
-
168
- return best_match
169
-
170
- def parse_srt_to_segments(self, srt_file):
171
- """Parse SRT file into list of segments with start/end times and text."""
172
- segments = []
173
- current_segment = {}
174
-
175
- with open(srt_file, 'r', encoding='utf-8') as f:
176
- lines = f.readlines()
177
-
178
- i = 0
179
- while i < len(lines):
180
- line = lines[i].strip()
181
-
182
- if line.isdigit(): # Index
183
- if current_segment:
184
- segments.append(current_segment)
185
- current_segment = {}
186
-
187
- elif '-->' in line: # Timestamp
188
- start, end = line.split(' --> ')
189
- current_segment['start'] = self.timestr_to_seconds(start)
190
- current_segment['end'] = self.timestr_to_seconds(end)
191
-
192
- elif line: # Text
193
- if 'text' in current_segment:
194
- current_segment['text'] += ' ' + line
195
- else:
196
- current_segment['text'] = line
197
-
198
- i += 1
199
-
200
- if current_segment:
201
- segments.append(current_segment)
202
-
203
- return segments
204
-
205
- def timestr_to_seconds(self, timestr):
206
- """Convert SRT timestamp to seconds."""
207
- h, m, s = timestr.replace(',','.').split(':')
208
- return float(h) * 3600 + float(m) * 60 + float(s)