mkv-episode-matcher 0.4.5__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (52) hide show
  1. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.coverage +0 -0
  2. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/PKG-INFO +1 -1
  3. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/__main__.py +1 -1
  4. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/episode_identification.py +108 -61
  5. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/episode_matcher.py +0 -3
  6. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/utils.py +9 -2
  7. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/PKG-INFO +1 -1
  8. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/SOURCES.txt +0 -1
  9. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/setup.cfg +1 -1
  10. mkv_episode_matcher-0.4.5/mkv_episode_matcher/speech_to_text.py +0 -96
  11. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.gitattributes +0 -0
  12. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/funding.yml +0 -0
  13. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/workflows/documentation.yml +0 -0
  14. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/workflows/python-publish.yml +0 -0
  15. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/workflows/tests.yml +0 -0
  16. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.gitignore +0 -0
  17. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.gitmodules +0 -0
  18. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.python-version +0 -0
  19. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.vscode/settings.json +0 -0
  20. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/README.md +0 -0
  21. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/api/index.md +0 -0
  22. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/cli.md +0 -0
  23. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/configuration.md +0 -0
  24. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/installation.md +0 -0
  25. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/quickstart.md +0 -0
  26. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/tips.md +0 -0
  27. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkdocs.yml +0 -0
  28. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/.gitattributes +0 -0
  29. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/__init__.py +0 -0
  30. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/config.py +0 -0
  31. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -0
  32. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -0
  33. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -0
  34. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -0
  35. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/README.md +0 -0
  36. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  37. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -0
  38. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -0
  39. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -0
  40. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -0
  41. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/mkv_to_srt.py +0 -0
  42. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/subtitle_utils.py +0 -0
  43. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/tmdb_client.py +0 -0
  44. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/dependency_links.txt +0 -0
  45. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/entry_points.txt +0 -0
  46. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/requires.txt +0 -0
  47. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/top_level.txt +0 -0
  48. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/pyproject.toml +0 -0
  49. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/setup.py +0 -0
  50. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/tests/__init__.py +0 -0
  51. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/tests/test_main.py +0 -0
  52. {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mkv-episode-matcher
3
- Version: 0.4.5
3
+ Version: 0.5.0
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -108,7 +108,7 @@ def main():
108
108
  )
109
109
  args = parser.parse_args()
110
110
  if args.check_gpu:
111
- from mkv_episode_matcher.speech_to_text import check_gpu_support
111
+ from mkv_episode_matcher.utils import check_gpu_support
112
112
  check_gpu_support()
113
113
  return
114
114
  logger.debug(f"Command-line arguments: {args}")
@@ -18,7 +18,7 @@ class EpisodeMatcher:
18
18
  self.cache_dir = Path(cache_dir)
19
19
  self.min_confidence = min_confidence
20
20
  self.show_name = show_name
21
- self.chunk_duration = 300 # 5 minutes
21
+ self.chunk_duration = 30
22
22
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
23
  self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
24
24
  self.temp_dir.mkdir(exist_ok=True)
@@ -44,7 +44,9 @@ class EpisodeMatcher:
44
44
  '-ss', str(start_time),
45
45
  '-t', str(self.chunk_duration),
46
46
  '-i', mkv_file,
47
- '-vn',
47
+ '-vn', # Disable video
48
+ '-sn', # Disable subtitles
49
+ '-dn', # Disable data streams
48
50
  '-acodec', 'pcm_s16le',
49
51
  '-ar', '16000',
50
52
  '-ac', '1',
@@ -80,31 +82,73 @@ class EpisodeMatcher:
80
82
  except Exception as e:
81
83
  logger.error(f"Error loading reference chunk from {srt_file}: {e}")
82
84
  return ''
83
-
84
- def identify_episode(self, video_file, temp_dir, season_number):
85
- try:
86
- # Get video duration
87
- duration = float(subprocess.check_output([
88
- 'ffprobe', '-v', 'error',
89
- '-show_entries', 'format=duration',
90
- '-of', 'default=noprint_wrappers=1:nokey=1',
91
- video_file
92
- ]).decode())
85
+ def _try_match_with_model(self, video_file, model_name, max_duration, reference_files):
86
+ """
87
+ Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
88
+
89
+ Args:
90
+ video_file: Path to the video file
91
+ model_name: Name of the Whisper model to use
92
+ max_duration: Maximum duration in seconds to check
93
+ reference_files: List of reference subtitle files
94
+ """
95
+ # Use cached model
96
+ model = get_whisper_model(model_name, self.device)
97
+
98
+ # Calculate number of chunks to check (30 seconds each)
99
+ num_chunks = max_duration // self.chunk_duration
100
+
101
+ for chunk_idx in range(num_chunks):
102
+ start_time = chunk_idx * self.chunk_duration
103
+ logger.debug(f"Trying {model_name} model at {start_time} seconds")
93
104
 
94
- total_chunks = int(np.ceil(duration / self.chunk_duration))
105
+ audio_path = self.extract_audio_chunk(video_file, start_time)
95
106
 
96
- # Load Whisper model
97
- model = whisper.load_model("base", device=self.device)
107
+ result = model.transcribe(
108
+ audio_path,
109
+ task="transcribe",
110
+ language="en"
111
+ )
98
112
 
99
- # Get season-specific reference files using multiple patterns
100
- reference_dir = self.cache_dir / "data" / self.show_name
113
+ chunk_text = result["text"]
114
+ best_confidence = 0
115
+ best_match = None
101
116
 
102
- # Create season patterns for different formats
117
+ # Compare with reference chunks
118
+ for ref_file in reference_files:
119
+ ref_text = self.load_reference_chunk(ref_file, chunk_idx)
120
+ confidence = self.chunk_score(chunk_text, ref_text)
121
+
122
+ if confidence > best_confidence:
123
+ best_confidence = confidence
124
+ best_match = ref_file
125
+
126
+ if confidence > self.min_confidence:
127
+ season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
128
+ if season_ep:
129
+ season, episode = map(int, season_ep.groups())
130
+ return {
131
+ 'season': season,
132
+ 'episode': episode,
133
+ 'confidence': best_confidence,
134
+ 'reference_file': str(best_match),
135
+ 'matched_at': start_time
136
+ }
137
+
138
+ logger.debug(f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})")
139
+
140
+ return None
141
+
142
+ def identify_episode(self, video_file, temp_dir, season_number):
143
+ """Progressive episode identification with faster initial attempt."""
144
+ try:
145
+ # Get reference files first
146
+ reference_dir = self.cache_dir / "data" / self.show_name
103
147
  patterns = [
104
- f"S{season_number:02d}E", # S01E01
105
- f"S{season_number}E", # S1E01
106
- f"{season_number:02d}x", # 01x01
107
- f"{season_number}x", # 1x01
148
+ f"S{season_number:02d}E",
149
+ f"S{season_number}E",
150
+ f"{season_number:02d}x",
151
+ f"{season_number}x",
108
152
  ]
109
153
 
110
154
  reference_files = []
@@ -114,55 +158,43 @@ class EpisodeMatcher:
114
158
  for p in patterns)]
115
159
  reference_files.extend(files)
116
160
 
117
- # Remove duplicates while preserving order
118
161
  reference_files = list(dict.fromkeys(reference_files))
119
162
 
120
163
  if not reference_files:
121
164
  logger.error(f"No reference files found for season {season_number}")
122
165
  return None
123
-
124
- # Process chunks until match found
125
- for chunk_idx in range(min(3, total_chunks)): # Only try first 3 chunks
126
- start_time = chunk_idx * self.chunk_duration
127
- audio_path = self.extract_audio_chunk(video_file, start_time)
128
-
129
- # Transcribe chunk
130
- result = model.transcribe(
131
- audio_path,
132
- task="transcribe",
133
- language="en"
134
- )
135
-
136
- chunk_text = result["text"]
137
- best_confidence = 0
138
- best_match = None
139
-
140
- # Compare with reference chunks
141
- for ref_file in reference_files:
142
- ref_text = self.load_reference_chunk(ref_file, chunk_idx)
143
- confidence = self.chunk_score(chunk_text, ref_text)
144
-
145
- if confidence > best_confidence:
146
- best_confidence = confidence
147
- best_match = ref_file
148
-
149
- if confidence > self.min_confidence:
150
- season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
151
- if season_ep:
152
- season, episode = map(int, season_ep.groups())
153
- return {
154
- 'season': season,
155
- 'episode': episode,
156
- 'confidence': best_confidence,
157
- 'reference_file': str(best_match),
158
- }
166
+
167
+ # Try with tiny model first (fastest) - check first 2 minutes
168
+ logger.info("Attempting match with tiny model (first 2 minutes)...")
169
+ match = self._try_match_with_model(video_file, "tiny", 120, reference_files)
170
+ if match and match['confidence'] > 0.65: # Slightly lower threshold for tiny
171
+ logger.info(f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
172
+ return match
173
+
174
+ # If unsuccessful with tiny, try base model on first 3 minutes
175
+ logger.info("Tiny model match failed, trying base model (first 3 minutes)...")
176
+ match = self._try_match_with_model(video_file, "base", 180, reference_files)
177
+ if match and match['confidence'] > self.min_confidence:
178
+ logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
179
+ return match
159
180
 
181
+ # If still no match, try base model on up to 10 minutes
182
+ logger.info("No match in first 3 minutes, extending base model search to 10 minutes...")
183
+ match = self._try_match_with_model(video_file, "base", 600, reference_files)
184
+ if match:
185
+ logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
186
+ return match
187
+
188
+ logger.info("Speech recognition match failed")
160
189
  return None
161
190
 
162
191
  finally:
163
192
  # Cleanup temp files
164
193
  for file in self.temp_dir.glob("chunk_*.wav"):
165
- file.unlink()
194
+ try:
195
+ file.unlink()
196
+ except Exception as e:
197
+ logger.warning(f"Failed to delete temp file {file}: {e}")
166
198
 
167
199
  def detect_file_encoding(file_path):
168
200
  """
@@ -279,4 +311,19 @@ class SubtitleReader:
279
311
  logger.warning(f"Error parsing subtitle block: {e}")
280
312
  continue
281
313
 
282
- return text_lines
314
+ return text_lines
315
+
316
+ _whisper_models = {}
317
+
318
+ def get_whisper_model(model_name="tiny", device=None):
319
+ """Cache whisper models to avoid reloading."""
320
+ global _whisper_models
321
+ if device is None:
322
+ device = "cuda" if torch.cuda.is_available() else "cpu"
323
+
324
+ key = f"{model_name}_{device}"
325
+ if key not in _whisper_models:
326
+ _whisper_models[key] = whisper.load_model(model_name, device=device)
327
+ logger.info(f"Loaded {model_name} model on {device}")
328
+
329
+ return _whisper_models[key]
@@ -19,7 +19,6 @@ from mkv_episode_matcher.utils import (
19
19
  process_srt_files,
20
20
  compare_and_rename_files,get_valid_seasons,rename_episode_file
21
21
  )
22
- from mkv_episode_matcher.speech_to_text import process_speech_to_text
23
22
  from mkv_episode_matcher.episode_identification import EpisodeMatcher
24
23
 
25
24
  def process_show(season=None, dry_run=False, get_subs=False):
@@ -76,8 +75,6 @@ def process_show(season=None, dry_run=False, get_subs=False):
76
75
 
77
76
  if match:
78
77
  new_name = f"{matcher.show_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
79
- new_path = os.path.join(season_path, new_name)
80
-
81
78
  logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
82
79
  f"(confidence: {match['confidence']:.2f})")
83
80
 
@@ -2,7 +2,7 @@
2
2
  import os
3
3
  import re
4
4
  import shutil
5
-
5
+ import torch
6
6
  import requests
7
7
  from loguru import logger
8
8
  from opensubtitlescom import OpenSubtitles
@@ -389,4 +389,11 @@ def compare_text(text1, text2):
389
389
 
390
390
  # Compare the two lists of text lines
391
391
  matching_lines = set(flat_text1).intersection(flat_text2)
392
- return len(matching_lines)
392
+ return len(matching_lines)
393
+
394
+ def check_gpu_support():
395
+ logger.info('Checking GPU support...')
396
+ if torch.cuda.is_available():
397
+ logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
398
+ else:
399
+ logger.warning("CUDA not available. Using CPU. Refer to https://pytorch.org/get-started/locally/ for GPU support.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: mkv-episode-matcher
3
- Version: 0.4.5
3
+ Version: 0.5.0
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -27,7 +27,6 @@ mkv_episode_matcher/config.py
27
27
  mkv_episode_matcher/episode_identification.py
28
28
  mkv_episode_matcher/episode_matcher.py
29
29
  mkv_episode_matcher/mkv_to_srt.py
30
- mkv_episode_matcher/speech_to_text.py
31
30
  mkv_episode_matcher/subtitle_utils.py
32
31
  mkv_episode_matcher/tmdb_client.py
33
32
  mkv_episode_matcher/utils.py
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = mkv_episode_matcher
3
- version = 0.4.5
3
+ version = 0.5.0
4
4
  author = Jonathan Sakkos
5
5
  author_email = jonathansakkos@gmail.com
6
6
  description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
@@ -1,96 +0,0 @@
1
- # mkv_episode_matcher/speech_to_text.py
2
-
3
- import os
4
- import subprocess
5
- from pathlib import Path
6
- import whisper
7
- import torch
8
- from loguru import logger
9
-
10
- def process_speech_to_text(mkv_file, output_dir):
11
- """
12
- Convert MKV file to transcript using Whisper.
13
-
14
- Args:
15
- mkv_file (str): Path to MKV file
16
- output_dir (str): Directory to save transcript files
17
- """
18
- # Extract audio if not already done
19
- wav_file = extract_audio(mkv_file, output_dir)
20
- if not wav_file:
21
- return None
22
-
23
- # Load model
24
- device = "cuda" if torch.cuda.is_available() else "cpu"
25
- if device == "cuda":
26
- logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
27
- else:
28
- logger.info("CUDA not available. Using CPU.")
29
-
30
- model = whisper.load_model("base", device=device)
31
-
32
- # Generate transcript
33
- segments_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.segments.json")
34
- if not os.path.exists(segments_file):
35
- try:
36
- result = model.transcribe(
37
- wav_file,
38
- task="transcribe",
39
- language="en",
40
- )
41
-
42
- # Save segments
43
- import json
44
- with open(segments_file, 'w', encoding='utf-8') as f:
45
- json.dump(result["segments"], f, indent=2)
46
-
47
- logger.info(f"Transcript saved to {segments_file}")
48
-
49
- except Exception as e:
50
- logger.error(f"Error during transcription: {e}")
51
- return None
52
- else:
53
- logger.info(f"Using existing transcript: {segments_file}")
54
-
55
- return segments_file
56
-
57
- def extract_audio(mkv_file, output_dir):
58
- """
59
- Extract audio from MKV file using FFmpeg.
60
-
61
- Args:
62
- mkv_file (str): Path to MKV file
63
- output_dir (str): Directory to save WAV file
64
-
65
- Returns:
66
- str: Path to extracted WAV file
67
- """
68
- wav_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.wav")
69
-
70
- if not os.path.exists(wav_file):
71
- logger.info(f"Extracting audio from {mkv_file}")
72
- try:
73
- cmd = [
74
- 'ffmpeg',
75
- '-i', mkv_file,
76
- '-vn', # Disable video
77
- '-acodec', 'pcm_s16le', # Convert to PCM format
78
- '-ar', '16000', # Set sample rate to 16kHz
79
- '-ac', '1', # Convert to mono
80
- wav_file
81
- ]
82
- subprocess.run(cmd, check=True, capture_output=True)
83
- logger.info(f"Audio extracted to {wav_file}")
84
- except subprocess.CalledProcessError as e:
85
- logger.error(f"Error extracting audio: {e}")
86
- return None
87
- else:
88
- logger.info(f"Audio file {wav_file} already exists, skipping extraction")
89
-
90
- return wav_file
91
- def check_gpu_support():
92
- logger.info('Checking GPU support...')
93
- if torch.cuda.is_available():
94
- logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
95
- else:
96
- logger.warning("CUDA not available. Using CPU. Refer to https://pytorch.org/get-started/locally/ for GPU support.")