mkv-episode-matcher 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

@@ -0,0 +1,208 @@
1
+ # mkv_episode_matcher/episode_identification.py
2
+
3
+ import os
4
+ import glob
5
+ from pathlib import Path
6
+ from rapidfuzz import fuzz
7
+ from collections import defaultdict
8
+ import re
9
+ from loguru import logger
10
+ import json
11
+ import shutil
12
+
13
+ class EpisodeMatcher:
14
+ def __init__(self, cache_dir, min_confidence=0.6):
15
+ self.cache_dir = Path(cache_dir)
16
+ self.min_confidence = min_confidence
17
+ self.whisper_segments = None
18
+ self.series_name = None
19
+
20
+ def clean_text(self, text):
21
+ """Clean text by removing stage directions and normalizing repeated words."""
22
+ # Remove stage directions like [groans] and <i>SHIP:</i>
23
+ text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
24
+ # Remove repeated words with dashes (e.g., "Y-y-you" -> "you")
25
+ text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
26
+ # Remove multiple spaces
27
+ text = ' '.join(text.split())
28
+ return text.lower()
29
+
30
+ def chunk_score(self, whisper_chunk, ref_chunk):
31
+ """Calculate fuzzy match score between two chunks of text."""
32
+ whisper_clean = self.clean_text(whisper_chunk)
33
+ ref_clean = self.clean_text(ref_chunk)
34
+
35
+ # Use token sort ratio to handle word order differences
36
+ token_sort = fuzz.token_sort_ratio(whisper_clean, ref_clean)
37
+ # Use partial ratio to catch substring matches
38
+ partial = fuzz.partial_ratio(whisper_clean, ref_clean)
39
+
40
+ # Weight token sort more heavily but consider partial matches
41
+ return (token_sort * 0.7 + partial * 0.3) / 100.0
42
+
43
+ def identify_episode(self, video_file, temp_dir):
44
+ """Identify which episode matches this video file."""
45
+
46
+ # Get series name from parent directory
47
+ self.series_name = Path(video_file).parent.parent.name
48
+
49
+ # Load whisper transcript if not already processed
50
+ segments_file = Path(temp_dir) / f"{Path(video_file).stem}.segments.json"
51
+ if not segments_file.exists():
52
+ logger.error(f"No transcript found for {video_file}. Run speech recognition first.")
53
+ return None
54
+
55
+ with open(segments_file) as f:
56
+ self.whisper_segments = json.load(f)
57
+
58
+ # Get reference directory for this series
59
+ reference_dir = self.cache_dir / "data" / self.series_name
60
+ if not reference_dir.exists():
61
+ logger.error(f"No reference files found for {self.series_name}")
62
+ return None
63
+
64
+ # Match against reference files
65
+ match = self.match_all_references(reference_dir)
66
+
67
+ if match and match['confidence'] >= self.min_confidence:
68
+ # Extract season and episode from filename
69
+ match_file = Path(match['file'])
70
+ season_ep = re.search(r'S(\d+)E(\d+)', match_file.stem)
71
+ if season_ep:
72
+ season, episode = map(int, season_ep.groups())
73
+ return {
74
+ 'season': season,
75
+ 'episode': episode,
76
+ 'confidence': match['confidence'],
77
+ 'reference_file': str(match_file),
78
+ 'chunk_scores': match['chunk_scores']
79
+ }
80
+
81
+ return None
82
+
83
+ def match_all_references(self, reference_dir):
84
+ """Process all reference files and track matching scores."""
85
+ results = defaultdict(list)
86
+ best_match = None
87
+ best_confidence = 0
88
+
89
+ def process_chunks(ref_segments, filename):
90
+ nonlocal best_match, best_confidence
91
+
92
+ chunk_size = 300 # 5 minute chunks
93
+ whisper_chunks = defaultdict(list)
94
+ ref_chunks = defaultdict(list)
95
+
96
+ # Group segments into time chunks
97
+ for seg in self.whisper_segments:
98
+ chunk_idx = int(float(seg['start']) // chunk_size)
99
+ whisper_chunks[chunk_idx].append(seg['text'])
100
+
101
+ for seg in ref_segments:
102
+ chunk_idx = int(seg['start'] // chunk_size)
103
+ ref_chunks[chunk_idx].append(seg['text'])
104
+
105
+ # Score each chunk
106
+ for chunk_idx in whisper_chunks:
107
+ whisper_text = ' '.join(whisper_chunks[chunk_idx])
108
+
109
+ # Look for matching reference chunk and adjacent chunks
110
+ scores = []
111
+ for ref_idx in range(max(0, chunk_idx-1), chunk_idx+2):
112
+ if ref_idx in ref_chunks:
113
+ ref_text = ' '.join(ref_chunks[ref_idx])
114
+ score = self.chunk_score(whisper_text, ref_text)
115
+ scores.append(score)
116
+
117
+ if scores:
118
+ chunk_confidence = max(scores)
119
+ logger.info(f"File: {filename}, "
120
+ f"Time: {chunk_idx*chunk_size}-{(chunk_idx+1)*chunk_size}s, "
121
+ f"Confidence: {chunk_confidence:.2f}")
122
+
123
+ results[filename].append({
124
+ 'chunk_idx': chunk_idx,
125
+ 'confidence': chunk_confidence
126
+ })
127
+
128
+ # Early exit if we find a very good match
129
+ if chunk_confidence > self.min_confidence:
130
+ chunk_scores = results[filename]
131
+ confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
132
+ for c in chunk_scores) / len(chunk_scores)
133
+
134
+ if confidence > best_confidence:
135
+ best_confidence = confidence
136
+ best_match = {
137
+ 'file': filename,
138
+ 'confidence': confidence,
139
+ 'chunk_scores': chunk_scores
140
+ }
141
+ return True
142
+
143
+ return False
144
+
145
+ # Process each reference file
146
+ for ref_file in glob.glob(os.path.join(reference_dir, "*.srt")):
147
+ ref_segments = self.parse_srt_to_segments(ref_file)
148
+ filename = os.path.basename(ref_file)
149
+
150
+ if process_chunks(ref_segments, filename):
151
+ break
152
+
153
+ # If no early match found, find best overall match
154
+ if not best_match:
155
+ for filename, chunks in results.items():
156
+ # Weight earlier chunks more heavily
157
+ confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
158
+ for c in chunks) / len(chunks)
159
+
160
+ if confidence > best_confidence:
161
+ best_confidence = confidence
162
+ best_match = {
163
+ 'file': filename,
164
+ 'confidence': confidence,
165
+ 'chunk_scores': chunks
166
+ }
167
+
168
+ return best_match
169
+
170
+ def parse_srt_to_segments(self, srt_file):
171
+ """Parse SRT file into list of segments with start/end times and text."""
172
+ segments = []
173
+ current_segment = {}
174
+
175
+ with open(srt_file, 'r', encoding='utf-8') as f:
176
+ lines = f.readlines()
177
+
178
+ i = 0
179
+ while i < len(lines):
180
+ line = lines[i].strip()
181
+
182
+ if line.isdigit(): # Index
183
+ if current_segment:
184
+ segments.append(current_segment)
185
+ current_segment = {}
186
+
187
+ elif '-->' in line: # Timestamp
188
+ start, end = line.split(' --> ')
189
+ current_segment['start'] = self.timestr_to_seconds(start)
190
+ current_segment['end'] = self.timestr_to_seconds(end)
191
+
192
+ elif line: # Text
193
+ if 'text' in current_segment:
194
+ current_segment['text'] += ' ' + line
195
+ else:
196
+ current_segment['text'] = line
197
+
198
+ i += 1
199
+
200
+ if current_segment:
201
+ segments.append(current_segment)
202
+
203
+ return segments
204
+
205
+ def timestr_to_seconds(self, timestr):
206
+ """Convert SRT timestamp to seconds."""
207
+ h, m, s = timestr.replace(',','.').split(':')
208
+ return float(h) * 3600 + float(m) * 60 + float(s)
@@ -1,10 +1,12 @@
1
- # episode_matcher.py
2
- import os
3
- import re
1
+ # mkv_episode_matcher/episode_matcher.py
4
2
 
3
+ from pathlib import Path
4
+ import shutil
5
+ import glob
6
+ import os
5
7
  from loguru import logger
6
8
 
7
- from mkv_episode_matcher.__main__ import CACHE_DIR, CONFIG_FILE
9
+ from mkv_episode_matcher.__main__ import CONFIG_FILE, CACHE_DIR
8
10
  from mkv_episode_matcher.config import get_config
9
11
  from mkv_episode_matcher.mkv_to_srt import convert_mkv_to_srt
10
12
  from mkv_episode_matcher.tmdb_client import fetch_show_id
@@ -13,262 +15,103 @@ from mkv_episode_matcher.utils import (
13
15
  clean_text,
14
16
  cleanup_ocr_files,
15
17
  get_subtitles,
18
+ process_reference_srt_files,
19
+ process_srt_files,
20
+ compare_and_rename_files,get_valid_seasons
16
21
  )
22
+ from mkv_episode_matcher.speech_to_text import process_speech_to_text
23
+ from mkv_episode_matcher.episode_identification import EpisodeMatcher
17
24
 
18
-
19
- # hash_data = {}
20
- @logger.catch
21
25
  def process_show(season=None, dry_run=False, get_subs=False):
22
- """
23
- Process the show by downloading episode images and finding matching episodes.
24
- Args:
25
- season (int, optional): The season number to process. If provided, only that season will be processed. Defaults to None.
26
- dry_run (bool, optional): Whether to perform a dry run without actually processing the episodes. Defaults to False.
27
- get_subs (bool, optional): Whether to download subtitles for the episodes. Defaults to False.
28
- """
26
+ """Process the show using both speech recognition and OCR fallback."""
29
27
  config = get_config(CONFIG_FILE)
30
28
  show_dir = config.get("show_dir")
31
- show_name = clean_text(os.path.basename(show_dir))
32
- logger.info(f"Processing show '{show_name}'...")
33
-
34
- show_id = fetch_show_id(show_name)
35
- if show_id is None:
36
- logger.error(f"Could not find show '{os.path.basename(show_dir)}' on TMDb.")
29
+
30
+ # Initialize episode matcher
31
+ matcher = EpisodeMatcher(CACHE_DIR)
32
+
33
+ # Get valid season directories
34
+ season_paths = get_valid_seasons(show_dir)
35
+ if not season_paths:
36
+ logger.warning(f"No seasons with .mkv files found")
37
37
  return
38
38
 
39
- # Get all season directories
40
- season_paths = [
41
- os.path.join(show_dir, d)
42
- for d in os.listdir(show_dir)
43
- if os.path.isdir(os.path.join(show_dir, d))
44
- ]
45
-
46
- # Filter seasons to only include those with .mkv files
47
- valid_season_paths = []
48
- for season_path in season_paths:
49
- mkv_files = [f for f in os.listdir(season_path) if f.endswith(".mkv")]
50
- if mkv_files:
51
- valid_season_paths.append(season_path)
52
-
53
- if not valid_season_paths:
54
- logger.warning(f"No seasons with .mkv files found in show '{show_name}'")
55
- return
56
-
57
- logger.info(
58
- f"Found {len(valid_season_paths)} seasons with .mkv files for show '{show_name}'"
59
- )
60
-
61
- # Extract season numbers from valid paths
62
- seasons_to_process = [
63
- int(os.path.basename(season_path).split()[-1])
64
- for season_path in valid_season_paths
65
- ]
66
-
67
- if get_subs:
68
- get_subtitles(show_id, seasons=set(seasons_to_process))
69
-
70
39
  if season is not None:
71
- # If specific season requested, check if it has .mkv files
72
40
  season_path = os.path.join(show_dir, f"Season {season}")
73
- if season_path not in valid_season_paths:
41
+ if season_path not in season_paths:
74
42
  logger.warning(f"Season {season} has no .mkv files to process")
75
43
  return
76
-
77
- season_paths_to_process = [season_path]
78
- else:
79
- # Process all valid seasons
80
- season_paths_to_process = valid_season_paths
44
+ season_paths = [season_path]
81
45
 
82
46
  # Process each season
83
- for season_path in season_paths_to_process:
84
- logger.info(f"Processing season path: {season_path}")
85
- mkv_files = [
86
- os.path.join(season_path, f)
87
- for f in os.listdir(season_path)
88
- if f.endswith(".mkv")
89
- ]
47
+ for season_path in season_paths:
48
+ # Get MKV files that haven't been processed
49
+ mkv_files = [f for f in glob.glob(os.path.join(season_path, "*.mkv"))
50
+ if not check_filename(f)]
90
51
 
91
- # Filter out files that have already been processed
92
- unprocessed_files = []
93
- for f in mkv_files:
94
- if check_filename(f):
95
- logger.info(f"Skipping {f}, already processed")
96
- else:
97
- unprocessed_files.append(f)
98
-
99
- if not unprocessed_files:
52
+ if not mkv_files:
100
53
  logger.info(f"No new files to process in {season_path}")
101
54
  continue
102
55
 
103
- logger.info(f"Processing {len(unprocessed_files)} files in {season_path}")
104
- convert_mkv_to_srt(season_path, unprocessed_files)
105
-
106
- # Process reference and SRT files after all seasons are converted
107
- reference_text_dict = process_reference_srt_files(show_name)
108
- srt_text_dict = process_srt_files(show_dir)
109
- compare_and_rename_files(srt_text_dict, reference_text_dict, dry_run=dry_run)
110
- cleanup_ocr_files(show_dir)
111
-
112
-
113
- def check_filename(filename):
114
- """
115
- Check if the filename is in the correct format.
116
-
117
- Args:
118
- filename (str): The filename to check.
119
-
120
- Returns:
121
- bool: True if the filename is in the correct format, False otherwise.
122
- """
123
- # Check if the filename matches the expected format
124
- match = re.match(r".*S\d+E\d+", filename)
125
- return bool(match)
126
-
127
-
128
- def extract_srt_text(filepath):
129
- """
130
- Extracts the text from an SRT file.
131
-
132
- Args:
133
- filepath (str): The path to the SRT file.
134
-
135
- Returns:
136
- list: A list of lists, where each inner list represents a block of text from the SRT file.
137
- Each inner list contains the lines of text for that block.
138
- """
139
- # extract the text from the file
140
- with open(filepath) as f:
141
- filepath = f.read()
142
- text_lines = [
143
- filepath.split("\n\n")[i].split("\n")[2:]
144
- for i in range(len(filepath.split("\n\n")))
145
- ]
146
- # remove empty lines
147
- text_lines = [[line for line in lines if line] for lines in text_lines]
148
- # remove <i> or </i> tags
149
- text_lines = [
150
- [re.sub(r"<i>|</i>|", "", line) for line in lines] for lines in text_lines
151
- ]
152
- # remove empty lists
153
- text_lines = [lines for lines in text_lines if lines]
154
- return text_lines
155
-
156
-
157
- def compare_text(text1, text2):
158
- """
159
- Compare two lists of text lines and return the number of matching lines.
160
-
161
- Args:
162
- text1 (list): List of text lines from the first source.
163
- text2 (list): List of text lines from the second source.
164
-
165
- Returns:
166
- int: Number of matching lines between the two sources.
167
- """
168
- # Flatten the list of text lines
169
- flat_text1 = [line for lines in text1 for line in lines]
170
- flat_text2 = [line for lines in text2 for line in lines]
171
-
172
- # Compare the two lists of text lines
173
- matching_lines = set(flat_text1).intersection(flat_text2)
174
- return len(matching_lines)
175
-
176
-
177
- def extract_season_episode(filename):
178
- """
179
- Extract the season and episode number from the filename.
180
-
181
- Args:
182
- filename (str): The filename to extract the season and episode from.
183
-
184
- Returns:
185
- tuple: A tuple containing the season and episode number.
186
- """
187
- # Extract the season and episode number from the filename
188
- match = re.search(r"S(\d+)E(\d+)", filename)
189
- if match:
190
- season = int(match.group(1))
191
- episode = int(match.group(2))
192
- return season, episode
193
- else:
194
- return None, None
195
-
196
-
197
- def process_reference_srt_files(series_name):
198
- """
199
- Process reference SRT files for a given series.
200
-
201
- Args:
202
- series_name (str): The name of the series.
203
-
204
- Returns:
205
- dict: A dictionary containing the reference files where the keys are the MKV filenames
206
- and the values are the corresponding SRT texts.
207
- """
208
- reference_files = {}
209
- reference_dir = os.path.join(CACHE_DIR, "data", series_name)
210
- for dirpath, _, filenames in os.walk(reference_dir):
211
- for filename in filenames:
212
- if filename.lower().endswith(".srt"):
213
- srt_file = os.path.join(dirpath, filename)
214
- logger.info(f"Processing {srt_file}")
215
- srt_text = extract_srt_text(srt_file)
216
- season, episode = extract_season_episode(filename)
217
- mkv_filename = f"{series_name} - S{season:02}E{episode:02}.mkv"
218
- reference_files[mkv_filename] = srt_text
219
- return reference_files
220
-
221
-
222
- def process_srt_files(show_dir):
223
- """
224
- Process all SRT files in the given directory and its subdirectories.
225
-
226
- Args:
227
- show_dir (str): The directory path where the SRT files are located.
228
-
229
- Returns:
230
- dict: A dictionary containing the SRT file paths as keys and their corresponding text content as values.
231
- """
232
- srt_files = {}
233
- for dirpath, _, filenames in os.walk(show_dir):
234
- for filename in filenames:
235
- if filename.lower().endswith(".srt"):
236
- srt_file = os.path.join(dirpath, filename)
237
- logger.info(f"Processing {srt_file}")
238
- srt_text = extract_srt_text(srt_file)
239
- srt_files[srt_file] = srt_text
240
- return srt_files
241
-
242
-
243
- def compare_and_rename_files(srt_files, reference_files, dry_run=False):
244
- """
245
- Compare the srt files with the reference files and rename the matching mkv files.
246
-
247
- Args:
248
- srt_files (dict): A dictionary containing the srt files as keys and their contents as values.
249
- reference_files (dict): A dictionary containing the reference files as keys and their contents as values.
250
- dry_run (bool, optional): If True, the function will only log the renaming actions without actually renaming the files. Defaults to False.
251
- """
252
- logger.info(
253
- f"Comparing {len(srt_files)} srt files with {len(reference_files)} reference files"
254
- )
255
- for srt_text in srt_files.keys():
256
- parent_dir = os.path.dirname(os.path.dirname(srt_text))
257
- for reference in reference_files.keys():
258
- _season, _episode = extract_season_episode(reference)
259
- mkv_file = os.path.join(
260
- parent_dir, os.path.basename(srt_text).replace(".srt", ".mkv")
261
- )
262
- matching_lines = compare_text(
263
- reference_files[reference], srt_files[srt_text]
264
- )
265
- if matching_lines >= int(len(reference_files[reference]) * 0.1):
266
- logger.info(f"Matching lines: {matching_lines}")
267
- logger.info(f"Found matching file: {mkv_file} ->{reference}")
268
- new_filename = os.path.join(parent_dir, reference)
269
- if not os.path.exists(new_filename):
270
- if os.path.exists(mkv_file) and not dry_run:
271
- logger.info(f"Renaming {mkv_file} to {new_filename}")
272
- os.rename(mkv_file, new_filename)
56
+ # Create temp directories
57
+ temp_dir = Path(season_path) / "temp"
58
+ ocr_dir = Path(season_path) / "ocr"
59
+ temp_dir.mkdir(exist_ok=True)
60
+ ocr_dir.mkdir(exist_ok=True)
61
+
62
+ try:
63
+ unmatched_files = []
64
+
65
+ # First pass: Try speech recognition matching
66
+ for mkv_file in mkv_files:
67
+ logger.info(f"Attempting speech recognition match for {mkv_file}")
68
+
69
+ # Extract audio and run speech recognition
70
+ process_speech_to_text(mkv_file, str(temp_dir))
71
+ match = matcher.identify_episode(mkv_file, temp_dir)
72
+
73
+ if match and match['confidence'] >= matcher.min_confidence:
74
+ # Rename the file
75
+ new_name = f"{matcher.series_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
76
+ new_path = os.path.join(season_path, new_name)
77
+
78
+ logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
79
+ f"(confidence: {match['confidence']:.2f})")
80
+
81
+ if not dry_run:
82
+ os.rename(mkv_file, new_path)
273
83
  else:
274
- logger.info(f"File {new_filename} already exists, skipping")
84
+ logger.info(f"Speech recognition match failed for {mkv_file}, will try OCR")
85
+ unmatched_files.append(mkv_file)
86
+
87
+ # Second pass: Try OCR for unmatched files
88
+ if unmatched_files:
89
+ logger.info(f"Attempting OCR matching for {len(unmatched_files)} unmatched files")
90
+
91
+ # Convert files to SRT using OCR
92
+ convert_mkv_to_srt(season_path, unmatched_files)
93
+
94
+ # Process OCR results
95
+ reference_text_dict = process_reference_srt_files(matcher.series_name)
96
+ srt_text_dict = process_srt_files(str(ocr_dir))
97
+
98
+ # Compare and rename
99
+ compare_and_rename_files(
100
+ srt_text_dict,
101
+ reference_text_dict,
102
+ dry_run=dry_run,
103
+ min_confidence=0.1 # Lower threshold for OCR
104
+ )
105
+
106
+ # Download subtitles if requested
107
+ if get_subs:
108
+ show_id = fetch_show_id(matcher.series_name)
109
+ if show_id:
110
+ seasons = {int(os.path.basename(p).split()[-1]) for p in season_paths}
111
+ get_subtitles(show_id, seasons=seasons)
112
+
113
+ finally:
114
+ # Cleanup
115
+ if not dry_run:
116
+ shutil.rmtree(temp_dir)
117
+ cleanup_ocr_files(show_dir)
@@ -0,0 +1,90 @@
1
+ # mkv_episode_matcher/speech_to_text.py
2
+
3
+ import os
4
+ import subprocess
5
+ from pathlib import Path
6
+ import whisper
7
+ import torch
8
+ from loguru import logger
9
+
10
+ def process_speech_to_text(mkv_file, output_dir):
11
+ """
12
+ Convert MKV file to transcript using Whisper.
13
+
14
+ Args:
15
+ mkv_file (str): Path to MKV file
16
+ output_dir (str): Directory to save transcript files
17
+ """
18
+ # Extract audio if not already done
19
+ wav_file = extract_audio(mkv_file, output_dir)
20
+ if not wav_file:
21
+ return None
22
+
23
+ # Load model
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ if device == "cuda":
26
+ logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
27
+ else:
28
+ logger.info("CUDA not available. Using CPU.")
29
+
30
+ model = whisper.load_model("base", device=device)
31
+
32
+ # Generate transcript
33
+ segments_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.segments.json")
34
+ if not os.path.exists(segments_file):
35
+ try:
36
+ result = model.transcribe(
37
+ wav_file,
38
+ task="transcribe",
39
+ language="en",
40
+ )
41
+
42
+ # Save segments
43
+ import json
44
+ with open(segments_file, 'w', encoding='utf-8') as f:
45
+ json.dump(result["segments"], f, indent=2)
46
+
47
+ logger.info(f"Transcript saved to {segments_file}")
48
+
49
+ except Exception as e:
50
+ logger.error(f"Error during transcription: {e}")
51
+ return None
52
+ else:
53
+ logger.info(f"Using existing transcript: {segments_file}")
54
+
55
+ return segments_file
56
+
57
+ def extract_audio(mkv_file, output_dir):
58
+ """
59
+ Extract audio from MKV file using FFmpeg.
60
+
61
+ Args:
62
+ mkv_file (str): Path to MKV file
63
+ output_dir (str): Directory to save WAV file
64
+
65
+ Returns:
66
+ str: Path to extracted WAV file
67
+ """
68
+ wav_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.wav")
69
+
70
+ if not os.path.exists(wav_file):
71
+ logger.info(f"Extracting audio from {mkv_file}")
72
+ try:
73
+ cmd = [
74
+ 'ffmpeg',
75
+ '-i', mkv_file,
76
+ '-vn', # Disable video
77
+ '-acodec', 'pcm_s16le', # Convert to PCM format
78
+ '-ar', '16000', # Set sample rate to 16kHz
79
+ '-ac', '1', # Convert to mono
80
+ wav_file
81
+ ]
82
+ subprocess.run(cmd, check=True, capture_output=True)
83
+ logger.info(f"Audio extracted to {wav_file}")
84
+ except subprocess.CalledProcessError as e:
85
+ logger.error(f"Error extracting audio: {e}")
86
+ return None
87
+ else:
88
+ logger.info(f"Audio file {wav_file} already exists, skipping extraction")
89
+
90
+ return wav_file
@@ -11,32 +11,51 @@ from mkv_episode_matcher.__main__ import CACHE_DIR, CONFIG_FILE
11
11
  from mkv_episode_matcher.config import get_config
12
12
  from mkv_episode_matcher.tmdb_client import fetch_season_details
13
13
 
14
-
15
- def check_filename(filename, series_title, season_number, episode_number):
14
+ def get_valid_seasons(show_dir):
16
15
  """
17
- Check if a filename matches the expected naming convention for a series episode.
16
+ Get all season directories that contain MKV files.
18
17
 
19
18
  Args:
20
- filename (str): The filename to be checked.
21
- series_title (str): The title of the series.
22
- season_number (int): The season number of the episode.
23
- episode_number (int): The episode number of the episode.
19
+ show_dir (str): Base directory for the TV show
24
20
 
25
21
  Returns:
26
- bool: True if the filename matches the expected naming convention, False otherwise.
22
+ list: List of paths to valid season directories
23
+ """
24
+ # Get all season directories
25
+ season_paths = [
26
+ os.path.join(show_dir, d)
27
+ for d in os.listdir(show_dir)
28
+ if os.path.isdir(os.path.join(show_dir, d))
29
+ ]
30
+
31
+ # Filter seasons to only include those with .mkv files
32
+ valid_season_paths = []
33
+ for season_path in season_paths:
34
+ mkv_files = [f for f in os.listdir(season_path) if f.endswith(".mkv")]
35
+ if mkv_files:
36
+ valid_season_paths.append(season_path)
37
+
38
+ if not valid_season_paths:
39
+ logger.warning(f"No seasons with .mkv files found in show '{os.path.basename(show_dir)}'")
40
+ else:
41
+ logger.info(
42
+ f"Found {len(valid_season_paths)} seasons with .mkv files in '{os.path.basename(show_dir)}'"
43
+ )
27
44
 
28
- This function checks if the given filename matches the expected naming convention for a series episode.
29
- The expected naming convention is '{series_title} - S{season_number:02d}E{episode_number:02d}.mkv'.
30
- If the filename matches the expected pattern, it returns True; otherwise, it returns False.
45
+ return valid_season_paths
46
+ def check_filename(filename):
47
+ """
48
+ Check if the filename is in the correct format (S01E02).
31
49
 
32
- Example:
33
- If filename = 'Example - S01E03.mkv', series_title = 'Example', season_number = 1, and episode_number = 3,
34
- the function will return True because the filename matches the expected pattern.
50
+ Args:
51
+ filename (str): The filename to check.
52
+
53
+ Returns:
54
+ bool: True if the filename matches the expected pattern.
35
55
  """
36
- pattern = re.compile(
37
- f"{re.escape(series_title)} - S{season_number:02d}E{episode_number:02d}.mkv"
38
- )
39
- return bool(pattern.match(filename))
56
+ # Check if the filename matches the expected format
57
+ match = re.search(r'.*S\d+E\d+', filename)
58
+ return bool(match)
40
59
 
41
60
 
42
61
  def scramble_filename(original_file_path, file_number):
@@ -63,57 +82,41 @@ def scramble_filename(original_file_path, file_number):
63
82
  os.rename(original_file_path, new_file_path)
64
83
 
65
84
 
66
- def rename_episode_file(original_file_path, season_number, episode_number):
85
+ def rename_episode_file(original_file_path, new_filename):
67
86
  """
68
87
  Rename an episode file with a standardized naming convention.
69
88
 
70
89
  Args:
71
90
  original_file_path (str): The original file path of the episode.
72
- season_number (int): The season number of the episode.
73
- episode_number (int): The episode number of the episode.
91
+ new_filename (str): The new filename including season/episode info.
74
92
 
75
93
  Returns:
76
- None
77
-
78
- This function renames an episode file with a standardized naming convention based on the series title, season number,
79
- and episode number. If a file with the intended new name already exists, it appends a numerical suffix to the filename
80
- until it finds a unique name.
81
-
82
- Example:
83
- If original_file_path = '/path/to/episode.mkv', season_number = 1, and episode_number = 3, and the series title is 'Example',
84
- the function will rename the file to 'Example - S01E03.mkv' if no file with that name already exists. If a file with that
85
- name already exists, it will be renamed to 'Example - S01E03_2.mkv', and so on.
94
+ str: Path to the renamed file, or None if rename failed.
86
95
  """
87
- series_title = os.path.basename(
88
- os.path.dirname(os.path.dirname(original_file_path))
89
- )
90
- original_file_name = os.path.basename(original_file_path)
91
- extension = os.path.splitext(original_file_path)[-1]
92
- new_file_name = (
93
- f"{series_title} - S{season_number:02d}E{episode_number:02d}{extension}"
94
- )
95
- new_file_path = os.path.join(os.path.dirname(original_file_path), new_file_name)
96
-
97
- # Check if the new file path already exists
96
+ original_dir = os.path.dirname(original_file_path)
97
+ new_file_path = os.path.join(original_dir, new_filename)
98
+
99
+ # Check if new filepath already exists
98
100
  if os.path.exists(new_file_path):
99
- logger.warning(f"Filename already exists: {new_file_name}.")
100
-
101
- # If the file already exists, find a unique name by appending a numerical suffix
101
+ logger.warning(f"File already exists: {new_filename}")
102
+
103
+ # Add numeric suffix if file exists
104
+ base, ext = os.path.splitext(new_filename)
102
105
  suffix = 2
103
106
  while True:
104
- new_file_name = f"{series_title} - S{season_number:02d}E{episode_number:02d}_{suffix}{extension}"
105
- new_file_path = os.path.join(
106
- os.path.dirname(original_file_path), new_file_name
107
- )
107
+ new_filename = f"{base}_{suffix}{ext}"
108
+ new_file_path = os.path.join(original_dir, new_filename)
108
109
  if not os.path.exists(new_file_path):
109
110
  break
110
111
  suffix += 1
111
-
112
- logger.info(f"Renaming {original_file_name} -> {new_file_name}")
113
- os.rename(original_file_path, new_file_path)
114
- else:
115
- logger.info(f"Renaming {original_file_name} -> {new_file_name}")
112
+
113
+ try:
116
114
  os.rename(original_file_path, new_file_path)
115
+ logger.info(f"Renamed {os.path.basename(original_file_path)} -> {new_filename}")
116
+ return new_file_path
117
+ except OSError as e:
118
+ logger.error(f"Failed to rename file: {e}")
119
+ return None
117
120
 
118
121
 
119
122
  def get_subtitles(show_id, seasons: set[int]):
@@ -230,3 +233,152 @@ def clean_text(text):
230
233
  cleaned_text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
231
234
  # Strip leading/trailing whitespace
232
235
  return cleaned_text.strip()
236
+ # mkv_episode_matcher/utils.py
237
+
238
+ # Add this to your existing utils.py, keeping all other functions
239
+
240
+ def process_reference_srt_files(series_name):
241
+ """
242
+ Process reference SRT files for a given series.
243
+
244
+ Args:
245
+ series_name (str): The name of the series.
246
+
247
+ Returns:
248
+ dict: A dictionary containing the reference files where the keys are the MKV filenames
249
+ and the values are the corresponding SRT texts.
250
+ """
251
+ from mkv_episode_matcher.__main__ import CACHE_DIR
252
+ import os
253
+
254
+ reference_files = {}
255
+ reference_dir = os.path.join(CACHE_DIR, "data", series_name)
256
+
257
+ for dirpath, _, filenames in os.walk(reference_dir):
258
+ for filename in filenames:
259
+ if filename.lower().endswith(".srt"):
260
+ srt_file = os.path.join(dirpath, filename)
261
+ logger.info(f"Processing {srt_file}")
262
+ srt_text = extract_srt_text(srt_file)
263
+ season, episode = extract_season_episode(filename)
264
+ mkv_filename = f"{series_name} - S{season:02}E{episode:02}.mkv"
265
+ reference_files[mkv_filename] = srt_text
266
+
267
+ return reference_files
268
+
269
+ def extract_srt_text(filepath):
270
+ """
271
+ Extracts text content from an SRT file.
272
+
273
+ Args:
274
+ filepath (str): Path to the SRT file.
275
+
276
+ Returns:
277
+ list: List of text lines from the SRT file.
278
+ """
279
+ # Read the file content
280
+ with open(filepath) as f:
281
+ content = f.read()
282
+
283
+ # Split into subtitle blocks
284
+ blocks = content.strip().split('\n\n')
285
+
286
+ text_lines = []
287
+ for block in blocks:
288
+ lines = block.split('\n')
289
+ if len(lines) < 3:
290
+ continue
291
+
292
+ # Skip index and timestamp, get all remaining lines as text
293
+ text = ' '.join(lines[2:])
294
+ # Remove stage directions and tags
295
+ text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
296
+ if text:
297
+ text_lines.append(text)
298
+
299
+ return text_lines
300
+
301
+ def extract_season_episode(filename):
302
+ """
303
+ Extract season and episode numbers from filename.
304
+
305
+ Args:
306
+ filename (str): Filename to parse
307
+
308
+ Returns:
309
+ tuple: (season_number, episode_number)
310
+ """
311
+ match = re.search(r'S(\d+)E(\d+)', filename)
312
+ if match:
313
+ return int(match.group(1)), int(match.group(2))
314
+ return None, None
315
+ def process_srt_files(show_dir):
316
+ """
317
+ Process all SRT files in the given directory and its subdirectories.
318
+
319
+ Args:
320
+ show_dir (str): The directory path where the SRT files are located.
321
+
322
+ Returns:
323
+ dict: A dictionary containing the SRT file paths as keys and their corresponding text content as values.
324
+ """
325
+ srt_files = {}
326
+ for dirpath, _, filenames in os.walk(show_dir):
327
+ for filename in filenames:
328
+ if filename.lower().endswith(".srt"):
329
+ srt_file = os.path.join(dirpath, filename)
330
+ logger.info(f"Processing {srt_file}")
331
+ srt_text = extract_srt_text(srt_file)
332
+ srt_files[srt_file] = srt_text
333
+ return srt_files
334
+ def compare_and_rename_files(srt_files, reference_files, dry_run=False):
335
+ """
336
+ Compare the srt files with the reference files and rename the matching mkv files.
337
+
338
+ Args:
339
+ srt_files (dict): A dictionary containing the srt files as keys and their contents as values.
340
+ reference_files (dict): A dictionary containing the reference files as keys and their contents as values.
341
+ dry_run (bool, optional): If True, the function will only log the renaming actions without actually renaming the files. Defaults to False.
342
+ """
343
+ logger.info(
344
+ f"Comparing {len(srt_files)} srt files with {len(reference_files)} reference files"
345
+ )
346
+ for srt_text in srt_files.keys():
347
+ parent_dir = os.path.dirname(os.path.dirname(srt_text))
348
+ for reference in reference_files.keys():
349
+ _season, _episode = extract_season_episode(reference)
350
+ mkv_file = os.path.join(
351
+ parent_dir, os.path.basename(srt_text).replace(".srt", ".mkv")
352
+ )
353
+ matching_lines = compare_text(
354
+ reference_files[reference], srt_files[srt_text]
355
+ )
356
+ if matching_lines >= int(len(reference_files[reference]) * 0.1):
357
+ logger.info(f"Matching lines: {matching_lines}")
358
+ logger.info(f"Found matching file: {mkv_file} ->{reference}")
359
+ new_filename = os.path.join(parent_dir, reference)
360
+ if not os.path.exists(new_filename):
361
+ if os.path.exists(mkv_file) and not dry_run:
362
+ logger.info(f"Renaming {mkv_file} to {new_filename}")
363
+ os.rename(mkv_file, new_filename)
364
+ else:
365
+ logger.info(f"File {new_filename} already exists, skipping")
366
+
367
+ def compare_text(text1, text2):
368
+ """
369
+ Compare two lists of text lines and return the number of matching lines.
370
+
371
+ Args:
372
+ text1 (list): List of text lines from the first source.
373
+ text2 (list): List of text lines from the second source.
374
+
375
+ Returns:
376
+ int: Number of matching lines between the two sources.
377
+ """
378
+ # Flatten the list of text lines
379
+ flat_text1 = [line for lines in text1 for line in lines]
380
+ flat_text2 = [line for lines in text2 for line in lines]
381
+
382
+ # Compare the two lists of text lines
383
+ matching_lines = set(flat_text1).intersection(flat_text2)
384
+ return len(matching_lines)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mkv-episode-matcher
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -14,16 +14,18 @@ Classifier: Programming Language :: Python
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: Implementation :: CPython
16
16
  Classifier: Programming Language :: Python :: Implementation :: PyPy
17
- Requires-Python: >=3.10
17
+ Requires-Python: >=3.9
18
18
  Description-Content-Type: text/markdown
19
19
  Requires-Dist: configparser>=7.1.0
20
20
  Requires-Dist: ffmpeg>=1.4
21
21
  Requires-Dist: loguru>=0.7.2
22
- Requires-Dist: numpy>=2.1.3
22
+ Requires-Dist: openai-whisper>=20240930
23
23
  Requires-Dist: opensubtitlescom>=0.1.5
24
24
  Requires-Dist: pytesseract>=0.3.13
25
+ Requires-Dist: rapidfuzz>=3.10.1
25
26
  Requires-Dist: requests>=2.32.3
26
27
  Requires-Dist: tmdb-client>=0.0.1
28
+ Requires-Dist: wave>=0.0.2
27
29
 
28
30
  # MKV Episode Matcher
29
31
 
@@ -2,10 +2,12 @@ mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb
2
2
  mkv_episode_matcher/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
3
3
  mkv_episode_matcher/__main__.py,sha256=3ZcCUxeI7rUA-4oiCD2WXBiOFJAqLsVVWfZKN446FwQ,6792
4
4
  mkv_episode_matcher/config.py,sha256=zDDKBcsDt5fME9BRqiTi7yWKeast1pZh36BNYMvIBYM,2419
5
- mkv_episode_matcher/episode_matcher.py,sha256=YBbRL-NIIvBwKojQOHoDsE3EQYy9_hn1j-4CAuLwM78,9854
5
+ mkv_episode_matcher/episode_identification.py,sha256=nXv9giH7xHysxnfczpsDefb7DmRXzuz8pgmfBemqwBs,8381
6
+ mkv_episode_matcher/episode_matcher.py,sha256=tffbLz35Mpah0yQ6ASz1dg8tz27s8UrdSZunZ-GQLtY,4560
6
7
  mkv_episode_matcher/mkv_to_srt.py,sha256=4yxBHRVhgVby0UtQ2aTXGuoQpid8pkgjMIaHU6GCdzc,10857
8
+ mkv_episode_matcher/speech_to_text.py,sha256=-bnGvmtPCKyHFPEaXwIcEYTf_P13rNpAJA-2UFeRFrs,2806
7
9
  mkv_episode_matcher/tmdb_client.py,sha256=LbMCgjmp7sCbrQo_CDlpcnryKPz5S7inE24YY9Pyjk4,4172
8
- mkv_episode_matcher/utils.py,sha256=5YWpRbciIAlmhBxa-etGtMA6IabswE3CuefRVQDROz4,9526
10
+ mkv_episode_matcher/utils.py,sha256=hQmJNdTogGnN3qbN6sN1JUPvIe6RHU6ml3B41yZB8DQ,14147
9
11
  mkv_episode_matcher/libraries/pgs2srt/.gitignore,sha256=mt3uxWYZaFurMw_yGE258gWhtGKPVR7e3Ll4ALJpyj4,23
10
12
  mkv_episode_matcher/libraries/pgs2srt/README.md,sha256=olb25G17tj0kxPgp_LcH5I2QWXjgP1m8JFyjYRGz4UU,1374
11
13
  mkv_episode_matcher/libraries/pgs2srt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -16,8 +18,8 @@ mkv_episode_matcher/libraries/pgs2srt/requirements.txt,sha256=sg87dqWw_qpbwciw-M
16
18
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py,sha256=geT1LXdVd8yED9zoJ9K1XfP2JzGcM7u1SslHYrJI09o,10061
17
19
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py,sha256=GKtVy_Lxv-z27mkRG8pJF2znKWXwZTot7jL6kN-zIxM,10503
18
20
  mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py,sha256=AlJHUYXl85J95OzGRik-AHVfzDd7Q8BJCvD4Nr8kRIk,938598
19
- mkv_episode_matcher-0.2.0.dist-info/METADATA,sha256=aocARhBMBFQ5HRTgCCKa5p9pm-4Kw6SJxoxz31sg7HY,3710
20
- mkv_episode_matcher-0.2.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
21
- mkv_episode_matcher-0.2.0.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
22
- mkv_episode_matcher-0.2.0.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
23
- mkv_episode_matcher-0.2.0.dist-info/RECORD,,
21
+ mkv_episode_matcher-0.3.0.dist-info/METADATA,sha256=XAnfyhD0sBKRJqR_LwUdLD30wNNijVRj44girWFrl1w,3781
22
+ mkv_episode_matcher-0.3.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
23
+ mkv_episode_matcher-0.3.0.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
24
+ mkv_episode_matcher-0.3.0.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
25
+ mkv_episode_matcher-0.3.0.dist-info/RECORD,,