mkv-episode-matcher 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (65) hide show
  1. mkv_episode_matcher-0.3.0/.python-version +1 -0
  2. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/PKG-INFO +5 -3
  3. mkv_episode_matcher-0.3.0/docs/installation.md +81 -0
  4. mkv_episode_matcher-0.3.0/mkv_episode_matcher/episode_identification.py +208 -0
  5. mkv_episode_matcher-0.3.0/mkv_episode_matcher/episode_matcher.py +117 -0
  6. mkv_episode_matcher-0.3.0/mkv_episode_matcher/speech_to_text.py +90 -0
  7. mkv_episode_matcher-0.3.0/mkv_episode_matcher/utils.py +384 -0
  8. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher.egg-info/PKG-INFO +5 -3
  9. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher.egg-info/SOURCES.txt +2 -0
  10. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher.egg-info/requires.txt +3 -1
  11. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/pyproject.toml +6 -2
  12. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/setup.cfg +1 -1
  13. mkv_episode_matcher-0.3.0/uv.lock +1375 -0
  14. mkv_episode_matcher-0.2.0/.python-version +0 -1
  15. mkv_episode_matcher-0.2.0/docs/installation.md +0 -51
  16. mkv_episode_matcher-0.2.0/mkv_episode_matcher/episode_matcher.py +0 -274
  17. mkv_episode_matcher-0.2.0/mkv_episode_matcher/utils.py +0 -232
  18. mkv_episode_matcher-0.2.0/uv.lock +0 -679
  19. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/.coverage +0 -0
  20. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/.gitattributes +0 -0
  21. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/.github/funding.yml +0 -0
  22. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/.github/workflows/documentation.yml +0 -0
  23. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/.github/workflows/python-publish.yml +0 -0
  24. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/.gitignore +0 -0
  25. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/.gitmodules +0 -0
  26. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/.vscode/settings.json +0 -0
  27. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/README.md +0 -0
  28. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.1-py3-none-any.whl +0 -0
  29. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.1.tar.gz +0 -0
  30. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.2-py3-none-any.whl +0 -0
  31. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.2.tar.gz +0 -0
  32. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.3-py3-none-any.whl +0 -0
  33. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.3.tar.gz +0 -0
  34. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.4-py3-none-any.whl +0 -0
  35. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.4.tar.gz +0 -0
  36. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.5-py3-none-any.whl +0 -0
  37. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/dist/mkv_episode_matcher-0.1.5.tar.gz +0 -0
  38. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/docs/api/index.md +0 -0
  39. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/docs/cli.md +0 -0
  40. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/docs/configuration.md +0 -0
  41. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/docs/quickstart.md +0 -0
  42. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/docs/tips.md +0 -0
  43. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkdocs.yml +0 -0
  44. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/.gitattributes +0 -0
  45. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/__init__.py +0 -0
  46. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/__main__.py +0 -0
  47. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/config.py +0 -0
  48. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -0
  49. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -0
  50. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -0
  51. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -0
  52. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/README.md +0 -0
  53. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  54. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -0
  55. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -0
  56. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -0
  57. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -0
  58. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/mkv_to_srt.py +0 -0
  59. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher/tmdb_client.py +0 -0
  60. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher.egg-info/dependency_links.txt +0 -0
  61. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher.egg-info/entry_points.txt +0 -0
  62. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/mkv_episode_matcher.egg-info/top_level.txt +0 -0
  63. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/setup.py +0 -0
  64. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/tests/__init__.py +0 -0
  65. {mkv_episode_matcher-0.2.0 → mkv_episode_matcher-0.3.0}/tests/test_improvements.py +0 -0
@@ -0,0 +1 @@
1
+ 3.9
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mkv-episode-matcher
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
5
5
  Home-page: https://github.com/Jsakkos/mkv-episode-matcher
6
6
  Author: Jonathan Sakkos
@@ -14,16 +14,18 @@ Classifier: Programming Language :: Python
14
14
  Classifier: Programming Language :: Python :: 3.12
15
15
  Classifier: Programming Language :: Python :: Implementation :: CPython
16
16
  Classifier: Programming Language :: Python :: Implementation :: PyPy
17
- Requires-Python: >=3.10
17
+ Requires-Python: >=3.9
18
18
  Description-Content-Type: text/markdown
19
19
  Requires-Dist: configparser>=7.1.0
20
20
  Requires-Dist: ffmpeg>=1.4
21
21
  Requires-Dist: loguru>=0.7.2
22
- Requires-Dist: numpy>=2.1.3
22
+ Requires-Dist: openai-whisper>=20240930
23
23
  Requires-Dist: opensubtitlescom>=0.1.5
24
24
  Requires-Dist: pytesseract>=0.3.13
25
+ Requires-Dist: rapidfuzz>=3.10.1
25
26
  Requires-Dist: requests>=2.32.3
26
27
  Requires-Dist: tmdb-client>=0.0.1
28
+ Requires-Dist: wave>=0.0.2
27
29
 
28
30
  # MKV Episode Matcher
29
31
 
@@ -0,0 +1,81 @@
1
+ # Installation Guide
2
+
3
+ ## Basic Installation
4
+
5
+ Install MKV Episode Matcher using pip:
6
+
7
+ ```bash
8
+ pip install mkv-episode-matcher
9
+ ```
10
+
11
+ ## Installation Options
12
+
13
+ ### GPU Support
14
+
15
+ For GPU acceleration (recommended if you have a CUDA-capable GPU):
16
+
17
+ ```bash
18
+ pip install "mkv-episode-matcher"
19
+ ```
20
+ Find the appropriate CUDA version and upgrade Torch (e.g., for CUDA 12.4):
21
+ ```bash
22
+ pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
23
+ ```
24
+
25
+
26
+ ### Development Installation
27
+
28
+ For contributing or development:
29
+
30
+ ```bash
31
+ # Clone the repository
32
+ git clone https://github.com/Jsakkos/mkv-episode-matcher.git
33
+ cd mkv-episode-matcher
34
+
35
+ # Install UV
36
+ pip install uv
37
+
38
+ # Install with development dependencies
39
+ uv venv
40
+ uv pip install -e ".[dev]"
41
+ ```
42
+
43
+ ## API Keys Setup
44
+
45
+ 1. **TMDb API Key**
46
+ - Create an account at [TMDb](https://www.themoviedb.org/)
47
+ - Go to your account settings
48
+ - Request an API key
49
+
50
+ 2. **OpenSubtitles (Optional)**
51
+ - Register at [OpenSubtitles](https://www.opensubtitles.com/)
52
+ - Get your API key from the dashboard
53
+
54
+ ## System Requirements
55
+
56
+ ### For GPU Support
57
+ - CUDA-capable NVIDIA GPU
58
+ - CUDA Toolkit 12.1 or compatible version
59
+ - At least 4GB GPU memory recommended for Whisper speech recognition
60
+
61
+ ### For CPU-Only
62
+ - No special requirements beyond Python 3.9+
63
+
64
+ ## Verification
65
+
66
+ Verify your installation:
67
+
68
+ ```bash
69
+ mkv-match --version
70
+
71
+ # Check GPU availability (if installed with GPU support)
72
+ python -c "import torch; print(f'GPU available: {torch.cuda.is_available()}')"
73
+ ```
74
+
75
+ ## Troubleshooting
76
+
77
+ If you encounter any issues:
78
+ 1. Ensure you have the latest pip: `pip install --upgrade pip`
79
+ 2. For GPU installations, verify CUDA is properly installed
80
+ 3. Check the [compatibility matrix](https://pytorch.org/get-started/locally/) for PyTorch and CUDA versions
81
+ 4. If you encounter any other issues, please [open an issue](https://github.com/Jsakkos/mkv-episode-matcher/issues) on GitHub
@@ -0,0 +1,208 @@
1
+ # mkv_episode_matcher/episode_identification.py
2
+
3
+ import os
4
+ import glob
5
+ from pathlib import Path
6
+ from rapidfuzz import fuzz
7
+ from collections import defaultdict
8
+ import re
9
+ from loguru import logger
10
+ import json
11
+ import shutil
12
+
13
+ class EpisodeMatcher:
14
+ def __init__(self, cache_dir, min_confidence=0.6):
15
+ self.cache_dir = Path(cache_dir)
16
+ self.min_confidence = min_confidence
17
+ self.whisper_segments = None
18
+ self.series_name = None
19
+
20
+ def clean_text(self, text):
21
+ """Clean text by removing stage directions and normalizing repeated words."""
22
+ # Remove stage directions like [groans] and <i>SHIP:</i>
23
+ text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
24
+ # Remove repeated words with dashes (e.g., "Y-y-you" -> "you")
25
+ text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
26
+ # Remove multiple spaces
27
+ text = ' '.join(text.split())
28
+ return text.lower()
29
+
30
+ def chunk_score(self, whisper_chunk, ref_chunk):
31
+ """Calculate fuzzy match score between two chunks of text."""
32
+ whisper_clean = self.clean_text(whisper_chunk)
33
+ ref_clean = self.clean_text(ref_chunk)
34
+
35
+ # Use token sort ratio to handle word order differences
36
+ token_sort = fuzz.token_sort_ratio(whisper_clean, ref_clean)
37
+ # Use partial ratio to catch substring matches
38
+ partial = fuzz.partial_ratio(whisper_clean, ref_clean)
39
+
40
+ # Weight token sort more heavily but consider partial matches
41
+ return (token_sort * 0.7 + partial * 0.3) / 100.0
42
+
43
+ def identify_episode(self, video_file, temp_dir):
44
+ """Identify which episode matches this video file."""
45
+
46
+ # Get series name from parent directory
47
+ self.series_name = Path(video_file).parent.parent.name
48
+
49
+ # Load whisper transcript if not already processed
50
+ segments_file = Path(temp_dir) / f"{Path(video_file).stem}.segments.json"
51
+ if not segments_file.exists():
52
+ logger.error(f"No transcript found for {video_file}. Run speech recognition first.")
53
+ return None
54
+
55
+ with open(segments_file) as f:
56
+ self.whisper_segments = json.load(f)
57
+
58
+ # Get reference directory for this series
59
+ reference_dir = self.cache_dir / "data" / self.series_name
60
+ if not reference_dir.exists():
61
+ logger.error(f"No reference files found for {self.series_name}")
62
+ return None
63
+
64
+ # Match against reference files
65
+ match = self.match_all_references(reference_dir)
66
+
67
+ if match and match['confidence'] >= self.min_confidence:
68
+ # Extract season and episode from filename
69
+ match_file = Path(match['file'])
70
+ season_ep = re.search(r'S(\d+)E(\d+)', match_file.stem)
71
+ if season_ep:
72
+ season, episode = map(int, season_ep.groups())
73
+ return {
74
+ 'season': season,
75
+ 'episode': episode,
76
+ 'confidence': match['confidence'],
77
+ 'reference_file': str(match_file),
78
+ 'chunk_scores': match['chunk_scores']
79
+ }
80
+
81
+ return None
82
+
83
+ def match_all_references(self, reference_dir):
84
+ """Process all reference files and track matching scores."""
85
+ results = defaultdict(list)
86
+ best_match = None
87
+ best_confidence = 0
88
+
89
+ def process_chunks(ref_segments, filename):
90
+ nonlocal best_match, best_confidence
91
+
92
+ chunk_size = 300 # 5 minute chunks
93
+ whisper_chunks = defaultdict(list)
94
+ ref_chunks = defaultdict(list)
95
+
96
+ # Group segments into time chunks
97
+ for seg in self.whisper_segments:
98
+ chunk_idx = int(float(seg['start']) // chunk_size)
99
+ whisper_chunks[chunk_idx].append(seg['text'])
100
+
101
+ for seg in ref_segments:
102
+ chunk_idx = int(seg['start'] // chunk_size)
103
+ ref_chunks[chunk_idx].append(seg['text'])
104
+
105
+ # Score each chunk
106
+ for chunk_idx in whisper_chunks:
107
+ whisper_text = ' '.join(whisper_chunks[chunk_idx])
108
+
109
+ # Look for matching reference chunk and adjacent chunks
110
+ scores = []
111
+ for ref_idx in range(max(0, chunk_idx-1), chunk_idx+2):
112
+ if ref_idx in ref_chunks:
113
+ ref_text = ' '.join(ref_chunks[ref_idx])
114
+ score = self.chunk_score(whisper_text, ref_text)
115
+ scores.append(score)
116
+
117
+ if scores:
118
+ chunk_confidence = max(scores)
119
+ logger.info(f"File: {filename}, "
120
+ f"Time: {chunk_idx*chunk_size}-{(chunk_idx+1)*chunk_size}s, "
121
+ f"Confidence: {chunk_confidence:.2f}")
122
+
123
+ results[filename].append({
124
+ 'chunk_idx': chunk_idx,
125
+ 'confidence': chunk_confidence
126
+ })
127
+
128
+ # Early exit if we find a very good match
129
+ if chunk_confidence > self.min_confidence:
130
+ chunk_scores = results[filename]
131
+ confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
132
+ for c in chunk_scores) / len(chunk_scores)
133
+
134
+ if confidence > best_confidence:
135
+ best_confidence = confidence
136
+ best_match = {
137
+ 'file': filename,
138
+ 'confidence': confidence,
139
+ 'chunk_scores': chunk_scores
140
+ }
141
+ return True
142
+
143
+ return False
144
+
145
+ # Process each reference file
146
+ for ref_file in glob.glob(os.path.join(reference_dir, "*.srt")):
147
+ ref_segments = self.parse_srt_to_segments(ref_file)
148
+ filename = os.path.basename(ref_file)
149
+
150
+ if process_chunks(ref_segments, filename):
151
+ break
152
+
153
+ # If no early match found, find best overall match
154
+ if not best_match:
155
+ for filename, chunks in results.items():
156
+ # Weight earlier chunks more heavily
157
+ confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
158
+ for c in chunks) / len(chunks)
159
+
160
+ if confidence > best_confidence:
161
+ best_confidence = confidence
162
+ best_match = {
163
+ 'file': filename,
164
+ 'confidence': confidence,
165
+ 'chunk_scores': chunks
166
+ }
167
+
168
+ return best_match
169
+
170
+ def parse_srt_to_segments(self, srt_file):
171
+ """Parse SRT file into list of segments with start/end times and text."""
172
+ segments = []
173
+ current_segment = {}
174
+
175
+ with open(srt_file, 'r', encoding='utf-8') as f:
176
+ lines = f.readlines()
177
+
178
+ i = 0
179
+ while i < len(lines):
180
+ line = lines[i].strip()
181
+
182
+ if line.isdigit(): # Index
183
+ if current_segment:
184
+ segments.append(current_segment)
185
+ current_segment = {}
186
+
187
+ elif '-->' in line: # Timestamp
188
+ start, end = line.split(' --> ')
189
+ current_segment['start'] = self.timestr_to_seconds(start)
190
+ current_segment['end'] = self.timestr_to_seconds(end)
191
+
192
+ elif line: # Text
193
+ if 'text' in current_segment:
194
+ current_segment['text'] += ' ' + line
195
+ else:
196
+ current_segment['text'] = line
197
+
198
+ i += 1
199
+
200
+ if current_segment:
201
+ segments.append(current_segment)
202
+
203
+ return segments
204
+
205
+ def timestr_to_seconds(self, timestr):
206
+ """Convert SRT timestamp to seconds."""
207
+ h, m, s = timestr.replace(',','.').split(':')
208
+ return float(h) * 3600 + float(m) * 60 + float(s)
@@ -0,0 +1,117 @@
1
+ # mkv_episode_matcher/episode_matcher.py
2
+
3
+ from pathlib import Path
4
+ import shutil
5
+ import glob
6
+ import os
7
+ from loguru import logger
8
+
9
+ from mkv_episode_matcher.__main__ import CONFIG_FILE, CACHE_DIR
10
+ from mkv_episode_matcher.config import get_config
11
+ from mkv_episode_matcher.mkv_to_srt import convert_mkv_to_srt
12
+ from mkv_episode_matcher.tmdb_client import fetch_show_id
13
+ from mkv_episode_matcher.utils import (
14
+ check_filename,
15
+ clean_text,
16
+ cleanup_ocr_files,
17
+ get_subtitles,
18
+ process_reference_srt_files,
19
+ process_srt_files,
20
+ compare_and_rename_files,get_valid_seasons
21
+ )
22
+ from mkv_episode_matcher.speech_to_text import process_speech_to_text
23
+ from mkv_episode_matcher.episode_identification import EpisodeMatcher
24
+
25
+ def process_show(season=None, dry_run=False, get_subs=False):
26
+ """Process the show using both speech recognition and OCR fallback."""
27
+ config = get_config(CONFIG_FILE)
28
+ show_dir = config.get("show_dir")
29
+
30
+ # Initialize episode matcher
31
+ matcher = EpisodeMatcher(CACHE_DIR)
32
+
33
+ # Get valid season directories
34
+ season_paths = get_valid_seasons(show_dir)
35
+ if not season_paths:
36
+ logger.warning(f"No seasons with .mkv files found")
37
+ return
38
+
39
+ if season is not None:
40
+ season_path = os.path.join(show_dir, f"Season {season}")
41
+ if season_path not in season_paths:
42
+ logger.warning(f"Season {season} has no .mkv files to process")
43
+ return
44
+ season_paths = [season_path]
45
+
46
+ # Process each season
47
+ for season_path in season_paths:
48
+ # Get MKV files that haven't been processed
49
+ mkv_files = [f for f in glob.glob(os.path.join(season_path, "*.mkv"))
50
+ if not check_filename(f)]
51
+
52
+ if not mkv_files:
53
+ logger.info(f"No new files to process in {season_path}")
54
+ continue
55
+
56
+ # Create temp directories
57
+ temp_dir = Path(season_path) / "temp"
58
+ ocr_dir = Path(season_path) / "ocr"
59
+ temp_dir.mkdir(exist_ok=True)
60
+ ocr_dir.mkdir(exist_ok=True)
61
+
62
+ try:
63
+ unmatched_files = []
64
+
65
+ # First pass: Try speech recognition matching
66
+ for mkv_file in mkv_files:
67
+ logger.info(f"Attempting speech recognition match for {mkv_file}")
68
+
69
+ # Extract audio and run speech recognition
70
+ process_speech_to_text(mkv_file, str(temp_dir))
71
+ match = matcher.identify_episode(mkv_file, temp_dir)
72
+
73
+ if match and match['confidence'] >= matcher.min_confidence:
74
+ # Rename the file
75
+ new_name = f"{matcher.series_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
76
+ new_path = os.path.join(season_path, new_name)
77
+
78
+ logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
79
+ f"(confidence: {match['confidence']:.2f})")
80
+
81
+ if not dry_run:
82
+ os.rename(mkv_file, new_path)
83
+ else:
84
+ logger.info(f"Speech recognition match failed for {mkv_file}, will try OCR")
85
+ unmatched_files.append(mkv_file)
86
+
87
+ # Second pass: Try OCR for unmatched files
88
+ if unmatched_files:
89
+ logger.info(f"Attempting OCR matching for {len(unmatched_files)} unmatched files")
90
+
91
+ # Convert files to SRT using OCR
92
+ convert_mkv_to_srt(season_path, unmatched_files)
93
+
94
+ # Process OCR results
95
+ reference_text_dict = process_reference_srt_files(matcher.series_name)
96
+ srt_text_dict = process_srt_files(str(ocr_dir))
97
+
98
+ # Compare and rename
99
+ compare_and_rename_files(
100
+ srt_text_dict,
101
+ reference_text_dict,
102
+ dry_run=dry_run,
103
+ min_confidence=0.1 # Lower threshold for OCR
104
+ )
105
+
106
+ # Download subtitles if requested
107
+ if get_subs:
108
+ show_id = fetch_show_id(matcher.series_name)
109
+ if show_id:
110
+ seasons = {int(os.path.basename(p).split()[-1]) for p in season_paths}
111
+ get_subtitles(show_id, seasons=seasons)
112
+
113
+ finally:
114
+ # Cleanup
115
+ if not dry_run:
116
+ shutil.rmtree(temp_dir)
117
+ cleanup_ocr_files(show_dir)
@@ -0,0 +1,90 @@
1
+ # mkv_episode_matcher/speech_to_text.py
2
+
3
+ import os
4
+ import subprocess
5
+ from pathlib import Path
6
+ import whisper
7
+ import torch
8
+ from loguru import logger
9
+
10
+ def process_speech_to_text(mkv_file, output_dir):
11
+ """
12
+ Convert MKV file to transcript using Whisper.
13
+
14
+ Args:
15
+ mkv_file (str): Path to MKV file
16
+ output_dir (str): Directory to save transcript files
17
+ """
18
+ # Extract audio if not already done
19
+ wav_file = extract_audio(mkv_file, output_dir)
20
+ if not wav_file:
21
+ return None
22
+
23
+ # Load model
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ if device == "cuda":
26
+ logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
27
+ else:
28
+ logger.info("CUDA not available. Using CPU.")
29
+
30
+ model = whisper.load_model("base", device=device)
31
+
32
+ # Generate transcript
33
+ segments_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.segments.json")
34
+ if not os.path.exists(segments_file):
35
+ try:
36
+ result = model.transcribe(
37
+ wav_file,
38
+ task="transcribe",
39
+ language="en",
40
+ )
41
+
42
+ # Save segments
43
+ import json
44
+ with open(segments_file, 'w', encoding='utf-8') as f:
45
+ json.dump(result["segments"], f, indent=2)
46
+
47
+ logger.info(f"Transcript saved to {segments_file}")
48
+
49
+ except Exception as e:
50
+ logger.error(f"Error during transcription: {e}")
51
+ return None
52
+ else:
53
+ logger.info(f"Using existing transcript: {segments_file}")
54
+
55
+ return segments_file
56
+
57
+ def extract_audio(mkv_file, output_dir):
58
+ """
59
+ Extract audio from MKV file using FFmpeg.
60
+
61
+ Args:
62
+ mkv_file (str): Path to MKV file
63
+ output_dir (str): Directory to save WAV file
64
+
65
+ Returns:
66
+ str: Path to extracted WAV file
67
+ """
68
+ wav_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.wav")
69
+
70
+ if not os.path.exists(wav_file):
71
+ logger.info(f"Extracting audio from {mkv_file}")
72
+ try:
73
+ cmd = [
74
+ 'ffmpeg',
75
+ '-i', mkv_file,
76
+ '-vn', # Disable video
77
+ '-acodec', 'pcm_s16le', # Convert to PCM format
78
+ '-ar', '16000', # Set sample rate to 16kHz
79
+ '-ac', '1', # Convert to mono
80
+ wav_file
81
+ ]
82
+ subprocess.run(cmd, check=True, capture_output=True)
83
+ logger.info(f"Audio extracted to {wav_file}")
84
+ except subprocess.CalledProcessError as e:
85
+ logger.error(f"Error extracting audio: {e}")
86
+ return None
87
+ else:
88
+ logger.info(f"Audio file {wav_file} already exists, skipping extraction")
89
+
90
+ return wav_file