mkv-episode-matcher 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher/__main__.py +23 -45
- mkv_episode_matcher/episode_identification.py +108 -61
- mkv_episode_matcher/episode_matcher.py +3 -6
- mkv_episode_matcher/utils.py +13 -5
- {mkv_episode_matcher-0.4.1.dist-info → mkv_episode_matcher-0.5.0.dist-info}/METADATA +1 -1
- {mkv_episode_matcher-0.4.1.dist-info → mkv_episode_matcher-0.5.0.dist-info}/RECORD +9 -10
- mkv_episode_matcher/speech_to_text.py +0 -96
- {mkv_episode_matcher-0.4.1.dist-info → mkv_episode_matcher-0.5.0.dist-info}/WHEEL +0 -0
- {mkv_episode_matcher-0.4.1.dist-info → mkv_episode_matcher-0.5.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.4.1.dist-info → mkv_episode_matcher-0.5.0.dist-info}/top_level.txt +0 -0
mkv_episode_matcher/__main__.py
CHANGED
|
@@ -108,75 +108,53 @@ def main():
|
|
|
108
108
|
)
|
|
109
109
|
args = parser.parse_args()
|
|
110
110
|
if args.check_gpu:
|
|
111
|
-
from mkv_episode_matcher.
|
|
111
|
+
from mkv_episode_matcher.utils import check_gpu_support
|
|
112
112
|
check_gpu_support()
|
|
113
113
|
return
|
|
114
114
|
logger.debug(f"Command-line arguments: {args}")
|
|
115
|
-
open_subtitles_api_key = ""
|
|
116
|
-
open_subtitles_user_agent = ""
|
|
117
|
-
open_subtitles_username = ""
|
|
118
|
-
open_subtitles_password = ""
|
|
119
|
-
# Check if API key is provided via command-line argument
|
|
120
|
-
tmdb_api_key = args.tmdb_api_key
|
|
121
|
-
|
|
122
|
-
# If API key is not provided, try to get it from the cache
|
|
123
|
-
if not tmdb_api_key:
|
|
124
|
-
cached_config = get_config(CONFIG_FILE)
|
|
125
|
-
if cached_config:
|
|
126
|
-
tmdb_api_key = cached_config.get("tmdb_api_key")
|
|
127
115
|
|
|
128
|
-
#
|
|
116
|
+
# Load configuration once
|
|
117
|
+
config = get_config(CONFIG_FILE)
|
|
118
|
+
|
|
119
|
+
# Get TMDb API key
|
|
120
|
+
tmdb_api_key = args.tmdb_api_key or config.get("tmdb_api_key")
|
|
129
121
|
if not tmdb_api_key:
|
|
130
122
|
tmdb_api_key = input("Enter your TMDb API key: ")
|
|
131
|
-
# Cache the API key
|
|
132
|
-
|
|
133
123
|
logger.debug(f"TMDb API Key: {tmdb_api_key}")
|
|
124
|
+
|
|
134
125
|
logger.debug("Getting OpenSubtitles API key")
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
open_subtitles_password = cached_config.get("open_subtitles_password")
|
|
141
|
-
except:
|
|
142
|
-
pass
|
|
126
|
+
open_subtitles_api_key = config.get("open_subtitles_api_key")
|
|
127
|
+
open_subtitles_user_agent = config.get("open_subtitles_user_agent")
|
|
128
|
+
open_subtitles_username = config.get("open_subtitles_username")
|
|
129
|
+
open_subtitles_password = config.get("open_subtitles_password")
|
|
130
|
+
|
|
143
131
|
if args.get_subs:
|
|
144
132
|
if not open_subtitles_api_key:
|
|
145
133
|
open_subtitles_api_key = input("Enter your OpenSubtitles API key: ")
|
|
146
|
-
|
|
147
134
|
if not open_subtitles_user_agent:
|
|
148
135
|
open_subtitles_user_agent = input("Enter your OpenSubtitles User Agent: ")
|
|
149
|
-
|
|
150
136
|
if not open_subtitles_username:
|
|
151
137
|
open_subtitles_username = input("Enter your OpenSubtitles Username: ")
|
|
152
|
-
|
|
153
138
|
if not open_subtitles_password:
|
|
154
139
|
open_subtitles_password = input("Enter your OpenSubtitles Password: ")
|
|
155
|
-
|
|
156
|
-
#
|
|
157
|
-
show_dir = args.show_dir
|
|
140
|
+
|
|
141
|
+
# Use config for show directory and tesseract path
|
|
142
|
+
show_dir = args.show_dir or config.get("show_dir")
|
|
143
|
+
if not show_dir:
|
|
144
|
+
show_dir = input("Enter the main directory of the show:")
|
|
145
|
+
logger.info(f"Show Directory: {show_dir}")
|
|
158
146
|
if not show_dir:
|
|
159
|
-
show_dir =
|
|
160
|
-
|
|
161
|
-
# If show directory is not provided, prompt the user to input it
|
|
162
|
-
show_dir = input("Enter the main directory of the show:")
|
|
163
|
-
logger.info(f"Show Directory: {show_dir}")
|
|
164
|
-
# if the user does not provide a show directory, make the default show directory the current working directory
|
|
165
|
-
if not show_dir:
|
|
166
|
-
show_dir = os.getcwd()
|
|
147
|
+
show_dir = os.getcwd()
|
|
148
|
+
|
|
167
149
|
if not args.tesseract_path:
|
|
168
|
-
tesseract_path =
|
|
169
|
-
|
|
150
|
+
tesseract_path = config.get("tesseract_path")
|
|
170
151
|
if not tesseract_path:
|
|
171
|
-
tesseract_path = input(
|
|
172
|
-
r"Enter the path to the tesseract executable: ['C:\Program Files\Tesseract-OCR\tesseract.exe']"
|
|
173
|
-
)
|
|
174
|
-
|
|
152
|
+
tesseract_path = input(r"Enter the path to the tesseract executable: ['C:\Program Files\Tesseract-OCR\tesseract.exe']")
|
|
175
153
|
else:
|
|
176
154
|
tesseract_path = args.tesseract_path
|
|
177
155
|
logger.debug(f"Teesseract Path: {tesseract_path}")
|
|
178
156
|
logger.debug(f"Show Directory: {show_dir}")
|
|
179
|
-
|
|
157
|
+
|
|
180
158
|
# Set the configuration
|
|
181
159
|
set_config(
|
|
182
160
|
tmdb_api_key,
|
|
@@ -18,7 +18,7 @@ class EpisodeMatcher:
|
|
|
18
18
|
self.cache_dir = Path(cache_dir)
|
|
19
19
|
self.min_confidence = min_confidence
|
|
20
20
|
self.show_name = show_name
|
|
21
|
-
self.chunk_duration =
|
|
21
|
+
self.chunk_duration = 30
|
|
22
22
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
23
23
|
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
24
24
|
self.temp_dir.mkdir(exist_ok=True)
|
|
@@ -44,7 +44,9 @@ class EpisodeMatcher:
|
|
|
44
44
|
'-ss', str(start_time),
|
|
45
45
|
'-t', str(self.chunk_duration),
|
|
46
46
|
'-i', mkv_file,
|
|
47
|
-
'-vn',
|
|
47
|
+
'-vn', # Disable video
|
|
48
|
+
'-sn', # Disable subtitles
|
|
49
|
+
'-dn', # Disable data streams
|
|
48
50
|
'-acodec', 'pcm_s16le',
|
|
49
51
|
'-ar', '16000',
|
|
50
52
|
'-ac', '1',
|
|
@@ -80,31 +82,73 @@ class EpisodeMatcher:
|
|
|
80
82
|
except Exception as e:
|
|
81
83
|
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
82
84
|
return ''
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
85
|
+
def _try_match_with_model(self, video_file, model_name, max_duration, reference_files):
|
|
86
|
+
"""
|
|
87
|
+
Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
video_file: Path to the video file
|
|
91
|
+
model_name: Name of the Whisper model to use
|
|
92
|
+
max_duration: Maximum duration in seconds to check
|
|
93
|
+
reference_files: List of reference subtitle files
|
|
94
|
+
"""
|
|
95
|
+
# Use cached model
|
|
96
|
+
model = get_whisper_model(model_name, self.device)
|
|
97
|
+
|
|
98
|
+
# Calculate number of chunks to check (30 seconds each)
|
|
99
|
+
num_chunks = max_duration // self.chunk_duration
|
|
100
|
+
|
|
101
|
+
for chunk_idx in range(num_chunks):
|
|
102
|
+
start_time = chunk_idx * self.chunk_duration
|
|
103
|
+
logger.debug(f"Trying {model_name} model at {start_time} seconds")
|
|
93
104
|
|
|
94
|
-
|
|
105
|
+
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
95
106
|
|
|
96
|
-
|
|
97
|
-
|
|
107
|
+
result = model.transcribe(
|
|
108
|
+
audio_path,
|
|
109
|
+
task="transcribe",
|
|
110
|
+
language="en"
|
|
111
|
+
)
|
|
98
112
|
|
|
99
|
-
|
|
100
|
-
|
|
113
|
+
chunk_text = result["text"]
|
|
114
|
+
best_confidence = 0
|
|
115
|
+
best_match = None
|
|
101
116
|
|
|
102
|
-
#
|
|
117
|
+
# Compare with reference chunks
|
|
118
|
+
for ref_file in reference_files:
|
|
119
|
+
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
120
|
+
confidence = self.chunk_score(chunk_text, ref_text)
|
|
121
|
+
|
|
122
|
+
if confidence > best_confidence:
|
|
123
|
+
best_confidence = confidence
|
|
124
|
+
best_match = ref_file
|
|
125
|
+
|
|
126
|
+
if confidence > self.min_confidence:
|
|
127
|
+
season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
|
|
128
|
+
if season_ep:
|
|
129
|
+
season, episode = map(int, season_ep.groups())
|
|
130
|
+
return {
|
|
131
|
+
'season': season,
|
|
132
|
+
'episode': episode,
|
|
133
|
+
'confidence': best_confidence,
|
|
134
|
+
'reference_file': str(best_match),
|
|
135
|
+
'matched_at': start_time
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
logger.debug(f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})")
|
|
139
|
+
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
def identify_episode(self, video_file, temp_dir, season_number):
|
|
143
|
+
"""Progressive episode identification with faster initial attempt."""
|
|
144
|
+
try:
|
|
145
|
+
# Get reference files first
|
|
146
|
+
reference_dir = self.cache_dir / "data" / self.show_name
|
|
103
147
|
patterns = [
|
|
104
|
-
f"S{season_number:02d}E",
|
|
105
|
-
f"S{season_number}E",
|
|
106
|
-
f"{season_number:02d}x",
|
|
107
|
-
f"{season_number}x",
|
|
148
|
+
f"S{season_number:02d}E",
|
|
149
|
+
f"S{season_number}E",
|
|
150
|
+
f"{season_number:02d}x",
|
|
151
|
+
f"{season_number}x",
|
|
108
152
|
]
|
|
109
153
|
|
|
110
154
|
reference_files = []
|
|
@@ -114,55 +158,43 @@ class EpisodeMatcher:
|
|
|
114
158
|
for p in patterns)]
|
|
115
159
|
reference_files.extend(files)
|
|
116
160
|
|
|
117
|
-
# Remove duplicates while preserving order
|
|
118
161
|
reference_files = list(dict.fromkeys(reference_files))
|
|
119
162
|
|
|
120
163
|
if not reference_files:
|
|
121
164
|
logger.error(f"No reference files found for season {season_number}")
|
|
122
165
|
return None
|
|
123
|
-
|
|
124
|
-
#
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
best_confidence = 0
|
|
138
|
-
best_match = None
|
|
139
|
-
|
|
140
|
-
# Compare with reference chunks
|
|
141
|
-
for ref_file in reference_files:
|
|
142
|
-
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
143
|
-
confidence = self.chunk_score(chunk_text, ref_text)
|
|
144
|
-
|
|
145
|
-
if confidence > best_confidence:
|
|
146
|
-
best_confidence = confidence
|
|
147
|
-
best_match = ref_file
|
|
148
|
-
|
|
149
|
-
if confidence > self.min_confidence:
|
|
150
|
-
season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
|
|
151
|
-
if season_ep:
|
|
152
|
-
season, episode = map(int, season_ep.groups())
|
|
153
|
-
return {
|
|
154
|
-
'season': season,
|
|
155
|
-
'episode': episode,
|
|
156
|
-
'confidence': best_confidence,
|
|
157
|
-
'reference_file': str(best_match),
|
|
158
|
-
}
|
|
166
|
+
|
|
167
|
+
# Try with tiny model first (fastest) - check first 2 minutes
|
|
168
|
+
logger.info("Attempting match with tiny model (first 2 minutes)...")
|
|
169
|
+
match = self._try_match_with_model(video_file, "tiny", 120, reference_files)
|
|
170
|
+
if match and match['confidence'] > 0.65: # Slightly lower threshold for tiny
|
|
171
|
+
logger.info(f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
|
|
172
|
+
return match
|
|
173
|
+
|
|
174
|
+
# If unsuccessful with tiny, try base model on first 3 minutes
|
|
175
|
+
logger.info("Tiny model match failed, trying base model (first 3 minutes)...")
|
|
176
|
+
match = self._try_match_with_model(video_file, "base", 180, reference_files)
|
|
177
|
+
if match and match['confidence'] > self.min_confidence:
|
|
178
|
+
logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
|
|
179
|
+
return match
|
|
159
180
|
|
|
181
|
+
# If still no match, try base model on up to 10 minutes
|
|
182
|
+
logger.info("No match in first 3 minutes, extending base model search to 10 minutes...")
|
|
183
|
+
match = self._try_match_with_model(video_file, "base", 600, reference_files)
|
|
184
|
+
if match:
|
|
185
|
+
logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
|
|
186
|
+
return match
|
|
187
|
+
|
|
188
|
+
logger.info("Speech recognition match failed")
|
|
160
189
|
return None
|
|
161
190
|
|
|
162
191
|
finally:
|
|
163
192
|
# Cleanup temp files
|
|
164
193
|
for file in self.temp_dir.glob("chunk_*.wav"):
|
|
165
|
-
|
|
194
|
+
try:
|
|
195
|
+
file.unlink()
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.warning(f"Failed to delete temp file {file}: {e}")
|
|
166
198
|
|
|
167
199
|
def detect_file_encoding(file_path):
|
|
168
200
|
"""
|
|
@@ -279,4 +311,19 @@ class SubtitleReader:
|
|
|
279
311
|
logger.warning(f"Error parsing subtitle block: {e}")
|
|
280
312
|
continue
|
|
281
313
|
|
|
282
|
-
return text_lines
|
|
314
|
+
return text_lines
|
|
315
|
+
|
|
316
|
+
_whisper_models = {}
|
|
317
|
+
|
|
318
|
+
def get_whisper_model(model_name="tiny", device=None):
|
|
319
|
+
"""Cache whisper models to avoid reloading."""
|
|
320
|
+
global _whisper_models
|
|
321
|
+
if device is None:
|
|
322
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
323
|
+
|
|
324
|
+
key = f"{model_name}_{device}"
|
|
325
|
+
if key not in _whisper_models:
|
|
326
|
+
_whisper_models[key] = whisper.load_model(model_name, device=device)
|
|
327
|
+
logger.info(f"Loaded {model_name} model on {device}")
|
|
328
|
+
|
|
329
|
+
return _whisper_models[key]
|
|
@@ -19,7 +19,6 @@ from mkv_episode_matcher.utils import (
|
|
|
19
19
|
process_srt_files,
|
|
20
20
|
compare_and_rename_files,get_valid_seasons,rename_episode_file
|
|
21
21
|
)
|
|
22
|
-
from mkv_episode_matcher.speech_to_text import process_speech_to_text
|
|
23
22
|
from mkv_episode_matcher.episode_identification import EpisodeMatcher
|
|
24
23
|
|
|
25
24
|
def process_show(season=None, dry_run=False, get_subs=False):
|
|
@@ -32,7 +31,7 @@ def process_show(season=None, dry_run=False, get_subs=False):
|
|
|
32
31
|
# Early check for reference files
|
|
33
32
|
reference_dir = Path(CACHE_DIR) / "data" / show_name
|
|
34
33
|
reference_files = list(reference_dir.glob("*.srt"))
|
|
35
|
-
if not reference_files:
|
|
34
|
+
if (not get_subs) and (not reference_files):
|
|
36
35
|
logger.error(f"No reference subtitle files found in {reference_dir}")
|
|
37
36
|
logger.info("Please download reference subtitles first")
|
|
38
37
|
return
|
|
@@ -67,7 +66,7 @@ def process_show(season=None, dry_run=False, get_subs=False):
|
|
|
67
66
|
if get_subs:
|
|
68
67
|
show_id = fetch_show_id(matcher.show_name)
|
|
69
68
|
if show_id:
|
|
70
|
-
get_subtitles(show_id, seasons={season_num})
|
|
69
|
+
get_subtitles(show_id, seasons={season_num}, config=config)
|
|
71
70
|
|
|
72
71
|
unmatched_files = []
|
|
73
72
|
for mkv_file in mkv_files:
|
|
@@ -76,8 +75,6 @@ def process_show(season=None, dry_run=False, get_subs=False):
|
|
|
76
75
|
|
|
77
76
|
if match:
|
|
78
77
|
new_name = f"{matcher.show_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
|
|
79
|
-
new_path = os.path.join(season_path, new_name)
|
|
80
|
-
|
|
81
78
|
logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
|
|
82
79
|
f"(confidence: {match['confidence']:.2f})")
|
|
83
80
|
|
|
@@ -105,4 +102,4 @@ def process_show(season=None, dry_run=False, get_subs=False):
|
|
|
105
102
|
finally:
|
|
106
103
|
if not dry_run:
|
|
107
104
|
shutil.rmtree(temp_dir)
|
|
108
|
-
cleanup_ocr_files(show_dir)
|
|
105
|
+
cleanup_ocr_files(show_dir)
|
mkv_episode_matcher/utils.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
4
|
import shutil
|
|
5
|
-
|
|
5
|
+
import torch
|
|
6
6
|
import requests
|
|
7
7
|
from loguru import logger
|
|
8
8
|
from opensubtitlescom import OpenSubtitles
|
|
@@ -121,16 +121,17 @@ def rename_episode_file(original_file_path, new_filename):
|
|
|
121
121
|
logger.error(f"Failed to rename file: {e}")
|
|
122
122
|
return None
|
|
123
123
|
|
|
124
|
-
def get_subtitles(show_id, seasons: set[int]):
|
|
124
|
+
def get_subtitles(show_id, seasons: set[int], config=None):
|
|
125
125
|
"""
|
|
126
126
|
Retrieves and saves subtitles for a given TV show and seasons.
|
|
127
127
|
|
|
128
128
|
Args:
|
|
129
129
|
show_id (int): The ID of the TV show.
|
|
130
130
|
seasons (Set[int]): A set of season numbers for which subtitles should be retrieved.
|
|
131
|
+
config (Config object, optional): Preloaded configuration.
|
|
131
132
|
"""
|
|
132
|
-
|
|
133
|
-
|
|
133
|
+
if config is None:
|
|
134
|
+
config = get_config(CONFIG_FILE)
|
|
134
135
|
show_dir = config.get("show_dir")
|
|
135
136
|
series_name = sanitize_filename(os.path.basename(show_dir))
|
|
136
137
|
tmdb_api_key = config.get("tmdb_api_key")
|
|
@@ -388,4 +389,11 @@ def compare_text(text1, text2):
|
|
|
388
389
|
|
|
389
390
|
# Compare the two lists of text lines
|
|
390
391
|
matching_lines = set(flat_text1).intersection(flat_text2)
|
|
391
|
-
return len(matching_lines)
|
|
392
|
+
return len(matching_lines)
|
|
393
|
+
|
|
394
|
+
def check_gpu_support():
|
|
395
|
+
logger.info('Checking GPU support...')
|
|
396
|
+
if torch.cuda.is_available():
|
|
397
|
+
logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
|
|
398
|
+
else:
|
|
399
|
+
logger.warning("CUDA not available. Using CPU. Refer to https://pytorch.org/get-started/locally/ for GPU support.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
mkv_episode_matcher/.gitattributes,sha256=Gh2-F2vCM7SZ01pX23UT8pQcmauXWfF3gwyRSb6ZAFs,66
|
|
2
2
|
mkv_episode_matcher/__init__.py,sha256=aNlpgTo1kHVrBcR3SH6wRmCgKu8KjNTki1ZvFfAud6s,240
|
|
3
|
-
mkv_episode_matcher/__main__.py,sha256=
|
|
3
|
+
mkv_episode_matcher/__main__.py,sha256=swYnLA2T8hvYMuNmK-EVRPBYzUdMxLP7pb8vxLuAnmc,6508
|
|
4
4
|
mkv_episode_matcher/config.py,sha256=zDDKBcsDt5fME9BRqiTi7yWKeast1pZh36BNYMvIBYM,2419
|
|
5
|
-
mkv_episode_matcher/episode_identification.py,sha256=
|
|
6
|
-
mkv_episode_matcher/episode_matcher.py,sha256=
|
|
5
|
+
mkv_episode_matcher/episode_identification.py,sha256=jpDWvb16YAHNUzn9fuiHNJ_TB9EYmNg1ahdp361zSf4,12671
|
|
6
|
+
mkv_episode_matcher/episode_matcher.py,sha256=BjPdPQwEHJWx_EOqj_AjKTsEFumdWHGNh7ERP-gfJ2g,4204
|
|
7
7
|
mkv_episode_matcher/mkv_to_srt.py,sha256=4yxBHRVhgVby0UtQ2aTXGuoQpid8pkgjMIaHU6GCdzc,10857
|
|
8
|
-
mkv_episode_matcher/speech_to_text.py,sha256=wVDrFFR7oASGMyq5cfOWmInEIeU9b3MPCLs9EyJrOMw,3128
|
|
9
8
|
mkv_episode_matcher/subtitle_utils.py,sha256=rYSbd393pKYQW0w4sXgals02WFGqMYYYkQHDbEkWF8c,2666
|
|
10
9
|
mkv_episode_matcher/tmdb_client.py,sha256=LbMCgjmp7sCbrQo_CDlpcnryKPz5S7inE24YY9Pyjk4,4172
|
|
11
|
-
mkv_episode_matcher/utils.py,sha256=
|
|
10
|
+
mkv_episode_matcher/utils.py,sha256=bw2-cQsA4tdL9E1HNVTBuCkjXWDYR1And_1k2_BqdMg,14651
|
|
12
11
|
mkv_episode_matcher/libraries/pgs2srt/.gitignore,sha256=mt3uxWYZaFurMw_yGE258gWhtGKPVR7e3Ll4ALJpyj4,23
|
|
13
12
|
mkv_episode_matcher/libraries/pgs2srt/README.md,sha256=olb25G17tj0kxPgp_LcH5I2QWXjgP1m8JFyjYRGz4UU,1374
|
|
14
13
|
mkv_episode_matcher/libraries/pgs2srt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -19,8 +18,8 @@ mkv_episode_matcher/libraries/pgs2srt/requirements.txt,sha256=sg87dqWw_qpbwciw-M
|
|
|
19
18
|
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py,sha256=geT1LXdVd8yED9zoJ9K1XfP2JzGcM7u1SslHYrJI09o,10061
|
|
20
19
|
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py,sha256=GKtVy_Lxv-z27mkRG8pJF2znKWXwZTot7jL6kN-zIxM,10503
|
|
21
20
|
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py,sha256=AlJHUYXl85J95OzGRik-AHVfzDd7Q8BJCvD4Nr8kRIk,938598
|
|
22
|
-
mkv_episode_matcher-0.
|
|
23
|
-
mkv_episode_matcher-0.
|
|
24
|
-
mkv_episode_matcher-0.
|
|
25
|
-
mkv_episode_matcher-0.
|
|
26
|
-
mkv_episode_matcher-0.
|
|
21
|
+
mkv_episode_matcher-0.5.0.dist-info/METADATA,sha256=3U2-ciHxqaP2BvtZ-awK5siPcCE9nFjopBID493NgBs,5579
|
|
22
|
+
mkv_episode_matcher-0.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
23
|
+
mkv_episode_matcher-0.5.0.dist-info/entry_points.txt,sha256=IglJ43SuCZq2eQ3shMFILCkmQASJHnDCI3ogohW2Hn4,64
|
|
24
|
+
mkv_episode_matcher-0.5.0.dist-info/top_level.txt,sha256=XRLbd93HUaedeWLtkyTvQjFcE5QcBRYa3V-CfHrq-OI,20
|
|
25
|
+
mkv_episode_matcher-0.5.0.dist-info/RECORD,,
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
# mkv_episode_matcher/speech_to_text.py
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import subprocess
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import whisper
|
|
7
|
-
import torch
|
|
8
|
-
from loguru import logger
|
|
9
|
-
|
|
10
|
-
def process_speech_to_text(mkv_file, output_dir):
|
|
11
|
-
"""
|
|
12
|
-
Convert MKV file to transcript using Whisper.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
mkv_file (str): Path to MKV file
|
|
16
|
-
output_dir (str): Directory to save transcript files
|
|
17
|
-
"""
|
|
18
|
-
# Extract audio if not already done
|
|
19
|
-
wav_file = extract_audio(mkv_file, output_dir)
|
|
20
|
-
if not wav_file:
|
|
21
|
-
return None
|
|
22
|
-
|
|
23
|
-
# Load model
|
|
24
|
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
25
|
-
if device == "cuda":
|
|
26
|
-
logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
|
|
27
|
-
else:
|
|
28
|
-
logger.info("CUDA not available. Using CPU.")
|
|
29
|
-
|
|
30
|
-
model = whisper.load_model("base", device=device)
|
|
31
|
-
|
|
32
|
-
# Generate transcript
|
|
33
|
-
segments_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.segments.json")
|
|
34
|
-
if not os.path.exists(segments_file):
|
|
35
|
-
try:
|
|
36
|
-
result = model.transcribe(
|
|
37
|
-
wav_file,
|
|
38
|
-
task="transcribe",
|
|
39
|
-
language="en",
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
# Save segments
|
|
43
|
-
import json
|
|
44
|
-
with open(segments_file, 'w', encoding='utf-8') as f:
|
|
45
|
-
json.dump(result["segments"], f, indent=2)
|
|
46
|
-
|
|
47
|
-
logger.info(f"Transcript saved to {segments_file}")
|
|
48
|
-
|
|
49
|
-
except Exception as e:
|
|
50
|
-
logger.error(f"Error during transcription: {e}")
|
|
51
|
-
return None
|
|
52
|
-
else:
|
|
53
|
-
logger.info(f"Using existing transcript: {segments_file}")
|
|
54
|
-
|
|
55
|
-
return segments_file
|
|
56
|
-
|
|
57
|
-
def extract_audio(mkv_file, output_dir):
|
|
58
|
-
"""
|
|
59
|
-
Extract audio from MKV file using FFmpeg.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
mkv_file (str): Path to MKV file
|
|
63
|
-
output_dir (str): Directory to save WAV file
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
str: Path to extracted WAV file
|
|
67
|
-
"""
|
|
68
|
-
wav_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.wav")
|
|
69
|
-
|
|
70
|
-
if not os.path.exists(wav_file):
|
|
71
|
-
logger.info(f"Extracting audio from {mkv_file}")
|
|
72
|
-
try:
|
|
73
|
-
cmd = [
|
|
74
|
-
'ffmpeg',
|
|
75
|
-
'-i', mkv_file,
|
|
76
|
-
'-vn', # Disable video
|
|
77
|
-
'-acodec', 'pcm_s16le', # Convert to PCM format
|
|
78
|
-
'-ar', '16000', # Set sample rate to 16kHz
|
|
79
|
-
'-ac', '1', # Convert to mono
|
|
80
|
-
wav_file
|
|
81
|
-
]
|
|
82
|
-
subprocess.run(cmd, check=True, capture_output=True)
|
|
83
|
-
logger.info(f"Audio extracted to {wav_file}")
|
|
84
|
-
except subprocess.CalledProcessError as e:
|
|
85
|
-
logger.error(f"Error extracting audio: {e}")
|
|
86
|
-
return None
|
|
87
|
-
else:
|
|
88
|
-
logger.info(f"Audio file {wav_file} already exists, skipping extraction")
|
|
89
|
-
|
|
90
|
-
return wav_file
|
|
91
|
-
def check_gpu_support():
|
|
92
|
-
logger.info('Checking GPU support...')
|
|
93
|
-
if torch.cuda.is_available():
|
|
94
|
-
logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
|
|
95
|
-
else:
|
|
96
|
-
logger.warning("CUDA not available. Using CPU. Refer to https://pytorch.org/get-started/locally/ for GPU support.")
|
|
File without changes
|
{mkv_episode_matcher-0.4.1.dist-info → mkv_episode_matcher-0.5.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|