mkv-episode-matcher 0.4.5__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.coverage +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/PKG-INFO +1 -1
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/__main__.py +1 -1
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/episode_identification.py +108 -61
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/episode_matcher.py +0 -3
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/utils.py +9 -2
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/PKG-INFO +1 -1
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/SOURCES.txt +0 -1
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/setup.cfg +1 -1
- mkv_episode_matcher-0.4.5/mkv_episode_matcher/speech_to_text.py +0 -96
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.gitattributes +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/funding.yml +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/workflows/documentation.yml +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/workflows/python-publish.yml +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/workflows/tests.yml +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.gitignore +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.gitmodules +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.python-version +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.vscode/settings.json +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/README.md +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/api/index.md +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/cli.md +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/configuration.md +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/installation.md +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/quickstart.md +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/docs/tips.md +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkdocs.yml +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/.gitattributes +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/__init__.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/config.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/README.md +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/mkv_to_srt.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/subtitle_utils.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/tmdb_client.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/dependency_links.txt +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/entry_points.txt +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/requires.txt +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/top_level.txt +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/pyproject.toml +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/setup.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/tests/__init__.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/tests/test_main.py +0 -0
- {mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/uv.lock +0 -0
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
|
@@ -108,7 +108,7 @@ def main():
|
|
|
108
108
|
)
|
|
109
109
|
args = parser.parse_args()
|
|
110
110
|
if args.check_gpu:
|
|
111
|
-
from mkv_episode_matcher.
|
|
111
|
+
from mkv_episode_matcher.utils import check_gpu_support
|
|
112
112
|
check_gpu_support()
|
|
113
113
|
return
|
|
114
114
|
logger.debug(f"Command-line arguments: {args}")
|
|
@@ -18,7 +18,7 @@ class EpisodeMatcher:
|
|
|
18
18
|
self.cache_dir = Path(cache_dir)
|
|
19
19
|
self.min_confidence = min_confidence
|
|
20
20
|
self.show_name = show_name
|
|
21
|
-
self.chunk_duration =
|
|
21
|
+
self.chunk_duration = 30
|
|
22
22
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
23
23
|
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
24
24
|
self.temp_dir.mkdir(exist_ok=True)
|
|
@@ -44,7 +44,9 @@ class EpisodeMatcher:
|
|
|
44
44
|
'-ss', str(start_time),
|
|
45
45
|
'-t', str(self.chunk_duration),
|
|
46
46
|
'-i', mkv_file,
|
|
47
|
-
'-vn',
|
|
47
|
+
'-vn', # Disable video
|
|
48
|
+
'-sn', # Disable subtitles
|
|
49
|
+
'-dn', # Disable data streams
|
|
48
50
|
'-acodec', 'pcm_s16le',
|
|
49
51
|
'-ar', '16000',
|
|
50
52
|
'-ac', '1',
|
|
@@ -80,31 +82,73 @@ class EpisodeMatcher:
|
|
|
80
82
|
except Exception as e:
|
|
81
83
|
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
82
84
|
return ''
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
85
|
+
def _try_match_with_model(self, video_file, model_name, max_duration, reference_files):
|
|
86
|
+
"""
|
|
87
|
+
Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
video_file: Path to the video file
|
|
91
|
+
model_name: Name of the Whisper model to use
|
|
92
|
+
max_duration: Maximum duration in seconds to check
|
|
93
|
+
reference_files: List of reference subtitle files
|
|
94
|
+
"""
|
|
95
|
+
# Use cached model
|
|
96
|
+
model = get_whisper_model(model_name, self.device)
|
|
97
|
+
|
|
98
|
+
# Calculate number of chunks to check (30 seconds each)
|
|
99
|
+
num_chunks = max_duration // self.chunk_duration
|
|
100
|
+
|
|
101
|
+
for chunk_idx in range(num_chunks):
|
|
102
|
+
start_time = chunk_idx * self.chunk_duration
|
|
103
|
+
logger.debug(f"Trying {model_name} model at {start_time} seconds")
|
|
93
104
|
|
|
94
|
-
|
|
105
|
+
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
95
106
|
|
|
96
|
-
|
|
97
|
-
|
|
107
|
+
result = model.transcribe(
|
|
108
|
+
audio_path,
|
|
109
|
+
task="transcribe",
|
|
110
|
+
language="en"
|
|
111
|
+
)
|
|
98
112
|
|
|
99
|
-
|
|
100
|
-
|
|
113
|
+
chunk_text = result["text"]
|
|
114
|
+
best_confidence = 0
|
|
115
|
+
best_match = None
|
|
101
116
|
|
|
102
|
-
#
|
|
117
|
+
# Compare with reference chunks
|
|
118
|
+
for ref_file in reference_files:
|
|
119
|
+
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
120
|
+
confidence = self.chunk_score(chunk_text, ref_text)
|
|
121
|
+
|
|
122
|
+
if confidence > best_confidence:
|
|
123
|
+
best_confidence = confidence
|
|
124
|
+
best_match = ref_file
|
|
125
|
+
|
|
126
|
+
if confidence > self.min_confidence:
|
|
127
|
+
season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
|
|
128
|
+
if season_ep:
|
|
129
|
+
season, episode = map(int, season_ep.groups())
|
|
130
|
+
return {
|
|
131
|
+
'season': season,
|
|
132
|
+
'episode': episode,
|
|
133
|
+
'confidence': best_confidence,
|
|
134
|
+
'reference_file': str(best_match),
|
|
135
|
+
'matched_at': start_time
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
logger.debug(f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})")
|
|
139
|
+
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
def identify_episode(self, video_file, temp_dir, season_number):
|
|
143
|
+
"""Progressive episode identification with faster initial attempt."""
|
|
144
|
+
try:
|
|
145
|
+
# Get reference files first
|
|
146
|
+
reference_dir = self.cache_dir / "data" / self.show_name
|
|
103
147
|
patterns = [
|
|
104
|
-
f"S{season_number:02d}E",
|
|
105
|
-
f"S{season_number}E",
|
|
106
|
-
f"{season_number:02d}x",
|
|
107
|
-
f"{season_number}x",
|
|
148
|
+
f"S{season_number:02d}E",
|
|
149
|
+
f"S{season_number}E",
|
|
150
|
+
f"{season_number:02d}x",
|
|
151
|
+
f"{season_number}x",
|
|
108
152
|
]
|
|
109
153
|
|
|
110
154
|
reference_files = []
|
|
@@ -114,55 +158,43 @@ class EpisodeMatcher:
|
|
|
114
158
|
for p in patterns)]
|
|
115
159
|
reference_files.extend(files)
|
|
116
160
|
|
|
117
|
-
# Remove duplicates while preserving order
|
|
118
161
|
reference_files = list(dict.fromkeys(reference_files))
|
|
119
162
|
|
|
120
163
|
if not reference_files:
|
|
121
164
|
logger.error(f"No reference files found for season {season_number}")
|
|
122
165
|
return None
|
|
123
|
-
|
|
124
|
-
#
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
best_confidence = 0
|
|
138
|
-
best_match = None
|
|
139
|
-
|
|
140
|
-
# Compare with reference chunks
|
|
141
|
-
for ref_file in reference_files:
|
|
142
|
-
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
143
|
-
confidence = self.chunk_score(chunk_text, ref_text)
|
|
144
|
-
|
|
145
|
-
if confidence > best_confidence:
|
|
146
|
-
best_confidence = confidence
|
|
147
|
-
best_match = ref_file
|
|
148
|
-
|
|
149
|
-
if confidence > self.min_confidence:
|
|
150
|
-
season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
|
|
151
|
-
if season_ep:
|
|
152
|
-
season, episode = map(int, season_ep.groups())
|
|
153
|
-
return {
|
|
154
|
-
'season': season,
|
|
155
|
-
'episode': episode,
|
|
156
|
-
'confidence': best_confidence,
|
|
157
|
-
'reference_file': str(best_match),
|
|
158
|
-
}
|
|
166
|
+
|
|
167
|
+
# Try with tiny model first (fastest) - check first 2 minutes
|
|
168
|
+
logger.info("Attempting match with tiny model (first 2 minutes)...")
|
|
169
|
+
match = self._try_match_with_model(video_file, "tiny", 120, reference_files)
|
|
170
|
+
if match and match['confidence'] > 0.65: # Slightly lower threshold for tiny
|
|
171
|
+
logger.info(f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
|
|
172
|
+
return match
|
|
173
|
+
|
|
174
|
+
# If unsuccessful with tiny, try base model on first 3 minutes
|
|
175
|
+
logger.info("Tiny model match failed, trying base model (first 3 minutes)...")
|
|
176
|
+
match = self._try_match_with_model(video_file, "base", 180, reference_files)
|
|
177
|
+
if match and match['confidence'] > self.min_confidence:
|
|
178
|
+
logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
|
|
179
|
+
return match
|
|
159
180
|
|
|
181
|
+
# If still no match, try base model on up to 10 minutes
|
|
182
|
+
logger.info("No match in first 3 minutes, extending base model search to 10 minutes...")
|
|
183
|
+
match = self._try_match_with_model(video_file, "base", 600, reference_files)
|
|
184
|
+
if match:
|
|
185
|
+
logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
|
|
186
|
+
return match
|
|
187
|
+
|
|
188
|
+
logger.info("Speech recognition match failed")
|
|
160
189
|
return None
|
|
161
190
|
|
|
162
191
|
finally:
|
|
163
192
|
# Cleanup temp files
|
|
164
193
|
for file in self.temp_dir.glob("chunk_*.wav"):
|
|
165
|
-
|
|
194
|
+
try:
|
|
195
|
+
file.unlink()
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.warning(f"Failed to delete temp file {file}: {e}")
|
|
166
198
|
|
|
167
199
|
def detect_file_encoding(file_path):
|
|
168
200
|
"""
|
|
@@ -279,4 +311,19 @@ class SubtitleReader:
|
|
|
279
311
|
logger.warning(f"Error parsing subtitle block: {e}")
|
|
280
312
|
continue
|
|
281
313
|
|
|
282
|
-
return text_lines
|
|
314
|
+
return text_lines
|
|
315
|
+
|
|
316
|
+
_whisper_models = {}
|
|
317
|
+
|
|
318
|
+
def get_whisper_model(model_name="tiny", device=None):
|
|
319
|
+
"""Cache whisper models to avoid reloading."""
|
|
320
|
+
global _whisper_models
|
|
321
|
+
if device is None:
|
|
322
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
323
|
+
|
|
324
|
+
key = f"{model_name}_{device}"
|
|
325
|
+
if key not in _whisper_models:
|
|
326
|
+
_whisper_models[key] = whisper.load_model(model_name, device=device)
|
|
327
|
+
logger.info(f"Loaded {model_name} model on {device}")
|
|
328
|
+
|
|
329
|
+
return _whisper_models[key]
|
{mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/episode_matcher.py
RENAMED
|
@@ -19,7 +19,6 @@ from mkv_episode_matcher.utils import (
|
|
|
19
19
|
process_srt_files,
|
|
20
20
|
compare_and_rename_files,get_valid_seasons,rename_episode_file
|
|
21
21
|
)
|
|
22
|
-
from mkv_episode_matcher.speech_to_text import process_speech_to_text
|
|
23
22
|
from mkv_episode_matcher.episode_identification import EpisodeMatcher
|
|
24
23
|
|
|
25
24
|
def process_show(season=None, dry_run=False, get_subs=False):
|
|
@@ -76,8 +75,6 @@ def process_show(season=None, dry_run=False, get_subs=False):
|
|
|
76
75
|
|
|
77
76
|
if match:
|
|
78
77
|
new_name = f"{matcher.show_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
|
|
79
|
-
new_path = os.path.join(season_path, new_name)
|
|
80
|
-
|
|
81
78
|
logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
|
|
82
79
|
f"(confidence: {match['confidence']:.2f})")
|
|
83
80
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
4
|
import shutil
|
|
5
|
-
|
|
5
|
+
import torch
|
|
6
6
|
import requests
|
|
7
7
|
from loguru import logger
|
|
8
8
|
from opensubtitlescom import OpenSubtitles
|
|
@@ -389,4 +389,11 @@ def compare_text(text1, text2):
|
|
|
389
389
|
|
|
390
390
|
# Compare the two lists of text lines
|
|
391
391
|
matching_lines = set(flat_text1).intersection(flat_text2)
|
|
392
|
-
return len(matching_lines)
|
|
392
|
+
return len(matching_lines)
|
|
393
|
+
|
|
394
|
+
def check_gpu_support():
|
|
395
|
+
logger.info('Checking GPU support...')
|
|
396
|
+
if torch.cuda.is_available():
|
|
397
|
+
logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
|
|
398
|
+
else:
|
|
399
|
+
logger.warning("CUDA not available. Using CPU. Refer to https://pytorch.org/get-started/locally/ for GPU support.")
|
{mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
{mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/SOURCES.txt
RENAMED
|
@@ -27,7 +27,6 @@ mkv_episode_matcher/config.py
|
|
|
27
27
|
mkv_episode_matcher/episode_identification.py
|
|
28
28
|
mkv_episode_matcher/episode_matcher.py
|
|
29
29
|
mkv_episode_matcher/mkv_to_srt.py
|
|
30
|
-
mkv_episode_matcher/speech_to_text.py
|
|
31
30
|
mkv_episode_matcher/subtitle_utils.py
|
|
32
31
|
mkv_episode_matcher/tmdb_client.py
|
|
33
32
|
mkv_episode_matcher/utils.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = mkv_episode_matcher
|
|
3
|
-
version = 0.
|
|
3
|
+
version = 0.5.0
|
|
4
4
|
author = Jonathan Sakkos
|
|
5
5
|
author_email = jonathansakkos@gmail.com
|
|
6
6
|
description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
# mkv_episode_matcher/speech_to_text.py
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import subprocess
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import whisper
|
|
7
|
-
import torch
|
|
8
|
-
from loguru import logger
|
|
9
|
-
|
|
10
|
-
def process_speech_to_text(mkv_file, output_dir):
|
|
11
|
-
"""
|
|
12
|
-
Convert MKV file to transcript using Whisper.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
mkv_file (str): Path to MKV file
|
|
16
|
-
output_dir (str): Directory to save transcript files
|
|
17
|
-
"""
|
|
18
|
-
# Extract audio if not already done
|
|
19
|
-
wav_file = extract_audio(mkv_file, output_dir)
|
|
20
|
-
if not wav_file:
|
|
21
|
-
return None
|
|
22
|
-
|
|
23
|
-
# Load model
|
|
24
|
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
25
|
-
if device == "cuda":
|
|
26
|
-
logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
|
|
27
|
-
else:
|
|
28
|
-
logger.info("CUDA not available. Using CPU.")
|
|
29
|
-
|
|
30
|
-
model = whisper.load_model("base", device=device)
|
|
31
|
-
|
|
32
|
-
# Generate transcript
|
|
33
|
-
segments_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.segments.json")
|
|
34
|
-
if not os.path.exists(segments_file):
|
|
35
|
-
try:
|
|
36
|
-
result = model.transcribe(
|
|
37
|
-
wav_file,
|
|
38
|
-
task="transcribe",
|
|
39
|
-
language="en",
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
# Save segments
|
|
43
|
-
import json
|
|
44
|
-
with open(segments_file, 'w', encoding='utf-8') as f:
|
|
45
|
-
json.dump(result["segments"], f, indent=2)
|
|
46
|
-
|
|
47
|
-
logger.info(f"Transcript saved to {segments_file}")
|
|
48
|
-
|
|
49
|
-
except Exception as e:
|
|
50
|
-
logger.error(f"Error during transcription: {e}")
|
|
51
|
-
return None
|
|
52
|
-
else:
|
|
53
|
-
logger.info(f"Using existing transcript: {segments_file}")
|
|
54
|
-
|
|
55
|
-
return segments_file
|
|
56
|
-
|
|
57
|
-
def extract_audio(mkv_file, output_dir):
|
|
58
|
-
"""
|
|
59
|
-
Extract audio from MKV file using FFmpeg.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
mkv_file (str): Path to MKV file
|
|
63
|
-
output_dir (str): Directory to save WAV file
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
str: Path to extracted WAV file
|
|
67
|
-
"""
|
|
68
|
-
wav_file = os.path.join(output_dir, f"{Path(mkv_file).stem}.wav")
|
|
69
|
-
|
|
70
|
-
if not os.path.exists(wav_file):
|
|
71
|
-
logger.info(f"Extracting audio from {mkv_file}")
|
|
72
|
-
try:
|
|
73
|
-
cmd = [
|
|
74
|
-
'ffmpeg',
|
|
75
|
-
'-i', mkv_file,
|
|
76
|
-
'-vn', # Disable video
|
|
77
|
-
'-acodec', 'pcm_s16le', # Convert to PCM format
|
|
78
|
-
'-ar', '16000', # Set sample rate to 16kHz
|
|
79
|
-
'-ac', '1', # Convert to mono
|
|
80
|
-
wav_file
|
|
81
|
-
]
|
|
82
|
-
subprocess.run(cmd, check=True, capture_output=True)
|
|
83
|
-
logger.info(f"Audio extracted to {wav_file}")
|
|
84
|
-
except subprocess.CalledProcessError as e:
|
|
85
|
-
logger.error(f"Error extracting audio: {e}")
|
|
86
|
-
return None
|
|
87
|
-
else:
|
|
88
|
-
logger.info(f"Audio file {wav_file} already exists, skipping extraction")
|
|
89
|
-
|
|
90
|
-
return wav_file
|
|
91
|
-
def check_gpu_support():
|
|
92
|
-
logger.info('Checking GPU support...')
|
|
93
|
-
if torch.cuda.is_available():
|
|
94
|
-
logger.info(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
|
|
95
|
-
else:
|
|
96
|
-
logger.warning("CUDA not available. Using CPU. Refer to https://pytorch.org/get-started/locally/ for GPU support.")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/.github/workflows/python-publish.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher/subtitle_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/requires.txt
RENAMED
|
File without changes
|
{mkv_episode_matcher-0.4.5 → mkv_episode_matcher-0.5.0}/mkv_episode_matcher.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|