mkv-episode-matcher 0.3.1__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.coverage +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/PKG-INFO +1 -1
- mkv_episode_matcher-0.3.3/mkv_episode_matcher/episode_identification.py +150 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/episode_matcher.py +17 -35
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/utils.py +7 -10
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/PKG-INFO +1 -1
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/SOURCES.txt +2 -1
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/setup.cfg +1 -1
- mkv_episode_matcher-0.3.3/tests/test_main.py +137 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/uv.lock +1 -1
- mkv_episode_matcher-0.3.1/mkv_episode_matcher/episode_identification.py +0 -208
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.gitattributes +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.github/funding.yml +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.github/workflows/documentation.yml +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.github/workflows/python-publish.yml +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.gitignore +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.gitmodules +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.python-version +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.vscode/settings.json +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/README.md +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/docs/api/index.md +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/docs/cli.md +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/docs/configuration.md +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/docs/installation.md +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/docs/quickstart.md +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/docs/tips.md +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkdocs.yml +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/.gitattributes +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/__init__.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/__main__.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/config.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/README.md +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/mkv_to_srt.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/speech_to_text.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/tmdb_client.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/dependency_links.txt +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/entry_points.txt +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/requires.txt +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/top_level.txt +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/pyproject.toml +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/setup.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/tests/__init__.py +0 -0
- {mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/tests/test_improvements.py +0 -0
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import torch
|
|
7
|
+
from rapidfuzz import fuzz
|
|
8
|
+
from loguru import logger
|
|
9
|
+
import whisper
|
|
10
|
+
import numpy as np
|
|
11
|
+
import re
|
|
12
|
+
class EpisodeMatcher:
|
|
13
|
+
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
14
|
+
self.cache_dir = Path(cache_dir)
|
|
15
|
+
self.min_confidence = min_confidence
|
|
16
|
+
self.show_name = show_name
|
|
17
|
+
self.chunk_duration = 300 # 5 minutes
|
|
18
|
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
19
|
+
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
20
|
+
self.temp_dir.mkdir(exist_ok=True)
|
|
21
|
+
|
|
22
|
+
def clean_text(self, text):
|
|
23
|
+
text = text.lower().strip()
|
|
24
|
+
text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
|
|
25
|
+
text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
|
|
26
|
+
return ' '.join(text.split())
|
|
27
|
+
|
|
28
|
+
def chunk_score(self, whisper_chunk, ref_chunk):
|
|
29
|
+
whisper_clean = self.clean_text(whisper_chunk)
|
|
30
|
+
ref_clean = self.clean_text(ref_chunk)
|
|
31
|
+
return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
|
|
32
|
+
fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
|
|
33
|
+
|
|
34
|
+
def extract_audio_chunk(self, mkv_file, start_time):
|
|
35
|
+
"""Extract a chunk of audio from MKV file."""
|
|
36
|
+
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
37
|
+
if not chunk_path.exists():
|
|
38
|
+
cmd = [
|
|
39
|
+
'ffmpeg',
|
|
40
|
+
'-ss', str(start_time),
|
|
41
|
+
'-t', str(self.chunk_duration),
|
|
42
|
+
'-i', mkv_file,
|
|
43
|
+
'-vn',
|
|
44
|
+
'-acodec', 'pcm_s16le',
|
|
45
|
+
'-ar', '16000',
|
|
46
|
+
'-ac', '1',
|
|
47
|
+
str(chunk_path)
|
|
48
|
+
]
|
|
49
|
+
subprocess.run(cmd, capture_output=True)
|
|
50
|
+
return str(chunk_path)
|
|
51
|
+
|
|
52
|
+
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
53
|
+
"""Load reference subtitles for a specific time chunk."""
|
|
54
|
+
chunk_start = chunk_idx * self.chunk_duration
|
|
55
|
+
chunk_end = chunk_start + self.chunk_duration
|
|
56
|
+
text_lines = []
|
|
57
|
+
|
|
58
|
+
with open(srt_file, 'r', encoding='utf-8') as f:
|
|
59
|
+
content = f.read().strip()
|
|
60
|
+
|
|
61
|
+
for block in content.split('\n\n'):
|
|
62
|
+
lines = block.split('\n')
|
|
63
|
+
if len(lines) < 3 or '-->' not in lines[1]: # Skip malformed blocks
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
timestamp = lines[1]
|
|
68
|
+
text = ' '.join(lines[2:])
|
|
69
|
+
|
|
70
|
+
end_time = timestamp.split(' --> ')[1].strip()
|
|
71
|
+
hours, minutes, seconds = map(float, end_time.replace(',','.').split(':'))
|
|
72
|
+
total_seconds = hours * 3600 + minutes * 60 + seconds
|
|
73
|
+
|
|
74
|
+
if chunk_start <= total_seconds <= chunk_end:
|
|
75
|
+
text_lines.append(text)
|
|
76
|
+
|
|
77
|
+
except (IndexError, ValueError):
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
return ' '.join(text_lines)
|
|
81
|
+
|
|
82
|
+
def identify_episode(self, video_file, temp_dir, season_number):
|
|
83
|
+
try:
|
|
84
|
+
# Get video duration
|
|
85
|
+
duration = float(subprocess.check_output([
|
|
86
|
+
'ffprobe', '-v', 'error',
|
|
87
|
+
'-show_entries', 'format=duration',
|
|
88
|
+
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
89
|
+
video_file
|
|
90
|
+
]).decode())
|
|
91
|
+
|
|
92
|
+
total_chunks = int(np.ceil(duration / self.chunk_duration))
|
|
93
|
+
|
|
94
|
+
# Load Whisper model
|
|
95
|
+
model = whisper.load_model("base", device=self.device)
|
|
96
|
+
|
|
97
|
+
# Get season-specific reference files
|
|
98
|
+
reference_dir = self.cache_dir / "data" / self.show_name
|
|
99
|
+
season_pattern = f"S{season_number:02d}E"
|
|
100
|
+
reference_files = [
|
|
101
|
+
f for f in reference_dir.glob("*.srt")
|
|
102
|
+
if season_pattern in f.name
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
if not reference_files:
|
|
106
|
+
logger.error(f"No reference files found for season {season_number}")
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
# Process chunks until match found
|
|
110
|
+
for chunk_idx in range(min(3, total_chunks)): # Only try first 3 chunks
|
|
111
|
+
start_time = chunk_idx * self.chunk_duration
|
|
112
|
+
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
113
|
+
|
|
114
|
+
# Transcribe chunk
|
|
115
|
+
result = model.transcribe(
|
|
116
|
+
audio_path,
|
|
117
|
+
task="transcribe",
|
|
118
|
+
language="en"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
chunk_text = result["text"]
|
|
122
|
+
best_confidence = 0
|
|
123
|
+
best_match = None
|
|
124
|
+
|
|
125
|
+
# Compare with reference chunks
|
|
126
|
+
for ref_file in reference_files:
|
|
127
|
+
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
128
|
+
confidence = self.chunk_score(chunk_text, ref_text)
|
|
129
|
+
|
|
130
|
+
if confidence > best_confidence:
|
|
131
|
+
best_confidence = confidence
|
|
132
|
+
best_match = ref_file
|
|
133
|
+
|
|
134
|
+
if confidence > self.min_confidence:
|
|
135
|
+
season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
|
|
136
|
+
if season_ep:
|
|
137
|
+
season, episode = map(int, season_ep.groups())
|
|
138
|
+
return {
|
|
139
|
+
'season': season,
|
|
140
|
+
'episode': episode,
|
|
141
|
+
'confidence': best_confidence,
|
|
142
|
+
'reference_file': str(best_match),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
finally:
|
|
148
|
+
# Cleanup temp files
|
|
149
|
+
for file in self.temp_dir.glob("chunk_*.wav"):
|
|
150
|
+
file.unlink()
|
{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/episode_matcher.py
RENAMED
|
@@ -5,7 +5,7 @@ import shutil
|
|
|
5
5
|
import glob
|
|
6
6
|
import os
|
|
7
7
|
from loguru import logger
|
|
8
|
-
|
|
8
|
+
import re
|
|
9
9
|
from mkv_episode_matcher.__main__ import CONFIG_FILE, CACHE_DIR
|
|
10
10
|
from mkv_episode_matcher.config import get_config
|
|
11
11
|
from mkv_episode_matcher.mkv_to_srt import convert_mkv_to_srt
|
|
@@ -17,20 +17,18 @@ from mkv_episode_matcher.utils import (
|
|
|
17
17
|
get_subtitles,
|
|
18
18
|
process_reference_srt_files,
|
|
19
19
|
process_srt_files,
|
|
20
|
-
compare_and_rename_files,get_valid_seasons
|
|
20
|
+
compare_and_rename_files,get_valid_seasons,rename_episode_file
|
|
21
21
|
)
|
|
22
22
|
from mkv_episode_matcher.speech_to_text import process_speech_to_text
|
|
23
23
|
from mkv_episode_matcher.episode_identification import EpisodeMatcher
|
|
24
24
|
|
|
25
25
|
def process_show(season=None, dry_run=False, get_subs=False):
|
|
26
|
-
"""Process the show using
|
|
26
|
+
"""Process the show using streaming speech recognition with OCR fallback."""
|
|
27
27
|
config = get_config(CONFIG_FILE)
|
|
28
28
|
show_dir = config.get("show_dir")
|
|
29
|
+
show_name = clean_text(os.path.basename(show_dir))
|
|
30
|
+
matcher = EpisodeMatcher(CACHE_DIR, show_name)
|
|
29
31
|
|
|
30
|
-
# Initialize episode matcher
|
|
31
|
-
matcher = EpisodeMatcher(CACHE_DIR)
|
|
32
|
-
|
|
33
|
-
# Get valid season directories
|
|
34
32
|
season_paths = get_valid_seasons(show_dir)
|
|
35
33
|
if not season_paths:
|
|
36
34
|
logger.warning(f"No seasons with .mkv files found")
|
|
@@ -43,9 +41,7 @@ def process_show(season=None, dry_run=False, get_subs=False):
|
|
|
43
41
|
return
|
|
44
42
|
season_paths = [season_path]
|
|
45
43
|
|
|
46
|
-
# Process each season
|
|
47
44
|
for season_path in season_paths:
|
|
48
|
-
# Get MKV files that haven't been processed
|
|
49
45
|
mkv_files = [f for f in glob.glob(os.path.join(season_path, "*.mkv"))
|
|
50
46
|
if not check_filename(f)]
|
|
51
47
|
|
|
@@ -53,66 +49,52 @@ def process_show(season=None, dry_run=False, get_subs=False):
|
|
|
53
49
|
logger.info(f"No new files to process in {season_path}")
|
|
54
50
|
continue
|
|
55
51
|
|
|
56
|
-
|
|
52
|
+
season_num = int(re.search(r'Season (\d+)', season_path).group(1))
|
|
57
53
|
temp_dir = Path(season_path) / "temp"
|
|
58
54
|
ocr_dir = Path(season_path) / "ocr"
|
|
59
55
|
temp_dir.mkdir(exist_ok=True)
|
|
60
56
|
ocr_dir.mkdir(exist_ok=True)
|
|
61
57
|
|
|
62
58
|
try:
|
|
63
|
-
# Download subtitles if requested
|
|
64
59
|
if get_subs:
|
|
65
|
-
show_id = fetch_show_id(matcher.
|
|
60
|
+
show_id = fetch_show_id(matcher.show_name)
|
|
66
61
|
if show_id:
|
|
67
|
-
seasons
|
|
68
|
-
|
|
62
|
+
get_subtitles(show_id, seasons={season_num})
|
|
63
|
+
|
|
69
64
|
unmatched_files = []
|
|
70
|
-
|
|
71
|
-
# First pass: Try speech recognition matching
|
|
72
65
|
for mkv_file in mkv_files:
|
|
73
66
|
logger.info(f"Attempting speech recognition match for {mkv_file}")
|
|
67
|
+
match = matcher.identify_episode(mkv_file, temp_dir, season_num)
|
|
74
68
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
match = matcher.identify_episode(mkv_file, temp_dir)
|
|
78
|
-
|
|
79
|
-
if match and match['confidence'] >= matcher.min_confidence:
|
|
80
|
-
# Rename the file
|
|
81
|
-
new_name = f"{matcher.series_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
|
|
69
|
+
if match:
|
|
70
|
+
new_name = f"{matcher.show_name} - S{match['season']:02d}E{match['episode']:02d}.mkv"
|
|
82
71
|
new_path = os.path.join(season_path, new_name)
|
|
83
72
|
|
|
84
73
|
logger.info(f"Speech matched {os.path.basename(mkv_file)} to {new_name} "
|
|
85
74
|
f"(confidence: {match['confidence']:.2f})")
|
|
86
75
|
|
|
87
76
|
if not dry_run:
|
|
88
|
-
|
|
77
|
+
logger.info(f"Renaming {mkv_file} to {new_name}")
|
|
78
|
+
rename_episode_file(mkv_file, new_name)
|
|
89
79
|
else:
|
|
90
|
-
logger.info(f"Speech recognition match failed for {mkv_file},
|
|
80
|
+
logger.info(f"Speech recognition match failed for {mkv_file}, trying OCR")
|
|
91
81
|
unmatched_files.append(mkv_file)
|
|
92
82
|
|
|
93
|
-
#
|
|
83
|
+
# OCR fallback for unmatched files
|
|
94
84
|
if unmatched_files:
|
|
95
85
|
logger.info(f"Attempting OCR matching for {len(unmatched_files)} unmatched files")
|
|
96
|
-
|
|
97
|
-
# Convert files to SRT using OCR
|
|
98
86
|
convert_mkv_to_srt(season_path, unmatched_files)
|
|
99
87
|
|
|
100
|
-
|
|
101
|
-
reference_text_dict = process_reference_srt_files(matcher.series_name)
|
|
88
|
+
reference_text_dict = process_reference_srt_files(matcher.show_name)
|
|
102
89
|
srt_text_dict = process_srt_files(str(ocr_dir))
|
|
103
90
|
|
|
104
|
-
# Compare and rename
|
|
105
91
|
compare_and_rename_files(
|
|
106
92
|
srt_text_dict,
|
|
107
93
|
reference_text_dict,
|
|
108
94
|
dry_run=dry_run,
|
|
109
|
-
min_confidence=0.1 # Lower threshold for OCR
|
|
110
95
|
)
|
|
111
|
-
|
|
112
|
-
|
|
113
96
|
|
|
114
97
|
finally:
|
|
115
|
-
# Cleanup
|
|
116
98
|
if not dry_run:
|
|
117
99
|
shutil.rmtree(temp_dir)
|
|
118
100
|
cleanup_ocr_files(show_dir)
|
|
@@ -117,8 +117,10 @@ def rename_episode_file(original_file_path, new_filename):
|
|
|
117
117
|
except OSError as e:
|
|
118
118
|
logger.error(f"Failed to rename file: {e}")
|
|
119
119
|
return None
|
|
120
|
-
|
|
121
|
-
|
|
120
|
+
except FileExistsError as e:
|
|
121
|
+
logger.error(f"Failed to rename file: {e}")
|
|
122
|
+
return None
|
|
123
|
+
|
|
122
124
|
def get_subtitles(show_id, seasons: set[int]):
|
|
123
125
|
"""
|
|
124
126
|
Retrieves and saves subtitles for a given TV show and seasons.
|
|
@@ -233,9 +235,7 @@ def clean_text(text):
|
|
|
233
235
|
cleaned_text = re.sub(r"\[.*?\]|\(.*?\)|\{.*?\}", "", text)
|
|
234
236
|
# Strip leading/trailing whitespace
|
|
235
237
|
return cleaned_text.strip()
|
|
236
|
-
# mkv_episode_matcher/utils.py
|
|
237
238
|
|
|
238
|
-
# Add this to your existing utils.py, keeping all other functions
|
|
239
239
|
|
|
240
240
|
def process_reference_srt_files(series_name):
|
|
241
241
|
"""
|
|
@@ -357,12 +357,9 @@ def compare_and_rename_files(srt_files, reference_files, dry_run=False):
|
|
|
357
357
|
logger.info(f"Matching lines: {matching_lines}")
|
|
358
358
|
logger.info(f"Found matching file: {mkv_file} ->{reference}")
|
|
359
359
|
new_filename = os.path.join(parent_dir, reference)
|
|
360
|
-
if not
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
os.rename(mkv_file, new_filename)
|
|
364
|
-
else:
|
|
365
|
-
logger.info(f"File {new_filename} already exists, skipping")
|
|
360
|
+
if not dry_run:
|
|
361
|
+
logger.info(f"Renaming {mkv_file} to {new_filename}")
|
|
362
|
+
rename_episode_file(mkv_file, new_filename)
|
|
366
363
|
|
|
367
364
|
def compare_text(text1, text2):
|
|
368
365
|
"""
|
{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/SOURCES.txt
RENAMED
|
@@ -46,4 +46,5 @@ mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py
|
|
|
46
46
|
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py
|
|
47
47
|
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py
|
|
48
48
|
tests/__init__.py
|
|
49
|
-
tests/test_improvements.py
|
|
49
|
+
tests/test_improvements.py
|
|
50
|
+
tests/test_main.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = mkv_episode_matcher
|
|
3
|
-
version = 0.3.
|
|
3
|
+
version = 0.3.3
|
|
4
4
|
author = Jonathan Sakkos
|
|
5
5
|
author_email = jonathansakkos@gmail.com
|
|
6
6
|
description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from unittest.mock import Mock, patch, mock_open
|
|
6
|
+
from mkv_episode_matcher.episode_matcher import process_show
|
|
7
|
+
from mkv_episode_matcher.utils import (
|
|
8
|
+
get_valid_seasons,
|
|
9
|
+
check_filename,
|
|
10
|
+
rename_episode_file,
|
|
11
|
+
clean_text,
|
|
12
|
+
extract_season_episode
|
|
13
|
+
)
|
|
14
|
+
from mkv_episode_matcher.episode_identification import EpisodeMatcher
|
|
15
|
+
from mkv_episode_matcher.config import get_config, set_config
|
|
16
|
+
|
|
17
|
+
@pytest.fixture
|
|
18
|
+
def temp_show_dir(tmp_path):
|
|
19
|
+
show_dir = tmp_path / "Test Show"
|
|
20
|
+
show_dir.mkdir()
|
|
21
|
+
season_dir = show_dir / "Season 1"
|
|
22
|
+
season_dir.mkdir()
|
|
23
|
+
(season_dir / "episode1.mkv").touch()
|
|
24
|
+
(season_dir / "episode2.mkv").touch()
|
|
25
|
+
return show_dir
|
|
26
|
+
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def mock_config():
|
|
29
|
+
return {
|
|
30
|
+
"tmdb_api_key": "test_key",
|
|
31
|
+
"show_dir": "/test/path",
|
|
32
|
+
"max_threads": 4,
|
|
33
|
+
"open_subtitles_api_key": "test_key",
|
|
34
|
+
"open_subtitles_user_agent": "test_agent",
|
|
35
|
+
"open_subtitles_username": "test_user",
|
|
36
|
+
"open_subtitles_password": "test_pass",
|
|
37
|
+
"tesseract_path": "/test/tesseract"
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
class TestUtilities:
|
|
41
|
+
def test_get_valid_seasons(self, temp_show_dir):
|
|
42
|
+
seasons = get_valid_seasons(str(temp_show_dir))
|
|
43
|
+
assert len(seasons) == 1
|
|
44
|
+
assert str(temp_show_dir / "Season 1") in seasons
|
|
45
|
+
|
|
46
|
+
def test_check_filename(self):
|
|
47
|
+
assert check_filename("Show - S01E02.mkv") == True
|
|
48
|
+
assert check_filename("random_file.mkv") == False
|
|
49
|
+
|
|
50
|
+
def test_rename_episode_file(self, temp_show_dir):
|
|
51
|
+
original = temp_show_dir / "Season 1" / "episode1.mkv"
|
|
52
|
+
new_name = "Show - S01E01.mkv"
|
|
53
|
+
result = rename_episode_file(str(original), new_name)
|
|
54
|
+
assert result is not None
|
|
55
|
+
assert Path(result).name == new_name
|
|
56
|
+
|
|
57
|
+
def test_clean_text(self):
|
|
58
|
+
text = "Test [action] (note) {tag}"
|
|
59
|
+
assert clean_text(text) == "Test"
|
|
60
|
+
|
|
61
|
+
def test_extract_season_episode(self):
|
|
62
|
+
filename = "Show - S01E02.mkv"
|
|
63
|
+
season, episode = extract_season_episode(filename)
|
|
64
|
+
assert season == 1
|
|
65
|
+
assert episode == 2
|
|
66
|
+
|
|
67
|
+
class TestConfiguration:
|
|
68
|
+
def test_set_config(self, tmp_path, mock_config):
|
|
69
|
+
config_file = tmp_path / "config.ini"
|
|
70
|
+
set_config(
|
|
71
|
+
mock_config["tmdb_api_key"],
|
|
72
|
+
mock_config["open_subtitles_api_key"],
|
|
73
|
+
mock_config["open_subtitles_user_agent"],
|
|
74
|
+
mock_config["open_subtitles_username"],
|
|
75
|
+
mock_config["open_subtitles_password"],
|
|
76
|
+
mock_config["show_dir"],
|
|
77
|
+
str(config_file),
|
|
78
|
+
mock_config["tesseract_path"]
|
|
79
|
+
)
|
|
80
|
+
assert config_file.exists()
|
|
81
|
+
|
|
82
|
+
def test_get_config(self, tmp_path, mock_config):
|
|
83
|
+
config_file = tmp_path / "config.ini"
|
|
84
|
+
set_config(
|
|
85
|
+
mock_config["tmdb_api_key"],
|
|
86
|
+
mock_config["open_subtitles_api_key"],
|
|
87
|
+
mock_config["open_subtitles_user_agent"],
|
|
88
|
+
mock_config["open_subtitles_username"],
|
|
89
|
+
mock_config["open_subtitles_password"],
|
|
90
|
+
mock_config["show_dir"],
|
|
91
|
+
str(config_file),
|
|
92
|
+
mock_config["tesseract_path"]
|
|
93
|
+
)
|
|
94
|
+
config = get_config(str(config_file))
|
|
95
|
+
assert config["tmdb_api_key"] == mock_config["tmdb_api_key"]
|
|
96
|
+
assert config["show_dir"] == mock_config["show_dir"]
|
|
97
|
+
|
|
98
|
+
class TestEpisodeMatcher:
|
|
99
|
+
@pytest.fixture
|
|
100
|
+
def matcher(self, tmp_path):
|
|
101
|
+
return EpisodeMatcher(tmp_path, "Test Show")
|
|
102
|
+
|
|
103
|
+
def test_clean_text(self, matcher):
|
|
104
|
+
text = "Test [action] <tag> T-t-test"
|
|
105
|
+
assert matcher.clean_text(text) == "test action tag test"
|
|
106
|
+
|
|
107
|
+
def test_chunk_score(self, matcher):
|
|
108
|
+
score = matcher.chunk_score("Test dialogue", "test dialog")
|
|
109
|
+
assert 0 <= score <= 1
|
|
110
|
+
|
|
111
|
+
@patch('subprocess.run')
|
|
112
|
+
def test_extract_audio_chunk(self, mock_run, matcher, tmp_path):
|
|
113
|
+
mkv_file = tmp_path / "test.mkv"
|
|
114
|
+
mkv_file.touch()
|
|
115
|
+
chunk = matcher.extract_audio_chunk(str(mkv_file), 0)
|
|
116
|
+
assert isinstance(chunk, str)
|
|
117
|
+
assert mock_run.called
|
|
118
|
+
|
|
119
|
+
class TestProcessShow:
|
|
120
|
+
@patch('mkv_episode_matcher.episode_matcher.get_valid_seasons')
|
|
121
|
+
@patch('mkv_episode_matcher.episode_matcher.get_config')
|
|
122
|
+
def test_process_show_no_seasons(self, mock_config, mock_seasons, mock_config_data):
|
|
123
|
+
mock_seasons.return_value = []
|
|
124
|
+
mock_config.return_value = mock_config_data
|
|
125
|
+
process_show()
|
|
126
|
+
mock_seasons.assert_called_once()
|
|
127
|
+
|
|
128
|
+
@patch('mkv_episode_matcher.episode_matcher.get_valid_seasons')
|
|
129
|
+
@patch('mkv_episode_matcher.episode_matcher.get_config')
|
|
130
|
+
def test_process_show_with_season(self, mock_config, mock_seasons, temp_show_dir, mock_config_data):
|
|
131
|
+
mock_seasons.return_value = [str(temp_show_dir / "Season 1")]
|
|
132
|
+
mock_config.return_value = mock_config_data
|
|
133
|
+
process_show(season=1)
|
|
134
|
+
mock_seasons.assert_called_once()
|
|
135
|
+
|
|
136
|
+
if __name__ == '__main__':
|
|
137
|
+
pytest.main(['-v'])
|
|
@@ -1,208 +0,0 @@
|
|
|
1
|
-
# mkv_episode_matcher/episode_identification.py
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import glob
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from rapidfuzz import fuzz
|
|
7
|
-
from collections import defaultdict
|
|
8
|
-
import re
|
|
9
|
-
from loguru import logger
|
|
10
|
-
import json
|
|
11
|
-
import shutil
|
|
12
|
-
|
|
13
|
-
class EpisodeMatcher:
|
|
14
|
-
def __init__(self, cache_dir, min_confidence=0.6):
|
|
15
|
-
self.cache_dir = Path(cache_dir)
|
|
16
|
-
self.min_confidence = min_confidence
|
|
17
|
-
self.whisper_segments = None
|
|
18
|
-
self.series_name = None
|
|
19
|
-
|
|
20
|
-
def clean_text(self, text):
|
|
21
|
-
"""Clean text by removing stage directions and normalizing repeated words."""
|
|
22
|
-
# Remove stage directions like [groans] and <i>SHIP:</i>
|
|
23
|
-
text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
|
|
24
|
-
# Remove repeated words with dashes (e.g., "Y-y-you" -> "you")
|
|
25
|
-
text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
|
|
26
|
-
# Remove multiple spaces
|
|
27
|
-
text = ' '.join(text.split())
|
|
28
|
-
return text.lower()
|
|
29
|
-
|
|
30
|
-
def chunk_score(self, whisper_chunk, ref_chunk):
|
|
31
|
-
"""Calculate fuzzy match score between two chunks of text."""
|
|
32
|
-
whisper_clean = self.clean_text(whisper_chunk)
|
|
33
|
-
ref_clean = self.clean_text(ref_chunk)
|
|
34
|
-
|
|
35
|
-
# Use token sort ratio to handle word order differences
|
|
36
|
-
token_sort = fuzz.token_sort_ratio(whisper_clean, ref_clean)
|
|
37
|
-
# Use partial ratio to catch substring matches
|
|
38
|
-
partial = fuzz.partial_ratio(whisper_clean, ref_clean)
|
|
39
|
-
|
|
40
|
-
# Weight token sort more heavily but consider partial matches
|
|
41
|
-
return (token_sort * 0.7 + partial * 0.3) / 100.0
|
|
42
|
-
|
|
43
|
-
def identify_episode(self, video_file, temp_dir):
|
|
44
|
-
"""Identify which episode matches this video file."""
|
|
45
|
-
|
|
46
|
-
# Get series name from parent directory
|
|
47
|
-
self.series_name = Path(video_file).parent.parent.name
|
|
48
|
-
|
|
49
|
-
# Load whisper transcript if not already processed
|
|
50
|
-
segments_file = Path(temp_dir) / f"{Path(video_file).stem}.segments.json"
|
|
51
|
-
if not segments_file.exists():
|
|
52
|
-
logger.error(f"No transcript found for {video_file}. Run speech recognition first.")
|
|
53
|
-
return None
|
|
54
|
-
|
|
55
|
-
with open(segments_file) as f:
|
|
56
|
-
self.whisper_segments = json.load(f)
|
|
57
|
-
|
|
58
|
-
# Get reference directory for this series
|
|
59
|
-
reference_dir = self.cache_dir / "data" / self.series_name
|
|
60
|
-
if not reference_dir.exists():
|
|
61
|
-
logger.error(f"No reference files found for {self.series_name}")
|
|
62
|
-
return None
|
|
63
|
-
|
|
64
|
-
# Match against reference files
|
|
65
|
-
match = self.match_all_references(reference_dir)
|
|
66
|
-
|
|
67
|
-
if match and match['confidence'] >= self.min_confidence:
|
|
68
|
-
# Extract season and episode from filename
|
|
69
|
-
match_file = Path(match['file'])
|
|
70
|
-
season_ep = re.search(r'S(\d+)E(\d+)', match_file.stem)
|
|
71
|
-
if season_ep:
|
|
72
|
-
season, episode = map(int, season_ep.groups())
|
|
73
|
-
return {
|
|
74
|
-
'season': season,
|
|
75
|
-
'episode': episode,
|
|
76
|
-
'confidence': match['confidence'],
|
|
77
|
-
'reference_file': str(match_file),
|
|
78
|
-
'chunk_scores': match['chunk_scores']
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
return None
|
|
82
|
-
|
|
83
|
-
def match_all_references(self, reference_dir):
|
|
84
|
-
"""Process all reference files and track matching scores."""
|
|
85
|
-
results = defaultdict(list)
|
|
86
|
-
best_match = None
|
|
87
|
-
best_confidence = 0
|
|
88
|
-
|
|
89
|
-
def process_chunks(ref_segments, filename):
|
|
90
|
-
nonlocal best_match, best_confidence
|
|
91
|
-
|
|
92
|
-
chunk_size = 300 # 5 minute chunks
|
|
93
|
-
whisper_chunks = defaultdict(list)
|
|
94
|
-
ref_chunks = defaultdict(list)
|
|
95
|
-
|
|
96
|
-
# Group segments into time chunks
|
|
97
|
-
for seg in self.whisper_segments:
|
|
98
|
-
chunk_idx = int(float(seg['start']) // chunk_size)
|
|
99
|
-
whisper_chunks[chunk_idx].append(seg['text'])
|
|
100
|
-
|
|
101
|
-
for seg in ref_segments:
|
|
102
|
-
chunk_idx = int(seg['start'] // chunk_size)
|
|
103
|
-
ref_chunks[chunk_idx].append(seg['text'])
|
|
104
|
-
|
|
105
|
-
# Score each chunk
|
|
106
|
-
for chunk_idx in whisper_chunks:
|
|
107
|
-
whisper_text = ' '.join(whisper_chunks[chunk_idx])
|
|
108
|
-
|
|
109
|
-
# Look for matching reference chunk and adjacent chunks
|
|
110
|
-
scores = []
|
|
111
|
-
for ref_idx in range(max(0, chunk_idx-1), chunk_idx+2):
|
|
112
|
-
if ref_idx in ref_chunks:
|
|
113
|
-
ref_text = ' '.join(ref_chunks[ref_idx])
|
|
114
|
-
score = self.chunk_score(whisper_text, ref_text)
|
|
115
|
-
scores.append(score)
|
|
116
|
-
|
|
117
|
-
if scores:
|
|
118
|
-
chunk_confidence = max(scores)
|
|
119
|
-
logger.info(f"File: {filename}, "
|
|
120
|
-
f"Time: {chunk_idx*chunk_size}-{(chunk_idx+1)*chunk_size}s, "
|
|
121
|
-
f"Confidence: {chunk_confidence:.2f}")
|
|
122
|
-
|
|
123
|
-
results[filename].append({
|
|
124
|
-
'chunk_idx': chunk_idx,
|
|
125
|
-
'confidence': chunk_confidence
|
|
126
|
-
})
|
|
127
|
-
|
|
128
|
-
# Early exit if we find a very good match
|
|
129
|
-
if chunk_confidence > self.min_confidence:
|
|
130
|
-
chunk_scores = results[filename]
|
|
131
|
-
confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
|
|
132
|
-
for c in chunk_scores) / len(chunk_scores)
|
|
133
|
-
|
|
134
|
-
if confidence > best_confidence:
|
|
135
|
-
best_confidence = confidence
|
|
136
|
-
best_match = {
|
|
137
|
-
'file': filename,
|
|
138
|
-
'confidence': confidence,
|
|
139
|
-
'chunk_scores': chunk_scores
|
|
140
|
-
}
|
|
141
|
-
return True
|
|
142
|
-
|
|
143
|
-
return False
|
|
144
|
-
|
|
145
|
-
# Process each reference file
|
|
146
|
-
for ref_file in glob.glob(os.path.join(reference_dir, "*.srt")):
|
|
147
|
-
ref_segments = self.parse_srt_to_segments(ref_file)
|
|
148
|
-
filename = os.path.basename(ref_file)
|
|
149
|
-
|
|
150
|
-
if process_chunks(ref_segments, filename):
|
|
151
|
-
break
|
|
152
|
-
|
|
153
|
-
# If no early match found, find best overall match
|
|
154
|
-
if not best_match:
|
|
155
|
-
for filename, chunks in results.items():
|
|
156
|
-
# Weight earlier chunks more heavily
|
|
157
|
-
confidence = sum(c['confidence'] * (0.9 ** c['chunk_idx'])
|
|
158
|
-
for c in chunks) / len(chunks)
|
|
159
|
-
|
|
160
|
-
if confidence > best_confidence:
|
|
161
|
-
best_confidence = confidence
|
|
162
|
-
best_match = {
|
|
163
|
-
'file': filename,
|
|
164
|
-
'confidence': confidence,
|
|
165
|
-
'chunk_scores': chunks
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
return best_match
|
|
169
|
-
|
|
170
|
-
def parse_srt_to_segments(self, srt_file):
|
|
171
|
-
"""Parse SRT file into list of segments with start/end times and text."""
|
|
172
|
-
segments = []
|
|
173
|
-
current_segment = {}
|
|
174
|
-
|
|
175
|
-
with open(srt_file, 'r', encoding='utf-8') as f:
|
|
176
|
-
lines = f.readlines()
|
|
177
|
-
|
|
178
|
-
i = 0
|
|
179
|
-
while i < len(lines):
|
|
180
|
-
line = lines[i].strip()
|
|
181
|
-
|
|
182
|
-
if line.isdigit(): # Index
|
|
183
|
-
if current_segment:
|
|
184
|
-
segments.append(current_segment)
|
|
185
|
-
current_segment = {}
|
|
186
|
-
|
|
187
|
-
elif '-->' in line: # Timestamp
|
|
188
|
-
start, end = line.split(' --> ')
|
|
189
|
-
current_segment['start'] = self.timestr_to_seconds(start)
|
|
190
|
-
current_segment['end'] = self.timestr_to_seconds(end)
|
|
191
|
-
|
|
192
|
-
elif line: # Text
|
|
193
|
-
if 'text' in current_segment:
|
|
194
|
-
current_segment['text'] += ' ' + line
|
|
195
|
-
else:
|
|
196
|
-
current_segment['text'] = line
|
|
197
|
-
|
|
198
|
-
i += 1
|
|
199
|
-
|
|
200
|
-
if current_segment:
|
|
201
|
-
segments.append(current_segment)
|
|
202
|
-
|
|
203
|
-
return segments
|
|
204
|
-
|
|
205
|
-
def timestr_to_seconds(self, timestr):
|
|
206
|
-
"""Convert SRT timestamp to seconds."""
|
|
207
|
-
h, m, s = timestr.replace(',','.').split(':')
|
|
208
|
-
return float(h) * 3600 + float(m) * 60 + float(s)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/.github/workflows/python-publish.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher/speech_to_text.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/requires.txt
RENAMED
|
File without changes
|
{mkv_episode_matcher-0.3.1 → mkv_episode_matcher-0.3.3}/mkv_episode_matcher.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|