mkv-episode-matcher 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mkv_episode_matcher/__init__.py +8 -0
- mkv_episode_matcher/__main__.py +2 -177
- mkv_episode_matcher/asr_models.py +506 -0
- mkv_episode_matcher/cli.py +558 -0
- mkv_episode_matcher/core/config_manager.py +100 -0
- mkv_episode_matcher/core/engine.py +577 -0
- mkv_episode_matcher/core/matcher.py +214 -0
- mkv_episode_matcher/core/models.py +91 -0
- mkv_episode_matcher/core/providers/asr.py +85 -0
- mkv_episode_matcher/core/providers/subtitles.py +341 -0
- mkv_episode_matcher/core/utils.py +148 -0
- mkv_episode_matcher/episode_identification.py +550 -118
- mkv_episode_matcher/subtitle_utils.py +82 -0
- mkv_episode_matcher/tmdb_client.py +56 -14
- mkv_episode_matcher/ui/flet_app.py +708 -0
- mkv_episode_matcher/utils.py +262 -139
- mkv_episode_matcher-1.0.0.dist-info/METADATA +242 -0
- mkv_episode_matcher-1.0.0.dist-info/RECORD +23 -0
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/WHEEL +1 -1
- mkv_episode_matcher-1.0.0.dist-info/licenses/LICENSE +21 -0
- mkv_episode_matcher/config.py +0 -82
- mkv_episode_matcher/episode_matcher.py +0 -100
- mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
- mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
- mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
- mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
- mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
- mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
- mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
- mkv_episode_matcher/mkv_to_srt.py +0 -302
- mkv_episode_matcher/speech_to_text.py +0 -90
- mkv_episode_matcher-0.3.3.dist-info/METADATA +0 -125
- mkv_episode_matcher-0.3.3.dist-info/RECORD +0 -25
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/entry_points.txt +0 -0
- {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from collections import Counter
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
from mkv_episode_matcher.core.models import MatchCandidate, MatchResult, SubtitleFile
|
|
8
|
+
from mkv_episode_matcher.core.providers.asr import ASRProvider
|
|
9
|
+
from mkv_episode_matcher.core.utils import (
|
|
10
|
+
SubtitleReader,
|
|
11
|
+
clean_text,
|
|
12
|
+
extract_audio_chunk,
|
|
13
|
+
get_video_duration,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MultiSegmentMatcher:
|
|
18
|
+
def __init__(self, asr_provider: ASRProvider, temp_dir: Path | None = None):
|
|
19
|
+
self.asr = asr_provider
|
|
20
|
+
self.temp_dir = temp_dir or Path(tempfile.gettempdir()) / "mkv_matcher_chunks"
|
|
21
|
+
self.temp_dir.mkdir(exist_ok=True, parents=True)
|
|
22
|
+
self.chunk_duration = 30
|
|
23
|
+
self.min_confidence = 0.6
|
|
24
|
+
|
|
25
|
+
def _process_chunk(
|
|
26
|
+
self, video_path: Path, start_time: float, reference_subs: list[SubtitleFile]
|
|
27
|
+
) -> list[MatchCandidate]:
|
|
28
|
+
"""Process a single chunk: Extract -> Transcribe -> Match against all subs."""
|
|
29
|
+
chunk_path = self.temp_dir / f"{video_path.stem}_{start_time}.wav"
|
|
30
|
+
try:
|
|
31
|
+
extract_audio_chunk(video_path, start_time, self.chunk_duration, chunk_path)
|
|
32
|
+
transcription = self.asr.transcribe(chunk_path)
|
|
33
|
+
|
|
34
|
+
# Clean transcription
|
|
35
|
+
clean_trans = clean_text(transcription)
|
|
36
|
+
if len(clean_trans) < 10:
|
|
37
|
+
logger.debug(f"Transcription too short at {start_time}s: {clean_trans}")
|
|
38
|
+
return []
|
|
39
|
+
|
|
40
|
+
candidates = []
|
|
41
|
+
for sub in reference_subs:
|
|
42
|
+
# Load text for this time window
|
|
43
|
+
# Note: SubtitleReader.extract_chunk reads file every time.
|
|
44
|
+
# Optimization: Cache full subtitle content in memory for the session?
|
|
45
|
+
# For now, rely on OS file caching.
|
|
46
|
+
if not sub.content:
|
|
47
|
+
sub.content = SubtitleReader.read_srt_file(sub.path)
|
|
48
|
+
|
|
49
|
+
ref_text = " ".join(
|
|
50
|
+
SubtitleReader.extract_subtitle_chunk(
|
|
51
|
+
sub.content, start_time, start_time + self.chunk_duration
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
ref_text = clean_text(ref_text)
|
|
55
|
+
|
|
56
|
+
if not ref_text:
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
score = self.asr.calculate_match_score(clean_trans, ref_text)
|
|
60
|
+
if score > self.min_confidence:
|
|
61
|
+
candidates.append(
|
|
62
|
+
MatchCandidate(
|
|
63
|
+
episode_info=sub.episode_info,
|
|
64
|
+
confidence=score,
|
|
65
|
+
reference_file=sub.path,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return candidates
|
|
70
|
+
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.error(f"Error processing chunk at {start_time}: {e}")
|
|
73
|
+
return []
|
|
74
|
+
finally:
|
|
75
|
+
if chunk_path.exists():
|
|
76
|
+
chunk_path.unlink()
|
|
77
|
+
|
|
78
|
+
def match(
|
|
79
|
+
self, video_path: Path, reference_subs: list[SubtitleFile]
|
|
80
|
+
) -> MatchResult | None:
|
|
81
|
+
duration = get_video_duration(video_path)
|
|
82
|
+
if duration < 60:
|
|
83
|
+
logger.warning(f"Video too short: {duration}s")
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
# Strategy: 3 primary checkpoints with fallbacks for empty segments
|
|
87
|
+
# Avoid intro (0-120s usually).
|
|
88
|
+
# Primary checkpoints: 15% (after intro), 50% (middle), 85% (end).
|
|
89
|
+
primary_checkpoints = [duration * 0.15, duration * 0.50, duration * 0.85]
|
|
90
|
+
|
|
91
|
+
# Fallback checkpoints for when primary segments fail
|
|
92
|
+
fallback_checkpoints = [
|
|
93
|
+
duration * 0.25,
|
|
94
|
+
duration * 0.35,
|
|
95
|
+
duration * 0.65,
|
|
96
|
+
duration * 0.75,
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
# Combine and filter checkpoints
|
|
100
|
+
all_checkpoints = primary_checkpoints + fallback_checkpoints
|
|
101
|
+
checkpoints = [t for t in all_checkpoints if t < duration - 10]
|
|
102
|
+
|
|
103
|
+
# Limit total attempts to prevent excessive processing
|
|
104
|
+
checkpoints = checkpoints[:6]
|
|
105
|
+
|
|
106
|
+
# Parallel processing of chunks?
|
|
107
|
+
# ASR might be GPU bound and not parallelizable easily within one process due to GIL/VRAM.
|
|
108
|
+
# But extraction is CPU/IO.
|
|
109
|
+
# We'll do sequential for now to be safe with VRAM users.
|
|
110
|
+
# "Faster-Whisper" releases GIL mostly, but VRAM contention is real.
|
|
111
|
+
|
|
112
|
+
all_candidates: list[MatchCandidate] = []
|
|
113
|
+
successful_segments = 0
|
|
114
|
+
empty_segments = 0
|
|
115
|
+
|
|
116
|
+
for i, t in enumerate(checkpoints):
|
|
117
|
+
logger.info(f"Checking segment {i + 1}/{len(checkpoints)} at {t:.1f}s")
|
|
118
|
+
|
|
119
|
+
candidates = self._process_chunk(video_path, t, reference_subs)
|
|
120
|
+
|
|
121
|
+
if not candidates:
|
|
122
|
+
empty_segments += 1
|
|
123
|
+
logger.debug(f"Empty transcription at {t:.1f}s (segment {i + 1})")
|
|
124
|
+
# Continue trying more segments if we're still in primary checkpoints
|
|
125
|
+
# or if we haven't found any successful matches yet
|
|
126
|
+
if i < 3 or successful_segments == 0:
|
|
127
|
+
continue
|
|
128
|
+
else:
|
|
129
|
+
# We have some matches already and this is a fallback segment
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
successful_segments += 1
|
|
133
|
+
# Sort candidates by score
|
|
134
|
+
candidates.sort(key=lambda x: x.confidence, reverse=True)
|
|
135
|
+
top_match = candidates[0]
|
|
136
|
+
|
|
137
|
+
logger.debug(
|
|
138
|
+
f"Top match at {t}s: {top_match.episode_info.s_e_format} ({top_match.confidence:.2f})"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# FAIL FAST: If we have an Extremely High confidence Unique match
|
|
142
|
+
# and it's not from the very first segment (which might be intro)
|
|
143
|
+
if (
|
|
144
|
+
i > 0 and top_match.confidence > 0.92
|
|
145
|
+
): # Not first segment, very high score
|
|
146
|
+
# Check for ambiguity
|
|
147
|
+
if len(candidates) > 1 and candidates[1].confidence > 0.8:
|
|
148
|
+
logger.debug("Ambiguous high score, continuing...")
|
|
149
|
+
else:
|
|
150
|
+
logger.info("Found definitive match, skipping remaining chunks.")
|
|
151
|
+
return MatchResult(
|
|
152
|
+
episode_info=top_match.episode_info,
|
|
153
|
+
confidence=top_match.confidence,
|
|
154
|
+
matched_file=video_path,
|
|
155
|
+
matched_time=t,
|
|
156
|
+
chunk_index=i,
|
|
157
|
+
model_name="unknown",
|
|
158
|
+
original_file=video_path,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
all_candidates.extend(candidates)
|
|
162
|
+
|
|
163
|
+
logger.info(
|
|
164
|
+
f"Processed {successful_segments} successful segments, {empty_segments} empty segments"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Voting Logic
|
|
168
|
+
if not all_candidates:
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
# Group by Episode ID (SxxExx)
|
|
172
|
+
vote_counter = Counter()
|
|
173
|
+
score_sum = {}
|
|
174
|
+
|
|
175
|
+
for c in all_candidates:
|
|
176
|
+
key = c.episode_info.s_e_format
|
|
177
|
+
vote_counter[key] += 1
|
|
178
|
+
if key not in score_sum:
|
|
179
|
+
score_sum[key] = 0.0
|
|
180
|
+
score_sum[key] += c.confidence
|
|
181
|
+
|
|
182
|
+
# Winner is the one with most votes. Tie-break with avg confidence.
|
|
183
|
+
best_ep = None
|
|
184
|
+
max_votes = 0
|
|
185
|
+
|
|
186
|
+
for ep_key, votes in vote_counter.items():
|
|
187
|
+
if votes > max_votes:
|
|
188
|
+
max_votes = votes
|
|
189
|
+
best_ep = ep_key
|
|
190
|
+
elif votes == max_votes:
|
|
191
|
+
# Tie break
|
|
192
|
+
if best_ep and score_sum[ep_key] > score_sum[best_ep]:
|
|
193
|
+
best_ep = ep_key
|
|
194
|
+
|
|
195
|
+
if best_ep:
|
|
196
|
+
# Reconstruct result based on the episode key
|
|
197
|
+
# Find a candidate that matches this key to get details
|
|
198
|
+
# Ideally return the one with highest confidence
|
|
199
|
+
winning_candidates = [
|
|
200
|
+
c for c in all_candidates if c.episode_info.s_e_format == best_ep
|
|
201
|
+
]
|
|
202
|
+
best_candidate = max(winning_candidates, key=lambda c: c.confidence)
|
|
203
|
+
|
|
204
|
+
return MatchResult(
|
|
205
|
+
episode_info=best_candidate.episode_info,
|
|
206
|
+
confidence=best_candidate.confidence,
|
|
207
|
+
matched_file=video_path,
|
|
208
|
+
matched_time=0,
|
|
209
|
+
chunk_index=-1, # Consensus
|
|
210
|
+
model_name="consensus",
|
|
211
|
+
original_file=video_path,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
return None
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EpisodeInfo(BaseModel):
|
|
8
|
+
"""Data model for episode information."""
|
|
9
|
+
|
|
10
|
+
series_name: str
|
|
11
|
+
season: int
|
|
12
|
+
episode: int
|
|
13
|
+
title: str | None = None
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def s_e_format(self) -> str:
|
|
17
|
+
return f"S{self.season:02d}E{self.episode:02d}"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SubtitleFile(BaseModel):
|
|
21
|
+
"""Data model for a subtitle file."""
|
|
22
|
+
|
|
23
|
+
path: Path
|
|
24
|
+
language: str = "en"
|
|
25
|
+
episode_info: EpisodeInfo | None = None
|
|
26
|
+
content: str | None = None # Loaded content (optional)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AudioChunk(BaseModel):
|
|
30
|
+
"""Data model for an extracted audio chunk."""
|
|
31
|
+
|
|
32
|
+
path: Path
|
|
33
|
+
start_time: float
|
|
34
|
+
duration: float
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class MatchResult(BaseModel):
|
|
38
|
+
"""Data model for a matching result."""
|
|
39
|
+
|
|
40
|
+
episode_info: EpisodeInfo
|
|
41
|
+
confidence: float
|
|
42
|
+
matched_file: Path
|
|
43
|
+
matched_time: float
|
|
44
|
+
chunk_index: int = 0
|
|
45
|
+
model_name: str
|
|
46
|
+
original_file: Path | None = None # Store original filename for display
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class FailedMatch(BaseModel):
|
|
50
|
+
"""Data model for a failed match."""
|
|
51
|
+
|
|
52
|
+
original_file: Path
|
|
53
|
+
reason: str
|
|
54
|
+
confidence: float = 0.0
|
|
55
|
+
series_name: str | None = None
|
|
56
|
+
season: int | None = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class MatchCandidate(BaseModel):
|
|
60
|
+
"""A candidate match from a single chunk."""
|
|
61
|
+
|
|
62
|
+
episode_info: EpisodeInfo
|
|
63
|
+
confidence: float
|
|
64
|
+
reference_file: Path
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class Config(BaseModel):
|
|
68
|
+
"""Global configuration model."""
|
|
69
|
+
|
|
70
|
+
tmdb_api_key: str | None = None
|
|
71
|
+
show_dir: Path | None = None
|
|
72
|
+
cache_dir: Path = Field(
|
|
73
|
+
default_factory=lambda: Path.home() / ".mkv-episode-matcher" / "cache"
|
|
74
|
+
)
|
|
75
|
+
min_confidence: float = 0.7
|
|
76
|
+
|
|
77
|
+
# OpenSubtitles settings
|
|
78
|
+
open_subtitles_api_key: str | None = None
|
|
79
|
+
open_subtitles_username: str | None = None
|
|
80
|
+
open_subtitles_password: str | None = None
|
|
81
|
+
open_subtitles_user_agent: str = "Oz 1.0.0"
|
|
82
|
+
|
|
83
|
+
# Provider settings
|
|
84
|
+
asr_provider: Literal["parakeet"] = "parakeet"
|
|
85
|
+
sub_provider: Literal["opensubtitles", "local"] = "opensubtitles"
|
|
86
|
+
|
|
87
|
+
@field_validator("show_dir")
|
|
88
|
+
def validate_show_dir(cls, v):
|
|
89
|
+
if v and not v.exists():
|
|
90
|
+
raise ValueError(f"Show directory does not exist: {v}")
|
|
91
|
+
return v
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
from mkv_episode_matcher.asr_models import (
|
|
8
|
+
ASRModel as _NativeASRModel,
|
|
9
|
+
)
|
|
10
|
+
from mkv_episode_matcher.asr_models import (
|
|
11
|
+
create_asr_model as _create_native_model,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ASRProvider(abc.ABC):
|
|
16
|
+
"""Abstract base class for ASR providers."""
|
|
17
|
+
|
|
18
|
+
@abc.abstractmethod
|
|
19
|
+
def transcribe(self, audio_path: Path) -> str:
|
|
20
|
+
"""Transcribe the given audio file to text."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
@abc.abstractmethod
|
|
24
|
+
def load(self):
|
|
25
|
+
"""Prepare/Load the model."""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@abc.abstractmethod
|
|
29
|
+
def calculate_match_score(self, transcription: str, reference: str) -> float:
|
|
30
|
+
"""Calculate similarity score."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class NativeASRProvider(ASRProvider):
|
|
35
|
+
"""Wrapper around the existing native ASR models (Whisper, Parakeet)."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, model_config: dict[str, Any]):
|
|
38
|
+
self._model: _NativeASRModel = _create_native_model(model_config)
|
|
39
|
+
self._loaded = False
|
|
40
|
+
|
|
41
|
+
def _ensure_loaded(self):
|
|
42
|
+
if not self._loaded:
|
|
43
|
+
logger.info(f"Loading ASR model: {self._model.model_name}")
|
|
44
|
+
self._model.load()
|
|
45
|
+
self._loaded = True
|
|
46
|
+
|
|
47
|
+
def load(self):
|
|
48
|
+
self._ensure_loaded()
|
|
49
|
+
|
|
50
|
+
def transcribe(self, audio_path: Path) -> str:
|
|
51
|
+
self._ensure_loaded()
|
|
52
|
+
result = self._model.transcribe(audio_path)
|
|
53
|
+
return result.get("text", "") # type: ignore
|
|
54
|
+
|
|
55
|
+
def calculate_match_score(self, transcription: str, reference: str) -> float:
|
|
56
|
+
return self._model.calculate_match_score(transcription, reference)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
factory_memo = {}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_asr_provider(
|
|
63
|
+
model_type: str = "parakeet",
|
|
64
|
+
model_name: str | None = None,
|
|
65
|
+
device: str | None = None,
|
|
66
|
+
) -> ASRProvider:
|
|
67
|
+
"""Factory to get or create an ASR provider."""
|
|
68
|
+
# Set default match names based on type if not provided
|
|
69
|
+
if not model_name:
|
|
70
|
+
if model_type == "whisper" or model_type == "faster-whisper":
|
|
71
|
+
model_name = "base.en"
|
|
72
|
+
elif "parakeet" in model_type:
|
|
73
|
+
model_name = "nvidia/parakeet-ctc-0.6b"
|
|
74
|
+
|
|
75
|
+
key = (model_type, model_name, device)
|
|
76
|
+
if key in factory_memo:
|
|
77
|
+
return factory_memo[key]
|
|
78
|
+
|
|
79
|
+
config = {"type": model_type, "name": model_name}
|
|
80
|
+
if device:
|
|
81
|
+
config["device"] = device
|
|
82
|
+
|
|
83
|
+
provider = NativeASRProvider(config)
|
|
84
|
+
factory_memo[key] = provider
|
|
85
|
+
return provider
|