mkv-episode-matcher 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mkv_episode_matcher/__init__.py +8 -0
  2. mkv_episode_matcher/__main__.py +2 -177
  3. mkv_episode_matcher/asr_models.py +506 -0
  4. mkv_episode_matcher/cli.py +558 -0
  5. mkv_episode_matcher/core/config_manager.py +100 -0
  6. mkv_episode_matcher/core/engine.py +577 -0
  7. mkv_episode_matcher/core/matcher.py +214 -0
  8. mkv_episode_matcher/core/models.py +91 -0
  9. mkv_episode_matcher/core/providers/asr.py +85 -0
  10. mkv_episode_matcher/core/providers/subtitles.py +341 -0
  11. mkv_episode_matcher/core/utils.py +148 -0
  12. mkv_episode_matcher/episode_identification.py +550 -118
  13. mkv_episode_matcher/subtitle_utils.py +82 -0
  14. mkv_episode_matcher/tmdb_client.py +56 -14
  15. mkv_episode_matcher/ui/flet_app.py +708 -0
  16. mkv_episode_matcher/utils.py +262 -139
  17. mkv_episode_matcher-1.0.0.dist-info/METADATA +242 -0
  18. mkv_episode_matcher-1.0.0.dist-info/RECORD +23 -0
  19. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/WHEEL +1 -1
  20. mkv_episode_matcher-1.0.0.dist-info/licenses/LICENSE +21 -0
  21. mkv_episode_matcher/config.py +0 -82
  22. mkv_episode_matcher/episode_matcher.py +0 -100
  23. mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
  24. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
  25. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
  26. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
  27. mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
  28. mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  29. mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
  30. mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
  31. mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
  32. mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
  33. mkv_episode_matcher/mkv_to_srt.py +0 -302
  34. mkv_episode_matcher/speech_to_text.py +0 -90
  35. mkv_episode_matcher-0.3.3.dist-info/METADATA +0 -125
  36. mkv_episode_matcher-0.3.3.dist-info/RECORD +0 -25
  37. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/entry_points.txt +0 -0
  38. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,214 @@
1
+ import tempfile
2
+ from collections import Counter
3
+ from pathlib import Path
4
+
5
+ from loguru import logger
6
+
7
+ from mkv_episode_matcher.core.models import MatchCandidate, MatchResult, SubtitleFile
8
+ from mkv_episode_matcher.core.providers.asr import ASRProvider
9
+ from mkv_episode_matcher.core.utils import (
10
+ SubtitleReader,
11
+ clean_text,
12
+ extract_audio_chunk,
13
+ get_video_duration,
14
+ )
15
+
16
+
17
+ class MultiSegmentMatcher:
18
+ def __init__(self, asr_provider: ASRProvider, temp_dir: Path | None = None):
19
+ self.asr = asr_provider
20
+ self.temp_dir = temp_dir or Path(tempfile.gettempdir()) / "mkv_matcher_chunks"
21
+ self.temp_dir.mkdir(exist_ok=True, parents=True)
22
+ self.chunk_duration = 30
23
+ self.min_confidence = 0.6
24
+
25
+ def _process_chunk(
26
+ self, video_path: Path, start_time: float, reference_subs: list[SubtitleFile]
27
+ ) -> list[MatchCandidate]:
28
+ """Process a single chunk: Extract -> Transcribe -> Match against all subs."""
29
+ chunk_path = self.temp_dir / f"{video_path.stem}_{start_time}.wav"
30
+ try:
31
+ extract_audio_chunk(video_path, start_time, self.chunk_duration, chunk_path)
32
+ transcription = self.asr.transcribe(chunk_path)
33
+
34
+ # Clean transcription
35
+ clean_trans = clean_text(transcription)
36
+ if len(clean_trans) < 10:
37
+ logger.debug(f"Transcription too short at {start_time}s: {clean_trans}")
38
+ return []
39
+
40
+ candidates = []
41
+ for sub in reference_subs:
42
+ # Load text for this time window
43
+ # Note: SubtitleReader.extract_chunk reads file every time.
44
+ # Optimization: Cache full subtitle content in memory for the session?
45
+ # For now, rely on OS file caching.
46
+ if not sub.content:
47
+ sub.content = SubtitleReader.read_srt_file(sub.path)
48
+
49
+ ref_text = " ".join(
50
+ SubtitleReader.extract_subtitle_chunk(
51
+ sub.content, start_time, start_time + self.chunk_duration
52
+ )
53
+ )
54
+ ref_text = clean_text(ref_text)
55
+
56
+ if not ref_text:
57
+ continue
58
+
59
+ score = self.asr.calculate_match_score(clean_trans, ref_text)
60
+ if score > self.min_confidence:
61
+ candidates.append(
62
+ MatchCandidate(
63
+ episode_info=sub.episode_info,
64
+ confidence=score,
65
+ reference_file=sub.path,
66
+ )
67
+ )
68
+
69
+ return candidates
70
+
71
+ except Exception as e:
72
+ logger.error(f"Error processing chunk at {start_time}: {e}")
73
+ return []
74
+ finally:
75
+ if chunk_path.exists():
76
+ chunk_path.unlink()
77
+
78
+ def match(
79
+ self, video_path: Path, reference_subs: list[SubtitleFile]
80
+ ) -> MatchResult | None:
81
+ duration = get_video_duration(video_path)
82
+ if duration < 60:
83
+ logger.warning(f"Video too short: {duration}s")
84
+ return None
85
+
86
+ # Strategy: 3 primary checkpoints with fallbacks for empty segments
87
+ # Avoid intro (0-120s usually).
88
+ # Primary checkpoints: 15% (after intro), 50% (middle), 85% (end).
89
+ primary_checkpoints = [duration * 0.15, duration * 0.50, duration * 0.85]
90
+
91
+ # Fallback checkpoints for when primary segments fail
92
+ fallback_checkpoints = [
93
+ duration * 0.25,
94
+ duration * 0.35,
95
+ duration * 0.65,
96
+ duration * 0.75,
97
+ ]
98
+
99
+ # Combine and filter checkpoints
100
+ all_checkpoints = primary_checkpoints + fallback_checkpoints
101
+ checkpoints = [t for t in all_checkpoints if t < duration - 10]
102
+
103
+ # Limit total attempts to prevent excessive processing
104
+ checkpoints = checkpoints[:6]
105
+
106
+ # Parallel processing of chunks?
107
+ # ASR might be GPU bound and not parallelizable easily within one process due to GIL/VRAM.
108
+ # But extraction is CPU/IO.
109
+ # We'll do sequential for now to be safe with VRAM users.
110
+ # "Faster-Whisper" releases GIL mostly, but VRAM contention is real.
111
+
112
+ all_candidates: list[MatchCandidate] = []
113
+ successful_segments = 0
114
+ empty_segments = 0
115
+
116
+ for i, t in enumerate(checkpoints):
117
+ logger.info(f"Checking segment {i + 1}/{len(checkpoints)} at {t:.1f}s")
118
+
119
+ candidates = self._process_chunk(video_path, t, reference_subs)
120
+
121
+ if not candidates:
122
+ empty_segments += 1
123
+ logger.debug(f"Empty transcription at {t:.1f}s (segment {i + 1})")
124
+ # Continue trying more segments if we're still in primary checkpoints
125
+ # or if we haven't found any successful matches yet
126
+ if i < 3 or successful_segments == 0:
127
+ continue
128
+ else:
129
+ # We have some matches already and this is a fallback segment
130
+ break
131
+
132
+ successful_segments += 1
133
+ # Sort candidates by score
134
+ candidates.sort(key=lambda x: x.confidence, reverse=True)
135
+ top_match = candidates[0]
136
+
137
+ logger.debug(
138
+ f"Top match at {t}s: {top_match.episode_info.s_e_format} ({top_match.confidence:.2f})"
139
+ )
140
+
141
+ # FAIL FAST: If we have an Extremely High confidence Unique match
142
+ # and it's not from the very first segment (which might be intro)
143
+ if (
144
+ i > 0 and top_match.confidence > 0.92
145
+ ): # Not first segment, very high score
146
+ # Check for ambiguity
147
+ if len(candidates) > 1 and candidates[1].confidence > 0.8:
148
+ logger.debug("Ambiguous high score, continuing...")
149
+ else:
150
+ logger.info("Found definitive match, skipping remaining chunks.")
151
+ return MatchResult(
152
+ episode_info=top_match.episode_info,
153
+ confidence=top_match.confidence,
154
+ matched_file=video_path,
155
+ matched_time=t,
156
+ chunk_index=i,
157
+ model_name="unknown",
158
+ original_file=video_path,
159
+ )
160
+
161
+ all_candidates.extend(candidates)
162
+
163
+ logger.info(
164
+ f"Processed {successful_segments} successful segments, {empty_segments} empty segments"
165
+ )
166
+
167
+ # Voting Logic
168
+ if not all_candidates:
169
+ return None
170
+
171
+ # Group by Episode ID (SxxExx)
172
+ vote_counter = Counter()
173
+ score_sum = {}
174
+
175
+ for c in all_candidates:
176
+ key = c.episode_info.s_e_format
177
+ vote_counter[key] += 1
178
+ if key not in score_sum:
179
+ score_sum[key] = 0.0
180
+ score_sum[key] += c.confidence
181
+
182
+ # Winner is the one with most votes. Tie-break with avg confidence.
183
+ best_ep = None
184
+ max_votes = 0
185
+
186
+ for ep_key, votes in vote_counter.items():
187
+ if votes > max_votes:
188
+ max_votes = votes
189
+ best_ep = ep_key
190
+ elif votes == max_votes:
191
+ # Tie break
192
+ if best_ep and score_sum[ep_key] > score_sum[best_ep]:
193
+ best_ep = ep_key
194
+
195
+ if best_ep:
196
+ # Reconstruct result based on the episode key
197
+ # Find a candidate that matches this key to get details
198
+ # Ideally return the one with highest confidence
199
+ winning_candidates = [
200
+ c for c in all_candidates if c.episode_info.s_e_format == best_ep
201
+ ]
202
+ best_candidate = max(winning_candidates, key=lambda c: c.confidence)
203
+
204
+ return MatchResult(
205
+ episode_info=best_candidate.episode_info,
206
+ confidence=best_candidate.confidence,
207
+ matched_file=video_path,
208
+ matched_time=0,
209
+ chunk_index=-1, # Consensus
210
+ model_name="consensus",
211
+ original_file=video_path,
212
+ )
213
+
214
+ return None
@@ -0,0 +1,91 @@
1
+ from pathlib import Path
2
+ from typing import Literal
3
+
4
+ from pydantic import BaseModel, Field, field_validator
5
+
6
+
7
+ class EpisodeInfo(BaseModel):
8
+ """Data model for episode information."""
9
+
10
+ series_name: str
11
+ season: int
12
+ episode: int
13
+ title: str | None = None
14
+
15
+ @property
16
+ def s_e_format(self) -> str:
17
+ return f"S{self.season:02d}E{self.episode:02d}"
18
+
19
+
20
+ class SubtitleFile(BaseModel):
21
+ """Data model for a subtitle file."""
22
+
23
+ path: Path
24
+ language: str = "en"
25
+ episode_info: EpisodeInfo | None = None
26
+ content: str | None = None # Loaded content (optional)
27
+
28
+
29
+ class AudioChunk(BaseModel):
30
+ """Data model for an extracted audio chunk."""
31
+
32
+ path: Path
33
+ start_time: float
34
+ duration: float
35
+
36
+
37
+ class MatchResult(BaseModel):
38
+ """Data model for a matching result."""
39
+
40
+ episode_info: EpisodeInfo
41
+ confidence: float
42
+ matched_file: Path
43
+ matched_time: float
44
+ chunk_index: int = 0
45
+ model_name: str
46
+ original_file: Path | None = None # Store original filename for display
47
+
48
+
49
+ class FailedMatch(BaseModel):
50
+ """Data model for a failed match."""
51
+
52
+ original_file: Path
53
+ reason: str
54
+ confidence: float = 0.0
55
+ series_name: str | None = None
56
+ season: int | None = None
57
+
58
+
59
+ class MatchCandidate(BaseModel):
60
+ """A candidate match from a single chunk."""
61
+
62
+ episode_info: EpisodeInfo
63
+ confidence: float
64
+ reference_file: Path
65
+
66
+
67
+ class Config(BaseModel):
68
+ """Global configuration model."""
69
+
70
+ tmdb_api_key: str | None = None
71
+ show_dir: Path | None = None
72
+ cache_dir: Path = Field(
73
+ default_factory=lambda: Path.home() / ".mkv-episode-matcher" / "cache"
74
+ )
75
+ min_confidence: float = 0.7
76
+
77
+ # OpenSubtitles settings
78
+ open_subtitles_api_key: str | None = None
79
+ open_subtitles_username: str | None = None
80
+ open_subtitles_password: str | None = None
81
+ open_subtitles_user_agent: str = "Oz 1.0.0"
82
+
83
+ # Provider settings
84
+ asr_provider: Literal["parakeet"] = "parakeet"
85
+ sub_provider: Literal["opensubtitles", "local"] = "opensubtitles"
86
+
87
+ @field_validator("show_dir")
88
+ def validate_show_dir(cls, v):
89
+ if v and not v.exists():
90
+ raise ValueError(f"Show directory does not exist: {v}")
91
+ return v
@@ -0,0 +1,85 @@
1
+ import abc
2
+ from pathlib import Path
3
+ from typing import Any
4
+
5
+ from loguru import logger
6
+
7
+ from mkv_episode_matcher.asr_models import (
8
+ ASRModel as _NativeASRModel,
9
+ )
10
+ from mkv_episode_matcher.asr_models import (
11
+ create_asr_model as _create_native_model,
12
+ )
13
+
14
+
15
+ class ASRProvider(abc.ABC):
16
+ """Abstract base class for ASR providers."""
17
+
18
+ @abc.abstractmethod
19
+ def transcribe(self, audio_path: Path) -> str:
20
+ """Transcribe the given audio file to text."""
21
+ pass
22
+
23
+ @abc.abstractmethod
24
+ def load(self):
25
+ """Prepare/Load the model."""
26
+ pass
27
+
28
+ @abc.abstractmethod
29
+ def calculate_match_score(self, transcription: str, reference: str) -> float:
30
+ """Calculate similarity score."""
31
+ pass
32
+
33
+
34
+ class NativeASRProvider(ASRProvider):
35
+ """Wrapper around the existing native ASR models (Whisper, Parakeet)."""
36
+
37
+ def __init__(self, model_config: dict[str, Any]):
38
+ self._model: _NativeASRModel = _create_native_model(model_config)
39
+ self._loaded = False
40
+
41
+ def _ensure_loaded(self):
42
+ if not self._loaded:
43
+ logger.info(f"Loading ASR model: {self._model.model_name}")
44
+ self._model.load()
45
+ self._loaded = True
46
+
47
+ def load(self):
48
+ self._ensure_loaded()
49
+
50
+ def transcribe(self, audio_path: Path) -> str:
51
+ self._ensure_loaded()
52
+ result = self._model.transcribe(audio_path)
53
+ return result.get("text", "") # type: ignore
54
+
55
+ def calculate_match_score(self, transcription: str, reference: str) -> float:
56
+ return self._model.calculate_match_score(transcription, reference)
57
+
58
+
59
+ factory_memo = {}
60
+
61
+
62
+ def get_asr_provider(
63
+ model_type: str = "parakeet",
64
+ model_name: str | None = None,
65
+ device: str | None = None,
66
+ ) -> ASRProvider:
67
+ """Factory to get or create an ASR provider."""
68
+ # Set default match names based on type if not provided
69
+ if not model_name:
70
+ if model_type == "whisper" or model_type == "faster-whisper":
71
+ model_name = "base.en"
72
+ elif "parakeet" in model_type:
73
+ model_name = "nvidia/parakeet-ctc-0.6b"
74
+
75
+ key = (model_type, model_name, device)
76
+ if key in factory_memo:
77
+ return factory_memo[key]
78
+
79
+ config = {"type": model_type, "name": model_name}
80
+ if device:
81
+ config["device"] = device
82
+
83
+ provider = NativeASRProvider(config)
84
+ factory_memo[key] = provider
85
+ return provider