mkv-episode-matcher 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mkv_episode_matcher/__init__.py +8 -0
  2. mkv_episode_matcher/__main__.py +2 -177
  3. mkv_episode_matcher/asr_models.py +506 -0
  4. mkv_episode_matcher/cli.py +558 -0
  5. mkv_episode_matcher/core/config_manager.py +100 -0
  6. mkv_episode_matcher/core/engine.py +577 -0
  7. mkv_episode_matcher/core/matcher.py +214 -0
  8. mkv_episode_matcher/core/models.py +91 -0
  9. mkv_episode_matcher/core/providers/asr.py +85 -0
  10. mkv_episode_matcher/core/providers/subtitles.py +341 -0
  11. mkv_episode_matcher/core/utils.py +148 -0
  12. mkv_episode_matcher/episode_identification.py +550 -118
  13. mkv_episode_matcher/subtitle_utils.py +82 -0
  14. mkv_episode_matcher/tmdb_client.py +56 -14
  15. mkv_episode_matcher/ui/flet_app.py +708 -0
  16. mkv_episode_matcher/utils.py +262 -139
  17. mkv_episode_matcher-1.0.0.dist-info/METADATA +242 -0
  18. mkv_episode_matcher-1.0.0.dist-info/RECORD +23 -0
  19. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/WHEEL +1 -1
  20. mkv_episode_matcher-1.0.0.dist-info/licenses/LICENSE +21 -0
  21. mkv_episode_matcher/config.py +0 -82
  22. mkv_episode_matcher/episode_matcher.py +0 -100
  23. mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
  24. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
  25. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
  26. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
  27. mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
  28. mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  29. mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
  30. mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
  31. mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
  32. mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
  33. mkv_episode_matcher/mkv_to_srt.py +0 -302
  34. mkv_episode_matcher/speech_to_text.py +0 -90
  35. mkv_episode_matcher-0.3.3.dist-info/METADATA +0 -125
  36. mkv_episode_matcher-0.3.3.dist-info/RECORD +0 -25
  37. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/entry_points.txt +0 -0
  38. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,341 @@
1
+ import abc
2
+ import re
3
+ import shutil
4
+ import time
5
+ from functools import wraps
6
+ from pathlib import Path
7
+ from typing import Any, Callable, TypeVar
8
+
9
+ from loguru import logger
10
+ from opensubtitlescom import OpenSubtitles
11
+
12
+ F = TypeVar("F", bound=Callable[..., Any])
13
+
14
+ from mkv_episode_matcher.core.config_manager import get_config_manager
15
+ from mkv_episode_matcher.core.models import EpisodeInfo, SubtitleFile
16
+
17
+
18
+ def retry_with_backoff(
19
+ max_retries: int = 3,
20
+ base_delay: float = 1.0,
21
+ max_delay: float = 60.0,
22
+ backoff_factor: float = 2.0,
23
+ ) -> Callable[[F], F]:
24
+ """Decorator for retrying operations with exponential backoff."""
25
+
26
+ def decorator(func: F) -> F:
27
+ @wraps(func)
28
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
29
+ last_exception = None
30
+ delay = base_delay
31
+
32
+ for attempt in range(max_retries + 1):
33
+ try:
34
+ return func(*args, **kwargs)
35
+ except Exception as e:
36
+ last_exception = e
37
+ if attempt == max_retries:
38
+ logger.error(
39
+ f"Max retries ({max_retries}) exceeded for {func.__name__}: {e}"
40
+ )
41
+ raise e
42
+
43
+ logger.warning(
44
+ f"Attempt {attempt + 1}/{max_retries + 1} failed for {func.__name__}: {e}, retrying in {delay:.1f}s..."
45
+ )
46
+ time.sleep(delay)
47
+ delay = min(delay * backoff_factor, max_delay)
48
+
49
+ raise last_exception
50
+
51
+ return wrapper # type: ignore
52
+
53
+ return decorator
54
+
55
+
56
+ def parse_season_episode(filename: str) -> EpisodeInfo | None:
57
+ """Parse season and episode from filename using regex."""
58
+ # S01E01
59
+ match = re.search(r"[Ss](\d{1,2})[Ee](\d{1,2})", filename)
60
+ if match:
61
+ return EpisodeInfo(
62
+ series_name="", # Placeholder
63
+ season=int(match.group(1)),
64
+ episode=int(match.group(2)),
65
+ )
66
+ # 1x01
67
+ match = re.search(r"(\d{1,2})x(\d{1,2})", filename)
68
+ if match:
69
+ return EpisodeInfo(
70
+ series_name="", season=int(match.group(1)), episode=int(match.group(2))
71
+ )
72
+ return None
73
+
74
+
75
+ class SubtitleProvider(abc.ABC):
76
+ @abc.abstractmethod
77
+ def get_subtitles(
78
+ self, show_name: str, season: int, video_files: list[Path] = None
79
+ ) -> list[SubtitleFile]:
80
+ pass
81
+
82
+
83
+ class LocalSubtitleProvider(SubtitleProvider):
84
+ """Provider that scans a local directory for subtitle files."""
85
+
86
+ def __init__(self, cache_dir: Path):
87
+ self.cache_dir = cache_dir / "data"
88
+
89
+ def get_subtitles(
90
+ self, show_name: str, season: int, video_files: list[Path] = None
91
+ ) -> list[SubtitleFile]:
92
+ """Get all subtitle files for a specific show and season."""
93
+ show_dir = self.cache_dir / show_name
94
+ if not show_dir.exists():
95
+ # logger.warning(f"No subtitle cache found at {show_dir}")
96
+ return []
97
+
98
+ subtitles = []
99
+ # Case insensitive glob
100
+ files = list(show_dir.glob("*.srt")) + list(show_dir.glob("*.SRT"))
101
+
102
+ for f in files:
103
+ info = parse_season_episode(f.name)
104
+ if info:
105
+ if info.season == season:
106
+ info.series_name = show_name
107
+ subtitles.append(SubtitleFile(path=f, episode_info=info))
108
+
109
+ # Deduplicate by path
110
+ seen = set()
111
+ unique_subs = []
112
+ for sub in subtitles:
113
+ if sub.path not in seen:
114
+ seen.add(sub.path)
115
+ unique_subs.append(sub)
116
+
117
+ return unique_subs
118
+
119
+
120
+ class OpenSubtitlesProvider(SubtitleProvider):
121
+ """Provider that downloads subtitles using OpenSubtitles.com."""
122
+
123
+ def __init__(self):
124
+ cm = get_config_manager()
125
+ self.config = cm.load()
126
+ self.client = None
127
+ self.network_timeout = 30 # seconds
128
+ self._authenticate()
129
+
130
+ def _authenticate(self):
131
+ if not self.config.open_subtitles_api_key:
132
+ logger.warning("OpenSubtitles API key not configured")
133
+ return
134
+
135
+ try:
136
+ self.client = OpenSubtitles(
137
+ self.config.open_subtitles_user_agent,
138
+ self.config.open_subtitles_api_key,
139
+ )
140
+ if (
141
+ self.config.open_subtitles_username
142
+ and self.config.open_subtitles_password
143
+ ):
144
+ self.client.login(
145
+ self.config.open_subtitles_username,
146
+ self.config.open_subtitles_password,
147
+ )
148
+ logger.debug("Logged in to OpenSubtitles")
149
+ else:
150
+ logger.debug("Initialized OpenSubtitles (no login)")
151
+ except Exception as e:
152
+ logger.error(f"Failed to initialize OpenSubtitles: {e}")
153
+ self.client = None
154
+
155
+ @retry_with_backoff(max_retries=3, base_delay=1.0)
156
+ def _search_with_retry(self, query: str, languages: str = "en"):
157
+ """Search for subtitles with retry logic."""
158
+ if not self.client:
159
+ raise RuntimeError("OpenSubtitles client not initialized")
160
+
161
+ import signal
162
+
163
+ def timeout_handler(signum, frame):
164
+ raise TimeoutError(
165
+ f"Search operation timed out after {self.network_timeout}s"
166
+ )
167
+
168
+ # Set timeout for search operation (Unix-like systems only)
169
+ if hasattr(signal, "SIGALRM"):
170
+ signal.signal(signal.SIGALRM, timeout_handler)
171
+ signal.alarm(self.network_timeout)
172
+
173
+ try:
174
+ return self.client.search(query=query, languages=languages)
175
+ finally:
176
+ if hasattr(signal, "SIGALRM"):
177
+ signal.alarm(0) # Cancel the alarm
178
+
179
+ @retry_with_backoff(max_retries=2, base_delay=0.5)
180
+ def _download_with_retry(self, subtitle):
181
+ """Download subtitle file with retry logic."""
182
+ if not self.client:
183
+ raise RuntimeError("OpenSubtitles client not initialized")
184
+
185
+ import signal
186
+
187
+ def timeout_handler(signum, frame):
188
+ raise TimeoutError(
189
+ f"Download operation timed out after {self.network_timeout}s"
190
+ )
191
+
192
+ # Set timeout for download operation (Unix-like systems only)
193
+ if hasattr(signal, "SIGALRM"):
194
+ signal.signal(signal.SIGALRM, timeout_handler)
195
+ signal.alarm(self.network_timeout)
196
+
197
+ try:
198
+ return self.client.download_and_save(subtitle)
199
+ finally:
200
+ if hasattr(signal, "SIGALRM"):
201
+ signal.alarm(0) # Cancel the alarm
202
+
203
+ def get_subtitles(
204
+ self, show_name: str, season: int, video_files: list[Path] = None
205
+ ) -> list[SubtitleFile]:
206
+ """Get subtitles for a show/season by downloading them."""
207
+ if not self.client:
208
+ logger.error("OpenSubtitles client not available")
209
+ return []
210
+
211
+ # We need video files to do specific searching usually, but if we just want to bulk match
212
+ # we might want to search by query.
213
+ # However, the engine usually passes a list of video files for the season.
214
+
215
+ # If we have video files, we can try to find subs for them specifically?
216
+ # Or just search for "Show Name S01" to get a bunch?
217
+ # OpenSubtitles API allows searching by query "Show Name S01".
218
+
219
+ logger.info(f"Searching OpenSubtitles for {show_name} S{season:02d}")
220
+
221
+ # Prepare cache directory
222
+ cache_dir = self.config.cache_dir / "data" / show_name
223
+ cache_dir.mkdir(parents=True, exist_ok=True)
224
+
225
+ downloaded_subtitles = []
226
+
227
+ try:
228
+ # Search by query with retry logic
229
+ query = f"{show_name} S{season:02d}"
230
+ response = self._search_with_retry(query)
231
+
232
+ if not response.data:
233
+ logger.warning(f"No subtitles found for query: {query}")
234
+ return []
235
+
236
+ logger.info(f"Found {len(response.data)} potential subtitles")
237
+
238
+ # Limit downloads to a reasonable number or try to match specifically?
239
+ # For now, let's download unique episodes for this season.
240
+
241
+ seen_episodes = set()
242
+
243
+ for subtitle in response.data:
244
+ # Use API provided metadata first
245
+ api_season = getattr(subtitle, "season_number", None)
246
+ api_episode = getattr(subtitle, "episode_number", None)
247
+
248
+ # Get filename from files list or top level
249
+ sub_filename = subtitle.file_name
250
+ if not sub_filename and subtitle.files:
251
+ # files is a list of dicts based on debug output
252
+ if isinstance(subtitle.files[0], dict):
253
+ sub_filename = subtitle.files[0].get("file_name", "")
254
+ else:
255
+ # Fallback if it somehow changes to object
256
+ sub_filename = getattr(subtitle.files[0], "file_name", "")
257
+
258
+ # Check match
259
+ if api_season and api_episode:
260
+ if api_season != season:
261
+ continue
262
+ ep_num = api_episode
263
+ else:
264
+ # Fallback to parsing filename
265
+ info = parse_season_episode(sub_filename or "")
266
+ if not info or info.season != season:
267
+ continue
268
+ ep_num = info.episode
269
+
270
+ if ep_num in seen_episodes:
271
+ continue
272
+
273
+ # Download with retry
274
+ try:
275
+ logger.info(f"Downloading subtitle for S{season:02d}E{ep_num:02d}")
276
+ srt_file = self._download_with_retry(subtitle)
277
+
278
+ # Move to cache
279
+ target_name = f"{show_name} - S{season:02d}E{ep_num:02d}.srt"
280
+ target_path = cache_dir / target_name
281
+
282
+ shutil.move(srt_file, target_path)
283
+
284
+ downloaded_subtitles.append(
285
+ SubtitleFile(
286
+ path=target_path,
287
+ language="en",
288
+ episode_info=EpisodeInfo(
289
+ series_name=show_name, season=season, episode=ep_num
290
+ ),
291
+ )
292
+ )
293
+ seen_episodes.add(ep_num)
294
+
295
+ except Exception as e:
296
+ logger.error(f"Failed to download/save subtitle: {e}")
297
+
298
+ return downloaded_subtitles
299
+
300
+ except Exception as e:
301
+ logger.error(f"OpenSubtitles search failed: {e}")
302
+ return []
303
+
304
+
305
+ class CompositeSubtitleProvider(SubtitleProvider):
306
+ def __init__(self, providers: list[SubtitleProvider]):
307
+ self.providers = providers
308
+
309
+ def get_subtitles(
310
+ self, show_name: str, season: int, video_files: list[Path] = None
311
+ ) -> list[SubtitleFile]:
312
+ results = []
313
+
314
+ # Try each provider in order, but prioritize cached results
315
+ for i, provider in enumerate(self.providers):
316
+ provider_results = provider.get_subtitles(show_name, season, video_files)
317
+
318
+ # If this is the local provider and we have results, prefer them
319
+ if isinstance(provider, LocalSubtitleProvider) and provider_results:
320
+ logger.info(
321
+ f"Found {len(provider_results)} cached subtitles for {show_name} S{season:02d}"
322
+ )
323
+ results.extend(provider_results)
324
+ # Return early if we have enough cached subtitles
325
+ if (
326
+ len(provider_results) >= 3
327
+ ): # Arbitrary threshold for "enough" episodes
328
+ logger.info("Using cached subtitles, skipping download")
329
+ return results
330
+ else:
331
+ # For non-local providers, only use if we don't have cached results
332
+ if not results:
333
+ logger.info(f"No cached subtitles found, trying provider {i + 1}")
334
+ results.extend(provider_results)
335
+ else:
336
+ logger.info(
337
+ "Skipping additional providers since cached subtitles are available"
338
+ )
339
+ break
340
+
341
+ return results
@@ -0,0 +1,148 @@
1
+ import re
2
+ from pathlib import Path
3
+
4
+ import chardet
5
+ from loguru import logger
6
+
7
+
8
+ def detect_file_encoding(file_path: Path) -> str:
9
+ """Detect the encoding of a file using chardet."""
10
+ try:
11
+ with open(file_path, "rb") as f:
12
+ raw_data = f.read(min(1024 * 1024, file_path.stat().st_size))
13
+ result = chardet.detect(raw_data)
14
+ encoding = result["encoding"]
15
+ return encoding if encoding else "utf-8"
16
+ except Exception as e:
17
+ logger.warning(f"Error detecting encoding for {file_path}: {e}")
18
+ return "utf-8"
19
+
20
+
21
+ def read_file_with_fallback(file_path: Path, encodings: list[str] | None = None) -> str:
22
+ """Read a file trying multiple encodings."""
23
+ if encodings is None:
24
+ detected = detect_file_encoding(file_path)
25
+ encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
26
+
27
+ errors = []
28
+ for encoding in encodings:
29
+ try:
30
+ with open(file_path, encoding=encoding) as f:
31
+ return f.read()
32
+ except UnicodeDecodeError as e:
33
+ errors.append(f"{encoding}: {str(e)}")
34
+ continue
35
+
36
+ raise ValueError(f"Failed to read {file_path} with any encoding. Errors: {errors}")
37
+
38
+
39
+ class SubtitleReader:
40
+ """Helper class for reading and parsing subtitle files."""
41
+
42
+ @staticmethod
43
+ def parse_timestamp(timestamp: str) -> float:
44
+ """Parse SRT timestamp into seconds."""
45
+ hours, minutes, seconds = timestamp.replace(",", ".").split(":")
46
+ return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
47
+
48
+ @staticmethod
49
+ def read_srt_file(file_path: Path) -> str:
50
+ return read_file_with_fallback(file_path)
51
+
52
+ @staticmethod
53
+ def extract_subtitle_chunk(
54
+ content: str, start_time: float, end_time: float
55
+ ) -> list[str]:
56
+ """Extract subtitle text for a specific time window."""
57
+ text_lines = []
58
+ for block in content.strip().split("\n\n"):
59
+ lines = block.split("\n")
60
+ if len(lines) < 3 or "-->" not in lines[1]:
61
+ continue
62
+ try:
63
+ timestamp = lines[1]
64
+ time_parts = timestamp.split(" --> ")
65
+ s_stamp = SubtitleReader.parse_timestamp(time_parts[0].strip())
66
+ e_stamp = SubtitleReader.parse_timestamp(time_parts[1].strip())
67
+
68
+ if e_stamp >= start_time and s_stamp <= end_time:
69
+ text_lines.append(" ".join(lines[2:]))
70
+ except (IndexError, ValueError):
71
+ continue
72
+ return text_lines
73
+
74
+
75
+ def clean_text(text: str) -> str:
76
+ """Clean and normalize text for matching."""
77
+ text = text.lower().strip()
78
+ text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
79
+ text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
80
+ return " ".join(text.split())
81
+
82
+
83
+ import subprocess
84
+
85
+
86
+ def get_video_duration(video_file: Path) -> float:
87
+ """Get video duration using ffprobe."""
88
+ try:
89
+ result = subprocess.run(
90
+ [
91
+ "ffprobe",
92
+ "-v",
93
+ "error",
94
+ "-show_entries",
95
+ "format=duration",
96
+ "-of",
97
+ "default=noprint_wrappers=1:nokey=1",
98
+ str(video_file),
99
+ ],
100
+ capture_output=True,
101
+ text=True,
102
+ timeout=10,
103
+ )
104
+
105
+ if result.returncode != 0:
106
+ raise RuntimeError(f"ffprobe error: {result.stderr}")
107
+
108
+ return float(result.stdout.strip())
109
+ except Exception as e:
110
+ logger.error(f"Failed to get duration for {video_file}: {e}")
111
+ return 0.0
112
+
113
+
114
+ def extract_audio_chunk(
115
+ video_file: Path, start_time: float, duration: float, output_path: Path
116
+ ) -> Path:
117
+ """Extract audio chunk using ffmpeg."""
118
+ cmd = [
119
+ "ffmpeg",
120
+ "-ss",
121
+ str(start_time),
122
+ "-t",
123
+ str(duration),
124
+ "-i",
125
+ str(video_file),
126
+ "-vn",
127
+ "-sn",
128
+ "-dn",
129
+ "-acodec",
130
+ "pcm_s16le",
131
+ "-ar",
132
+ "16000",
133
+ "-ac",
134
+ "1",
135
+ "-y",
136
+ str(output_path),
137
+ ]
138
+ try:
139
+ subprocess.run(cmd, capture_output=True, check=True, timeout=30)
140
+ if not output_path.exists() or output_path.stat().st_size < 1024:
141
+ raise RuntimeError("Output file too small or missing")
142
+ return output_path
143
+ except subprocess.CalledProcessError as e:
144
+ logger.error(f"FFmpeg failed: {e.stderr}")
145
+ raise
146
+ except Exception as e:
147
+ logger.error(f"Extraction failed: {e}")
148
+ raise