mkv-episode-matcher 0.3.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mkv_episode_matcher/__init__.py +8 -0
  2. mkv_episode_matcher/__main__.py +2 -177
  3. mkv_episode_matcher/asr_models.py +506 -0
  4. mkv_episode_matcher/cli.py +558 -0
  5. mkv_episode_matcher/core/config_manager.py +100 -0
  6. mkv_episode_matcher/core/engine.py +577 -0
  7. mkv_episode_matcher/core/matcher.py +214 -0
  8. mkv_episode_matcher/core/models.py +91 -0
  9. mkv_episode_matcher/core/providers/asr.py +85 -0
  10. mkv_episode_matcher/core/providers/subtitles.py +341 -0
  11. mkv_episode_matcher/core/utils.py +148 -0
  12. mkv_episode_matcher/episode_identification.py +550 -118
  13. mkv_episode_matcher/subtitle_utils.py +82 -0
  14. mkv_episode_matcher/tmdb_client.py +56 -14
  15. mkv_episode_matcher/ui/flet_app.py +708 -0
  16. mkv_episode_matcher/utils.py +262 -139
  17. mkv_episode_matcher-1.0.0.dist-info/METADATA +242 -0
  18. mkv_episode_matcher-1.0.0.dist-info/RECORD +23 -0
  19. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/WHEEL +1 -1
  20. mkv_episode_matcher-1.0.0.dist-info/licenses/LICENSE +21 -0
  21. mkv_episode_matcher/config.py +0 -82
  22. mkv_episode_matcher/episode_matcher.py +0 -100
  23. mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
  24. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
  25. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
  26. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
  27. mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
  28. mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  29. mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
  30. mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
  31. mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
  32. mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
  33. mkv_episode_matcher/mkv_to_srt.py +0 -302
  34. mkv_episode_matcher/speech_to_text.py +0 -90
  35. mkv_episode_matcher-0.3.3.dist-info/METADATA +0 -125
  36. mkv_episode_matcher-0.3.3.dist-info/RECORD +0 -25
  37. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/entry_points.txt +0 -0
  38. {mkv_episode_matcher-0.3.3.dist-info → mkv_episode_matcher-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,577 @@
1
+ """
2
+ Enhanced MKV Episode Matcher Engine V2
3
+
4
+ This module provides the core matching engine with:
5
+ - Optimized Parakeet ASR singleton
6
+ - Enhanced caching system
7
+ - Automatic subtitle acquisition
8
+ - Progress tracking and rich output
9
+ - Multiple use case support
10
+ """
11
+
12
+ import hashlib
13
+ import json
14
+ import re
15
+ from collections.abc import Generator
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from loguru import logger
20
+ from rich.console import Console
21
+ from rich.progress import (
22
+ BarColumn,
23
+ MofNCompleteColumn,
24
+ Progress,
25
+ TextColumn,
26
+ TimeRemainingColumn,
27
+ )
28
+
29
+ from mkv_episode_matcher.core.matcher import MultiSegmentMatcher
30
+ from mkv_episode_matcher.core.models import Config, MatchResult
31
+ from mkv_episode_matcher.core.providers.asr import get_asr_provider
32
+ from mkv_episode_matcher.core.providers.subtitles import (
33
+ CompositeSubtitleProvider,
34
+ LocalSubtitleProvider,
35
+ OpenSubtitlesProvider,
36
+ )
37
+
38
+
39
+ class CacheManager:
40
+ """Enhanced caching system with memory bounds and LRU eviction."""
41
+
42
+ def __init__(self, cache_dir: Path, max_memory_mb: int = 512, max_items: int = 100):
43
+ self.cache_dir = cache_dir
44
+ self.memory_cache = {}
45
+ self.access_order = {} # Track access times for LRU
46
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
47
+ self.max_memory_bytes = max_memory_mb * 1024 * 1024
48
+ self.max_items = max_items
49
+ self.current_memory = 0
50
+
51
+ def _estimate_size(self, value) -> int:
52
+ """Estimate memory usage of cached object."""
53
+ import sys
54
+
55
+ if hasattr(value, "__sizeof__"):
56
+ return value.__sizeof__()
57
+ elif isinstance(value, (str, bytes)):
58
+ return len(value) * (4 if isinstance(value, str) else 1)
59
+ elif isinstance(value, dict):
60
+ return sum(
61
+ self._estimate_size(k) + self._estimate_size(v)
62
+ for k, v in value.items()
63
+ )
64
+ elif isinstance(value, (list, tuple)):
65
+ return sum(self._estimate_size(item) for item in value)
66
+ else:
67
+ return sys.getsizeof(value)
68
+
69
+ def _evict_lru(self):
70
+ """Evict least recently used items until under limits."""
71
+ import time
72
+
73
+ # Sort by access time (oldest first)
74
+ if not self.access_order:
75
+ return
76
+
77
+ sorted_items = sorted(self.access_order.items(), key=lambda x: x[1])
78
+
79
+ while (
80
+ len(self.memory_cache) > self.max_items
81
+ or self.current_memory > self.max_memory_bytes
82
+ ) and sorted_items:
83
+ key_to_remove = sorted_items.pop(0)[0]
84
+ if key_to_remove in self.memory_cache:
85
+ value = self.memory_cache[key_to_remove]
86
+ self.current_memory -= self._estimate_size(value)
87
+ del self.memory_cache[key_to_remove]
88
+ del self.access_order[key_to_remove]
89
+
90
+ def get(self, key: str) -> Any | None:
91
+ """Get item from memory cache with LRU tracking."""
92
+ import time
93
+
94
+ if key in self.memory_cache:
95
+ self.access_order[key] = time.time()
96
+ return self.memory_cache[key]
97
+ return None
98
+
99
+ def set(self, key: str, value: Any, ttl: int = 3600) -> None:
100
+ """Set item in memory cache with bounds checking."""
101
+ import time
102
+
103
+ value_size = self._estimate_size(value)
104
+
105
+ # Don't cache items that are too large
106
+ if value_size > self.max_memory_bytes * 0.5:
107
+ logger.warning(f"Item too large to cache: {value_size} bytes")
108
+ return
109
+
110
+ # Update existing item
111
+ if key in self.memory_cache:
112
+ old_size = self._estimate_size(self.memory_cache[key])
113
+ self.current_memory -= old_size
114
+
115
+ self.memory_cache[key] = value
116
+ self.current_memory += value_size
117
+ self.access_order[key] = time.time()
118
+
119
+ # Evict if necessary
120
+ self._evict_lru()
121
+
122
+ def clear(self) -> None:
123
+ """Clear all cached items."""
124
+ self.memory_cache.clear()
125
+ self.access_order.clear()
126
+ self.current_memory = 0
127
+
128
+ def get_stats(self) -> dict:
129
+ """Get cache statistics."""
130
+ return {
131
+ "items": len(self.memory_cache),
132
+ "memory_mb": self.current_memory / (1024 * 1024),
133
+ "max_memory_mb": self.max_memory_bytes / (1024 * 1024),
134
+ "max_items": self.max_items,
135
+ }
136
+
137
+ def get_file_hash(self, file_path: Path) -> str:
138
+ """Generate hash for file caching."""
139
+ stat = file_path.stat()
140
+ return hashlib.md5(
141
+ f"{file_path}_{stat.st_mtime}_{stat.st_size}".encode()
142
+ ).hexdigest()
143
+
144
+
145
+ class MatchEngineV2:
146
+ """Enhanced matching engine with optimized performance and workflow."""
147
+
148
+ def __init__(self, config: Config):
149
+ self.config = config
150
+ self.console = Console()
151
+
152
+ # Initialize ASR provider (singleton pattern for Parakeet optimization)
153
+ logger.info(f"Initializing ASR provider: {config.asr_provider}")
154
+ self.asr = get_asr_provider(config.asr_provider)
155
+ # Pre-load the model to avoid repeated loading delays
156
+ self.asr.load()
157
+ logger.success("ASR provider loaded and ready")
158
+
159
+ # Initialize cache manager for enhanced caching
160
+ self.cache = CacheManager(config.cache_dir)
161
+
162
+ # Initialize subtitle providers with fallback chain
163
+ self._init_subtitle_providers()
164
+
165
+ # Initialize matcher
166
+ self.matcher = MultiSegmentMatcher(self.asr)
167
+
168
+ def _init_subtitle_providers(self):
169
+ """Initialize subtitle providers with fallback chain."""
170
+ providers = []
171
+
172
+ # Always include local provider first
173
+ providers.append(LocalSubtitleProvider(self.config.cache_dir))
174
+
175
+ # Add OpenSubtitles provider if enabled
176
+ if self.config.sub_provider == "opensubtitles":
177
+ providers.append(OpenSubtitlesProvider())
178
+ logger.info("OpenSubtitles provider enabled")
179
+
180
+ self.subtitle_provider = CompositeSubtitleProvider(providers)
181
+
182
+ def scan_for_mkv(
183
+ self, path: Path, recursive: bool = True
184
+ ) -> Generator[Path, None, None]:
185
+ """Scan for MKV files with optional recursive search."""
186
+ if path.is_file() and path.suffix.lower() == ".mkv":
187
+ yield path
188
+ elif path.is_dir():
189
+ if recursive:
190
+ for p in path.rglob("*.mkv"):
191
+ yield p
192
+ else:
193
+ for p in path.glob("*.mkv"):
194
+ yield p
195
+
196
+ def _detect_context(self, video_file: Path) -> tuple[str | None, int | None]:
197
+ """Detect show name and season from file path using multiple heuristics."""
198
+ show_name = None
199
+ season = None
200
+
201
+ # Heuristic 1: Standard folder structure (Show/Season X/file.mkv)
202
+ try:
203
+ parent_name = video_file.parent.name
204
+ if "season" in parent_name.lower():
205
+ # Extract season number
206
+ s_match = re.search(r"(\d+)", parent_name)
207
+ if s_match:
208
+ season = int(s_match.group(1))
209
+ show_name = video_file.parent.parent.name
210
+ except Exception:
211
+ pass
212
+
213
+ # Heuristic 2: Season folder with S## pattern
214
+ if not season:
215
+ try:
216
+ parent_name = video_file.parent.name
217
+ s_match = re.search(r"[Ss](\d{1,2})", parent_name)
218
+ if s_match:
219
+ season = int(s_match.group(1))
220
+ show_name = video_file.parent.parent.name
221
+ except Exception:
222
+ pass
223
+
224
+ # Heuristic 3: Show directory structure (if config.show_dir is set)
225
+ if not show_name and self.config.show_dir:
226
+ try:
227
+ if str(self.config.show_dir) in str(video_file):
228
+ rel = video_file.relative_to(self.config.show_dir)
229
+ if len(rel.parts) >= 2: # show/season/file.mkv
230
+ show_name = rel.parts[0]
231
+ season_part = rel.parts[1]
232
+ if "season" in season_part.lower():
233
+ s_match = re.search(r"(\d+)", season_part)
234
+ if s_match:
235
+ season = int(s_match.group(1))
236
+ except Exception:
237
+ pass
238
+
239
+ # Heuristic 4: Extract from filename itself
240
+ if not season:
241
+ filename = video_file.stem
242
+ # Look for S##E## or ##x## patterns
243
+ patterns = [
244
+ r"[Ss](\d{1,2})[Ee]\d{1,2}", # S01E01
245
+ r"(\d{1,2})x\d{1,2}", # 1x01
246
+ r"Season[\s\.]*(\d{1,2})", # Season 1
247
+ ]
248
+ for pattern in patterns:
249
+ match = re.search(pattern, filename)
250
+ if match:
251
+ season = int(match.group(1))
252
+ break
253
+
254
+ # Clean show name
255
+ if show_name:
256
+ show_name = re.sub(r"[^\w\s-]", "", show_name).strip()
257
+
258
+ return show_name, season
259
+
260
+ def _is_already_processed(self, video_file: Path) -> bool:
261
+ """Check if file is already processed (has SXXEXX format in filename)."""
262
+ filename = video_file.stem
263
+ # Check for S##E## pattern
264
+ if re.search(r"[Ss]\d{1,2}[Ee]\d{1,2}", filename):
265
+ return True
266
+ # Check for ##x## pattern
267
+ if re.search(r"\d{1,2}x\d{1,2}", filename):
268
+ return True
269
+ return False
270
+
271
+ def _group_files_by_series(
272
+ self, files: list[Path], season_override: int | None
273
+ ) -> dict[tuple[str, int], list[Path]]:
274
+ """Group files by series and season for batch processing."""
275
+ groups = {}
276
+ skipped = []
277
+
278
+ for video_file in files:
279
+ # Check if file is already processed
280
+ if self._is_already_processed(video_file):
281
+ logger.info(f"Skipping already processed file: {video_file.name}")
282
+ skipped.append(video_file)
283
+ continue
284
+
285
+ show_name, season = self._detect_context(video_file)
286
+
287
+ if season_override:
288
+ season = season_override
289
+
290
+ if not show_name or not season:
291
+ logger.warning(f"Could not determine context for {video_file.name}")
292
+ continue
293
+
294
+ key = (show_name, season)
295
+ if key not in groups:
296
+ groups[key] = []
297
+ groups[key].append(video_file)
298
+
299
+ if skipped:
300
+ logger.info(f"Skipped {len(skipped)} already processed files")
301
+
302
+ return groups
303
+
304
+ def _get_subtitles_with_fallback(
305
+ self, show_name: str, season: int, video_files: list[Path] = None
306
+ ):
307
+ """Get subtitles with fallback chain (local -> subliminal)."""
308
+ # Try to get from cache first
309
+ cache_key = f"subtitles_{show_name}_{season}"
310
+ cached_subs = self.cache.get(cache_key)
311
+ if cached_subs:
312
+ logger.debug(f"Using cached subtitles for {show_name} S{season:02d}")
313
+ return cached_subs
314
+
315
+ # Get subtitles from providers (pass video files for Subliminal)
316
+ logger.info(f"Fetching subtitles for {show_name} S{season:02d}")
317
+ subs = self.subtitle_provider.get_subtitles(show_name, season, video_files)
318
+
319
+ # Cache results
320
+ if subs:
321
+ self.cache.set(cache_key, subs)
322
+ logger.success(
323
+ f"Found {len(subs)} subtitle files for {show_name} S{season:02d}"
324
+ )
325
+ else:
326
+ logger.warning(f"No subtitles found for {show_name} S{season:02d}")
327
+
328
+ return subs
329
+
330
+ def _perform_rename(
331
+ self, match: MatchResult, output_dir: Path | None = None
332
+ ) -> Path | None:
333
+ """Perform file rename with enhanced logic and output directory support."""
334
+ original_path = match.matched_file
335
+
336
+ # Generate new filename
337
+ title_part = (
338
+ f" - {match.episode_info.title}" if match.episode_info.title else ""
339
+ )
340
+ new_filename = f"{match.episode_info.series_name} - {match.episode_info.s_e_format}{title_part}{original_path.suffix}"
341
+
342
+ # Clean filename
343
+ new_filename = re.sub(r'[<>:"/\\\\|?*]', "", new_filename).strip()
344
+
345
+ # Determine output path
346
+ if output_dir:
347
+ output_dir.mkdir(parents=True, exist_ok=True)
348
+ new_path = output_dir / new_filename
349
+ else:
350
+ new_path = original_path.parent / new_filename
351
+
352
+ if new_path == original_path:
353
+ logger.debug("File already named correctly")
354
+ return new_path
355
+
356
+ try:
357
+ if new_path.exists():
358
+ logger.warning(f"Destination exists: {new_path}")
359
+ return None
360
+
361
+ if output_dir:
362
+ # Copy to output directory
363
+ import shutil
364
+
365
+ shutil.copy2(original_path, new_path)
366
+ else:
367
+ # Rename in place
368
+ original_path.rename(new_path)
369
+
370
+ logger.success(
371
+ f"{'Copied' if output_dir else 'Renamed'} to: {new_filename}"
372
+ )
373
+ match.matched_file = new_path
374
+ return new_path
375
+
376
+ except Exception as e:
377
+ logger.error(
378
+ f"Failed to {'copy' if output_dir else 'rename'} {original_path.name}: {e}"
379
+ )
380
+ return None
381
+
382
+ def process_path(
383
+ self,
384
+ path: Path,
385
+ season_override: int | None = None,
386
+ recursive: bool = True,
387
+ dry_run: bool = False,
388
+ output_dir: Path | None = None,
389
+ json_output: bool = False,
390
+ confidence_threshold: float = None,
391
+ progress_callback=None,
392
+ ) -> tuple[list[MatchResult], list]:
393
+ """
394
+ Process path for MKV files with enhanced workflow and progress tracking.
395
+
396
+ Args:
397
+ path: Path to file or directory to process
398
+ season_override: Force specific season number
399
+ recursive: Whether to search recursively in directories
400
+ dry_run: If True, don't actually rename files
401
+ output_dir: Directory to copy renamed files to (instead of renaming in place)
402
+ json_output: If True, suppress rich console output for JSON mode
403
+ confidence_threshold: Minimum confidence score for matches
404
+
405
+ Returns:
406
+ Tuple of (successful matches, failed matches)
407
+ """
408
+ if confidence_threshold is None:
409
+ confidence_threshold = self.config.min_confidence
410
+
411
+ results = []
412
+ failures = []
413
+ files = list(self.scan_for_mkv(path, recursive))
414
+
415
+ if not files:
416
+ if not json_output:
417
+ self.console.print(f"[yellow]No MKV files found in {path}[/yellow]")
418
+ return []
419
+
420
+ # Group files by series for batch processing
421
+ file_groups = self._group_files_by_series(files, season_override)
422
+
423
+ if not json_output:
424
+ self.console.print(
425
+ f"[blue]Found {len(files)} MKV files in {len(file_groups)} series/seasons[/blue]"
426
+ )
427
+
428
+ # Track total files for progress callback
429
+ total_files = len(files)
430
+ files_processed = 0
431
+
432
+ with Progress(
433
+ TextColumn("[progress.description]{task.description}"),
434
+ BarColumn(),
435
+ MofNCompleteColumn(),
436
+ TimeRemainingColumn(),
437
+ console=self.console,
438
+ disable=json_output,
439
+ ) as progress:
440
+ main_task = progress.add_task("Processing files...", total=total_files)
441
+
442
+ for group_info, group_files in file_groups.items():
443
+ show_name, season = group_info
444
+
445
+ progress.update(
446
+ main_task, description=f"Processing {show_name} S{season:02d}"
447
+ )
448
+
449
+ # Get subtitles for this series/season (pass video files for Subliminal)
450
+ subs = self._get_subtitles_with_fallback(show_name, season, group_files)
451
+
452
+ if not subs:
453
+ if not json_output:
454
+ self.console.print(
455
+ f"[yellow]No subtitles found for {show_name} S{season:02d} - skipping {len(group_files)} files[/yellow]"
456
+ )
457
+ # Mark all files in group as failed
458
+ from mkv_episode_matcher.core.models import FailedMatch
459
+
460
+ for f in group_files:
461
+ failures.append(
462
+ FailedMatch(
463
+ original_file=f,
464
+ reason=f"No subtitles found for {show_name} S{season:02d}",
465
+ series_name=show_name,
466
+ season=season,
467
+ )
468
+ )
469
+ files_processed += 1
470
+ # Call progress callback if provided
471
+ if progress_callback:
472
+ progress_callback(files_processed, total_files)
473
+ progress.advance(main_task, len(group_files))
474
+ continue
475
+
476
+ # Process files in this group
477
+ for video_file in group_files:
478
+ try:
479
+ match = self.matcher.match(video_file, subs)
480
+ if match and match.confidence >= confidence_threshold:
481
+ match.episode_info.series_name = show_name
482
+ results.append(match)
483
+
484
+ if not json_output:
485
+ self.console.print(
486
+ f"[green]SUCCESS[/green] {video_file.name} -> "
487
+ f"{match.episode_info.s_e_format} "
488
+ f"(Confidence: {match.confidence:.2f})"
489
+ )
490
+
491
+ # Perform rename if not dry run
492
+ if not dry_run:
493
+ logger.debug(f"Attempting to rename {video_file.name}")
494
+ renamed_path = self._perform_rename(match, output_dir)
495
+ if renamed_path:
496
+ match.matched_file = renamed_path
497
+ logger.info(
498
+ f"File successfully renamed: {video_file.name} -> {renamed_path.name}"
499
+ )
500
+ else:
501
+ logger.warning(
502
+ f"Failed to rename {video_file.name}"
503
+ )
504
+ else:
505
+ logger.debug(
506
+ f"Dry run mode - skipping rename for {video_file.name}"
507
+ )
508
+
509
+ else:
510
+ if not json_output:
511
+ conf_str = (
512
+ f" (conf: {match.confidence:.2f})" if match else ""
513
+ )
514
+ self.console.print(
515
+ f"[red]FAILED[/red] {video_file.name} - No match{conf_str}"
516
+ )
517
+
518
+ from mkv_episode_matcher.core.models import FailedMatch
519
+
520
+ failures.append(
521
+ FailedMatch(
522
+ original_file=video_file,
523
+ reason=f"Low confidence match{f' ({match.confidence:.2f})' if match else ''} or no match found",
524
+ confidence=match.confidence if match else 0.0,
525
+ series_name=show_name,
526
+ season=season,
527
+ )
528
+ )
529
+
530
+ except Exception as e:
531
+ logger.error(f"Error processing {video_file}: {e}")
532
+ if not json_output:
533
+ self.console.print(
534
+ f"[red]ERROR[/red] {video_file.name} - Error: {e}"
535
+ )
536
+
537
+ # Update progress after each file
538
+ files_processed += 1
539
+ if progress_callback:
540
+ # Only call progress callback every file to avoid overwhelming UI
541
+ progress_callback(files_processed, total_files)
542
+ progress.advance(main_task)
543
+
544
+ return results, failures
545
+
546
+ def export_results(self, results: list[MatchResult], format: str = "json") -> str:
547
+ """Export results in various formats for automation."""
548
+ if format == "json":
549
+ export_data = []
550
+ for result in results:
551
+ export_data.append({
552
+ "original_file": str(result.matched_file),
553
+ "series_name": result.episode_info.series_name,
554
+ "season": result.episode_info.season,
555
+ "episode": result.episode_info.episode,
556
+ "episode_format": result.episode_info.s_e_format,
557
+ "title": result.episode_info.title,
558
+ "confidence": result.confidence,
559
+ "model_name": result.model_name,
560
+ })
561
+ return json.dumps(export_data, indent=2)
562
+ else:
563
+ raise ValueError(f"Unsupported export format: {format}")
564
+
565
+ def process_single_file(self, file_path: Path, **kwargs) -> MatchResult | None:
566
+ """Process a single MKV file - convenience method."""
567
+ matches, _ = self.process_path(file_path, **kwargs)
568
+ return matches[0] if matches else None
569
+
570
+ def process_library(self, library_path: Path, **kwargs) -> list[MatchResult]:
571
+ """Process entire library - convenience method."""
572
+ matches, _ = self.process_path(library_path, recursive=True, **kwargs)
573
+ return matches
574
+
575
+
576
+ # Alias for backward compatibility
577
+ MatchEngine = MatchEngineV2