mkv-episode-matcher 0.3.4__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mkv-episode-matcher might be problematic. Click here for more details.
- mkv_episode_matcher-0.3.6/.coverage +0 -0
- mkv_episode_matcher-0.3.6/.github/workflows/tests.yml +40 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/PKG-INFO +30 -28
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/README.md +28 -26
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/docs/cli.md +9 -27
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/docs/quickstart.md +15 -18
- mkv_episode_matcher-0.3.6/mkv_episode_matcher/episode_identification.py +282 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/episode_matcher.py +8 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/utils.py +14 -4
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/PKG-INFO +30 -28
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/SOURCES.txt +2 -1
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/pyproject.toml +1 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/setup.cfg +1 -1
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/tests/test_main.py +48 -18
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/uv.lock +12 -1
- mkv_episode_matcher-0.3.4/mkv_episode_matcher/episode_identification.py +0 -150
- mkv_episode_matcher-0.3.4/tests/test_improvements.py +0 -59
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.gitattributes +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.github/funding.yml +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.github/workflows/documentation.yml +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.github/workflows/python-publish.yml +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.gitignore +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.gitmodules +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.python-version +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.vscode/settings.json +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/docs/api/index.md +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/docs/configuration.md +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/docs/installation.md +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/docs/tips.md +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkdocs.yml +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/.gitattributes +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/__init__.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/__main__.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/config.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/README.md +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/mkv_to_srt.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/speech_to_text.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/subtitle_utils.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/tmdb_client.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/dependency_links.txt +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/entry_points.txt +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/requires.txt +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/top_level.txt +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/setup.py +0 -0
- {mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/tests/__init__.py +0 -0
|
Binary file
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, master]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main, master]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version:
|
|
15
|
+
- "3.9"
|
|
16
|
+
- "3.10"
|
|
17
|
+
- "3.11"
|
|
18
|
+
- "3.12"
|
|
19
|
+
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Install uv and set the python version
|
|
24
|
+
uses: astral-sh/setup-uv@v4
|
|
25
|
+
with:
|
|
26
|
+
python-version: ${{ matrix.python-version }}
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: |
|
|
30
|
+
uv venv
|
|
31
|
+
uv pip install -e .
|
|
32
|
+
|
|
33
|
+
- name: Run tests with pytest and coverage
|
|
34
|
+
run: |
|
|
35
|
+
uv run --dev pytest --cov-branch --cov-report=xml
|
|
36
|
+
|
|
37
|
+
- name: Upload coverage reports to Codecov
|
|
38
|
+
uses: codecov/codecov-action@v5
|
|
39
|
+
with:
|
|
40
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
|
@@ -51,46 +51,31 @@ Automatically match and rename your MKV TV episodes using The Movie Database (TM
|
|
|
51
51
|
- ✨ **Bulk Processing**: Handle entire seasons at once
|
|
52
52
|
- 🧪 **Dry Run Mode**: Test changes before applying
|
|
53
53
|
|
|
54
|
+
## Prerequisites
|
|
55
|
+
|
|
56
|
+
- Python 3.9 or higher
|
|
57
|
+
- [FFmpeg](https://ffmpeg.org/download.html) installed and available in system PATH
|
|
58
|
+
- [Tesseract OCR](https://github.com/UB-Mannheim/tesseract/wiki) installed (required for image-based subtitle processing)
|
|
59
|
+
- TMDb API key
|
|
60
|
+
- OpenSubtitles account (optional, for subtitle downloads)
|
|
61
|
+
|
|
54
62
|
## Quick Start
|
|
55
63
|
|
|
56
64
|
1. Install the package:
|
|
57
65
|
```bash
|
|
58
66
|
pip install mkv-episode-matcher
|
|
59
67
|
```
|
|
68
|
+
2. Download .srt subtitles files to ~/.mkv-episode-matcher/cache/data/Show Name/
|
|
60
69
|
|
|
61
|
-
|
|
70
|
+
3. Run on your show directory:
|
|
62
71
|
```bash
|
|
63
|
-
mkv-match --show-dir "path/to/your/show"
|
|
72
|
+
mkv-match --show-dir "path/to/your/show"
|
|
64
73
|
```
|
|
65
74
|
|
|
66
|
-
## Requirements
|
|
67
|
-
|
|
68
|
-
- Python 3.8 or higher
|
|
69
|
-
- TMDb API key
|
|
70
|
-
- OpenSubtitles account (optional, for subtitle downloads)
|
|
71
|
-
|
|
72
75
|
## Documentation
|
|
73
76
|
|
|
74
77
|
Full documentation is available at [https://jsakkos.github.io/mkv-episode-matcher/](https://jsakkos.github.io/mkv-episode-matcher/)
|
|
75
78
|
|
|
76
|
-
## Basic Usage
|
|
77
|
-
|
|
78
|
-
```python
|
|
79
|
-
from mkv_episode_matcher import process_show
|
|
80
|
-
|
|
81
|
-
# Process all seasons
|
|
82
|
-
process_show()
|
|
83
|
-
|
|
84
|
-
# Process specific season
|
|
85
|
-
process_show(season=1)
|
|
86
|
-
|
|
87
|
-
# Test run without making changes
|
|
88
|
-
process_show(season=1, dry_run=True)
|
|
89
|
-
|
|
90
|
-
# Process and download subtitles
|
|
91
|
-
process_show(get_subs=True)
|
|
92
|
-
```
|
|
93
|
-
|
|
94
79
|
## Directory Structure
|
|
95
80
|
|
|
96
81
|
MKV Episode Matcher expects your TV shows to be organized as follows:
|
|
@@ -105,6 +90,23 @@ Show Name/
|
|
|
105
90
|
│ └── episode2.mkv
|
|
106
91
|
```
|
|
107
92
|
|
|
93
|
+
## Reference Subtitle File Structure
|
|
94
|
+
|
|
95
|
+
Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
~/.mkv-episode-matcher/cache/data/Show Name/
|
|
100
|
+
├── Show Name - S01E01.srt
|
|
101
|
+
├── Show Name - S01E02.srt
|
|
102
|
+
└── ...
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
On Windows, the cache directory is located at `C:\Users\{username}\.mkv-episode-matcher\cache\data\`
|
|
106
|
+
|
|
107
|
+
Reference subtitle files should follow this naming pattern:
|
|
108
|
+
`{show_name} - S{season:02d}E{episode:02d}.srt`
|
|
109
|
+
|
|
108
110
|
## Contributing
|
|
109
111
|
|
|
110
112
|
1. Fork the repository
|
|
@@ -22,46 +22,31 @@ Automatically match and rename your MKV TV episodes using The Movie Database (TM
|
|
|
22
22
|
- ✨ **Bulk Processing**: Handle entire seasons at once
|
|
23
23
|
- 🧪 **Dry Run Mode**: Test changes before applying
|
|
24
24
|
|
|
25
|
+
## Prerequisites
|
|
26
|
+
|
|
27
|
+
- Python 3.9 or higher
|
|
28
|
+
- [FFmpeg](https://ffmpeg.org/download.html) installed and available in system PATH
|
|
29
|
+
- [Tesseract OCR](https://github.com/UB-Mannheim/tesseract/wiki) installed (required for image-based subtitle processing)
|
|
30
|
+
- TMDb API key
|
|
31
|
+
- OpenSubtitles account (optional, for subtitle downloads)
|
|
32
|
+
|
|
25
33
|
## Quick Start
|
|
26
34
|
|
|
27
35
|
1. Install the package:
|
|
28
36
|
```bash
|
|
29
37
|
pip install mkv-episode-matcher
|
|
30
38
|
```
|
|
39
|
+
2. Download .srt subtitles files to ~/.mkv-episode-matcher/cache/data/Show Name/
|
|
31
40
|
|
|
32
|
-
|
|
41
|
+
3. Run on your show directory:
|
|
33
42
|
```bash
|
|
34
|
-
mkv-match --show-dir "path/to/your/show"
|
|
43
|
+
mkv-match --show-dir "path/to/your/show"
|
|
35
44
|
```
|
|
36
45
|
|
|
37
|
-
## Requirements
|
|
38
|
-
|
|
39
|
-
- Python 3.8 or higher
|
|
40
|
-
- TMDb API key
|
|
41
|
-
- OpenSubtitles account (optional, for subtitle downloads)
|
|
42
|
-
|
|
43
46
|
## Documentation
|
|
44
47
|
|
|
45
48
|
Full documentation is available at [https://jsakkos.github.io/mkv-episode-matcher/](https://jsakkos.github.io/mkv-episode-matcher/)
|
|
46
49
|
|
|
47
|
-
## Basic Usage
|
|
48
|
-
|
|
49
|
-
```python
|
|
50
|
-
from mkv_episode_matcher import process_show
|
|
51
|
-
|
|
52
|
-
# Process all seasons
|
|
53
|
-
process_show()
|
|
54
|
-
|
|
55
|
-
# Process specific season
|
|
56
|
-
process_show(season=1)
|
|
57
|
-
|
|
58
|
-
# Test run without making changes
|
|
59
|
-
process_show(season=1, dry_run=True)
|
|
60
|
-
|
|
61
|
-
# Process and download subtitles
|
|
62
|
-
process_show(get_subs=True)
|
|
63
|
-
```
|
|
64
|
-
|
|
65
50
|
## Directory Structure
|
|
66
51
|
|
|
67
52
|
MKV Episode Matcher expects your TV shows to be organized as follows:
|
|
@@ -76,6 +61,23 @@ Show Name/
|
|
|
76
61
|
│ └── episode2.mkv
|
|
77
62
|
```
|
|
78
63
|
|
|
64
|
+
## Reference Subtitle File Structure
|
|
65
|
+
|
|
66
|
+
Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
~/.mkv-episode-matcher/cache/data/Show Name/
|
|
71
|
+
├── Show Name - S01E01.srt
|
|
72
|
+
├── Show Name - S01E02.srt
|
|
73
|
+
└── ...
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
On Windows, the cache directory is located at `C:\Users\{username}\.mkv-episode-matcher\cache\data\`
|
|
77
|
+
|
|
78
|
+
Reference subtitle files should follow this naming pattern:
|
|
79
|
+
`{show_name} - S{season:02d}E{episode:02d}.srt`
|
|
80
|
+
|
|
79
81
|
## Contributing
|
|
80
82
|
|
|
81
83
|
1. Fork the repository
|
|
@@ -16,14 +16,14 @@ mkv-match --show-dir "/path/to/show" --season 1
|
|
|
16
16
|
|
|
17
17
|
## Command Options
|
|
18
18
|
|
|
19
|
-
| Option
|
|
20
|
-
|
|
21
|
-
| `--show-dir`
|
|
22
|
-
| `--season`
|
|
23
|
-
| `--dry-run`
|
|
24
|
-
| `--get-subs`
|
|
25
|
-
| `--tmdb-api-key`
|
|
26
|
-
| `--tesseract-path` | Path to Tesseract
|
|
19
|
+
| Option | Description | Default |
|
|
20
|
+
| ------------------ | --------------------------- | ---------- |
|
|
21
|
+
| `--show-dir` | Show directory path | None |
|
|
22
|
+
| `--season` | Season number to process | None (all) |
|
|
23
|
+
| `--dry-run` | Test without making changes | False |
|
|
24
|
+
| `--get-subs` | Download subtitles | False |
|
|
25
|
+
| `--tmdb-api-key` | TMDb API key | None |
|
|
26
|
+
| `--tesseract-path` | Path to Tesseract | None |
|
|
27
27
|
|
|
28
28
|
## Examples
|
|
29
29
|
|
|
@@ -55,25 +55,6 @@ mkv-match \
|
|
|
55
55
|
--dry-run true
|
|
56
56
|
```
|
|
57
57
|
|
|
58
|
-
## Environment Variables
|
|
59
|
-
|
|
60
|
-
Alternative to command line options:
|
|
61
|
-
|
|
62
|
-
```bash
|
|
63
|
-
export TMDB_API_KEY="your_key"
|
|
64
|
-
export SHOW_DIR="/path/to/shows"
|
|
65
|
-
mkv-match
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
## Exit Codes
|
|
69
|
-
|
|
70
|
-
| Code | Meaning |
|
|
71
|
-
|------|---------|
|
|
72
|
-
| 0 | Success |
|
|
73
|
-
| 1 | General error |
|
|
74
|
-
| 2 | Configuration error |
|
|
75
|
-
| 3 | API error |
|
|
76
|
-
|
|
77
58
|
## Logging
|
|
78
59
|
|
|
79
60
|
Logs are stored in:
|
|
@@ -89,3 +70,4 @@ Logs are stored in:
|
|
|
89
70
|
2. Use dry-run first to test
|
|
90
71
|
3. Check logs for details
|
|
91
72
|
4. Use full paths for reliability
|
|
73
|
+
5. Avoid using a trailing slash in paths
|
|
@@ -42,34 +42,31 @@ Show Name/
|
|
|
42
42
|
│ └── episode2.mkv
|
|
43
43
|
```
|
|
44
44
|
|
|
45
|
-
##
|
|
45
|
+
## Reference Subtitle File Structure
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
from mkv_episode_matcher import process_show
|
|
47
|
+
Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
|
|
49
48
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# Test run
|
|
57
|
-
process_show(season=1, dry_run=True)
|
|
58
|
-
|
|
59
|
-
# With subtitles
|
|
60
|
-
process_show(season=1, get_subs=True)
|
|
49
|
+
```plaintext
|
|
50
|
+
~/.mkv-episode-matcher/cache/data/Show Name/
|
|
51
|
+
├── Show Name - S01E01.srt
|
|
52
|
+
├── Show Name - S01E02.srt
|
|
53
|
+
└── ...
|
|
61
54
|
```
|
|
62
55
|
|
|
63
56
|
## Configuration
|
|
64
57
|
|
|
65
|
-
|
|
58
|
+
The configuration file is automatically generated at `~/.mkv-episode-matcher/config.ini`:
|
|
66
59
|
|
|
67
60
|
```ini
|
|
68
61
|
[Config]
|
|
69
|
-
tmdb_api_key =
|
|
70
|
-
|
|
71
|
-
show_dir = /path/to/shows
|
|
62
|
+
tmdb_api_key = your_tmdb_api_key
|
|
63
|
+
show_dir = /path/to/show
|
|
72
64
|
max_threads = 4
|
|
65
|
+
open_subtitles_api_key = your_opensubs_key
|
|
66
|
+
open_subtitles_user_agent = your_user_agent
|
|
67
|
+
open_subtitles_username = your_username
|
|
68
|
+
open_subtitles_password = your_password
|
|
69
|
+
tesseract_path = C:\Program Files\Tesseract-OCR\tesseract.exe
|
|
73
70
|
```
|
|
74
71
|
|
|
75
72
|
## Next Steps
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
import tempfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import torch
|
|
7
|
+
from rapidfuzz import fuzz
|
|
8
|
+
from loguru import logger
|
|
9
|
+
import whisper
|
|
10
|
+
import numpy as np
|
|
11
|
+
import re
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
import chardet
|
|
14
|
+
from loguru import logger
|
|
15
|
+
|
|
16
|
+
class EpisodeMatcher:
|
|
17
|
+
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
18
|
+
self.cache_dir = Path(cache_dir)
|
|
19
|
+
self.min_confidence = min_confidence
|
|
20
|
+
self.show_name = show_name
|
|
21
|
+
self.chunk_duration = 300 # 5 minutes
|
|
22
|
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
23
|
+
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
24
|
+
self.temp_dir.mkdir(exist_ok=True)
|
|
25
|
+
|
|
26
|
+
def clean_text(self, text):
|
|
27
|
+
text = text.lower().strip()
|
|
28
|
+
text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
|
|
29
|
+
text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
|
|
30
|
+
return ' '.join(text.split())
|
|
31
|
+
|
|
32
|
+
def chunk_score(self, whisper_chunk, ref_chunk):
|
|
33
|
+
whisper_clean = self.clean_text(whisper_chunk)
|
|
34
|
+
ref_clean = self.clean_text(ref_chunk)
|
|
35
|
+
return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
|
|
36
|
+
fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
|
|
37
|
+
|
|
38
|
+
def extract_audio_chunk(self, mkv_file, start_time):
|
|
39
|
+
"""Extract a chunk of audio from MKV file."""
|
|
40
|
+
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
41
|
+
if not chunk_path.exists():
|
|
42
|
+
cmd = [
|
|
43
|
+
'ffmpeg',
|
|
44
|
+
'-ss', str(start_time),
|
|
45
|
+
'-t', str(self.chunk_duration),
|
|
46
|
+
'-i', mkv_file,
|
|
47
|
+
'-vn',
|
|
48
|
+
'-acodec', 'pcm_s16le',
|
|
49
|
+
'-ar', '16000',
|
|
50
|
+
'-ac', '1',
|
|
51
|
+
str(chunk_path)
|
|
52
|
+
]
|
|
53
|
+
subprocess.run(cmd, capture_output=True)
|
|
54
|
+
return str(chunk_path)
|
|
55
|
+
|
|
56
|
+
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
57
|
+
"""
|
|
58
|
+
Load reference subtitles for a specific time chunk with robust encoding handling.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
srt_file (str or Path): Path to the SRT file
|
|
62
|
+
chunk_idx (int): Index of the chunk to load
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
str: Combined text from the subtitle chunk
|
|
66
|
+
"""
|
|
67
|
+
chunk_start = chunk_idx * self.chunk_duration
|
|
68
|
+
chunk_end = chunk_start + self.chunk_duration
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
# Read the file content using our robust reader
|
|
72
|
+
reader = SubtitleReader()
|
|
73
|
+
content = reader.read_srt_file(srt_file)
|
|
74
|
+
|
|
75
|
+
# Extract subtitles for the time chunk
|
|
76
|
+
text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
|
|
77
|
+
|
|
78
|
+
return ' '.join(text_lines)
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.error(f"Error loading reference chunk from {srt_file}: {e}")
|
|
82
|
+
return ''
|
|
83
|
+
|
|
84
|
+
def identify_episode(self, video_file, temp_dir, season_number):
|
|
85
|
+
try:
|
|
86
|
+
# Get video duration
|
|
87
|
+
duration = float(subprocess.check_output([
|
|
88
|
+
'ffprobe', '-v', 'error',
|
|
89
|
+
'-show_entries', 'format=duration',
|
|
90
|
+
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
91
|
+
video_file
|
|
92
|
+
]).decode())
|
|
93
|
+
|
|
94
|
+
total_chunks = int(np.ceil(duration / self.chunk_duration))
|
|
95
|
+
|
|
96
|
+
# Load Whisper model
|
|
97
|
+
model = whisper.load_model("base", device=self.device)
|
|
98
|
+
|
|
99
|
+
# Get season-specific reference files using multiple patterns
|
|
100
|
+
reference_dir = self.cache_dir / "data" / self.show_name
|
|
101
|
+
|
|
102
|
+
# Create season patterns for different formats
|
|
103
|
+
patterns = [
|
|
104
|
+
f"S{season_number:02d}E", # S01E01
|
|
105
|
+
f"S{season_number}E", # S1E01
|
|
106
|
+
f"{season_number:02d}x", # 01x01
|
|
107
|
+
f"{season_number}x", # 1x01
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
reference_files = []
|
|
111
|
+
for pattern in patterns:
|
|
112
|
+
files = [f for f in reference_dir.glob("*.srt")
|
|
113
|
+
if any(re.search(f"{p}\\d+", f.name, re.IGNORECASE)
|
|
114
|
+
for p in patterns)]
|
|
115
|
+
reference_files.extend(files)
|
|
116
|
+
|
|
117
|
+
# Remove duplicates while preserving order
|
|
118
|
+
reference_files = list(dict.fromkeys(reference_files))
|
|
119
|
+
|
|
120
|
+
if not reference_files:
|
|
121
|
+
logger.error(f"No reference files found for season {season_number}")
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
# Process chunks until match found
|
|
125
|
+
for chunk_idx in range(min(3, total_chunks)): # Only try first 3 chunks
|
|
126
|
+
start_time = chunk_idx * self.chunk_duration
|
|
127
|
+
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
128
|
+
|
|
129
|
+
# Transcribe chunk
|
|
130
|
+
result = model.transcribe(
|
|
131
|
+
audio_path,
|
|
132
|
+
task="transcribe",
|
|
133
|
+
language="en"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
chunk_text = result["text"]
|
|
137
|
+
best_confidence = 0
|
|
138
|
+
best_match = None
|
|
139
|
+
|
|
140
|
+
# Compare with reference chunks
|
|
141
|
+
for ref_file in reference_files:
|
|
142
|
+
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
143
|
+
confidence = self.chunk_score(chunk_text, ref_text)
|
|
144
|
+
|
|
145
|
+
if confidence > best_confidence:
|
|
146
|
+
best_confidence = confidence
|
|
147
|
+
best_match = ref_file
|
|
148
|
+
|
|
149
|
+
if confidence > self.min_confidence:
|
|
150
|
+
season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
|
|
151
|
+
if season_ep:
|
|
152
|
+
season, episode = map(int, season_ep.groups())
|
|
153
|
+
return {
|
|
154
|
+
'season': season,
|
|
155
|
+
'episode': episode,
|
|
156
|
+
'confidence': best_confidence,
|
|
157
|
+
'reference_file': str(best_match),
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
finally:
|
|
163
|
+
# Cleanup temp files
|
|
164
|
+
for file in self.temp_dir.glob("chunk_*.wav"):
|
|
165
|
+
file.unlink()
|
|
166
|
+
|
|
167
|
+
def detect_file_encoding(file_path):
|
|
168
|
+
"""
|
|
169
|
+
Detect the encoding of a file using chardet.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
file_path (str or Path): Path to the file
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
str: Detected encoding, defaults to 'utf-8' if detection fails
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
with open(file_path, 'rb') as f:
|
|
179
|
+
raw_data = f.read()
|
|
180
|
+
result = chardet.detect(raw_data)
|
|
181
|
+
encoding = result['encoding']
|
|
182
|
+
confidence = result['confidence']
|
|
183
|
+
|
|
184
|
+
logger.debug(f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}")
|
|
185
|
+
return encoding if encoding else 'utf-8'
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.warning(f"Error detecting encoding for {file_path}: {e}")
|
|
188
|
+
return 'utf-8'
|
|
189
|
+
|
|
190
|
+
def read_file_with_fallback(file_path, encodings=None):
|
|
191
|
+
"""
|
|
192
|
+
Read a file trying multiple encodings in order of preference.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
file_path (str or Path): Path to the file
|
|
196
|
+
encodings (list): List of encodings to try, defaults to common subtitle encodings
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
str: File contents
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
ValueError: If file cannot be read with any encoding
|
|
203
|
+
"""
|
|
204
|
+
if encodings is None:
|
|
205
|
+
# First try detected encoding, then fallback to common subtitle encodings
|
|
206
|
+
detected = detect_file_encoding(file_path)
|
|
207
|
+
encodings = [detected, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
|
|
208
|
+
|
|
209
|
+
file_path = Path(file_path)
|
|
210
|
+
errors = []
|
|
211
|
+
|
|
212
|
+
for encoding in encodings:
|
|
213
|
+
try:
|
|
214
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
215
|
+
content = f.read()
|
|
216
|
+
logger.debug(f"Successfully read {file_path} using {encoding} encoding")
|
|
217
|
+
return content
|
|
218
|
+
except UnicodeDecodeError as e:
|
|
219
|
+
errors.append(f"{encoding}: {str(e)}")
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(errors)
|
|
223
|
+
logger.error(error_msg)
|
|
224
|
+
raise ValueError(error_msg)
|
|
225
|
+
|
|
226
|
+
class SubtitleReader:
|
|
227
|
+
"""Helper class for reading and parsing subtitle files."""
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def parse_timestamp(timestamp):
|
|
231
|
+
"""Parse SRT timestamp into seconds."""
|
|
232
|
+
hours, minutes, seconds = timestamp.replace(',', '.').split(':')
|
|
233
|
+
return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
|
|
234
|
+
|
|
235
|
+
@staticmethod
|
|
236
|
+
def read_srt_file(file_path):
|
|
237
|
+
"""
|
|
238
|
+
Read an SRT file and return its contents with robust encoding handling.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
file_path (str or Path): Path to the SRT file
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
str: Contents of the SRT file
|
|
245
|
+
"""
|
|
246
|
+
return read_file_with_fallback(file_path)
|
|
247
|
+
|
|
248
|
+
@staticmethod
|
|
249
|
+
def extract_subtitle_chunk(content, start_time, end_time):
|
|
250
|
+
"""
|
|
251
|
+
Extract subtitle text for a specific time window.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
content (str): Full SRT file content
|
|
255
|
+
start_time (float): Chunk start time in seconds
|
|
256
|
+
end_time (float): Chunk end time in seconds
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
list: List of subtitle texts within the time window
|
|
260
|
+
"""
|
|
261
|
+
text_lines = []
|
|
262
|
+
|
|
263
|
+
for block in content.strip().split('\n\n'):
|
|
264
|
+
lines = block.split('\n')
|
|
265
|
+
if len(lines) < 3 or '-->' not in lines[1]:
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
try:
|
|
269
|
+
timestamp = lines[1]
|
|
270
|
+
text = ' '.join(lines[2:])
|
|
271
|
+
|
|
272
|
+
end_stamp = timestamp.split(' --> ')[1].strip()
|
|
273
|
+
total_seconds = SubtitleReader.parse_timestamp(end_stamp)
|
|
274
|
+
|
|
275
|
+
if start_time <= total_seconds <= end_time:
|
|
276
|
+
text_lines.append(text)
|
|
277
|
+
|
|
278
|
+
except (IndexError, ValueError) as e:
|
|
279
|
+
logger.warning(f"Error parsing subtitle block: {e}")
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
return text_lines
|
{mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/episode_matcher.py
RENAMED
|
@@ -29,6 +29,14 @@ def process_show(season=None, dry_run=False, get_subs=False):
|
|
|
29
29
|
show_name = clean_text(os.path.basename(show_dir))
|
|
30
30
|
matcher = EpisodeMatcher(CACHE_DIR, show_name)
|
|
31
31
|
|
|
32
|
+
# Early check for reference files
|
|
33
|
+
reference_dir = Path(CACHE_DIR) / "data" / show_name
|
|
34
|
+
reference_files = list(reference_dir.glob("*.srt"))
|
|
35
|
+
if not reference_files:
|
|
36
|
+
logger.error(f"No reference subtitle files found in {reference_dir}")
|
|
37
|
+
logger.info("Please download reference subtitles first")
|
|
38
|
+
return
|
|
39
|
+
|
|
32
40
|
season_paths = get_valid_seasons(show_dir)
|
|
33
41
|
if not season_paths:
|
|
34
42
|
logger.warning(f"No seasons with .mkv files found")
|
|
@@ -300,7 +300,7 @@ def extract_srt_text(filepath):
|
|
|
300
300
|
|
|
301
301
|
def extract_season_episode(filename):
|
|
302
302
|
"""
|
|
303
|
-
Extract season and episode numbers from filename.
|
|
303
|
+
Extract season and episode numbers from filename with support for multiple formats.
|
|
304
304
|
|
|
305
305
|
Args:
|
|
306
306
|
filename (str): Filename to parse
|
|
@@ -308,10 +308,20 @@ def extract_season_episode(filename):
|
|
|
308
308
|
Returns:
|
|
309
309
|
tuple: (season_number, episode_number)
|
|
310
310
|
"""
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
311
|
+
# List of patterns to try
|
|
312
|
+
patterns = [
|
|
313
|
+
r'S(\d+)E(\d+)', # S01E01
|
|
314
|
+
r'(\d+)x(\d+)', # 1x01 or 01x01
|
|
315
|
+
r'Season\s*(\d+).*?(\d+)' # Season 1 - 01
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
for pattern in patterns:
|
|
319
|
+
match = re.search(pattern, filename, re.IGNORECASE)
|
|
320
|
+
if match:
|
|
321
|
+
return int(match.group(1)), int(match.group(2))
|
|
322
|
+
|
|
314
323
|
return None, None
|
|
324
|
+
|
|
315
325
|
def process_srt_files(show_dir):
|
|
316
326
|
"""
|
|
317
327
|
Process all SRT files in the given directory and its subdirectories.
|
{mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: mkv-episode-matcher
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
5
5
|
Home-page: https://github.com/Jsakkos/mkv-episode-matcher
|
|
6
6
|
Author: Jonathan Sakkos
|
|
@@ -51,46 +51,31 @@ Automatically match and rename your MKV TV episodes using The Movie Database (TM
|
|
|
51
51
|
- ✨ **Bulk Processing**: Handle entire seasons at once
|
|
52
52
|
- 🧪 **Dry Run Mode**: Test changes before applying
|
|
53
53
|
|
|
54
|
+
## Prerequisites
|
|
55
|
+
|
|
56
|
+
- Python 3.9 or higher
|
|
57
|
+
- [FFmpeg](https://ffmpeg.org/download.html) installed and available in system PATH
|
|
58
|
+
- [Tesseract OCR](https://github.com/UB-Mannheim/tesseract/wiki) installed (required for image-based subtitle processing)
|
|
59
|
+
- TMDb API key
|
|
60
|
+
- OpenSubtitles account (optional, for subtitle downloads)
|
|
61
|
+
|
|
54
62
|
## Quick Start
|
|
55
63
|
|
|
56
64
|
1. Install the package:
|
|
57
65
|
```bash
|
|
58
66
|
pip install mkv-episode-matcher
|
|
59
67
|
```
|
|
68
|
+
2. Download .srt subtitles files to ~/.mkv-episode-matcher/cache/data/Show Name/
|
|
60
69
|
|
|
61
|
-
|
|
70
|
+
3. Run on your show directory:
|
|
62
71
|
```bash
|
|
63
|
-
mkv-match --show-dir "path/to/your/show"
|
|
72
|
+
mkv-match --show-dir "path/to/your/show"
|
|
64
73
|
```
|
|
65
74
|
|
|
66
|
-
## Requirements
|
|
67
|
-
|
|
68
|
-
- Python 3.8 or higher
|
|
69
|
-
- TMDb API key
|
|
70
|
-
- OpenSubtitles account (optional, for subtitle downloads)
|
|
71
|
-
|
|
72
75
|
## Documentation
|
|
73
76
|
|
|
74
77
|
Full documentation is available at [https://jsakkos.github.io/mkv-episode-matcher/](https://jsakkos.github.io/mkv-episode-matcher/)
|
|
75
78
|
|
|
76
|
-
## Basic Usage
|
|
77
|
-
|
|
78
|
-
```python
|
|
79
|
-
from mkv_episode_matcher import process_show
|
|
80
|
-
|
|
81
|
-
# Process all seasons
|
|
82
|
-
process_show()
|
|
83
|
-
|
|
84
|
-
# Process specific season
|
|
85
|
-
process_show(season=1)
|
|
86
|
-
|
|
87
|
-
# Test run without making changes
|
|
88
|
-
process_show(season=1, dry_run=True)
|
|
89
|
-
|
|
90
|
-
# Process and download subtitles
|
|
91
|
-
process_show(get_subs=True)
|
|
92
|
-
```
|
|
93
|
-
|
|
94
79
|
## Directory Structure
|
|
95
80
|
|
|
96
81
|
MKV Episode Matcher expects your TV shows to be organized as follows:
|
|
@@ -105,6 +90,23 @@ Show Name/
|
|
|
105
90
|
│ └── episode2.mkv
|
|
106
91
|
```
|
|
107
92
|
|
|
93
|
+
## Reference Subtitle File Structure
|
|
94
|
+
|
|
95
|
+
Subtitle files that are not automatically downloaded using the `--get-subs` flag should be named as follows:
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
~/.mkv-episode-matcher/cache/data/Show Name/
|
|
100
|
+
├── Show Name - S01E01.srt
|
|
101
|
+
├── Show Name - S01E02.srt
|
|
102
|
+
└── ...
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
On Windows, the cache directory is located at `C:\Users\{username}\.mkv-episode-matcher\cache\data\`
|
|
106
|
+
|
|
107
|
+
Reference subtitle files should follow this naming pattern:
|
|
108
|
+
`{show_name} - S{season:02d}E{episode:02d}.srt`
|
|
109
|
+
|
|
108
110
|
## Contributing
|
|
109
111
|
|
|
110
112
|
1. Fork the repository
|
{mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/SOURCES.txt
RENAMED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
.coverage
|
|
1
2
|
.gitattributes
|
|
2
3
|
.gitignore
|
|
3
4
|
.gitmodules
|
|
@@ -11,6 +12,7 @@ uv.lock
|
|
|
11
12
|
.github/funding.yml
|
|
12
13
|
.github/workflows/documentation.yml
|
|
13
14
|
.github/workflows/python-publish.yml
|
|
15
|
+
.github/workflows/tests.yml
|
|
14
16
|
.vscode/settings.json
|
|
15
17
|
docs/cli.md
|
|
16
18
|
docs/configuration.md
|
|
@@ -46,5 +48,4 @@ mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py
|
|
|
46
48
|
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py
|
|
47
49
|
mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py
|
|
48
50
|
tests/__init__.py
|
|
49
|
-
tests/test_improvements.py
|
|
50
51
|
tests/test_main.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = mkv_episode_matcher
|
|
3
|
-
version = 0.3.
|
|
3
|
+
version = 0.3.6
|
|
4
4
|
author = Jonathan Sakkos
|
|
5
5
|
author_email = jonathansakkos@gmail.com
|
|
6
6
|
description = The MKV Episode Matcher is a tool for identifying TV series episodes from MKV files and renaming the files accordingly.
|
|
@@ -13,6 +13,31 @@ from mkv_episode_matcher.utils import (
|
|
|
13
13
|
)
|
|
14
14
|
from mkv_episode_matcher.episode_identification import EpisodeMatcher
|
|
15
15
|
from mkv_episode_matcher.config import get_config, set_config
|
|
16
|
+
from unittest.mock import Mock, patch
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# @pytest.fixture
|
|
20
|
+
# def mock_config():
|
|
21
|
+
# return {
|
|
22
|
+
# "tmdb_api_key": "test_key",
|
|
23
|
+
# "show_dir": "/test/path",
|
|
24
|
+
# "max_threads": 4,
|
|
25
|
+
# "tesseract_path": "/usr/bin/tesseract",
|
|
26
|
+
# }
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def mock_episode_data():
|
|
31
|
+
return {
|
|
32
|
+
"name": "Test Episode",
|
|
33
|
+
"season_number": 1,
|
|
34
|
+
"episode_number": 1,
|
|
35
|
+
"overview": "Test overview",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
@pytest.fixture
|
|
39
|
+
def mock_seasons():
|
|
40
|
+
return ["/test/path/Season 1"]
|
|
16
41
|
|
|
17
42
|
@pytest.fixture
|
|
18
43
|
def temp_show_dir(tmp_path):
|
|
@@ -101,8 +126,8 @@ class TestEpisodeMatcher:
|
|
|
101
126
|
return EpisodeMatcher(tmp_path, "Test Show")
|
|
102
127
|
|
|
103
128
|
def test_clean_text(self, matcher):
|
|
104
|
-
text = "Test [action]
|
|
105
|
-
assert matcher.clean_text(text) == "test action
|
|
129
|
+
text = "Test [action] T-t-test"
|
|
130
|
+
assert matcher.clean_text(text) == "test action test"
|
|
106
131
|
|
|
107
132
|
def test_chunk_score(self, matcher):
|
|
108
133
|
score = matcher.chunk_score("Test dialogue", "test dialog")
|
|
@@ -116,22 +141,27 @@ class TestEpisodeMatcher:
|
|
|
116
141
|
assert isinstance(chunk, str)
|
|
117
142
|
assert mock_run.called
|
|
118
143
|
|
|
119
|
-
class
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
@patch(
|
|
130
|
-
def
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
144
|
+
class TestEpisodeMatcher:
|
|
145
|
+
def test_extract_season_episode(self):
|
|
146
|
+
from mkv_episode_matcher.utils import extract_season_episode
|
|
147
|
+
|
|
148
|
+
# Test valid filename
|
|
149
|
+
assert extract_season_episode("Show - S01E02.mkv") == (1, 2)
|
|
150
|
+
|
|
151
|
+
# Test invalid filename
|
|
152
|
+
assert extract_season_episode("invalid.mkv") == (None, None)
|
|
153
|
+
|
|
154
|
+
@patch("mkv_episode_matcher.tmdb_client.requests.get")
|
|
155
|
+
def test_fetch_show_id(self, mock_get):
|
|
156
|
+
from mkv_episode_matcher.tmdb_client import fetch_show_id
|
|
157
|
+
|
|
158
|
+
mock_response = Mock()
|
|
159
|
+
mock_response.status_code = 200
|
|
160
|
+
mock_response.json.return_value = {"results": [{"id": 12345}]}
|
|
161
|
+
mock_get.return_value = mock_response
|
|
162
|
+
|
|
163
|
+
assert fetch_show_id("Test Show") == "12345"
|
|
164
|
+
|
|
135
165
|
|
|
136
166
|
if __name__ == '__main__':
|
|
137
167
|
pytest.main(['-v'])
|
|
@@ -24,6 +24,15 @@ wheels = [
|
|
|
24
24
|
{ url = "https://files.pythonhosted.org/packages/12/90/3c9ff0512038035f59d279fddeb79f5f1eccd8859f06d6163c58798b9487/certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8", size = 167321 },
|
|
25
25
|
]
|
|
26
26
|
|
|
27
|
+
[[package]]
|
|
28
|
+
name = "chardet"
|
|
29
|
+
version = "5.2.0"
|
|
30
|
+
source = { registry = "https://pypi.org/simple" }
|
|
31
|
+
sdist = { url = "https://files.pythonhosted.org/packages/f3/0d/f7b6ab21ec75897ed80c17d79b15951a719226b9fababf1e40ea74d69079/chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7", size = 2069618 }
|
|
32
|
+
wheels = [
|
|
33
|
+
{ url = "https://files.pythonhosted.org/packages/38/6f/f5fbc992a329ee4e0f288c1fe0e2ad9485ed064cac731ed2fe47dcc38cbf/chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970", size = 199385 },
|
|
34
|
+
]
|
|
35
|
+
|
|
27
36
|
[[package]]
|
|
28
37
|
name = "charset-normalizer"
|
|
29
38
|
version = "3.4.0"
|
|
@@ -374,7 +383,7 @@ wheels = [
|
|
|
374
383
|
|
|
375
384
|
[[package]]
|
|
376
385
|
name = "mkv-episode-matcher"
|
|
377
|
-
version = "0.3.
|
|
386
|
+
version = "0.3.5.post1.dev0+g1beb2cf.d20250112"
|
|
378
387
|
source = { editable = "." }
|
|
379
388
|
dependencies = [
|
|
380
389
|
{ name = "configparser" },
|
|
@@ -391,6 +400,7 @@ dependencies = [
|
|
|
391
400
|
|
|
392
401
|
[package.dev-dependencies]
|
|
393
402
|
dev = [
|
|
403
|
+
{ name = "chardet" },
|
|
394
404
|
{ name = "pytest" },
|
|
395
405
|
{ name = "pytest-cov" },
|
|
396
406
|
{ name = "ruff" },
|
|
@@ -412,6 +422,7 @@ requires-dist = [
|
|
|
412
422
|
|
|
413
423
|
[package.metadata.requires-dev]
|
|
414
424
|
dev = [
|
|
425
|
+
{ name = "chardet", specifier = ">=5.2.0" },
|
|
415
426
|
{ name = "pytest", specifier = ">=8.3.3" },
|
|
416
427
|
{ name = "pytest-cov", specifier = ">=6.0.0" },
|
|
417
428
|
{ name = "ruff", specifier = ">=0.8.0" },
|
|
@@ -1,150 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import subprocess
|
|
4
|
-
import tempfile
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import torch
|
|
7
|
-
from rapidfuzz import fuzz
|
|
8
|
-
from loguru import logger
|
|
9
|
-
import whisper
|
|
10
|
-
import numpy as np
|
|
11
|
-
import re
|
|
12
|
-
class EpisodeMatcher:
|
|
13
|
-
def __init__(self, cache_dir, show_name, min_confidence=0.6):
|
|
14
|
-
self.cache_dir = Path(cache_dir)
|
|
15
|
-
self.min_confidence = min_confidence
|
|
16
|
-
self.show_name = show_name
|
|
17
|
-
self.chunk_duration = 300 # 5 minutes
|
|
18
|
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
19
|
-
self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
|
|
20
|
-
self.temp_dir.mkdir(exist_ok=True)
|
|
21
|
-
|
|
22
|
-
def clean_text(self, text):
|
|
23
|
-
text = text.lower().strip()
|
|
24
|
-
text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
|
|
25
|
-
text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
|
|
26
|
-
return ' '.join(text.split())
|
|
27
|
-
|
|
28
|
-
def chunk_score(self, whisper_chunk, ref_chunk):
|
|
29
|
-
whisper_clean = self.clean_text(whisper_chunk)
|
|
30
|
-
ref_clean = self.clean_text(ref_chunk)
|
|
31
|
-
return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
|
|
32
|
-
fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
|
|
33
|
-
|
|
34
|
-
def extract_audio_chunk(self, mkv_file, start_time):
|
|
35
|
-
"""Extract a chunk of audio from MKV file."""
|
|
36
|
-
chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
|
|
37
|
-
if not chunk_path.exists():
|
|
38
|
-
cmd = [
|
|
39
|
-
'ffmpeg',
|
|
40
|
-
'-ss', str(start_time),
|
|
41
|
-
'-t', str(self.chunk_duration),
|
|
42
|
-
'-i', mkv_file,
|
|
43
|
-
'-vn',
|
|
44
|
-
'-acodec', 'pcm_s16le',
|
|
45
|
-
'-ar', '16000',
|
|
46
|
-
'-ac', '1',
|
|
47
|
-
str(chunk_path)
|
|
48
|
-
]
|
|
49
|
-
subprocess.run(cmd, capture_output=True)
|
|
50
|
-
return str(chunk_path)
|
|
51
|
-
|
|
52
|
-
def load_reference_chunk(self, srt_file, chunk_idx):
|
|
53
|
-
"""Load reference subtitles for a specific time chunk."""
|
|
54
|
-
chunk_start = chunk_idx * self.chunk_duration
|
|
55
|
-
chunk_end = chunk_start + self.chunk_duration
|
|
56
|
-
text_lines = []
|
|
57
|
-
|
|
58
|
-
with open(srt_file, 'r', encoding='utf-8') as f:
|
|
59
|
-
content = f.read().strip()
|
|
60
|
-
|
|
61
|
-
for block in content.split('\n\n'):
|
|
62
|
-
lines = block.split('\n')
|
|
63
|
-
if len(lines) < 3 or '-->' not in lines[1]: # Skip malformed blocks
|
|
64
|
-
continue
|
|
65
|
-
|
|
66
|
-
try:
|
|
67
|
-
timestamp = lines[1]
|
|
68
|
-
text = ' '.join(lines[2:])
|
|
69
|
-
|
|
70
|
-
end_time = timestamp.split(' --> ')[1].strip()
|
|
71
|
-
hours, minutes, seconds = map(float, end_time.replace(',','.').split(':'))
|
|
72
|
-
total_seconds = hours * 3600 + minutes * 60 + seconds
|
|
73
|
-
|
|
74
|
-
if chunk_start <= total_seconds <= chunk_end:
|
|
75
|
-
text_lines.append(text)
|
|
76
|
-
|
|
77
|
-
except (IndexError, ValueError):
|
|
78
|
-
continue
|
|
79
|
-
|
|
80
|
-
return ' '.join(text_lines)
|
|
81
|
-
|
|
82
|
-
def identify_episode(self, video_file, temp_dir, season_number):
|
|
83
|
-
try:
|
|
84
|
-
# Get video duration
|
|
85
|
-
duration = float(subprocess.check_output([
|
|
86
|
-
'ffprobe', '-v', 'error',
|
|
87
|
-
'-show_entries', 'format=duration',
|
|
88
|
-
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
89
|
-
video_file
|
|
90
|
-
]).decode())
|
|
91
|
-
|
|
92
|
-
total_chunks = int(np.ceil(duration / self.chunk_duration))
|
|
93
|
-
|
|
94
|
-
# Load Whisper model
|
|
95
|
-
model = whisper.load_model("base", device=self.device)
|
|
96
|
-
|
|
97
|
-
# Get season-specific reference files
|
|
98
|
-
reference_dir = self.cache_dir / "data" / self.show_name
|
|
99
|
-
season_pattern = f"S{season_number:02d}E"
|
|
100
|
-
reference_files = [
|
|
101
|
-
f for f in reference_dir.glob("*.srt")
|
|
102
|
-
if season_pattern in f.name
|
|
103
|
-
]
|
|
104
|
-
|
|
105
|
-
if not reference_files:
|
|
106
|
-
logger.error(f"No reference files found for season {season_number}")
|
|
107
|
-
return None
|
|
108
|
-
|
|
109
|
-
# Process chunks until match found
|
|
110
|
-
for chunk_idx in range(min(3, total_chunks)): # Only try first 3 chunks
|
|
111
|
-
start_time = chunk_idx * self.chunk_duration
|
|
112
|
-
audio_path = self.extract_audio_chunk(video_file, start_time)
|
|
113
|
-
|
|
114
|
-
# Transcribe chunk
|
|
115
|
-
result = model.transcribe(
|
|
116
|
-
audio_path,
|
|
117
|
-
task="transcribe",
|
|
118
|
-
language="en"
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
chunk_text = result["text"]
|
|
122
|
-
best_confidence = 0
|
|
123
|
-
best_match = None
|
|
124
|
-
|
|
125
|
-
# Compare with reference chunks
|
|
126
|
-
for ref_file in reference_files:
|
|
127
|
-
ref_text = self.load_reference_chunk(ref_file, chunk_idx)
|
|
128
|
-
confidence = self.chunk_score(chunk_text, ref_text)
|
|
129
|
-
|
|
130
|
-
if confidence > best_confidence:
|
|
131
|
-
best_confidence = confidence
|
|
132
|
-
best_match = ref_file
|
|
133
|
-
|
|
134
|
-
if confidence > self.min_confidence:
|
|
135
|
-
season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
|
|
136
|
-
if season_ep:
|
|
137
|
-
season, episode = map(int, season_ep.groups())
|
|
138
|
-
return {
|
|
139
|
-
'season': season,
|
|
140
|
-
'episode': episode,
|
|
141
|
-
'confidence': best_confidence,
|
|
142
|
-
'reference_file': str(best_match),
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
return None
|
|
146
|
-
|
|
147
|
-
finally:
|
|
148
|
-
# Cleanup temp files
|
|
149
|
-
for file in self.temp_dir.glob("chunk_*.wav"):
|
|
150
|
-
file.unlink()
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
from unittest.mock import Mock, patch
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
@pytest.fixture
|
|
7
|
-
def mock_config():
|
|
8
|
-
return {
|
|
9
|
-
"tmdb_api_key": "test_key",
|
|
10
|
-
"show_dir": "/test/path",
|
|
11
|
-
"max_threads": 4,
|
|
12
|
-
"tesseract_path": "/usr/bin/tesseract",
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@pytest.fixture
|
|
17
|
-
def mock_episode_data():
|
|
18
|
-
return {
|
|
19
|
-
"name": "Test Episode",
|
|
20
|
-
"season_number": 1,
|
|
21
|
-
"episode_number": 1,
|
|
22
|
-
"overview": "Test overview",
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class TestEpisodeMatcher:
|
|
27
|
-
def test_extract_season_episode(self):
|
|
28
|
-
from mkv_episode_matcher.episode_matcher import extract_season_episode
|
|
29
|
-
|
|
30
|
-
# Test valid filename
|
|
31
|
-
assert extract_season_episode("Show - S01E02.mkv") == (1, 2)
|
|
32
|
-
|
|
33
|
-
# Test invalid filename
|
|
34
|
-
assert extract_season_episode("invalid.mkv") == (None, None)
|
|
35
|
-
|
|
36
|
-
@patch("mkv_episode_matcher.tmdb_client.requests.get")
|
|
37
|
-
def test_fetch_show_id(self, mock_get):
|
|
38
|
-
from mkv_episode_matcher.tmdb_client import fetch_show_id
|
|
39
|
-
|
|
40
|
-
mock_response = Mock()
|
|
41
|
-
mock_response.status_code = 200
|
|
42
|
-
mock_response.json.return_value = {"results": [{"id": 12345}]}
|
|
43
|
-
mock_get.return_value = mock_response
|
|
44
|
-
|
|
45
|
-
assert fetch_show_id("Test Show") == "12345"
|
|
46
|
-
|
|
47
|
-
@patch("mkv_episode_matcher.utils.OpenSubtitles")
|
|
48
|
-
def test_get_subtitles(self, mock_subtitles):
|
|
49
|
-
from mkv_episode_matcher.utils import get_subtitles
|
|
50
|
-
|
|
51
|
-
# Test subtitle download
|
|
52
|
-
mock_subtitles.return_value.search.return_value.data = [
|
|
53
|
-
{"file_name": "Test.Show.S01E01.srt"}
|
|
54
|
-
]
|
|
55
|
-
|
|
56
|
-
with patch("pathlib.Path.exists", return_value=False):
|
|
57
|
-
get_subtitles(12345, {1})
|
|
58
|
-
|
|
59
|
-
mock_subtitles.return_value.download_and_save.assert_called_once()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/.github/workflows/python-publish.yml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/speech_to_text.py
RENAMED
|
File without changes
|
{mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher/subtitle_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/requires.txt
RENAMED
|
File without changes
|
{mkv_episode_matcher-0.3.4 → mkv_episode_matcher-0.3.6}/mkv_episode_matcher.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|