mkv-episode-matcher 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mkv-episode-matcher might be problematic. Click here for more details.

Files changed (24) hide show
  1. mkv_episode_matcher/__init__.py +2 -2
  2. mkv_episode_matcher/__main__.py +13 -28
  3. mkv_episode_matcher/config.py +0 -3
  4. mkv_episode_matcher/episode_identification.py +163 -124
  5. mkv_episode_matcher/episode_matcher.py +19 -39
  6. mkv_episode_matcher/subtitle_utils.py +26 -25
  7. mkv_episode_matcher/utils.py +56 -56
  8. {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.6.0.dist-info}/METADATA +7 -13
  9. mkv_episode_matcher-0.6.0.dist-info/RECORD +14 -0
  10. {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.6.0.dist-info}/WHEEL +1 -1
  11. mkv_episode_matcher/libraries/pgs2srt/.gitignore +0 -2
  12. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/SubZero.py +0 -321
  13. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/dictionaries/data.py +0 -16700
  14. mkv_episode_matcher/libraries/pgs2srt/Libraries/SubZero/post_processing.py +0 -260
  15. mkv_episode_matcher/libraries/pgs2srt/README.md +0 -26
  16. mkv_episode_matcher/libraries/pgs2srt/__init__.py +0 -0
  17. mkv_episode_matcher/libraries/pgs2srt/imagemaker.py +0 -89
  18. mkv_episode_matcher/libraries/pgs2srt/pgs2srt.py +0 -150
  19. mkv_episode_matcher/libraries/pgs2srt/pgsreader.py +0 -225
  20. mkv_episode_matcher/libraries/pgs2srt/requirements.txt +0 -4
  21. mkv_episode_matcher/mkv_to_srt.py +0 -302
  22. mkv_episode_matcher-0.5.0.dist-info/RECORD +0 -25
  23. {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.6.0.dist-info}/entry_points.txt +0 -0
  24. {mkv_episode_matcher-0.5.0.dist-info → mkv_episode_matcher-0.6.0.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  """MKV Episode Matcher package."""
2
- from importlib.metadata import version, PackageNotFoundError
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
3
4
 
4
5
  try:
5
6
  __version__ = version("mkv-episode-matcher")
6
7
  except PackageNotFoundError:
7
8
  # package is not installed
8
9
  __version__ = "unknown"
9
-
@@ -1,9 +1,9 @@
1
1
  # __main__.py
2
2
  import argparse
3
3
  import os
4
- import sys
5
4
 
6
5
  from loguru import logger
6
+
7
7
  from mkv_episode_matcher import __version__
8
8
  from mkv_episode_matcher.config import get_config, set_config
9
9
 
@@ -34,7 +34,7 @@ if not os.path.exists(log_dir):
34
34
  logger.add(
35
35
  os.path.join(log_dir, "stdout.log"),
36
36
  format="{time} {level} {message}",
37
- level="DEBUG",
37
+ level="INFO",
38
38
  rotation="10 MB",
39
39
  )
40
40
 
@@ -56,7 +56,6 @@ def main():
56
56
  --season: The season number to be processed. If not provided, all seasons will be processed.
57
57
  --dry-run: A boolean flag indicating whether to perform a dry run (i.e., not rename any files). If not provided, the function will rename files.
58
58
  --get-subs: A boolean flag indicating whether to download subtitles for the show. If not provided, the function will not download subtitles.
59
- --tesseract-path: The path to the tesseract executable. If not provided, the function will try to get it from the cache or prompt the user to input it.
60
59
 
61
60
  The function logs its progress to two separate log files: one for standard output and one for errors.
62
61
  """
@@ -67,7 +66,7 @@ def main():
67
66
  "--version",
68
67
  action="version",
69
68
  version=f"%(prog)s {__version__}",
70
- help="Show the version number and exit"
69
+ help="Show the version number and exit",
71
70
  )
72
71
  parser.add_argument("--tmdb-api-key", help="TMDb API key")
73
72
  parser.add_argument("--show-dir", help="Main directory of the show")
@@ -92,13 +91,6 @@ def main():
92
91
  nargs="?",
93
92
  help="Download subtitles for the show (default: None)",
94
93
  )
95
- parser.add_argument(
96
- "--tesseract-path",
97
- type=str,
98
- default=None,
99
- nargs="?",
100
- help="Path to the tesseract executable (default: None)",
101
- )
102
94
  parser.add_argument(
103
95
  "--check-gpu",
104
96
  type=bool,
@@ -109,6 +101,7 @@ def main():
109
101
  args = parser.parse_args()
110
102
  if args.check_gpu:
111
103
  from mkv_episode_matcher.utils import check_gpu_support
104
+
112
105
  check_gpu_support()
113
106
  return
114
107
  logger.debug(f"Command-line arguments: {args}")
@@ -118,17 +111,17 @@ def main():
118
111
 
119
112
  # Get TMDb API key
120
113
  tmdb_api_key = args.tmdb_api_key or config.get("tmdb_api_key")
121
- if not tmdb_api_key:
122
- tmdb_api_key = input("Enter your TMDb API key: ")
123
- logger.debug(f"TMDb API Key: {tmdb_api_key}")
124
-
114
+
125
115
  logger.debug("Getting OpenSubtitles API key")
126
116
  open_subtitles_api_key = config.get("open_subtitles_api_key")
127
117
  open_subtitles_user_agent = config.get("open_subtitles_user_agent")
128
118
  open_subtitles_username = config.get("open_subtitles_username")
129
119
  open_subtitles_password = config.get("open_subtitles_password")
130
-
120
+
131
121
  if args.get_subs:
122
+ if not tmdb_api_key:
123
+ tmdb_api_key = input("Enter your TMDb API key: ")
124
+ logger.debug(f"TMDb API Key: {tmdb_api_key}")
132
125
  if not open_subtitles_api_key:
133
126
  open_subtitles_api_key = input("Enter your OpenSubtitles API key: ")
134
127
  if not open_subtitles_user_agent:
@@ -137,24 +130,17 @@ def main():
137
130
  open_subtitles_username = input("Enter your OpenSubtitles Username: ")
138
131
  if not open_subtitles_password:
139
132
  open_subtitles_password = input("Enter your OpenSubtitles Password: ")
140
-
141
- # Use config for show directory and tesseract path
133
+
134
+ # Use config for show directory
142
135
  show_dir = args.show_dir or config.get("show_dir")
143
136
  if not show_dir:
144
137
  show_dir = input("Enter the main directory of the show:")
145
138
  logger.info(f"Show Directory: {show_dir}")
146
139
  if not show_dir:
147
140
  show_dir = os.getcwd()
148
-
149
- if not args.tesseract_path:
150
- tesseract_path = config.get("tesseract_path")
151
- if not tesseract_path:
152
- tesseract_path = input(r"Enter the path to the tesseract executable: ['C:\Program Files\Tesseract-OCR\tesseract.exe']")
153
- else:
154
- tesseract_path = args.tesseract_path
155
- logger.debug(f"Teesseract Path: {tesseract_path}")
141
+
156
142
  logger.debug(f"Show Directory: {show_dir}")
157
-
143
+
158
144
  # Set the configuration
159
145
  set_config(
160
146
  tmdb_api_key,
@@ -164,7 +150,6 @@ def main():
164
150
  open_subtitles_password,
165
151
  show_dir,
166
152
  CONFIG_FILE,
167
- tesseract_path=tesseract_path,
168
153
  )
169
154
  logger.info("Configuration set")
170
155
 
@@ -27,7 +27,6 @@ def set_config(
27
27
  open_subtitles_password,
28
28
  show_dir,
29
29
  file,
30
- tesseract_path=None,
31
30
  ):
32
31
  """
33
32
  Sets the configuration values and writes them to a file.
@@ -40,7 +39,6 @@ def set_config(
40
39
  open_subtitles_password (str): The password for OpenSubtitles.
41
40
  show_dir (str): The directory where the TV show episodes are located.
42
41
  file (str): The path to the configuration file.
43
- tesseract_path (str, optional): The path to the Tesseract OCR executable.
44
42
 
45
43
  Returns:
46
44
  None
@@ -54,7 +52,6 @@ def set_config(
54
52
  "open_subtitles_user_agent": str(open_subtitles_user_agent),
55
53
  "open_subtitles_username": str(open_subtitles_username),
56
54
  "open_subtitles_password": str(open_subtitles_password),
57
- "tesseract_path": str(tesseract_path),
58
55
  }
59
56
  logger.info(
60
57
  f"Setting config with API:{tmdb_api_key}, show_dir: {show_dir}, and max_threads: {MAX_THREADS}"
@@ -1,17 +1,15 @@
1
- import json
2
- import os
1
+ import re
3
2
  import subprocess
4
3
  import tempfile
5
4
  from pathlib import Path
5
+
6
+ import chardet
7
+ import numpy as np
6
8
  import torch
7
- from rapidfuzz import fuzz
8
- from loguru import logger
9
9
  import whisper
10
- import numpy as np
11
- import re
12
- from pathlib import Path
13
- import chardet
14
10
  from loguru import logger
11
+ from rapidfuzz import fuzz
12
+
15
13
 
16
14
  class EpisodeMatcher:
17
15
  def __init__(self, cache_dir, show_name, min_confidence=0.6):
@@ -22,35 +20,43 @@ class EpisodeMatcher:
22
20
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23
21
  self.temp_dir = Path(tempfile.gettempdir()) / "whisper_chunks"
24
22
  self.temp_dir.mkdir(exist_ok=True)
25
-
23
+
26
24
  def clean_text(self, text):
27
25
  text = text.lower().strip()
28
- text = re.sub(r'\[.*?\]|\<.*?\>', '', text)
29
- text = re.sub(r'([A-Za-z])-\1+', r'\1', text)
30
- return ' '.join(text.split())
26
+ text = re.sub(r"\[.*?\]|\<.*?\>", "", text)
27
+ text = re.sub(r"([A-Za-z])-\1+", r"\1", text)
28
+ return " ".join(text.split())
31
29
 
32
30
  def chunk_score(self, whisper_chunk, ref_chunk):
33
31
  whisper_clean = self.clean_text(whisper_chunk)
34
32
  ref_clean = self.clean_text(ref_chunk)
35
- return (fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7 +
36
- fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3) / 100.0
33
+ return (
34
+ fuzz.token_sort_ratio(whisper_clean, ref_clean) * 0.7
35
+ + fuzz.partial_ratio(whisper_clean, ref_clean) * 0.3
36
+ ) / 100.0
37
37
 
38
38
  def extract_audio_chunk(self, mkv_file, start_time):
39
39
  """Extract a chunk of audio from MKV file."""
40
40
  chunk_path = self.temp_dir / f"chunk_{start_time}.wav"
41
41
  if not chunk_path.exists():
42
42
  cmd = [
43
- 'ffmpeg',
44
- '-ss', str(start_time),
45
- '-t', str(self.chunk_duration),
46
- '-i', mkv_file,
47
- '-vn', # Disable video
48
- '-sn', # Disable subtitles
49
- '-dn', # Disable data streams
50
- '-acodec', 'pcm_s16le',
51
- '-ar', '16000',
52
- '-ac', '1',
53
- str(chunk_path)
43
+ "ffmpeg",
44
+ "-ss",
45
+ str(start_time),
46
+ "-t",
47
+ str(self.chunk_duration),
48
+ "-i",
49
+ mkv_file,
50
+ "-vn", # Disable video
51
+ "-sn", # Disable subtitles
52
+ "-dn", # Disable data streams
53
+ "-acodec",
54
+ "pcm_s16le",
55
+ "-ar",
56
+ "16000",
57
+ "-ac",
58
+ "1",
59
+ str(chunk_path),
54
60
  ]
55
61
  subprocess.run(cmd, capture_output=True)
56
62
  return str(chunk_path)
@@ -58,34 +64,37 @@ class EpisodeMatcher:
58
64
  def load_reference_chunk(self, srt_file, chunk_idx):
59
65
  """
60
66
  Load reference subtitles for a specific time chunk with robust encoding handling.
61
-
67
+
62
68
  Args:
63
69
  srt_file (str or Path): Path to the SRT file
64
70
  chunk_idx (int): Index of the chunk to load
65
-
71
+
66
72
  Returns:
67
73
  str: Combined text from the subtitle chunk
68
74
  """
69
75
  chunk_start = chunk_idx * self.chunk_duration
70
76
  chunk_end = chunk_start + self.chunk_duration
71
-
77
+
72
78
  try:
73
79
  # Read the file content using our robust reader
74
80
  reader = SubtitleReader()
75
81
  content = reader.read_srt_file(srt_file)
76
-
82
+
77
83
  # Extract subtitles for the time chunk
78
84
  text_lines = reader.extract_subtitle_chunk(content, chunk_start, chunk_end)
79
-
80
- return ' '.join(text_lines)
81
-
85
+
86
+ return " ".join(text_lines)
87
+
82
88
  except Exception as e:
83
89
  logger.error(f"Error loading reference chunk from {srt_file}: {e}")
84
- return ''
85
- def _try_match_with_model(self, video_file, model_name, max_duration, reference_files):
90
+ return ""
91
+
92
+ def _try_match_with_model(
93
+ self, video_file, model_name, max_duration, reference_files
94
+ ):
86
95
  """
87
96
  Attempt to match using specified model, checking multiple 30-second chunks up to max_duration.
88
-
97
+
89
98
  Args:
90
99
  video_file: Path to the video file
91
100
  model_name: Name of the Whisper model to use
@@ -94,49 +103,47 @@ class EpisodeMatcher:
94
103
  """
95
104
  # Use cached model
96
105
  model = get_whisper_model(model_name, self.device)
97
-
106
+
98
107
  # Calculate number of chunks to check (30 seconds each)
99
108
  num_chunks = max_duration // self.chunk_duration
100
-
109
+
101
110
  for chunk_idx in range(num_chunks):
102
111
  start_time = chunk_idx * self.chunk_duration
103
112
  logger.debug(f"Trying {model_name} model at {start_time} seconds")
104
-
113
+
105
114
  audio_path = self.extract_audio_chunk(video_file, start_time)
106
-
107
- result = model.transcribe(
108
- audio_path,
109
- task="transcribe",
110
- language="en"
111
- )
112
-
115
+
116
+ result = model.transcribe(audio_path, task="transcribe", language="en")
117
+
113
118
  chunk_text = result["text"]
114
119
  best_confidence = 0
115
120
  best_match = None
116
-
121
+
117
122
  # Compare with reference chunks
118
123
  for ref_file in reference_files:
119
124
  ref_text = self.load_reference_chunk(ref_file, chunk_idx)
120
125
  confidence = self.chunk_score(chunk_text, ref_text)
121
-
126
+
122
127
  if confidence > best_confidence:
123
128
  best_confidence = confidence
124
129
  best_match = ref_file
125
-
130
+
126
131
  if confidence > self.min_confidence:
127
- season_ep = re.search(r'S(\d+)E(\d+)', best_match.stem)
132
+ season_ep = re.search(r"S(\d+)E(\d+)", best_match.stem)
128
133
  if season_ep:
129
134
  season, episode = map(int, season_ep.groups())
130
135
  return {
131
- 'season': season,
132
- 'episode': episode,
133
- 'confidence': best_confidence,
134
- 'reference_file': str(best_match),
135
- 'matched_at': start_time
136
+ "season": season,
137
+ "episode": episode,
138
+ "confidence": best_confidence,
139
+ "reference_file": str(best_match),
140
+ "matched_at": start_time,
136
141
  }
137
-
138
- logger.debug(f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})")
139
-
142
+
143
+ logger.info(
144
+ f"No match found at {start_time} seconds (best confidence: {best_confidence:.2f})"
145
+ )
146
+
140
147
  return None
141
148
 
142
149
  def identify_episode(self, video_file, temp_dir, season_number):
@@ -150,44 +157,67 @@ class EpisodeMatcher:
150
157
  f"{season_number:02d}x",
151
158
  f"{season_number}x",
152
159
  ]
153
-
160
+
154
161
  reference_files = []
155
- for pattern in patterns:
156
- files = [f for f in reference_dir.glob("*.srt")
157
- if any(re.search(f"{p}\\d+", f.name, re.IGNORECASE)
158
- for p in patterns)]
162
+ # TODO Figure our why patterns is not being used
163
+ for _pattern in patterns:
164
+ files = [
165
+ f
166
+ for f in reference_dir.glob("*.srt")
167
+ if any(
168
+ re.search(f"{p}\\d+", f.name, re.IGNORECASE) for p in patterns
169
+ )
170
+ ]
159
171
  reference_files.extend(files)
160
-
172
+
161
173
  reference_files = list(dict.fromkeys(reference_files))
162
-
174
+
163
175
  if not reference_files:
164
176
  logger.error(f"No reference files found for season {season_number}")
165
177
  return None
178
+ duration = float(
179
+ subprocess.check_output([
180
+ "ffprobe",
181
+ "-v",
182
+ "error",
183
+ "-show_entries",
184
+ "format=duration",
185
+ "-of",
186
+ "default=noprint_wrappers=1:nokey=1",
187
+ video_file,
188
+ ]).decode()
189
+ )
166
190
 
167
- # Try with tiny model first (fastest) - check first 2 minutes
168
- logger.info("Attempting match with tiny model (first 2 minutes)...")
169
- match = self._try_match_with_model(video_file, "tiny", 120, reference_files)
170
- if match and match['confidence'] > 0.65: # Slightly lower threshold for tiny
171
- logger.info(f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
172
- return match
173
-
174
- # If unsuccessful with tiny, try base model on first 3 minutes
175
- logger.info("Tiny model match failed, trying base model (first 3 minutes)...")
176
- match = self._try_match_with_model(video_file, "base", 180, reference_files)
177
- if match and match['confidence'] > self.min_confidence:
178
- logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
191
+ duration = int(np.ceil(duration))
192
+ # Try with tiny model first (fastest)
193
+ logger.info("Attempting match with tiny model...")
194
+ match = self._try_match_with_model(
195
+ video_file, "tiny", duration, reference_files
196
+ )
197
+ if (
198
+ match and match["confidence"] > 0.65
199
+ ): # Slightly lower threshold for tiny
200
+ logger.info(
201
+ f"Successfully matched with tiny model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
202
+ )
179
203
  return match
180
-
181
- # If still no match, try base model on up to 10 minutes
182
- logger.info("No match in first 3 minutes, extending base model search to 10 minutes...")
183
- match = self._try_match_with_model(video_file, "base", 600, reference_files)
204
+
205
+ # If no match, try base model
206
+ logger.info(
207
+ "No match in first 3 minutes, extending base model search to 10 minutes..."
208
+ )
209
+ match = self._try_match_with_model(
210
+ video_file, "base", duration, reference_files
211
+ )
184
212
  if match:
185
- logger.info(f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})")
213
+ logger.info(
214
+ f"Successfully matched with base model at {match['matched_at']}s (confidence: {match['confidence']:.2f})"
215
+ )
186
216
  return match
187
-
217
+
188
218
  logger.info("Speech recognition match failed")
189
219
  return None
190
-
220
+
191
221
  finally:
192
222
  # Cleanup temp files
193
223
  for file in self.temp_dir.glob("chunk_*.wav"):
@@ -196,134 +226,143 @@ class EpisodeMatcher:
196
226
  except Exception as e:
197
227
  logger.warning(f"Failed to delete temp file {file}: {e}")
198
228
 
229
+
199
230
  def detect_file_encoding(file_path):
200
231
  """
201
232
  Detect the encoding of a file using chardet.
202
-
233
+
203
234
  Args:
204
235
  file_path (str or Path): Path to the file
205
-
236
+
206
237
  Returns:
207
238
  str: Detected encoding, defaults to 'utf-8' if detection fails
208
239
  """
209
240
  try:
210
- with open(file_path, 'rb') as f:
241
+ with open(file_path, "rb") as f:
211
242
  raw_data = f.read()
212
243
  result = chardet.detect(raw_data)
213
- encoding = result['encoding']
214
- confidence = result['confidence']
215
-
216
- logger.debug(f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}")
217
- return encoding if encoding else 'utf-8'
244
+ encoding = result["encoding"]
245
+ confidence = result["confidence"]
246
+
247
+ logger.debug(
248
+ f"Detected encoding {encoding} with {confidence:.2%} confidence for {file_path}"
249
+ )
250
+ return encoding if encoding else "utf-8"
218
251
  except Exception as e:
219
252
  logger.warning(f"Error detecting encoding for {file_path}: {e}")
220
- return 'utf-8'
253
+ return "utf-8"
254
+
221
255
 
222
256
  def read_file_with_fallback(file_path, encodings=None):
223
257
  """
224
258
  Read a file trying multiple encodings in order of preference.
225
-
259
+
226
260
  Args:
227
261
  file_path (str or Path): Path to the file
228
262
  encodings (list): List of encodings to try, defaults to common subtitle encodings
229
-
263
+
230
264
  Returns:
231
265
  str: File contents
232
-
266
+
233
267
  Raises:
234
268
  ValueError: If file cannot be read with any encoding
235
269
  """
236
270
  if encodings is None:
237
271
  # First try detected encoding, then fallback to common subtitle encodings
238
272
  detected = detect_file_encoding(file_path)
239
- encodings = [detected, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
240
-
273
+ encodings = [detected, "utf-8", "latin-1", "cp1252", "iso-8859-1"]
274
+
241
275
  file_path = Path(file_path)
242
276
  errors = []
243
-
277
+
244
278
  for encoding in encodings:
245
279
  try:
246
- with open(file_path, 'r', encoding=encoding) as f:
280
+ with open(file_path, encoding=encoding) as f:
247
281
  content = f.read()
248
282
  logger.debug(f"Successfully read {file_path} using {encoding} encoding")
249
283
  return content
250
284
  except UnicodeDecodeError as e:
251
285
  errors.append(f"{encoding}: {str(e)}")
252
286
  continue
253
-
254
- error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(errors)
287
+
288
+ error_msg = f"Failed to read {file_path} with any encoding. Errors:\n" + "\n".join(
289
+ errors
290
+ )
255
291
  logger.error(error_msg)
256
292
  raise ValueError(error_msg)
257
293
 
294
+
258
295
  class SubtitleReader:
259
296
  """Helper class for reading and parsing subtitle files."""
260
-
297
+
261
298
  @staticmethod
262
299
  def parse_timestamp(timestamp):
263
300
  """Parse SRT timestamp into seconds."""
264
- hours, minutes, seconds = timestamp.replace(',', '.').split(':')
301
+ hours, minutes, seconds = timestamp.replace(",", ".").split(":")
265
302
  return float(hours) * 3600 + float(minutes) * 60 + float(seconds)
266
-
303
+
267
304
  @staticmethod
268
305
  def read_srt_file(file_path):
269
306
  """
270
307
  Read an SRT file and return its contents with robust encoding handling.
271
-
308
+
272
309
  Args:
273
310
  file_path (str or Path): Path to the SRT file
274
-
311
+
275
312
  Returns:
276
313
  str: Contents of the SRT file
277
314
  """
278
315
  return read_file_with_fallback(file_path)
279
-
316
+
280
317
  @staticmethod
281
318
  def extract_subtitle_chunk(content, start_time, end_time):
282
319
  """
283
320
  Extract subtitle text for a specific time window.
284
-
321
+
285
322
  Args:
286
323
  content (str): Full SRT file content
287
324
  start_time (float): Chunk start time in seconds
288
325
  end_time (float): Chunk end time in seconds
289
-
326
+
290
327
  Returns:
291
328
  list: List of subtitle texts within the time window
292
329
  """
293
330
  text_lines = []
294
-
295
- for block in content.strip().split('\n\n'):
296
- lines = block.split('\n')
297
- if len(lines) < 3 or '-->' not in lines[1]:
331
+
332
+ for block in content.strip().split("\n\n"):
333
+ lines = block.split("\n")
334
+ if len(lines) < 3 or "-->" not in lines[1]:
298
335
  continue
299
-
336
+
300
337
  try:
301
338
  timestamp = lines[1]
302
- text = ' '.join(lines[2:])
303
-
304
- end_stamp = timestamp.split(' --> ')[1].strip()
339
+ text = " ".join(lines[2:])
340
+
341
+ end_stamp = timestamp.split(" --> ")[1].strip()
305
342
  total_seconds = SubtitleReader.parse_timestamp(end_stamp)
306
-
343
+
307
344
  if start_time <= total_seconds <= end_time:
308
345
  text_lines.append(text)
309
-
346
+
310
347
  except (IndexError, ValueError) as e:
311
348
  logger.warning(f"Error parsing subtitle block: {e}")
312
349
  continue
313
-
350
+
314
351
  return text_lines
315
-
352
+
353
+
316
354
  _whisper_models = {}
317
355
 
356
+
318
357
  def get_whisper_model(model_name="tiny", device=None):
319
358
  """Cache whisper models to avoid reloading."""
320
359
  global _whisper_models
321
360
  if device is None:
322
361
  device = "cuda" if torch.cuda.is_available() else "cpu"
323
-
362
+
324
363
  key = f"{model_name}_{device}"
325
364
  if key not in _whisper_models:
326
365
  _whisper_models[key] = whisper.load_model(model_name, device=device)
327
366
  logger.info(f"Loaded {model_name} model on {device}")
328
-
329
- return _whisper_models[key]
367
+
368
+ return _whisper_models[key]