GameSentenceMiner 2.8.25__py3-none-any.whl → 2.8.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
GameSentenceMiner/anki.py CHANGED
@@ -8,7 +8,7 @@ from datetime import datetime, timedelta
8
8
  from requests import post
9
9
 
10
10
  from GameSentenceMiner import obs, util, notification, ffmpeg
11
- from GameSentenceMiner.ai.ai_prompting import GeminiAI, get_ai_prompt_result
11
+ from GameSentenceMiner.ai.ai_prompting import get_ai_prompt_result
12
12
  from GameSentenceMiner.configuration import *
13
13
  from GameSentenceMiner.configuration import get_config
14
14
  from GameSentenceMiner.model import AnkiCard
@@ -28,7 +28,7 @@ card_queue = []
28
28
 
29
29
 
30
30
  def update_anki_card(last_note: AnkiCard, note=None, audio_path='', video_path='', tango='', reuse_audio=False,
31
- should_update_audio=True, ss_time=0, game_line=None, selected_lines=None):
31
+ should_update_audio=True, ss_time=0, game_line=None, selected_lines=None, prev_ss_timing=0):
32
32
  global audio_in_anki, screenshot_in_anki, prev_screenshot_in_anki
33
33
  update_audio = should_update_audio and (get_config().anki.sentence_audio_field and not
34
34
  last_note.get_field(get_config().anki.sentence_audio_field) or get_config().anki.overwrite_audio)
@@ -45,7 +45,11 @@ def update_anki_card(last_note: AnkiCard, note=None, audio_path='', video_path='
45
45
  if get_config().paths.remove_screenshot:
46
46
  os.remove(screenshot)
47
47
  if get_config().anki.previous_image_field:
48
- prev_screenshot = ffmpeg.get_screenshot(video_path, ffmpeg.get_screenshot_time(video_path, selected_lines[0].prev if selected_lines else game_line.prev))
48
+ try:
49
+ prev_screenshot = ffmpeg.get_screenshot(video_path, prev_ss_timing)
50
+ except Exception as e:
51
+ logger.error(f"Error getting previous screenshot based on VAD, Falling back to previous logic: {e}")
52
+ prev_screenshot = ffmpeg.get_screenshot(video_path, ffmpeg.get_screenshot_time(video_path, selected_lines[0].prev if selected_lines else game_line.prev))
49
53
  prev_screenshot_in_anki = store_media_file(prev_screenshot)
50
54
  if get_config().paths.remove_screenshot:
51
55
  os.remove(prev_screenshot)
@@ -50,7 +50,7 @@ def get_screenshot_for_line(video_file, game_line):
50
50
  return get_screenshot(video_file, get_screenshot_time(video_file, game_line))
51
51
 
52
52
 
53
- def get_screenshot_time(video_path, game_line, default_beginning=False, vad_beginning=None, vad_end=None, doing_multi_line=False):
53
+ def get_screenshot_time(video_path, game_line, default_beginning=False, vad_result=None, doing_multi_line=False):
54
54
  if game_line:
55
55
  line_time = game_line.time
56
56
  else:
@@ -68,19 +68,19 @@ def get_screenshot_time(video_path, game_line, default_beginning=False, vad_begi
68
68
  screenshot_offset = get_config().screenshot.seconds_after_line
69
69
 
70
70
  # Calculate screenshot time from the beginning by adding the offset
71
- if vad_beginning and vad_end and not doing_multi_line:
71
+ if vad_result and vad_result.success and not doing_multi_line:
72
72
  logger.debug("Using VAD to determine screenshot time")
73
- screenshot_time_from_beginning = line_timestamp_in_video + vad_end - screenshot_offset
73
+ screenshot_time_from_beginning = line_timestamp_in_video + vad_result.end - 0.1
74
74
  elif get_config().screenshot.screenshot_timing_setting == "beginning":
75
75
  logger.debug("Using beginning of line for screenshot")
76
76
  screenshot_time_from_beginning = line_timestamp_in_video + screenshot_offset
77
77
  elif get_config().screenshot.screenshot_timing_setting == "middle":
78
78
  if game_line.next:
79
79
  logger.debug("Finding time between lines for screenshot")
80
- screenshot_time_from_beginning = line_timestamp_in_video + ((game_line.next.time - game_line.time).total_seconds() / 2)
80
+ screenshot_time_from_beginning = line_timestamp_in_video + ((game_line.next.time - game_line.time).total_seconds() / 2) + screenshot_offset
81
81
  else:
82
82
  logger.debug("Using end of line for screenshot")
83
- screenshot_time_from_beginning = file_length - screenshot_offset
83
+ screenshot_time_from_beginning = file_length - abs(screenshot_offset)
84
84
  elif get_config().screenshot.screenshot_timing_setting == "end":
85
85
  logger.debug("Using end of line for screenshot")
86
86
  if game_line.next:
GameSentenceMiner/gsm.py CHANGED
@@ -1,5 +1,7 @@
1
1
  import asyncio
2
2
 
3
+ from GameSentenceMiner.vad.result import VADResult
4
+
3
5
  try:
4
6
  import os.path
5
7
  import signal
@@ -134,32 +136,33 @@ class VideoToAudioHandler(FileSystemEventHandler):
134
136
 
135
137
  if get_config().anki.sentence_audio_field and get_config().audio.enabled:
136
138
  logger.debug("Attempting to get audio from video")
137
- final_audio_output, should_update_audio, vad_trimmed_audio, vad_beginning, vad_end = VideoToAudioHandler.get_audio(
139
+ final_audio_output, vad_result, vad_trimmed_audio = VideoToAudioHandler.get_audio(
138
140
  start_line,
139
141
  line_cutoff,
140
142
  video_path,
141
143
  anki_card_creation_time)
142
144
  else:
143
145
  final_audio_output = ""
144
- should_update_audio = False
146
+ vad_result = VADResult(False, 0, 0)
145
147
  vad_trimmed_audio = ""
146
- vad_beginning = 0
147
- vad_end = 0
148
148
  if not get_config().audio.enabled:
149
149
  logger.info("Audio is disabled in config, skipping audio processing!")
150
150
  elif not get_config().anki.sentence_audio_field:
151
151
  logger.info("No SentenceAudio Field in config, skipping audio processing!")
152
152
 
153
- ss_timing = ffmpeg.get_screenshot_time(video_path, mined_line, vad_beginning=vad_beginning, vad_end=vad_end, doing_multi_line=bool(selected_lines))
153
+ ss_timing = ffmpeg.get_screenshot_time(video_path, mined_line, vad_result=vad_result, doing_multi_line=bool(selected_lines))
154
+ if get_config().anki.previous_image_field:
155
+ prev_ss_timing = ffmpeg.get_screenshot_time(video_path, mined_line.prev, vad_result=VideoToAudioHandler.get_audio(mined_line.prev, mined_line.time, video_path, anki_card_creation_time=anki_card_creation_time, timing_only=True) ,doing_multi_line=bool(selected_lines))
154
156
 
155
157
  if get_config().anki.update_anki and last_note:
156
158
  anki.update_anki_card(last_note, note, audio_path=final_audio_output, video_path=video_path,
157
159
  tango=tango,
158
- should_update_audio=should_update_audio,
160
+ should_update_audio=vad_result.success,
159
161
  ss_time=ss_timing,
160
162
  game_line=start_line,
161
- selected_lines=selected_lines)
162
- elif get_config().features.notify_on_update and should_update_audio:
163
+ selected_lines=selected_lines,
164
+ prev_ss_timing=prev_ss_timing)
165
+ elif get_config().features.notify_on_update and vad_result.success:
163
166
  notification.send_audio_generated_notification(vad_trimmed_audio)
164
167
  except Exception as e:
165
168
  logger.error(f"Failed Processing and/or adding to Anki: Reason {e}")
@@ -173,7 +176,7 @@ class VideoToAudioHandler(FileSystemEventHandler):
173
176
 
174
177
 
175
178
  @staticmethod
176
- def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False):
179
+ def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False, timing_only=False):
177
180
  trimmed_audio = get_audio_and_trim(video_path, game_line, next_line_time, anki_card_creation_time)
178
181
  if temporary:
179
182
  return trimmed_audio
@@ -181,23 +184,23 @@ class VideoToAudioHandler(FileSystemEventHandler):
181
184
  f"{os.path.abspath(configuration.get_temporary_directory())}/{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}")
182
185
  final_audio_output = make_unique_file_name(os.path.join(get_config().paths.audio_destination,
183
186
  f"{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}"))
184
- should_update_audio = True
185
- vad_beginning, vad_end = 0, 0
187
+ result = VADResult(False, 0, 0)
186
188
  if get_config().vad.do_vad_postprocessing:
187
- should_update_audio, vad_beginning, vad_end = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio)
188
- if not should_update_audio:
189
- should_update_audio, vad_beginning, vad_end = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio,
189
+ result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio)
190
+ if not result.success:
191
+ result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio,
190
192
  vad_trimmed_audio)
191
- if not should_update_audio and get_config().vad.add_audio_on_no_results:
193
+ if not result.success and get_config().vad.add_audio_on_no_results:
192
194
  logger.info("No voice activity detected, using full audio.")
193
195
  vad_trimmed_audio = trimmed_audio
194
- should_update_audio = True
196
+ if timing_only:
197
+ return result
195
198
  if get_config().audio.ffmpeg_reencode_options and os.path.exists(vad_trimmed_audio):
196
199
  ffmpeg.reencode_file_with_user_config(vad_trimmed_audio, final_audio_output,
197
200
  get_config().audio.ffmpeg_reencode_options)
198
201
  elif os.path.exists(vad_trimmed_audio):
199
202
  shutil.move(vad_trimmed_audio, final_audio_output)
200
- return final_audio_output, should_update_audio, vad_trimmed_audio, vad_beginning, vad_end
203
+ return final_audio_output, result, vad_trimmed_audio
201
204
 
202
205
 
203
206
  def do_vad_processing(model, trimmed_audio, vad_trimmed_audio, second_pass=False):
@@ -207,7 +207,6 @@ rectangles = None
207
207
 
208
208
  def do_second_ocr(ocr1_text, rectangle_index, time, img):
209
209
  global twopassocr, ocr2, last_ocr1_results, last_ocr2_results
210
- last_result = ([], -1)
211
210
  try:
212
211
  orig_text, text = run.process_and_write_results(img, None, None, None, None,
213
212
  engine=ocr2)
@@ -218,6 +217,7 @@ def do_second_ocr(ocr1_text, rectangle_index, time, img):
218
217
  img.save(os.path.join(get_temporary_directory(), "last_successful_ocr.png"))
219
218
  last_ocr2_results[rectangle_index] = text
220
219
  send_result(text, time)
220
+ img.close()
221
221
  except json.JSONDecodeError:
222
222
  print("Invalid JSON received.")
223
223
  except Exception as e:
@@ -4,16 +4,26 @@ import argparse
4
4
  import textwrap
5
5
  import urllib.request
6
6
 
7
+ def str2bool(value):
8
+ if value.lower() == 'true':
9
+ return True
10
+ elif value.lower() == 'false':
11
+ return False
12
+ else:
13
+ raise argparse.ArgumentTypeError('Boolean value expected.')
14
+
7
15
  parser = argparse.ArgumentParser(prog='owocr', description=textwrap.dedent('''\
8
16
  Runs OCR in the background.
9
17
  It can read images copied to the system clipboard or placed in a directory, images sent via a websocket or a Unix domain socket, or directly capture a screen (or a portion of it) or a window.
10
- Recognized texts can be either saved to system clipboard, appended to a text file or sent via a websocket.
18
+ Recognized text can be either saved to system clipboard, appended to a text file or sent via a websocket.
11
19
  '''))
12
20
 
13
21
  parser.add_argument('-r', '--read_from', type=str, default=argparse.SUPPRESS,
14
- help='Specifies where to read input images from. Can be either "clipboard", "websocket", "unixsocket" (on macOS/Linux), "screencapture", or a path to a directory.')
22
+ help='Where to read input images from. Can be either "clipboard", "websocket", "unixsocket" (on macOS/Linux), "screencapture", or a path to a directory.')
23
+ parser.add_argument('-rs', '--read_from_secondary', type=str, default=argparse.SUPPRESS,
24
+ help="Optional secondary source to read input images from. Same options as read_from, but they can't both be directory paths.")
15
25
  parser.add_argument('-w', '--write_to', type=str, default=argparse.SUPPRESS,
16
- help='Specifies where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.')
26
+ help='Where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.')
17
27
  parser.add_argument('-e', '--engine', type=str, default=argparse.SUPPRESS,
18
28
  help='OCR engine to use. Available: "mangaocr", "glens", "glensweb", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".')
19
29
  parser.add_argument('-p', '--pause_at_startup', action='store_true', default=argparse.SUPPRESS,
@@ -23,21 +33,21 @@ parser.add_argument('-i', '--ignore_flag', action='store_true', default=argparse
23
33
  parser.add_argument('-d', '--delete_images', action='store_true', default=argparse.SUPPRESS,
24
34
  help='Delete image files after processing when reading from a directory.')
25
35
  parser.add_argument('-n', '--notifications', action='store_true', default=argparse.SUPPRESS,
26
- help='Show an operating system notification with the detected text.')
36
+ help='Show an operating system notification with the detected text. Will be ignored when reading with screen capture, unless screen_capture_combo is set.')
27
37
  parser.add_argument('-a', '--auto_pause', type=float, default=argparse.SUPPRESS,
28
- help='Automatically pause the program after the specified amount of seconds since the last successful text recognition. Will be ignored when reading with screen capture. 0 to disable.')
38
+ help='Automatically pause the program after the specified amount of seconds since the last successful text recognition. Will be ignored when reading with screen capture, unless screen_capture_combo is set. 0 to disable.')
29
39
  parser.add_argument('-cp', '--combo_pause', type=str, default=argparse.SUPPRESS,
30
- help='Specifies a combo to wait on for pausing the program. As an example: "<ctrl>+<shift>+p". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
40
+ help='Combo to wait on for pausing the program. As an example: "<ctrl>+<shift>+p". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
31
41
  parser.add_argument('-cs', '--combo_engine_switch', type=str, default=argparse.SUPPRESS,
32
- help='Specifies a combo to wait on for switching the OCR engine. As an example: "<ctrl>+<shift>+a". To be used with combo_pause. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
42
+ help='Combo to wait on for switching the OCR engine. As an example: "<ctrl>+<shift>+a". To be used with combo_pause. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
33
43
  parser.add_argument('-sa', '--screen_capture_area', type=str, default=argparse.SUPPRESS,
34
- help='Specifies area to target when reading with screen capture. Can be either empty (automatic selector), a set of coordinates (x,y,width,height), "screen_N" (captures a whole screen, where N is the screen number starting from 1) or a window name (the first matching window title will be used).')
44
+ help='Area to target when reading with screen capture. Can be either empty (automatic selector), a set of coordinates (x,y,width,height), "screen_N" (captures a whole screen, where N is the screen number starting from 1) or a window name (the first matching window title will be used).')
35
45
  parser.add_argument('-sd', '--screen_capture_delay_secs', type=float, default=argparse.SUPPRESS,
36
- help='Specifies the delay (in seconds) between screenshots when reading with screen capture.')
37
- parser.add_argument('-sw', '--screen_capture_only_active_windows', action='store_true', default=argparse.SUPPRESS,
38
- help="When reading with screen capture and screen_capture_area is a window name, specifies whether to only target the window while it's active.")
46
+ help='Delay (in seconds) between screenshots when reading with screen capture.')
47
+ parser.add_argument('-sw', '--screen_capture_only_active_windows', type=str2bool, default=argparse.SUPPRESS,
48
+ help="When reading with screen capture and screen_capture_area is a window name, only target the window while it's active.")
39
49
  parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
40
- help='When reading with screen capture, specifies a combo to wait on for taking a screenshot instead of using the delay. As an example: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
50
+ help='When reading with screen capture, combo to wait on for taking a screenshot instead of using the delay. As an example: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
41
51
 
42
52
  class Config:
43
53
  has_config = False
@@ -47,6 +57,7 @@ class Config:
47
57
  __engine_config = {}
48
58
  __default_config = {
49
59
  'read_from': 'clipboard',
60
+ 'read_from_secondary': '',
50
61
  'write_to': 'clipboard',
51
62
  'engine': '',
52
63
  'pause_at_startup': False,
@@ -64,7 +75,8 @@ class Config:
64
75
  'screen_capture_area': '',
65
76
  'screen_capture_delay_secs': 3,
66
77
  'screen_capture_only_active_windows': True,
67
- 'screen_capture_combo': ''
78
+ 'screen_capture_combo': '',
79
+ 'screen_capture_old_macos_api': False
68
80
  }
69
81
 
70
82
  def __parse(self, value):
@@ -96,6 +96,22 @@ def post_process(text):
96
96
  return text
97
97
 
98
98
 
99
+ def input_to_pil_image(img):
100
+ if isinstance(img, Image.Image):
101
+ pil_image = img
102
+ elif isinstance(img, (bytes, bytearray)):
103
+ pil_image = Image.open(io.BytesIO(img))
104
+ elif isinstance(img, Path):
105
+ try:
106
+ pil_image = Image.open(img)
107
+ pil_image.load()
108
+ except (UnidentifiedImageError, OSError) as e:
109
+ return None
110
+ else:
111
+ raise ValueError(f'img must be a path, PIL.Image or bytes object, instead got: {img}')
112
+ return pil_image
113
+
114
+
99
115
  def pil_image_to_bytes(img, img_format='png', png_compression=6, jpeg_quality=80, optimize=False):
100
116
  if img_format == 'png' and optimized_png_encode and not optimize:
101
117
  raw_data = img.convert('RGBA').tobytes()
@@ -157,15 +173,14 @@ class MangaOcr:
157
173
  self.available = True
158
174
  logger.info('Manga OCR ready')
159
175
 
160
- def __call__(self, img_or_path):
161
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
162
- img = Image.open(img_or_path)
163
- elif isinstance(img_or_path, Image.Image):
164
- img = img_or_path
165
- else:
166
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
176
+ def __call__(self, img):
177
+ img = input_to_pil_image(img)
178
+ if not img:
179
+ return (False, 'Invalid image provided')
167
180
 
168
181
  x = (True, self.model(img))
182
+
183
+ # img.close()
169
184
  return x
170
185
 
171
186
  class GoogleVision:
@@ -188,13 +203,10 @@ class GoogleVision:
188
203
  except:
189
204
  logger.warning('Error parsing Google credentials, Google Vision will not work!')
190
205
 
191
- def __call__(self, img_or_path):
192
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
193
- img = Image.open(img_or_path)
194
- elif isinstance(img_or_path, Image.Image):
195
- img = img_or_path
196
- else:
197
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
206
+ def __call__(self, img):
207
+ img = input_to_pil_image(img)
208
+ if not img:
209
+ return (False, 'Invalid image provided')
198
210
 
199
211
  image_bytes = self._preprocess(img)
200
212
  image = vision.Image(content=image_bytes)
@@ -207,6 +219,8 @@ class GoogleVision:
207
219
  texts = response.text_annotations
208
220
  res = texts[0].description if len(texts) > 0 else ''
209
221
  x = (True, res)
222
+
223
+ # img.close()
210
224
  return x
211
225
 
212
226
  def _preprocess(self, img):
@@ -225,13 +239,10 @@ class GoogleLens:
225
239
  self.available = True
226
240
  logger.info('Google Lens ready')
227
241
 
228
- def __call__(self, img_or_path):
229
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
230
- img = Image.open(img_or_path)
231
- elif isinstance(img_or_path, Image.Image):
232
- img = img_or_path
233
- else:
234
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
242
+ def __call__(self, img):
243
+ img = input_to_pil_image(img)
244
+ if not img:
245
+ return (False, 'Invalid image provided')
235
246
 
236
247
  request = LensOverlayServerRequest()
237
248
 
@@ -298,6 +309,8 @@ class GoogleLens:
298
309
  res += '\n'
299
310
 
300
311
  x = (True, res)
312
+
313
+ # img.close()
301
314
  return x
302
315
 
303
316
  def _preprocess(self, img):
@@ -305,7 +318,9 @@ class GoogleLens:
305
318
  aspect_ratio = img.width / img.height
306
319
  new_w = int(sqrt(3000000 * aspect_ratio))
307
320
  new_h = int(new_w / aspect_ratio)
308
- img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
321
+ img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
322
+ # img.close()
323
+ img = img_resized
309
324
 
310
325
  return (pil_image_to_bytes(img), img.width, img.height)
311
326
 
@@ -323,13 +338,10 @@ class GoogleLensWeb:
323
338
  self.available = True
324
339
  logger.info('Google Lens (web) ready')
325
340
 
326
- def __call__(self, img_or_path):
327
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
328
- img = Image.open(img_or_path)
329
- elif isinstance(img_or_path, Image.Image):
330
- img = img_or_path
331
- else:
332
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
341
+ def __call__(self, img):
342
+ img = input_to_pil_image(img)
343
+ if not img:
344
+ return (False, 'Invalid image provided')
333
345
 
334
346
  url = 'https://lens.google.com/v3/upload'
335
347
  files = {'encoded_image': ('image.png', self._preprocess(img), 'image/png')}
@@ -393,6 +405,8 @@ class GoogleLensWeb:
393
405
  res += '\n'
394
406
 
395
407
  x = (True, res)
408
+
409
+ # img.close()
396
410
  return x
397
411
 
398
412
  def _preprocess(self, img):
@@ -400,7 +414,9 @@ class GoogleLensWeb:
400
414
  aspect_ratio = img.width / img.height
401
415
  new_w = int(sqrt(3000000 * aspect_ratio))
402
416
  new_h = int(new_w / aspect_ratio)
403
- img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
417
+ img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
418
+ # img.close()
419
+ img = img_resized
404
420
 
405
421
  return pil_image_to_bytes(img)
406
422
 
@@ -415,13 +431,10 @@ class Bing:
415
431
  self.available = True
416
432
  logger.info('Bing ready')
417
433
 
418
- def __call__(self, img_or_path):
419
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
420
- img = Image.open(img_or_path)
421
- elif isinstance(img_or_path, Image.Image):
422
- img = img_or_path
423
- else:
424
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
434
+ def __call__(self, img):
435
+ img = input_to_pil_image(img)
436
+ if not img:
437
+ return (False, 'Invalid image provided')
425
438
 
426
439
  img_bytes = self._preprocess(img)
427
440
  if not img_bytes:
@@ -515,6 +528,8 @@ class Bing:
515
528
  res += line['text'] + '\n'
516
529
 
517
530
  x = (True, res)
531
+
532
+ # img.close()
518
533
  return x
519
534
 
520
535
  def _preprocess(self, img):
@@ -526,7 +541,9 @@ class Bing:
526
541
  resize_factor = max(max_pixel_size / img.width, max_pixel_size / img.height)
527
542
  new_w = int(img.width * resize_factor)
528
543
  new_h = int(img.height * resize_factor)
529
- img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
544
+ img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
545
+ # img.close()
546
+ img = img_resized
530
547
 
531
548
  img_bytes, _ = limit_image_size(img, max_byte_size)
532
549
 
@@ -550,13 +567,10 @@ class AppleVision:
550
567
  self.available = True
551
568
  logger.info('Apple Vision ready')
552
569
 
553
- def __call__(self, img_or_path):
554
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
555
- img = Image.open(img_or_path)
556
- elif isinstance(img_or_path, Image.Image):
557
- img = img_or_path
558
- else:
559
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
570
+ def __call__(self, img):
571
+ img = input_to_pil_image(img)
572
+ if not img:
573
+ return (False, 'Invalid image provided')
560
574
 
561
575
  with objc.autorelease_pool():
562
576
  req = Vision.VNRecognizeTextRequest.alloc().init()
@@ -579,6 +593,7 @@ class AppleVision:
579
593
  else:
580
594
  x = (False, 'Unknown error!')
581
595
 
596
+ # img.close()
582
597
  return x
583
598
 
584
599
  def _preprocess(self, img):
@@ -631,13 +646,10 @@ class AppleLiveText:
631
646
  self.available = True
632
647
  logger.info('Apple Live Text ready')
633
648
 
634
- def __call__(self, img_or_path):
635
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
636
- img = Image.open(img_or_path)
637
- elif isinstance(img_or_path, Image.Image):
638
- img = img_or_path
639
- else:
640
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
649
+ def __call__(self, img):
650
+ img = input_to_pil_image(img)
651
+ if not img:
652
+ return (False, 'Invalid image provided')
641
653
 
642
654
  with objc.autorelease_pool():
643
655
  analyzer = self.VKCImageAnalyzer.alloc().init()
@@ -691,13 +703,10 @@ class WinRTOCR:
691
703
  except:
692
704
  logger.warning('Error reading URL from config, WinRT OCR will not work!')
693
705
 
694
- def __call__(self, img_or_path):
695
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
696
- img = Image.open(img_or_path)
697
- elif isinstance(img_or_path, Image.Image):
698
- img = img_or_path
699
- else:
700
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
706
+ def __call__(self, img):
707
+ img = input_to_pil_image(img)
708
+ if not img:
709
+ return (False, 'Invalid image provided')
701
710
 
702
711
  if sys.platform == 'win32':
703
712
  res = winocr.recognize_pil_sync(img, lang='ja')['text']
@@ -716,6 +725,8 @@ class WinRTOCR:
716
725
  res = res.json()['text']
717
726
 
718
727
  x = (True, res)
728
+
729
+ # img.close()
719
730
  return x
720
731
 
721
732
  def _preprocess(self, img):
@@ -749,13 +760,10 @@ class OneOCR:
749
760
  except:
750
761
  logger.warning('Error reading URL from config, OneOCR will not work!')
751
762
 
752
- def __call__(self, img_or_path):
753
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
754
- img = Image.open(img_or_path)
755
- elif isinstance(img_or_path, Image.Image):
756
- img = img_or_path
757
- else:
758
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
763
+ def __call__(self, img):
764
+ img = input_to_pil_image(img)
765
+ if not img:
766
+ return (False, 'Invalid image provided')
759
767
 
760
768
  if sys.platform == 'win32':
761
769
  try:
@@ -779,6 +787,8 @@ class OneOCR:
779
787
  res = res.json()['text']
780
788
 
781
789
  x = (True, res)
790
+
791
+ # img.close()
782
792
  return x
783
793
 
784
794
  def _preprocess(self, img):
@@ -802,13 +812,10 @@ class AzureImageAnalysis:
802
812
  except:
803
813
  logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')
804
814
 
805
- def __call__(self, img_or_path):
806
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
807
- img = Image.open(img_or_path)
808
- elif isinstance(img_or_path, Image.Image):
809
- img = img_or_path
810
- else:
811
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
815
+ def __call__(self, img):
816
+ img = input_to_pil_image(img)
817
+ if not img:
818
+ return (False, 'Invalid image provided')
812
819
 
813
820
  try:
814
821
  read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ])
@@ -826,6 +833,8 @@ class AzureImageAnalysis:
826
833
  return (False, 'Unknown error!')
827
834
 
828
835
  x = (True, res)
836
+
837
+ # img.close()
829
838
  return x
830
839
 
831
840
  def _preprocess(self, img):
@@ -833,7 +842,9 @@ class AzureImageAnalysis:
833
842
  resize_factor = max(50 / img.width, 50 / img.height)
834
843
  new_w = int(img.width * resize_factor)
835
844
  new_h = int(img.height * resize_factor)
836
- img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
845
+ img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
846
+ # img.close()
847
+ img = img_resized
837
848
 
838
849
  return pil_image_to_bytes(img)
839
850
 
@@ -853,13 +864,10 @@ class EasyOCR:
853
864
  self.available = True
854
865
  logger.info('EasyOCR ready')
855
866
 
856
- def __call__(self, img_or_path):
857
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
858
- img = Image.open(img_or_path)
859
- elif isinstance(img_or_path, Image.Image):
860
- img = img_or_path
861
- else:
862
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
867
+ def __call__(self, img):
868
+ img = input_to_pil_image(img)
869
+ if not img:
870
+ return (False, 'Invalid image provided')
863
871
 
864
872
  res = ''
865
873
  read_result = self.model.readtext(self._preprocess(img), detail=0)
@@ -867,6 +875,8 @@ class EasyOCR:
867
875
  res += text + '\n'
868
876
 
869
877
  x = (True, res)
878
+
879
+ # img.close()
870
880
  return x
871
881
 
872
882
  def _preprocess(self, img):
@@ -900,13 +910,10 @@ class RapidOCR:
900
910
  self.available = True
901
911
  logger.info('RapidOCR ready')
902
912
 
903
- def __call__(self, img_or_path):
904
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
905
- img = Image.open(img_or_path)
906
- elif isinstance(img_or_path, Image.Image):
907
- img = img_or_path
908
- else:
909
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
913
+ def __call__(self, img):
914
+ img = input_to_pil_image(img)
915
+ if not img:
916
+ return (False, 'Invalid image provided')
910
917
 
911
918
  res = ''
912
919
  read_results, elapsed = self.model(self._preprocess(img))
@@ -915,6 +922,8 @@ class RapidOCR:
915
922
  res += read_result[1] + '\n'
916
923
 
917
924
  x = (True, res)
925
+
926
+ # img.close()
918
927
  return x
919
928
 
920
929
  def _preprocess(self, img):
@@ -935,13 +944,10 @@ class OCRSpace:
935
944
  except:
936
945
  logger.warning('Error reading API key from config, OCRSpace will not work!')
937
946
 
938
- def __call__(self, img_or_path):
939
- if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
940
- img = Image.open(img_or_path)
941
- elif isinstance(img_or_path, Image.Image):
942
- img = img_or_path
943
- else:
944
- raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
947
+ def __call__(self, img):
948
+ img = input_to_pil_image(img)
949
+ if not img:
950
+ return (False, 'Invalid image provided')
945
951
 
946
952
  img_bytes, img_extension = self._preprocess(img)
947
953
  if not img_bytes:
@@ -972,6 +978,8 @@ class OCRSpace:
972
978
 
973
979
  res = res['ParsedResults'][0]['ParsedText']
974
980
  x = (True, res)
981
+
982
+ # img.close()
975
983
  return x
976
984
 
977
985
  def _preprocess(self, img):