GameSentenceMiner 2.8.26__py3-none-any.whl → 2.8.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GameSentenceMiner/anki.py +7 -3
- GameSentenceMiner/ffmpeg.py +3 -3
- GameSentenceMiner/gsm.py +20 -17
- GameSentenceMiner/ocr/owocr_helper.py +1 -1
- GameSentenceMiner/owocr/owocr/config.py +25 -13
- GameSentenceMiner/owocr/owocr/ocr.py +103 -95
- GameSentenceMiner/owocr/owocr/run.py +602 -598
- GameSentenceMiner/owocr/owocr/screen_coordinate_picker.py +3 -2
- GameSentenceMiner/vad/result.py +8 -0
- GameSentenceMiner/vad/silero_trim.py +3 -2
- GameSentenceMiner/vad/vosk_helper.py +3 -2
- GameSentenceMiner/vad/whisper_helper.py +3 -2
- {gamesentenceminer-2.8.26.dist-info → gamesentenceminer-2.8.27.dist-info}/METADATA +1 -1
- {gamesentenceminer-2.8.26.dist-info → gamesentenceminer-2.8.27.dist-info}/RECORD +18 -17
- {gamesentenceminer-2.8.26.dist-info → gamesentenceminer-2.8.27.dist-info}/WHEEL +0 -0
- {gamesentenceminer-2.8.26.dist-info → gamesentenceminer-2.8.27.dist-info}/entry_points.txt +0 -0
- {gamesentenceminer-2.8.26.dist-info → gamesentenceminer-2.8.27.dist-info}/licenses/LICENSE +0 -0
- {gamesentenceminer-2.8.26.dist-info → gamesentenceminer-2.8.27.dist-info}/top_level.txt +0 -0
GameSentenceMiner/anki.py
CHANGED
@@ -8,7 +8,7 @@ from datetime import datetime, timedelta
|
|
8
8
|
from requests import post
|
9
9
|
|
10
10
|
from GameSentenceMiner import obs, util, notification, ffmpeg
|
11
|
-
from GameSentenceMiner.ai.ai_prompting import
|
11
|
+
from GameSentenceMiner.ai.ai_prompting import get_ai_prompt_result
|
12
12
|
from GameSentenceMiner.configuration import *
|
13
13
|
from GameSentenceMiner.configuration import get_config
|
14
14
|
from GameSentenceMiner.model import AnkiCard
|
@@ -28,7 +28,7 @@ card_queue = []
|
|
28
28
|
|
29
29
|
|
30
30
|
def update_anki_card(last_note: AnkiCard, note=None, audio_path='', video_path='', tango='', reuse_audio=False,
|
31
|
-
should_update_audio=True, ss_time=0, game_line=None, selected_lines=None):
|
31
|
+
should_update_audio=True, ss_time=0, game_line=None, selected_lines=None, prev_ss_timing=0):
|
32
32
|
global audio_in_anki, screenshot_in_anki, prev_screenshot_in_anki
|
33
33
|
update_audio = should_update_audio and (get_config().anki.sentence_audio_field and not
|
34
34
|
last_note.get_field(get_config().anki.sentence_audio_field) or get_config().anki.overwrite_audio)
|
@@ -45,7 +45,11 @@ def update_anki_card(last_note: AnkiCard, note=None, audio_path='', video_path='
|
|
45
45
|
if get_config().paths.remove_screenshot:
|
46
46
|
os.remove(screenshot)
|
47
47
|
if get_config().anki.previous_image_field:
|
48
|
-
|
48
|
+
try:
|
49
|
+
prev_screenshot = ffmpeg.get_screenshot(video_path, prev_ss_timing)
|
50
|
+
except Exception as e:
|
51
|
+
logger.error(f"Error getting previous screenshot based on VAD, Falling back to previous logic: {e}")
|
52
|
+
prev_screenshot = ffmpeg.get_screenshot(video_path, ffmpeg.get_screenshot_time(video_path, selected_lines[0].prev if selected_lines else game_line.prev))
|
49
53
|
prev_screenshot_in_anki = store_media_file(prev_screenshot)
|
50
54
|
if get_config().paths.remove_screenshot:
|
51
55
|
os.remove(prev_screenshot)
|
GameSentenceMiner/ffmpeg.py
CHANGED
@@ -50,7 +50,7 @@ def get_screenshot_for_line(video_file, game_line):
|
|
50
50
|
return get_screenshot(video_file, get_screenshot_time(video_file, game_line))
|
51
51
|
|
52
52
|
|
53
|
-
def get_screenshot_time(video_path, game_line, default_beginning=False,
|
53
|
+
def get_screenshot_time(video_path, game_line, default_beginning=False, vad_result=None, doing_multi_line=False):
|
54
54
|
if game_line:
|
55
55
|
line_time = game_line.time
|
56
56
|
else:
|
@@ -68,9 +68,9 @@ def get_screenshot_time(video_path, game_line, default_beginning=False, vad_begi
|
|
68
68
|
screenshot_offset = get_config().screenshot.seconds_after_line
|
69
69
|
|
70
70
|
# Calculate screenshot time from the beginning by adding the offset
|
71
|
-
if
|
71
|
+
if vad_result and vad_result.success and not doing_multi_line:
|
72
72
|
logger.debug("Using VAD to determine screenshot time")
|
73
|
-
screenshot_time_from_beginning = line_timestamp_in_video +
|
73
|
+
screenshot_time_from_beginning = line_timestamp_in_video + vad_result.end - 0.1
|
74
74
|
elif get_config().screenshot.screenshot_timing_setting == "beginning":
|
75
75
|
logger.debug("Using beginning of line for screenshot")
|
76
76
|
screenshot_time_from_beginning = line_timestamp_in_video + screenshot_offset
|
GameSentenceMiner/gsm.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
import asyncio
|
2
2
|
|
3
|
+
from GameSentenceMiner.vad.result import VADResult
|
4
|
+
|
3
5
|
try:
|
4
6
|
import os.path
|
5
7
|
import signal
|
@@ -134,32 +136,33 @@ class VideoToAudioHandler(FileSystemEventHandler):
|
|
134
136
|
|
135
137
|
if get_config().anki.sentence_audio_field and get_config().audio.enabled:
|
136
138
|
logger.debug("Attempting to get audio from video")
|
137
|
-
final_audio_output,
|
139
|
+
final_audio_output, vad_result, vad_trimmed_audio = VideoToAudioHandler.get_audio(
|
138
140
|
start_line,
|
139
141
|
line_cutoff,
|
140
142
|
video_path,
|
141
143
|
anki_card_creation_time)
|
142
144
|
else:
|
143
145
|
final_audio_output = ""
|
144
|
-
|
146
|
+
vad_result = VADResult(False, 0, 0)
|
145
147
|
vad_trimmed_audio = ""
|
146
|
-
vad_beginning = 0
|
147
|
-
vad_end = 0
|
148
148
|
if not get_config().audio.enabled:
|
149
149
|
logger.info("Audio is disabled in config, skipping audio processing!")
|
150
150
|
elif not get_config().anki.sentence_audio_field:
|
151
151
|
logger.info("No SentenceAudio Field in config, skipping audio processing!")
|
152
152
|
|
153
|
-
ss_timing = ffmpeg.get_screenshot_time(video_path, mined_line,
|
153
|
+
ss_timing = ffmpeg.get_screenshot_time(video_path, mined_line, vad_result=vad_result, doing_multi_line=bool(selected_lines))
|
154
|
+
if get_config().anki.previous_image_field:
|
155
|
+
prev_ss_timing = ffmpeg.get_screenshot_time(video_path, mined_line.prev, vad_result=VideoToAudioHandler.get_audio(mined_line.prev, mined_line.time, video_path, anki_card_creation_time=anki_card_creation_time, timing_only=True) ,doing_multi_line=bool(selected_lines))
|
154
156
|
|
155
157
|
if get_config().anki.update_anki and last_note:
|
156
158
|
anki.update_anki_card(last_note, note, audio_path=final_audio_output, video_path=video_path,
|
157
159
|
tango=tango,
|
158
|
-
should_update_audio=
|
160
|
+
should_update_audio=vad_result.success,
|
159
161
|
ss_time=ss_timing,
|
160
162
|
game_line=start_line,
|
161
|
-
selected_lines=selected_lines
|
162
|
-
|
163
|
+
selected_lines=selected_lines,
|
164
|
+
prev_ss_timing=prev_ss_timing)
|
165
|
+
elif get_config().features.notify_on_update and vad_result.success:
|
163
166
|
notification.send_audio_generated_notification(vad_trimmed_audio)
|
164
167
|
except Exception as e:
|
165
168
|
logger.error(f"Failed Processing and/or adding to Anki: Reason {e}")
|
@@ -173,7 +176,7 @@ class VideoToAudioHandler(FileSystemEventHandler):
|
|
173
176
|
|
174
177
|
|
175
178
|
@staticmethod
|
176
|
-
def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False):
|
179
|
+
def get_audio(game_line, next_line_time, video_path, anki_card_creation_time=None, temporary=False, timing_only=False):
|
177
180
|
trimmed_audio = get_audio_and_trim(video_path, game_line, next_line_time, anki_card_creation_time)
|
178
181
|
if temporary:
|
179
182
|
return trimmed_audio
|
@@ -181,23 +184,23 @@ class VideoToAudioHandler(FileSystemEventHandler):
|
|
181
184
|
f"{os.path.abspath(configuration.get_temporary_directory())}/{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}")
|
182
185
|
final_audio_output = make_unique_file_name(os.path.join(get_config().paths.audio_destination,
|
183
186
|
f"{obs.get_current_game(sanitize=True)}.{get_config().audio.extension}"))
|
184
|
-
|
185
|
-
vad_beginning, vad_end = 0, 0
|
187
|
+
result = VADResult(False, 0, 0)
|
186
188
|
if get_config().vad.do_vad_postprocessing:
|
187
|
-
|
188
|
-
if not
|
189
|
-
|
189
|
+
result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio, vad_trimmed_audio)
|
190
|
+
if not result.success:
|
191
|
+
result = do_vad_processing(get_config().vad.selected_vad_model, trimmed_audio,
|
190
192
|
vad_trimmed_audio)
|
191
|
-
if not
|
193
|
+
if not result.success and get_config().vad.add_audio_on_no_results:
|
192
194
|
logger.info("No voice activity detected, using full audio.")
|
193
195
|
vad_trimmed_audio = trimmed_audio
|
194
|
-
|
196
|
+
if timing_only:
|
197
|
+
return result
|
195
198
|
if get_config().audio.ffmpeg_reencode_options and os.path.exists(vad_trimmed_audio):
|
196
199
|
ffmpeg.reencode_file_with_user_config(vad_trimmed_audio, final_audio_output,
|
197
200
|
get_config().audio.ffmpeg_reencode_options)
|
198
201
|
elif os.path.exists(vad_trimmed_audio):
|
199
202
|
shutil.move(vad_trimmed_audio, final_audio_output)
|
200
|
-
return final_audio_output,
|
203
|
+
return final_audio_output, result, vad_trimmed_audio
|
201
204
|
|
202
205
|
|
203
206
|
def do_vad_processing(model, trimmed_audio, vad_trimmed_audio, second_pass=False):
|
@@ -207,7 +207,6 @@ rectangles = None
|
|
207
207
|
|
208
208
|
def do_second_ocr(ocr1_text, rectangle_index, time, img):
|
209
209
|
global twopassocr, ocr2, last_ocr1_results, last_ocr2_results
|
210
|
-
last_result = ([], -1)
|
211
210
|
try:
|
212
211
|
orig_text, text = run.process_and_write_results(img, None, None, None, None,
|
213
212
|
engine=ocr2)
|
@@ -218,6 +217,7 @@ def do_second_ocr(ocr1_text, rectangle_index, time, img):
|
|
218
217
|
img.save(os.path.join(get_temporary_directory(), "last_successful_ocr.png"))
|
219
218
|
last_ocr2_results[rectangle_index] = text
|
220
219
|
send_result(text, time)
|
220
|
+
img.close()
|
221
221
|
except json.JSONDecodeError:
|
222
222
|
print("Invalid JSON received.")
|
223
223
|
except Exception as e:
|
@@ -4,16 +4,26 @@ import argparse
|
|
4
4
|
import textwrap
|
5
5
|
import urllib.request
|
6
6
|
|
7
|
+
def str2bool(value):
|
8
|
+
if value.lower() == 'true':
|
9
|
+
return True
|
10
|
+
elif value.lower() == 'false':
|
11
|
+
return False
|
12
|
+
else:
|
13
|
+
raise argparse.ArgumentTypeError('Boolean value expected.')
|
14
|
+
|
7
15
|
parser = argparse.ArgumentParser(prog='owocr', description=textwrap.dedent('''\
|
8
16
|
Runs OCR in the background.
|
9
17
|
It can read images copied to the system clipboard or placed in a directory, images sent via a websocket or a Unix domain socket, or directly capture a screen (or a portion of it) or a window.
|
10
|
-
Recognized
|
18
|
+
Recognized text can be either saved to system clipboard, appended to a text file or sent via a websocket.
|
11
19
|
'''))
|
12
20
|
|
13
21
|
parser.add_argument('-r', '--read_from', type=str, default=argparse.SUPPRESS,
|
14
|
-
help='
|
22
|
+
help='Where to read input images from. Can be either "clipboard", "websocket", "unixsocket" (on macOS/Linux), "screencapture", or a path to a directory.')
|
23
|
+
parser.add_argument('-rs', '--read_from_secondary', type=str, default=argparse.SUPPRESS,
|
24
|
+
help="Optional secondary source to read input images from. Same options as read_from, but they can't both be directory paths.")
|
15
25
|
parser.add_argument('-w', '--write_to', type=str, default=argparse.SUPPRESS,
|
16
|
-
help='
|
26
|
+
help='Where to save recognized texts to. Can be either "clipboard", "websocket", or a path to a text file.')
|
17
27
|
parser.add_argument('-e', '--engine', type=str, default=argparse.SUPPRESS,
|
18
28
|
help='OCR engine to use. Available: "mangaocr", "glens", "glensweb", "bing", "gvision", "avision", "alivetext", "azure", "winrtocr", "oneocr", "easyocr", "rapidocr", "ocrspace".')
|
19
29
|
parser.add_argument('-p', '--pause_at_startup', action='store_true', default=argparse.SUPPRESS,
|
@@ -23,21 +33,21 @@ parser.add_argument('-i', '--ignore_flag', action='store_true', default=argparse
|
|
23
33
|
parser.add_argument('-d', '--delete_images', action='store_true', default=argparse.SUPPRESS,
|
24
34
|
help='Delete image files after processing when reading from a directory.')
|
25
35
|
parser.add_argument('-n', '--notifications', action='store_true', default=argparse.SUPPRESS,
|
26
|
-
help='Show an operating system notification with the detected text.')
|
36
|
+
help='Show an operating system notification with the detected text. Will be ignored when reading with screen capture, unless screen_capture_combo is set.')
|
27
37
|
parser.add_argument('-a', '--auto_pause', type=float, default=argparse.SUPPRESS,
|
28
|
-
help='Automatically pause the program after the specified amount of seconds since the last successful text recognition. Will be ignored when reading with screen capture. 0 to disable.')
|
38
|
+
help='Automatically pause the program after the specified amount of seconds since the last successful text recognition. Will be ignored when reading with screen capture, unless screen_capture_combo is set. 0 to disable.')
|
29
39
|
parser.add_argument('-cp', '--combo_pause', type=str, default=argparse.SUPPRESS,
|
30
|
-
help='
|
40
|
+
help='Combo to wait on for pausing the program. As an example: "<ctrl>+<shift>+p". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
|
31
41
|
parser.add_argument('-cs', '--combo_engine_switch', type=str, default=argparse.SUPPRESS,
|
32
|
-
help='
|
42
|
+
help='Combo to wait on for switching the OCR engine. As an example: "<ctrl>+<shift>+a". To be used with combo_pause. The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
|
33
43
|
parser.add_argument('-sa', '--screen_capture_area', type=str, default=argparse.SUPPRESS,
|
34
|
-
help='
|
44
|
+
help='Area to target when reading with screen capture. Can be either empty (automatic selector), a set of coordinates (x,y,width,height), "screen_N" (captures a whole screen, where N is the screen number starting from 1) or a window name (the first matching window title will be used).')
|
35
45
|
parser.add_argument('-sd', '--screen_capture_delay_secs', type=float, default=argparse.SUPPRESS,
|
36
|
-
help='
|
37
|
-
parser.add_argument('-sw', '--screen_capture_only_active_windows',
|
38
|
-
help="When reading with screen capture and screen_capture_area is a window name,
|
46
|
+
help='Delay (in seconds) between screenshots when reading with screen capture.')
|
47
|
+
parser.add_argument('-sw', '--screen_capture_only_active_windows', type=str2bool, default=argparse.SUPPRESS,
|
48
|
+
help="When reading with screen capture and screen_capture_area is a window name, only target the window while it's active.")
|
39
49
|
parser.add_argument('-sc', '--screen_capture_combo', type=str, default=argparse.SUPPRESS,
|
40
|
-
help='When reading with screen capture,
|
50
|
+
help='When reading with screen capture, combo to wait on for taking a screenshot instead of using the delay. As an example: "<ctrl>+<shift>+s". The list of keys can be found here: https://pynput.readthedocs.io/en/latest/keyboard.html#pynput.keyboard.Key')
|
41
51
|
|
42
52
|
class Config:
|
43
53
|
has_config = False
|
@@ -47,6 +57,7 @@ class Config:
|
|
47
57
|
__engine_config = {}
|
48
58
|
__default_config = {
|
49
59
|
'read_from': 'clipboard',
|
60
|
+
'read_from_secondary': '',
|
50
61
|
'write_to': 'clipboard',
|
51
62
|
'engine': '',
|
52
63
|
'pause_at_startup': False,
|
@@ -64,7 +75,8 @@ class Config:
|
|
64
75
|
'screen_capture_area': '',
|
65
76
|
'screen_capture_delay_secs': 3,
|
66
77
|
'screen_capture_only_active_windows': True,
|
67
|
-
'screen_capture_combo': ''
|
78
|
+
'screen_capture_combo': '',
|
79
|
+
'screen_capture_old_macos_api': False
|
68
80
|
}
|
69
81
|
|
70
82
|
def __parse(self, value):
|
@@ -96,6 +96,22 @@ def post_process(text):
|
|
96
96
|
return text
|
97
97
|
|
98
98
|
|
99
|
+
def input_to_pil_image(img):
|
100
|
+
if isinstance(img, Image.Image):
|
101
|
+
pil_image = img
|
102
|
+
elif isinstance(img, (bytes, bytearray)):
|
103
|
+
pil_image = Image.open(io.BytesIO(img))
|
104
|
+
elif isinstance(img, Path):
|
105
|
+
try:
|
106
|
+
pil_image = Image.open(img)
|
107
|
+
pil_image.load()
|
108
|
+
except (UnidentifiedImageError, OSError) as e:
|
109
|
+
return None
|
110
|
+
else:
|
111
|
+
raise ValueError(f'img must be a path, PIL.Image or bytes object, instead got: {img}')
|
112
|
+
return pil_image
|
113
|
+
|
114
|
+
|
99
115
|
def pil_image_to_bytes(img, img_format='png', png_compression=6, jpeg_quality=80, optimize=False):
|
100
116
|
if img_format == 'png' and optimized_png_encode and not optimize:
|
101
117
|
raw_data = img.convert('RGBA').tobytes()
|
@@ -157,15 +173,14 @@ class MangaOcr:
|
|
157
173
|
self.available = True
|
158
174
|
logger.info('Manga OCR ready')
|
159
175
|
|
160
|
-
def __call__(self,
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
img = img_or_path
|
165
|
-
else:
|
166
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
176
|
+
def __call__(self, img):
|
177
|
+
img = input_to_pil_image(img)
|
178
|
+
if not img:
|
179
|
+
return (False, 'Invalid image provided')
|
167
180
|
|
168
181
|
x = (True, self.model(img))
|
182
|
+
|
183
|
+
# img.close()
|
169
184
|
return x
|
170
185
|
|
171
186
|
class GoogleVision:
|
@@ -188,13 +203,10 @@ class GoogleVision:
|
|
188
203
|
except:
|
189
204
|
logger.warning('Error parsing Google credentials, Google Vision will not work!')
|
190
205
|
|
191
|
-
def __call__(self,
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
img = img_or_path
|
196
|
-
else:
|
197
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
206
|
+
def __call__(self, img):
|
207
|
+
img = input_to_pil_image(img)
|
208
|
+
if not img:
|
209
|
+
return (False, 'Invalid image provided')
|
198
210
|
|
199
211
|
image_bytes = self._preprocess(img)
|
200
212
|
image = vision.Image(content=image_bytes)
|
@@ -207,6 +219,8 @@ class GoogleVision:
|
|
207
219
|
texts = response.text_annotations
|
208
220
|
res = texts[0].description if len(texts) > 0 else ''
|
209
221
|
x = (True, res)
|
222
|
+
|
223
|
+
# img.close()
|
210
224
|
return x
|
211
225
|
|
212
226
|
def _preprocess(self, img):
|
@@ -225,13 +239,10 @@ class GoogleLens:
|
|
225
239
|
self.available = True
|
226
240
|
logger.info('Google Lens ready')
|
227
241
|
|
228
|
-
def __call__(self,
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
img = img_or_path
|
233
|
-
else:
|
234
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
242
|
+
def __call__(self, img):
|
243
|
+
img = input_to_pil_image(img)
|
244
|
+
if not img:
|
245
|
+
return (False, 'Invalid image provided')
|
235
246
|
|
236
247
|
request = LensOverlayServerRequest()
|
237
248
|
|
@@ -298,6 +309,8 @@ class GoogleLens:
|
|
298
309
|
res += '\n'
|
299
310
|
|
300
311
|
x = (True, res)
|
312
|
+
|
313
|
+
# img.close()
|
301
314
|
return x
|
302
315
|
|
303
316
|
def _preprocess(self, img):
|
@@ -305,7 +318,9 @@ class GoogleLens:
|
|
305
318
|
aspect_ratio = img.width / img.height
|
306
319
|
new_w = int(sqrt(3000000 * aspect_ratio))
|
307
320
|
new_h = int(new_w / aspect_ratio)
|
308
|
-
|
321
|
+
img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
322
|
+
# img.close()
|
323
|
+
img = img_resized
|
309
324
|
|
310
325
|
return (pil_image_to_bytes(img), img.width, img.height)
|
311
326
|
|
@@ -323,13 +338,10 @@ class GoogleLensWeb:
|
|
323
338
|
self.available = True
|
324
339
|
logger.info('Google Lens (web) ready')
|
325
340
|
|
326
|
-
def __call__(self,
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
img = img_or_path
|
331
|
-
else:
|
332
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
341
|
+
def __call__(self, img):
|
342
|
+
img = input_to_pil_image(img)
|
343
|
+
if not img:
|
344
|
+
return (False, 'Invalid image provided')
|
333
345
|
|
334
346
|
url = 'https://lens.google.com/v3/upload'
|
335
347
|
files = {'encoded_image': ('image.png', self._preprocess(img), 'image/png')}
|
@@ -393,6 +405,8 @@ class GoogleLensWeb:
|
|
393
405
|
res += '\n'
|
394
406
|
|
395
407
|
x = (True, res)
|
408
|
+
|
409
|
+
# img.close()
|
396
410
|
return x
|
397
411
|
|
398
412
|
def _preprocess(self, img):
|
@@ -400,7 +414,9 @@ class GoogleLensWeb:
|
|
400
414
|
aspect_ratio = img.width / img.height
|
401
415
|
new_w = int(sqrt(3000000 * aspect_ratio))
|
402
416
|
new_h = int(new_w / aspect_ratio)
|
403
|
-
|
417
|
+
img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
418
|
+
# img.close()
|
419
|
+
img = img_resized
|
404
420
|
|
405
421
|
return pil_image_to_bytes(img)
|
406
422
|
|
@@ -415,13 +431,10 @@ class Bing:
|
|
415
431
|
self.available = True
|
416
432
|
logger.info('Bing ready')
|
417
433
|
|
418
|
-
def __call__(self,
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
img = img_or_path
|
423
|
-
else:
|
424
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
434
|
+
def __call__(self, img):
|
435
|
+
img = input_to_pil_image(img)
|
436
|
+
if not img:
|
437
|
+
return (False, 'Invalid image provided')
|
425
438
|
|
426
439
|
img_bytes = self._preprocess(img)
|
427
440
|
if not img_bytes:
|
@@ -515,6 +528,8 @@ class Bing:
|
|
515
528
|
res += line['text'] + '\n'
|
516
529
|
|
517
530
|
x = (True, res)
|
531
|
+
|
532
|
+
# img.close()
|
518
533
|
return x
|
519
534
|
|
520
535
|
def _preprocess(self, img):
|
@@ -526,7 +541,9 @@ class Bing:
|
|
526
541
|
resize_factor = max(max_pixel_size / img.width, max_pixel_size / img.height)
|
527
542
|
new_w = int(img.width * resize_factor)
|
528
543
|
new_h = int(img.height * resize_factor)
|
529
|
-
|
544
|
+
img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
545
|
+
# img.close()
|
546
|
+
img = img_resized
|
530
547
|
|
531
548
|
img_bytes, _ = limit_image_size(img, max_byte_size)
|
532
549
|
|
@@ -550,13 +567,10 @@ class AppleVision:
|
|
550
567
|
self.available = True
|
551
568
|
logger.info('Apple Vision ready')
|
552
569
|
|
553
|
-
def __call__(self,
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
img = img_or_path
|
558
|
-
else:
|
559
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
570
|
+
def __call__(self, img):
|
571
|
+
img = input_to_pil_image(img)
|
572
|
+
if not img:
|
573
|
+
return (False, 'Invalid image provided')
|
560
574
|
|
561
575
|
with objc.autorelease_pool():
|
562
576
|
req = Vision.VNRecognizeTextRequest.alloc().init()
|
@@ -579,6 +593,7 @@ class AppleVision:
|
|
579
593
|
else:
|
580
594
|
x = (False, 'Unknown error!')
|
581
595
|
|
596
|
+
# img.close()
|
582
597
|
return x
|
583
598
|
|
584
599
|
def _preprocess(self, img):
|
@@ -631,13 +646,10 @@ class AppleLiveText:
|
|
631
646
|
self.available = True
|
632
647
|
logger.info('Apple Live Text ready')
|
633
648
|
|
634
|
-
def __call__(self,
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
img = img_or_path
|
639
|
-
else:
|
640
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
649
|
+
def __call__(self, img):
|
650
|
+
img = input_to_pil_image(img)
|
651
|
+
if not img:
|
652
|
+
return (False, 'Invalid image provided')
|
641
653
|
|
642
654
|
with objc.autorelease_pool():
|
643
655
|
analyzer = self.VKCImageAnalyzer.alloc().init()
|
@@ -691,13 +703,10 @@ class WinRTOCR:
|
|
691
703
|
except:
|
692
704
|
logger.warning('Error reading URL from config, WinRT OCR will not work!')
|
693
705
|
|
694
|
-
def __call__(self,
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
img = img_or_path
|
699
|
-
else:
|
700
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
706
|
+
def __call__(self, img):
|
707
|
+
img = input_to_pil_image(img)
|
708
|
+
if not img:
|
709
|
+
return (False, 'Invalid image provided')
|
701
710
|
|
702
711
|
if sys.platform == 'win32':
|
703
712
|
res = winocr.recognize_pil_sync(img, lang='ja')['text']
|
@@ -716,6 +725,8 @@ class WinRTOCR:
|
|
716
725
|
res = res.json()['text']
|
717
726
|
|
718
727
|
x = (True, res)
|
728
|
+
|
729
|
+
# img.close()
|
719
730
|
return x
|
720
731
|
|
721
732
|
def _preprocess(self, img):
|
@@ -749,13 +760,10 @@ class OneOCR:
|
|
749
760
|
except:
|
750
761
|
logger.warning('Error reading URL from config, OneOCR will not work!')
|
751
762
|
|
752
|
-
def __call__(self,
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
img = img_or_path
|
757
|
-
else:
|
758
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
763
|
+
def __call__(self, img):
|
764
|
+
img = input_to_pil_image(img)
|
765
|
+
if not img:
|
766
|
+
return (False, 'Invalid image provided')
|
759
767
|
|
760
768
|
if sys.platform == 'win32':
|
761
769
|
try:
|
@@ -779,6 +787,8 @@ class OneOCR:
|
|
779
787
|
res = res.json()['text']
|
780
788
|
|
781
789
|
x = (True, res)
|
790
|
+
|
791
|
+
# img.close()
|
782
792
|
return x
|
783
793
|
|
784
794
|
def _preprocess(self, img):
|
@@ -802,13 +812,10 @@ class AzureImageAnalysis:
|
|
802
812
|
except:
|
803
813
|
logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')
|
804
814
|
|
805
|
-
def __call__(self,
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
img = img_or_path
|
810
|
-
else:
|
811
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
815
|
+
def __call__(self, img):
|
816
|
+
img = input_to_pil_image(img)
|
817
|
+
if not img:
|
818
|
+
return (False, 'Invalid image provided')
|
812
819
|
|
813
820
|
try:
|
814
821
|
read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ])
|
@@ -826,6 +833,8 @@ class AzureImageAnalysis:
|
|
826
833
|
return (False, 'Unknown error!')
|
827
834
|
|
828
835
|
x = (True, res)
|
836
|
+
|
837
|
+
# img.close()
|
829
838
|
return x
|
830
839
|
|
831
840
|
def _preprocess(self, img):
|
@@ -833,7 +842,9 @@ class AzureImageAnalysis:
|
|
833
842
|
resize_factor = max(50 / img.width, 50 / img.height)
|
834
843
|
new_w = int(img.width * resize_factor)
|
835
844
|
new_h = int(img.height * resize_factor)
|
836
|
-
|
845
|
+
img_resized = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
846
|
+
# img.close()
|
847
|
+
img = img_resized
|
837
848
|
|
838
849
|
return pil_image_to_bytes(img)
|
839
850
|
|
@@ -853,13 +864,10 @@ class EasyOCR:
|
|
853
864
|
self.available = True
|
854
865
|
logger.info('EasyOCR ready')
|
855
866
|
|
856
|
-
def __call__(self,
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
img = img_or_path
|
861
|
-
else:
|
862
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
867
|
+
def __call__(self, img):
|
868
|
+
img = input_to_pil_image(img)
|
869
|
+
if not img:
|
870
|
+
return (False, 'Invalid image provided')
|
863
871
|
|
864
872
|
res = ''
|
865
873
|
read_result = self.model.readtext(self._preprocess(img), detail=0)
|
@@ -867,6 +875,8 @@ class EasyOCR:
|
|
867
875
|
res += text + '\n'
|
868
876
|
|
869
877
|
x = (True, res)
|
878
|
+
|
879
|
+
# img.close()
|
870
880
|
return x
|
871
881
|
|
872
882
|
def _preprocess(self, img):
|
@@ -900,13 +910,10 @@ class RapidOCR:
|
|
900
910
|
self.available = True
|
901
911
|
logger.info('RapidOCR ready')
|
902
912
|
|
903
|
-
def __call__(self,
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
img = img_or_path
|
908
|
-
else:
|
909
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
913
|
+
def __call__(self, img):
|
914
|
+
img = input_to_pil_image(img)
|
915
|
+
if not img:
|
916
|
+
return (False, 'Invalid image provided')
|
910
917
|
|
911
918
|
res = ''
|
912
919
|
read_results, elapsed = self.model(self._preprocess(img))
|
@@ -915,6 +922,8 @@ class RapidOCR:
|
|
915
922
|
res += read_result[1] + '\n'
|
916
923
|
|
917
924
|
x = (True, res)
|
925
|
+
|
926
|
+
# img.close()
|
918
927
|
return x
|
919
928
|
|
920
929
|
def _preprocess(self, img):
|
@@ -935,13 +944,10 @@ class OCRSpace:
|
|
935
944
|
except:
|
936
945
|
logger.warning('Error reading API key from config, OCRSpace will not work!')
|
937
946
|
|
938
|
-
def __call__(self,
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
img = img_or_path
|
943
|
-
else:
|
944
|
-
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
|
947
|
+
def __call__(self, img):
|
948
|
+
img = input_to_pil_image(img)
|
949
|
+
if not img:
|
950
|
+
return (False, 'Invalid image provided')
|
945
951
|
|
946
952
|
img_bytes, img_extension = self._preprocess(img)
|
947
953
|
if not img_bytes:
|
@@ -972,6 +978,8 @@ class OCRSpace:
|
|
972
978
|
|
973
979
|
res = res['ParsedResults'][0]['ParsedText']
|
974
980
|
x = (True, res)
|
981
|
+
|
982
|
+
# img.close()
|
975
983
|
return x
|
976
984
|
|
977
985
|
def _preprocess(self, img):
|