GameSentenceMiner 2.14.9__py3-none-any.whl → 2.14.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. GameSentenceMiner/ai/__init__.py +0 -0
  2. GameSentenceMiner/ai/ai_prompting.py +473 -0
  3. GameSentenceMiner/ocr/__init__.py +0 -0
  4. GameSentenceMiner/ocr/gsm_ocr_config.py +174 -0
  5. GameSentenceMiner/ocr/ocrconfig.py +129 -0
  6. GameSentenceMiner/ocr/owocr_area_selector.py +629 -0
  7. GameSentenceMiner/ocr/owocr_helper.py +638 -0
  8. GameSentenceMiner/ocr/ss_picker.py +140 -0
  9. GameSentenceMiner/owocr/owocr/__init__.py +1 -0
  10. GameSentenceMiner/owocr/owocr/__main__.py +9 -0
  11. GameSentenceMiner/owocr/owocr/config.py +148 -0
  12. GameSentenceMiner/owocr/owocr/lens_betterproto.py +1238 -0
  13. GameSentenceMiner/owocr/owocr/ocr.py +1690 -0
  14. GameSentenceMiner/owocr/owocr/run.py +1818 -0
  15. GameSentenceMiner/owocr/owocr/screen_coordinate_picker.py +109 -0
  16. GameSentenceMiner/tools/__init__.py +0 -0
  17. GameSentenceMiner/tools/audio_offset_selector.py +215 -0
  18. GameSentenceMiner/tools/ss_selector.py +135 -0
  19. GameSentenceMiner/tools/window_transparency.py +214 -0
  20. GameSentenceMiner/util/__init__.py +0 -0
  21. GameSentenceMiner/util/communication/__init__.py +22 -0
  22. GameSentenceMiner/util/communication/send.py +7 -0
  23. GameSentenceMiner/util/communication/websocket.py +94 -0
  24. GameSentenceMiner/util/configuration.py +1199 -0
  25. GameSentenceMiner/util/db.py +408 -0
  26. GameSentenceMiner/util/downloader/Untitled_json.py +472 -0
  27. GameSentenceMiner/util/downloader/__init__.py +0 -0
  28. GameSentenceMiner/util/downloader/download_tools.py +194 -0
  29. GameSentenceMiner/util/downloader/oneocr_dl.py +250 -0
  30. GameSentenceMiner/util/electron_config.py +259 -0
  31. GameSentenceMiner/util/ffmpeg.py +571 -0
  32. GameSentenceMiner/util/get_overlay_coords.py +366 -0
  33. GameSentenceMiner/util/gsm_utils.py +323 -0
  34. GameSentenceMiner/util/model.py +206 -0
  35. GameSentenceMiner/util/notification.py +157 -0
  36. GameSentenceMiner/util/text_log.py +214 -0
  37. GameSentenceMiner/util/win10toast/__init__.py +154 -0
  38. GameSentenceMiner/util/win10toast/__main__.py +22 -0
  39. GameSentenceMiner/web/__init__.py +0 -0
  40. GameSentenceMiner/web/service.py +132 -0
  41. GameSentenceMiner/web/static/__init__.py +0 -0
  42. GameSentenceMiner/web/static/apple-touch-icon.png +0 -0
  43. GameSentenceMiner/web/static/favicon-96x96.png +0 -0
  44. GameSentenceMiner/web/static/favicon.ico +0 -0
  45. GameSentenceMiner/web/static/favicon.svg +3 -0
  46. GameSentenceMiner/web/static/site.webmanifest +21 -0
  47. GameSentenceMiner/web/static/style.css +292 -0
  48. GameSentenceMiner/web/static/web-app-manifest-192x192.png +0 -0
  49. GameSentenceMiner/web/static/web-app-manifest-512x512.png +0 -0
  50. GameSentenceMiner/web/templates/__init__.py +0 -0
  51. GameSentenceMiner/web/templates/index.html +50 -0
  52. GameSentenceMiner/web/templates/text_replacements.html +238 -0
  53. GameSentenceMiner/web/templates/utility.html +483 -0
  54. GameSentenceMiner/web/texthooking_page.py +584 -0
  55. GameSentenceMiner/wip/__init___.py +0 -0
  56. {gamesentenceminer-2.14.9.dist-info → gamesentenceminer-2.14.11.dist-info}/METADATA +1 -1
  57. gamesentenceminer-2.14.11.dist-info/RECORD +79 -0
  58. gamesentenceminer-2.14.9.dist-info/RECORD +0 -24
  59. {gamesentenceminer-2.14.9.dist-info → gamesentenceminer-2.14.11.dist-info}/WHEEL +0 -0
  60. {gamesentenceminer-2.14.9.dist-info → gamesentenceminer-2.14.11.dist-info}/entry_points.txt +0 -0
  61. {gamesentenceminer-2.14.9.dist-info → gamesentenceminer-2.14.11.dist-info}/licenses/LICENSE +0 -0
  62. {gamesentenceminer-2.14.9.dist-info → gamesentenceminer-2.14.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1690 @@
1
+ import re
2
+ import os
3
+ import io
4
+ import time
5
+ from pathlib import Path
6
+ import sys
7
+ import platform
8
+ import logging
9
+ from math import sqrt, floor
10
+ import json
11
+ import base64
12
+ from urllib.parse import urlparse, parse_qs
13
+
14
+ import numpy as np
15
+ import rapidfuzz.fuzz
16
+ from PIL import Image
17
+ from loguru import logger
18
+ import requests
19
+
20
+
21
+ try:
22
+ from GameSentenceMiner.util.electron_config import get_ocr_language, get_furigana_filter_sensitivity
23
+ from GameSentenceMiner.util.configuration import CommonLanguages
24
+ except ImportError:
25
+ pass
26
+
27
+ # from GameSentenceMiner.util.configuration import get_temporary_directory
28
+
29
+ try:
30
+ from manga_ocr import MangaOcr as MOCR
31
+ except ImportError:
32
+ pass
33
+
34
+ try:
35
+ import Vision
36
+ import objc
37
+ from AppKit import NSData, NSImage, NSBundle
38
+ from CoreFoundation import CFRunLoopRunInMode, kCFRunLoopDefaultMode, CFRunLoopStop, CFRunLoopGetCurrent
39
+ except ImportError:
40
+ pass
41
+
42
+ try:
43
+ from google.cloud import vision
44
+ from google.oauth2 import service_account
45
+ from google.api_core.exceptions import ServiceUnavailable
46
+ except ImportError:
47
+ pass
48
+
49
+ try:
50
+ from azure.ai.vision.imageanalysis import ImageAnalysisClient
51
+ from azure.ai.vision.imageanalysis.models import VisualFeatures
52
+ from azure.core.credentials import AzureKeyCredential
53
+ from azure.core.exceptions import ServiceRequestError
54
+ except ImportError:
55
+ pass
56
+
57
+ try:
58
+ import easyocr
59
+ except ImportError:
60
+ pass
61
+
62
+ try:
63
+ from rapidocr_onnxruntime import RapidOCR as ROCR
64
+ import urllib.request
65
+ except ImportError:
66
+ pass
67
+
68
+ try:
69
+ import winocr
70
+ except ImportError:
71
+ pass
72
+
73
+ try:
74
+ try:
75
+ if os.path.exists(os.path.expanduser('~/.config/oneocr/oneocr.dll')):
76
+ import oneocr
77
+ except Exception as e:
78
+ oneocr = None
79
+ logger.warning(f'Failed to import OneOCR: {e}', exc_info=True)
80
+ except ImportError:
81
+ pass
82
+
83
+ try:
84
+ import pyjson5
85
+ except ImportError:
86
+ pass
87
+
88
+ try:
89
+ import betterproto
90
+ from GameSentenceMiner.owocr.owocr.lens_betterproto import *
91
+ import random
92
+ except ImportError:
93
+ pass
94
+
95
+ try:
96
+ import fpng_py
97
+ optimized_png_encode = True
98
+ except:
99
+ optimized_png_encode = False
100
+
101
+
102
+ def empty_post_process(text):
103
+ return text
104
+
105
+
106
+ def post_process(text, keep_blank_lines=False):
107
+ import jaconv
108
+ if keep_blank_lines:
109
+ text = '\n'.join([''.join(i.split()) for i in text.splitlines()])
110
+ else:
111
+ text = ''.join([''.join(i.split()) for i in text.splitlines()])
112
+ text = text.replace('…', '...')
113
+ text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
114
+ text = jaconv.h2z(text, ascii=True, digit=True)
115
+ return text
116
+
117
+
118
+ def input_to_pil_image(img):
119
+ is_path = False
120
+ if isinstance(img, Image.Image):
121
+ pil_image = img
122
+ elif isinstance(img, (bytes, bytearray)):
123
+ pil_image = Image.open(io.BytesIO(img))
124
+ elif isinstance(img, Path):
125
+ is_path = True
126
+ try:
127
+ pil_image = Image.open(img)
128
+ pil_image.load()
129
+ except (UnidentifiedImageError, OSError) as e:
130
+ return None
131
+ else:
132
+ raise ValueError(f'img must be a path, PIL.Image or bytes object, instead got: {img}')
133
+ return pil_image, is_path
134
+
135
+
136
+ def pil_image_to_bytes(img, img_format='png', png_compression=6, jpeg_quality=80, optimize=False):
137
+ if img_format == 'png' and optimized_png_encode and not optimize:
138
+ raw_data = img.convert('RGBA').tobytes()
139
+ image_bytes = fpng_py.fpng_encode_image_to_memory(raw_data, img.width, img.height)
140
+ else:
141
+ image_bytes = io.BytesIO()
142
+ if img_format == 'jpeg':
143
+ img = img.convert('RGB')
144
+ img.save(image_bytes, format=img_format, compress_level=png_compression, quality=jpeg_quality, optimize=optimize, subsampling=0)
145
+ image_bytes = image_bytes.getvalue()
146
+ return image_bytes
147
+
148
+
149
+ def pil_image_to_numpy_array(img):
150
+ return np.array(img.convert('RGBA'))
151
+
152
+
153
+ def limit_image_size(img, max_size):
154
+ img_bytes = pil_image_to_bytes(img)
155
+ if len(img_bytes) <= max_size:
156
+ return img_bytes, 'png'
157
+
158
+ scaling_factor = 0.60 if any(x > 2000 for x in img.size) else 0.75
159
+ new_w = int(img.width * scaling_factor)
160
+ new_h = int(img.height * scaling_factor)
161
+ resized_img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
162
+ resized_img_bytes = pil_image_to_bytes(resized_img)
163
+ if len(resized_img_bytes) <= max_size:
164
+ return resized_img_bytes, 'png'
165
+
166
+ for _ in range(2):
167
+ jpeg_quality = 80
168
+ while jpeg_quality >= 60:
169
+ img_bytes = pil_image_to_bytes(img, 'jpeg', jpeg_quality=jpeg_quality, optimize=True)
170
+ if len(img_bytes) <= max_size:
171
+ return img_bytes, 'jpeg'
172
+ jpeg_quality -= 5
173
+ img = resized_img
174
+
175
+ return False, ''
176
+
177
+
178
+ def get_regex(lang):
179
+ if lang == "ja":
180
+ return re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
181
+ elif lang == "zh":
182
+ return re.compile(r'[\u4E00-\u9FFF]')
183
+ elif lang == "ko":
184
+ return re.compile(r'[\uAC00-\uD7AF]')
185
+ elif lang == "ar":
186
+ return re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]')
187
+ elif lang == "ru":
188
+ return re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]')
189
+ elif lang == "el":
190
+ return re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]')
191
+ elif lang == "he":
192
+ return re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]')
193
+ elif lang == "th":
194
+ return re.compile(r'[\u0E00-\u0E7F]')
195
+ else:
196
+ return re.compile(
197
+ r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]')
198
+
199
+
200
+ class MangaOcr:
201
+ name = 'mangaocr'
202
+ readable_name = 'Manga OCR'
203
+ key = 'm'
204
+ available = False
205
+
206
+ def __init__(self, config={'pretrained_model_name_or_path':'kha-white/manga-ocr-base','force_cpu': False}, lang='ja'):
207
+ if 'manga_ocr' not in sys.modules:
208
+ logger.warning('manga-ocr not available, Manga OCR will not work!')
209
+ else:
210
+ logger.disable('manga_ocr')
211
+ logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings
212
+ from manga_ocr import ocr
213
+ ocr.post_process = empty_post_process
214
+ logger.info(f'Loading Manga OCR model')
215
+ self.model = MOCR(config['pretrained_model_name_or_path'], config['force_cpu'])
216
+ self.available = True
217
+ logger.info('Manga OCR ready')
218
+
219
+ def __call__(self, img, furigana_filter_sensitivity=0):
220
+ img, is_path = input_to_pil_image(img)
221
+ if not img:
222
+ return (False, 'Invalid image provided')
223
+
224
+ x = (True, self.model(img))
225
+
226
+ # img.close()
227
+ return x
228
+
229
+ class GoogleVision:
230
+ name = 'gvision'
231
+ readable_name = 'Google Vision'
232
+ key = 'g'
233
+ available = False
234
+
235
+ def __init__(self, lang='ja'):
236
+ if 'google.cloud' not in sys.modules:
237
+ logger.warning('google-cloud-vision not available, Google Vision will not work!')
238
+ else:
239
+ logger.info(f'Parsing Google credentials')
240
+ google_credentials_file = os.path.join(os.path.expanduser('~'),'.config','google_vision.json')
241
+ try:
242
+ google_credentials = service_account.Credentials.from_service_account_file(google_credentials_file)
243
+ self.client = vision.ImageAnnotatorClient(credentials=google_credentials)
244
+ self.available = True
245
+ logger.info('Google Vision ready')
246
+ except:
247
+ logger.warning('Error parsing Google credentials, Google Vision will not work!')
248
+
249
+ def __call__(self, img, furigana_filter_sensitivity=0):
250
+ img, is_path = input_to_pil_image(img)
251
+ if not img:
252
+ return (False, 'Invalid image provided')
253
+
254
+ image_bytes = self._preprocess(img)
255
+ image = vision.Image(content=image_bytes)
256
+ try:
257
+ response = self.client.text_detection(image=image)
258
+ except ServiceUnavailable:
259
+ return (False, 'Connection error!')
260
+ except:
261
+ return (False, 'Unknown error!')
262
+ texts = response.text_annotations
263
+ res = texts[0].description if len(texts) > 0 else ''
264
+ x = (True, res)
265
+
266
+ # img.close()
267
+ return x
268
+
269
+ def _preprocess(self, img):
270
+ return pil_image_to_bytes(img)
271
+
272
+ class GoogleLens:
273
+ name = 'glens'
274
+ readable_name = 'Google Lens'
275
+ key = 'l'
276
+ available = False
277
+
278
+ def __init__(self, lang='ja'):
279
+ import regex
280
+ self.regex = get_regex(lang)
281
+ self.initial_lang = lang
282
+ self.punctuation_regex = regex.compile(r'[\p{P}\p{S}]')
283
+ if 'betterproto' not in sys.modules:
284
+ logger.warning('betterproto not available, Google Lens will not work!')
285
+ else:
286
+ self.available = True
287
+ logger.info('Google Lens ready')
288
+
289
+ def __call__(self, img, furigana_filter_sensitivity=0, return_coords=False):
290
+ if furigana_filter_sensitivity != None:
291
+ furigana_filter_sensitivity = get_furigana_filter_sensitivity()
292
+ else:
293
+ furigana_filter_sensitivity = 0
294
+ lang = get_ocr_language()
295
+ img, is_path = input_to_pil_image(img)
296
+ if lang != self.initial_lang:
297
+ self.initial_lang = lang
298
+ self.regex = get_regex(lang)
299
+ if not img:
300
+ return (False, 'Invalid image provided')
301
+
302
+ request = LensOverlayServerRequest()
303
+
304
+ request.objects_request.request_context.request_id.uuid = random.randint(0, 2**64 - 1)
305
+ request.objects_request.request_context.request_id.sequence_id = 0
306
+ request.objects_request.request_context.request_id.image_sequence_id = 0
307
+ request.objects_request.request_context.request_id.analytics_id = random.randbytes(16)
308
+ request.objects_request.request_context.request_id.routing_info = LensOverlayRoutingInfo()
309
+
310
+ request.objects_request.request_context.client_context.platform = Platform.WEB
311
+ request.objects_request.request_context.client_context.surface = Surface.CHROMIUM
312
+
313
+ request.objects_request.request_context.client_context.locale_context.language = 'ja'
314
+ request.objects_request.request_context.client_context.locale_context.region = 'Asia/Tokyo'
315
+ request.objects_request.request_context.client_context.locale_context.time_zone = '' # not set by chromium
316
+
317
+ request.objects_request.request_context.client_context.app_id = '' # not set by chromium
318
+
319
+ filter = AppliedFilter()
320
+ filter.filter_type = LensOverlayFilterType.AUTO_FILTER
321
+ request.objects_request.request_context.client_context.client_filters.filter.append(filter)
322
+
323
+ image_data = self._preprocess(img)
324
+ request.objects_request.image_data.payload.image_bytes = image_data[0]
325
+ request.objects_request.image_data.image_metadata.width = image_data[1]
326
+ request.objects_request.image_data.image_metadata.height = image_data[2]
327
+
328
+ payload = request.SerializeToString()
329
+
330
+ headers = {
331
+ 'Host': 'lensfrontend-pa.googleapis.com',
332
+ 'Connection': 'keep-alive',
333
+ 'Content-Type': 'application/x-protobuf',
334
+ 'X-Goog-Api-Key': 'AIzaSyDr2UxVnv_U85AbhhY8XSHSIavUW0DC-sY',
335
+ 'Sec-Fetch-Site': 'none',
336
+ 'Sec-Fetch-Mode': 'no-cors',
337
+ 'Sec-Fetch-Dest': 'empty',
338
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
339
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
340
+ 'Accept-Language': 'ja-JP;q=0.6,ja;q=0.5'
341
+ }
342
+
343
+ try:
344
+ res = requests.post('https://lensfrontend-pa.googleapis.com/v1/crupload', data=payload, headers=headers, timeout=5)
345
+ except requests.exceptions.Timeout:
346
+ return (False, 'Request timeout!')
347
+ except requests.exceptions.ConnectionError:
348
+ return (False, 'Connection error!')
349
+
350
+ if res.status_code != 200:
351
+ return (False, 'Unknown error!')
352
+
353
+ response_proto = LensOverlayServerResponse().FromString(res.content)
354
+ response_dict = response_proto.to_dict(betterproto.Casing.SNAKE)
355
+
356
+ if os.path.exists(r"C:\Users\Beangate\GSM\Electron App\test"):
357
+ with open(os.path.join(r"C:\Users\Beangate\GSM\Electron App\test", 'glens_response.json'), 'w', encoding='utf-8') as f:
358
+ json.dump(response_dict, f, indent=4, ensure_ascii=False)
359
+ res = ''
360
+ text = response_dict['objects_response']['text']
361
+ skipped = []
362
+ previous_line = None
363
+ if 'text_layout' in text:
364
+ for paragraph in text['text_layout']['paragraphs']:
365
+ if previous_line:
366
+ prev_bbox = previous_line['geometry']['bounding_box']
367
+ curr_bbox = paragraph['geometry']['bounding_box']
368
+ vertical_space = abs(curr_bbox['center_y'] - prev_bbox['center_y']) * img.height
369
+ prev_height = prev_bbox['height'] * img.height
370
+ current_height = curr_bbox['height'] * img.height
371
+ avg_height = (prev_height + current_height) / 2
372
+ # If vertical space is close to previous line's height, add a blank line
373
+ # logger.info(f"Vertical space: {vertical_space}, Average height: {avg_height}")
374
+ # logger.info(avg_height * 2)
375
+ if vertical_space > avg_height * 2:
376
+ res += 'BLANK_LINE'
377
+ for line in paragraph['lines']:
378
+ if furigana_filter_sensitivity:
379
+ for word in line['words']:
380
+ if not self.punctuation_regex.findall(word):
381
+ continue
382
+ if 'geometry' not in word:
383
+ res += word['plain_text'] + word['text_separator']
384
+ continue
385
+ word_width = word['geometry']['bounding_box']['width'] * img.width
386
+ word_height = word['geometry']['bounding_box']['height'] * img.height
387
+ if word_width > furigana_filter_sensitivity and word_height > furigana_filter_sensitivity:
388
+ res += word['plain_text'] + word['text_separator']
389
+ else:
390
+ skipped.extend(word['plain_text'])
391
+ continue
392
+ else:
393
+ for word in line['words']:
394
+ res += word['plain_text'] + word['text_separator']
395
+
396
+ previous_line = paragraph
397
+ res += '\n'
398
+ # logger.info(
399
+ # f"Skipped {len(skipped)} chars due to furigana filter sensitivity: {furigana_filter_sensitivity}")
400
+ # widths = []
401
+ # heights = []
402
+ # if 'text_layout' in text:
403
+ # paragraphs = text['text_layout']['paragraphs']
404
+ # for paragraph in paragraphs:
405
+ # for line in paragraph['lines']:
406
+ # for word in line['words']:
407
+ # if self.kana_kanji_regex.search(word['plain_text']) is None:
408
+ # continue
409
+ # widths.append(word['geometry']['bounding_box']['width'])
410
+ # heights.append(word['geometry']['bounding_box']['height'])
411
+ #
412
+ # max_width = max(sorted(widths)[:-max(1, len(widths) // 10)]) if len(widths) > 1 else 0
413
+ # max_height = max(sorted(heights)[:-max(1, len(heights) // 10)]) if len(heights) > 1 else 0
414
+ #
415
+ # required_width = max_width * furigana_filter_sensitivity
416
+ # required_height = max_height * furigana_filter_sensitivity
417
+ #
418
+ # if 'text_layout' in text:
419
+ # paragraphs = text['text_layout']['paragraphs']
420
+ # for paragraph in paragraphs:
421
+ # for line in paragraph['lines']:
422
+ # if furigana_filter_sensitivity == 0 or line['geometry']['bounding_box']['width'] > required_width or line['geometry']['bounding_box']['height'] > required_height:
423
+ # for word in line['words']:
424
+ # res += word['plain_text'] + word['text_separator']
425
+ # else:
426
+ # continue
427
+ # res += '\n'
428
+ # else:
429
+ # if 'text_layout' in text:
430
+ # paragraphs = text['text_layout']['paragraphs']
431
+ # for paragraph in paragraphs:
432
+ # for line in paragraph['lines']:
433
+ # for word in line['words']:
434
+ # res += word['plain_text'] + word['text_separator']
435
+ # else:
436
+ # continue
437
+ # res += '\n'
438
+
439
+ if return_coords:
440
+ x = (True, res, response_dict)
441
+ else:
442
+ x = (True, res)
443
+
444
+ if skipped:
445
+ logger.info(f"Skipped {len(skipped)} chars due to furigana filter sensitivity: {furigana_filter_sensitivity}")
446
+ logger.debug(f"Skipped chars: {''.join(skipped)}")
447
+
448
+ # img.close()
449
+ return x
450
+
451
+ def _preprocess(self, img):
452
+ if img.width * img.height > 3000000:
453
+ aspect_ratio = img.width / img.height
454
+ new_w = int(sqrt(3000000 * aspect_ratio))
455
+ new_h = int(new_w / aspect_ratio)
456
+ img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
457
+
458
+ return (pil_image_to_bytes(img), img.width, img.height)
459
+
460
+ class GoogleLensWeb:
461
+ name = 'glensweb'
462
+ readable_name = 'Google Lens (web)'
463
+ key = 'k'
464
+ available = False
465
+
466
+ def __init__(self, lang='ja'):
467
+ if 'pyjson5' not in sys.modules:
468
+ logger.warning('pyjson5 not available, Google Lens (web) will not work!')
469
+ else:
470
+ self.requests_session = requests.Session()
471
+ self.available = True
472
+ logger.info('Google Lens (web) ready')
473
+
474
+ def __call__(self, img, furigana_filter_sensitivity=0):
475
+ img, is_path = input_to_pil_image(img)
476
+ if not img:
477
+ return (False, 'Invalid image provided')
478
+
479
+ url = 'https://lens.google.com/v3/upload'
480
+ files = {'encoded_image': ('image.png', self._preprocess(img), 'image/png')}
481
+ headers = {
482
+ 'Host': 'lens.google.com',
483
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
484
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
485
+ 'Accept-Language': 'ja-JP;q=0.6,ja;q=0.5',
486
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
487
+ 'Referer': 'https://www.google.com/',
488
+ 'Origin': 'https://www.google.com',
489
+ 'Alt-Used': 'lens.google.com',
490
+ 'Connection': 'keep-alive',
491
+ 'Upgrade-Insecure-Requests': '1',
492
+ 'Sec-Fetch-Dest': 'document',
493
+ 'Sec-Fetch-Mode': 'navigate',
494
+ 'Sec-Fetch-Site': 'same-site',
495
+ 'Priority': 'u=0, i',
496
+ 'TE': 'trailers'
497
+ }
498
+ cookies = {'SOCS': 'CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg'}
499
+
500
+ try:
501
+ res = self.requests_session.post(url, files=files, headers=headers, cookies=cookies, timeout=5, allow_redirects=False)
502
+ except requests.exceptions.Timeout:
503
+ return (False, 'Request timeout!')
504
+ except requests.exceptions.ConnectionError:
505
+ return (False, 'Connection error!')
506
+
507
+ if res.status_code != 303:
508
+ return (False, 'Unknown error!')
509
+
510
+ redirect_url = res.headers.get('Location')
511
+ if not redirect_url:
512
+ return (False, 'Error getting redirect URL!')
513
+
514
+ parsed_url = urlparse(redirect_url)
515
+ query_params = parse_qs(parsed_url.query)
516
+
517
+ if ('vsrid' not in query_params) or ('gsessionid' not in query_params):
518
+ return (False, 'Unknown error!')
519
+
520
+ try:
521
+ res = self.requests_session.get(f"https://lens.google.com/qfmetadata?vsrid={query_params['vsrid'][0]}&gsessionid={query_params['gsessionid'][0]}", timeout=5)
522
+ except requests.exceptions.Timeout:
523
+ return (False, 'Request timeout!')
524
+ except requests.exceptions.ConnectionError:
525
+ return (False, 'Connection error!')
526
+
527
+ if (len(res.text.splitlines()) != 3):
528
+ return (False, 'Unknown error!')
529
+
530
+ lens_object = pyjson5.loads(res.text.splitlines()[2])
531
+
532
+ res = ''
533
+ text = lens_object[0][2][0][0]
534
+ for paragraph in text:
535
+ for line in paragraph[1]:
536
+ for word in line[0]:
537
+ res += word[1] + word[2]
538
+ res += '\n'
539
+
540
+ x = (True, res)
541
+
542
+ # img.close()
543
+ return x
544
+
545
+ def _preprocess(self, img):
546
+ if img.width * img.height > 3000000:
547
+ aspect_ratio = img.width / img.height
548
+ new_w = int(sqrt(3000000 * aspect_ratio))
549
+ new_h = int(new_w / aspect_ratio)
550
+ img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
551
+
552
+ return pil_image_to_bytes(img)
553
+
554
+ class Bing:
555
+ name = 'bing'
556
+ readable_name = 'Bing'
557
+ key = 'b'
558
+ available = False
559
+
560
+ def __init__(self, lang='ja'):
561
+ self.requests_session = requests.Session()
562
+ self.available = True
563
+ logger.info('Bing ready')
564
+
565
+ def __call__(self, img, furigana_filter_sensitivity=0):
566
+ img, is_path = input_to_pil_image(img)
567
+ if not img:
568
+ return (False, 'Invalid image provided')
569
+
570
+ img_bytes = self._preprocess(img)
571
+ if not img_bytes:
572
+ return (False, 'Image is too big!')
573
+
574
+ upload_url = 'https://www.bing.com/images/search?view=detailv2&iss=sbiupload'
575
+ upload_headers = {
576
+ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
577
+ 'accept-language': 'ja-JP;q=0.6,ja;q=0.5',
578
+ 'cache-control': 'max-age=0',
579
+ 'origin': 'https://www.bing.com',
580
+ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
581
+ }
582
+ files = {
583
+ 'imgurl': (None, ''),
584
+ 'cbir': (None, 'sbi'),
585
+ 'imageBin': (None, img_bytes)
586
+ }
587
+
588
+ for _ in range(2):
589
+ api_host = urlparse(upload_url).netloc
590
+ try:
591
+ res = self.requests_session.post(upload_url, headers=upload_headers, files=files, timeout=5, allow_redirects=False)
592
+ except requests.exceptions.Timeout:
593
+ return (False, 'Request timeout!')
594
+ except requests.exceptions.ConnectionError:
595
+ return (False, 'Connection error!')
596
+
597
+ if res.status_code != 302:
598
+ return (False, 'Unknown error!')
599
+
600
+ redirect_url = res.headers.get('Location')
601
+ if not redirect_url:
602
+ return (False, 'Error getting redirect URL!')
603
+ if not redirect_url.startswith('https://'):
604
+ break
605
+ upload_url = redirect_url
606
+
607
+ parsed_url = urlparse(redirect_url)
608
+ query_params = parse_qs(parsed_url.query)
609
+
610
+ image_insights_token = query_params.get('insightsToken')
611
+ if not image_insights_token:
612
+ return (False, 'Error getting token!')
613
+ image_insights_token = image_insights_token[0]
614
+
615
+ api_url = f'https://{api_host}/images/api/custom/knowledge'
616
+ api_headers = {
617
+ 'accept': '*/*',
618
+ 'accept-language': 'ja-JP;q=0.6,ja;q=0.5',
619
+ 'origin': 'https://www.bing.com',
620
+ 'referer': f'https://www.bing.com/images/search?view=detailV2&insightstoken={image_insights_token}',
621
+ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
622
+ }
623
+ api_data_json = {
624
+ 'imageInfo': {'imageInsightsToken': image_insights_token, 'source': 'Url'},
625
+ 'knowledgeRequest': {'invokedSkills': ['OCR'], 'index': 1}
626
+ }
627
+ files = {
628
+ 'knowledgeRequest': (None, json.dumps(api_data_json), 'application/json')
629
+ }
630
+
631
+ try:
632
+ res = self.requests_session.post(api_url, headers=api_headers, files=files, timeout=5)
633
+ except requests.exceptions.Timeout:
634
+ return (False, 'Request timeout!')
635
+ except requests.exceptions.ConnectionError:
636
+ return (False, 'Connection error!')
637
+
638
+ if res.status_code != 200:
639
+ return (False, 'Unknown error!')
640
+
641
+ data = res.json()
642
+
643
+ res = ''
644
+ text_tag = None
645
+ for tag in data['tags']:
646
+ if tag.get('displayName') == '##TextRecognition':
647
+ text_tag = tag
648
+ break
649
+ if text_tag:
650
+ text_action = None
651
+ for action in text_tag['actions']:
652
+ if action.get('_type') == 'ImageKnowledge/TextRecognitionAction':
653
+ text_action = action
654
+ break
655
+ if text_action:
656
+ regions = text_action['data'].get('regions', [])
657
+ for region in regions:
658
+ for line in region.get('lines', []):
659
+ res += line['text'] + '\n'
660
+
661
+ x = (True, res)
662
+
663
+ # img.close()
664
+ return x
665
+
666
+ def _preprocess(self, img):
667
+ max_pixel_size = 4000
668
+ max_byte_size = 767772
669
+ res = None
670
+
671
+ if any(x > max_pixel_size for x in img.size):
672
+ resize_factor = max(max_pixel_size / img.width, max_pixel_size / img.height)
673
+ new_w = int(img.width * resize_factor)
674
+ new_h = int(img.height * resize_factor)
675
+ img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
676
+
677
+ img_bytes, _ = limit_image_size(img, max_byte_size)
678
+
679
+ if img_bytes:
680
+ res = base64.b64encode(img_bytes).decode('utf-8')
681
+
682
+ return res
683
+
684
+ class AppleVision:
685
+ name = 'avision'
686
+ readable_name = 'Apple Vision'
687
+ key = 'a'
688
+ available = False
689
+
690
+ def __init__(self, lang='ja'):
691
+ if sys.platform != 'darwin':
692
+ logger.warning('Apple Vision is not supported on non-macOS platforms!')
693
+ elif int(platform.mac_ver()[0].split('.')[0]) < 13:
694
+ logger.warning('Apple Vision is not supported on macOS older than Ventura/13.0!')
695
+ else:
696
+ self.available = True
697
+ logger.info('Apple Vision ready')
698
+
699
+ def __call__(self, img, furigana_filter_sensitivity=0):
700
+ img, is_path = input_to_pil_image(img)
701
+ if not img:
702
+ return (False, 'Invalid image provided')
703
+
704
+ with objc.autorelease_pool():
705
+ req = Vision.VNRecognizeTextRequest.alloc().init()
706
+
707
+ req.setRevision_(Vision.VNRecognizeTextRequestRevision3)
708
+ req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
709
+ req.setUsesLanguageCorrection_(True)
710
+ req.setRecognitionLanguages_(['ja','en'])
711
+
712
+ handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(
713
+ self._preprocess(img), None
714
+ )
715
+
716
+ success = handler.performRequests_error_([req], None)
717
+ res = ''
718
+ if success[0]:
719
+ for result in req.results():
720
+ res += result.text() + '\n'
721
+ x = (True, res)
722
+ else:
723
+ x = (False, 'Unknown error!')
724
+
725
+ # img.close()
726
+ return x
727
+
728
+ def _preprocess(self, img):
729
+ return pil_image_to_bytes(img, 'tiff')
730
+
731
+
732
+ class AppleLiveText:
733
+ name = 'alivetext'
734
+ readable_name = 'Apple Live Text'
735
+ key = 'd'
736
+ available = False
737
+
738
+ def __init__(self, lang='ja'):
739
+ if sys.platform != 'darwin':
740
+ logger.warning('Apple Live Text is not supported on non-macOS platforms!')
741
+ elif int(platform.mac_ver()[0].split('.')[0]) < 13:
742
+ logger.warning('Apple Live Text is not supported on macOS older than Ventura/13.0!')
743
+ else:
744
+ app_info = NSBundle.mainBundle().infoDictionary()
745
+ app_info['LSBackgroundOnly'] = '1'
746
+ self.VKCImageAnalyzer = objc.lookUpClass('VKCImageAnalyzer')
747
+ self.VKCImageAnalyzerRequest = objc.lookUpClass('VKCImageAnalyzerRequest')
748
+ objc.registerMetaDataForSelector(
749
+ b'VKCImageAnalyzer',
750
+ b'processRequest:progressHandler:completionHandler:',
751
+ {
752
+ 'arguments': {
753
+ 3: {
754
+ 'callable': {
755
+ 'retval': {'type': b'v'},
756
+ 'arguments': {
757
+ 0: {'type': b'^v'},
758
+ 1: {'type': b'd'},
759
+ }
760
+ }
761
+ },
762
+ 4: {
763
+ 'callable': {
764
+ 'retval': {'type': b'v'},
765
+ 'arguments': {
766
+ 0: {'type': b'^v'},
767
+ 1: {'type': b'@'},
768
+ 2: {'type': b'@'},
769
+ }
770
+ }
771
+ }
772
+ }
773
+ }
774
+ )
775
+ self.available = True
776
+ logger.info('Apple Live Text ready')
777
+
778
+ def __call__(self, img, furigana_filter_sensitivity=0):
779
+ img, is_path = input_to_pil_image(img)
780
+ if not img:
781
+ return (False, 'Invalid image provided')
782
+
783
+ with objc.autorelease_pool():
784
+ analyzer = self.VKCImageAnalyzer.alloc().init()
785
+ req = self.VKCImageAnalyzerRequest.alloc().initWithImage_requestType_(self._preprocess(img), 1) #VKAnalysisTypeText
786
+ req.setLocales_(['ja','en'])
787
+ self.result = None
788
+ analyzer.processRequest_progressHandler_completionHandler_(req, lambda progress: None, self._process)
789
+
790
+ CFRunLoopRunInMode(kCFRunLoopDefaultMode, 10.0, False)
791
+
792
+ if self.result == None:
793
+ return (False, 'Unknown error!')
794
+ return (True, self.result)
795
+
796
+ def _process(self, analysis, error):
797
+ res = ''
798
+ lines = analysis.allLines()
799
+ if lines:
800
+ for line in lines:
801
+ res += line.string() + '\n'
802
+ self.result = res
803
+ CFRunLoopStop(CFRunLoopGetCurrent())
804
+
805
+ def _preprocess(self, img):
806
+ image_bytes = pil_image_to_bytes(img, 'tiff')
807
+ ns_data = NSData.dataWithBytes_length_(image_bytes, len(image_bytes))
808
+ ns_image = NSImage.alloc().initWithData_(ns_data)
809
+ return ns_image
810
+
811
+
812
+ class WinRTOCR:
813
+ name = 'winrtocr'
814
+ readable_name = 'WinRT OCR'
815
+ key = 'w'
816
+ available = False
817
+
818
+ def __init__(self, config={}, lang='ja'):
819
+ if sys.platform == 'win32':
820
+ if int(platform.release()) < 10:
821
+ logger.warning('WinRT OCR is not supported on Windows older than 10!')
822
+ elif 'winocr' not in sys.modules:
823
+ logger.warning('winocr not available, WinRT OCR will not work!')
824
+ else:
825
+ self.available = True
826
+ logger.info('WinRT OCR ready')
827
+ else:
828
+ try:
829
+ self.url = config['url']
830
+ self.available = True
831
+ logger.info('WinRT OCR ready')
832
+ except:
833
+ logger.warning('Error reading URL from config, WinRT OCR will not work!')
834
+
835
+ def __call__(self, img, furigana_filter_sensitivity=0):
836
+ img, is_path = input_to_pil_image(img)
837
+ if not img:
838
+ return (False, 'Invalid image provided')
839
+
840
+ if sys.platform == 'win32':
841
+ res = winocr.recognize_pil_sync(img, lang='ja')['text']
842
+ else:
843
+ params = {'lang': 'ja'}
844
+ try:
845
+ res = requests.post(self.url, params=params, data=self._preprocess(img), timeout=3)
846
+ except requests.exceptions.Timeout:
847
+ return (False, 'Request timeout!')
848
+ except requests.exceptions.ConnectionError:
849
+ return (False, 'Connection error!')
850
+
851
+ if res.status_code != 200:
852
+ return (False, 'Unknown error!')
853
+
854
+ res = res.json()['text']
855
+
856
+ x = (True, res)
857
+
858
+
859
+ # img.close()
860
+ return x
861
+
862
+ def _preprocess(self, img):
863
+ return pil_image_to_bytes(img, png_compression=1)
864
+
865
+ class OneOCR:
866
+ name = 'oneocr'
867
+ readable_name = 'OneOCR'
868
+ key = 'z'
869
+ available = False
870
+
871
+ def __init__(self, config={}, lang='ja'):
872
+ self.initial_lang = lang
873
+ self.regex = get_regex(lang)
874
+ if sys.platform == 'win32':
875
+ if int(platform.release()) < 10:
876
+ logger.warning('OneOCR is not supported on Windows older than 10!')
877
+ elif 'oneocr' not in sys.modules:
878
+ logger.warning('oneocr not available, OneOCR will not work!')
879
+ elif not os.path.exists(os.path.expanduser('~/.config/oneocr/oneocr.dll')):
880
+ logger.warning('OneOCR DLLs not found, please install OwOCR Dependencies via OCR Tab in GSM.')
881
+ else:
882
+ try:
883
+ logger.info(f'Loading OneOCR model')
884
+ self.model = oneocr.OcrEngine()
885
+ except RuntimeError as e:
886
+ logger.warning(e + ', OneOCR will not work!')
887
+ else:
888
+ self.available = True
889
+ logger.info('OneOCR ready')
890
+ else:
891
+ try:
892
+ self.url = config['url']
893
+ self.available = True
894
+ logger.info('OneOCR ready')
895
+ except:
896
+ logger.warning('Error reading URL from config, OneOCR will not work!')
897
+
898
+ def get_regex(self, lang):
899
+ if lang == "ja":
900
+ self.regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
901
+ elif lang == "zh":
902
+ self.regex = re.compile(r'[\u4E00-\u9FFF]')
903
+ elif lang == "ko":
904
+ self.regex = re.compile(r'[\uAC00-\uD7AF]')
905
+ elif lang == "ar":
906
+ self.regex = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]')
907
+ elif lang == "ru":
908
+ self.regex = re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]')
909
+ elif lang == "el":
910
+ self.regex = re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]')
911
+ elif lang == "he":
912
+ self.regex = re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]')
913
+ elif lang == "th":
914
+ self.regex = re.compile(r'[\u0E00-\u0E7F]')
915
+ else:
916
+ self.regex = re.compile(
917
+ r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]')
918
+
919
+ def __call__(self, img, furigana_filter_sensitivity=0, return_coords=False, multiple_crop_coords=False, return_one_box=True):
920
+ lang = get_ocr_language()
921
+ if furigana_filter_sensitivity != None:
922
+ furigana_filter_sensitivity = get_furigana_filter_sensitivity()
923
+ else:
924
+ furigana_filter_sensitivity = 0
925
+ if lang != self.initial_lang:
926
+ self.initial_lang = lang
927
+ self.regex = get_regex(lang)
928
+ img, is_path = input_to_pil_image(img)
929
+ if img.width < 51 or img.height < 51:
930
+ new_width = max(img.width, 51)
931
+ new_height = max(img.height, 51)
932
+ new_img = Image.new("RGBA", (new_width, new_height), (0, 0, 0, 0))
933
+ new_img.paste(img, ((new_width - img.width) // 2, (new_height - img.height) // 2))
934
+ img = new_img
935
+ if not img:
936
+ return (False, 'Invalid image provided')
937
+ crop_coords = None
938
+ crop_coords_list = []
939
+ if sys.platform == 'win32':
940
+ try:
941
+ ocr_resp = self.model.recognize_pil(img)
942
+ # if os.path.exists(os.path.expanduser("~/GSM/temp")):
943
+ # with open(os.path.join(os.path.expanduser("~/GSM/temp"), 'oneocr_response.json'), 'w',
944
+ # encoding='utf-8') as f:
945
+ # json.dump(ocr_resp, f, indent=4, ensure_ascii=False)
946
+ # print(json.dumps(ocr_resp))
947
+ filtered_lines = [line for line in ocr_resp['lines'] if self.regex.search(line['text'])]
948
+ x_coords = [line['bounding_rect'][f'x{i}'] for line in filtered_lines for i in range(1, 5)]
949
+ y_coords = [line['bounding_rect'][f'y{i}'] for line in filtered_lines for i in range(1, 5)]
950
+ if x_coords and y_coords:
951
+ crop_coords = (min(x_coords) - 5, min(y_coords) - 5, max(x_coords) + 5, max(y_coords) + 5)
952
+ # logger.info(filtered_lines)
953
+ res = ''
954
+ skipped = []
955
+ boxes = []
956
+ if furigana_filter_sensitivity > 0:
957
+ for line in filtered_lines:
958
+ x1, x2, x3, x4 = line['bounding_rect']['x1'], line['bounding_rect']['x2'], \
959
+ line['bounding_rect']['x3'], line['bounding_rect']['x4']
960
+ y1, y2, y3, y4 = line['bounding_rect']['y1'], line['bounding_rect']['y2'], \
961
+ line['bounding_rect']['y3'], line['bounding_rect']['y4']
962
+ width = max(x2 - x1, x3 - x4)
963
+ height = max(y3 - y1, y4 - y2)
964
+ if width > furigana_filter_sensitivity and height > furigana_filter_sensitivity:
965
+ res += line['text']
966
+ else:
967
+ skipped.extend(char for char in line['text'])
968
+ continue
969
+ res += '\n'
970
+ # logger.info(
971
+ # f"Skipped {len(skipped)} chars due to furigana filter sensitivity: {furigana_filter_sensitivity}")
972
+ # widths, heights = [], []
973
+ # for line in ocr_resp['lines']:
974
+ # for word in line['words']:
975
+ # if self.kana_kanji_regex.search(word['text']) is None:
976
+ # continue
977
+ # # x1, x2, x3, x4 = line['bounding_rect']['x1'], line['bounding_rect']['x2'], line['bounding_rect']['x3'], line['bounding_rect']['x4']
978
+ # # y1, y2, y3, y4 = line['bounding_rect']['y1'], line['bounding_rect']['y2'], line['bounding_rect']['y3'], line['bounding_rect']['y4']
979
+ # x1, x2, x3, x4 = word['bounding_rect']['x1'], word['bounding_rect']['x2'], \
980
+ # word['bounding_rect']['x3'], word['bounding_rect']['x4']
981
+ # y1, y2, y3, y4 = word['bounding_rect']['y1'], word['bounding_rect']['y2'], \
982
+ # word['bounding_rect']['y3'], word['bounding_rect']['y4']
983
+ # widths.append(max(x2 - x1, x3 - x4))
984
+ # heights.append(max(y2 - y1, y3 - y4))
985
+ #
986
+ #
987
+ # max_width = max(sorted(widths)[:-max(1, len(widths) // 10)]) if len(widths) > 1 else 0
988
+ # max_height = max(sorted(heights)[:-max(1, len(heights) // 10)]) if len(heights) > 1 else 0
989
+ #
990
+ # required_width = max_width * furigana_filter_sensitivity
991
+ # required_height = max_height * furigana_filter_sensitivity
992
+ # for line in ocr_resp['lines']:
993
+ # for word in line['words']:
994
+ # x1, x2, x3, x4 = word['bounding_rect']['x1'], word['bounding_rect']['x2'], \
995
+ # word['bounding_rect']['x3'], word['bounding_rect']['x4']
996
+ # y1, y2, y3, y4 = word['bounding_rect']['y1'], word['bounding_rect']['y2'], \
997
+ # word['bounding_rect']['y3'], word['bounding_rect']['y4']
998
+ # width = max(x2 - x1, x3 - x4)
999
+ # height = max(y2 - y1, y3 - y4)
1000
+ # if furigana_filter_sensitivity == 0 or width > required_width or height > required_height:
1001
+ # res += word['text']
1002
+ # else:
1003
+ # continue
1004
+ # res += '\n'
1005
+ else:
1006
+ res = ocr_resp['text']
1007
+
1008
+ if multiple_crop_coords:
1009
+ logger.info(f"Getting multiple crop coords for {len(filtered_lines)} lines")
1010
+ for line in filtered_lines:
1011
+ crop_coords_list.append(
1012
+ (line['bounding_rect']['x1'] - 5, line['bounding_rect']['y1'] - 5,
1013
+ line['bounding_rect']['x3'] + 5, line['bounding_rect']['y3'] + 5))
1014
+
1015
+ except RuntimeError as e:
1016
+ return (False, e)
1017
+ else:
1018
+ try:
1019
+ res = requests.post(self.url, data=self._preprocess(img), timeout=3)
1020
+ except requests.exceptions.Timeout:
1021
+ return (False, 'Request timeout!')
1022
+ except requests.exceptions.ConnectionError:
1023
+ return (False, 'Connection error!')
1024
+
1025
+ if res.status_code != 200:
1026
+ return (False, 'Unknown error!')
1027
+
1028
+ res = res.json()['text']
1029
+
1030
+ x = [True, res]
1031
+ if return_coords:
1032
+ x.append(filtered_lines)
1033
+ if multiple_crop_coords:
1034
+ x.append(crop_coords_list)
1035
+ if return_one_box:
1036
+ x.append(crop_coords)
1037
+ if is_path:
1038
+ img.close()
1039
+ return x
1040
+
1041
+ def _preprocess(self, img):
1042
+ return pil_image_to_bytes(img, png_compression=1)
1043
+
1044
+ class AzureImageAnalysis:
1045
+ name = 'azure'
1046
+ readable_name = 'Azure Image Analysis'
1047
+ key = 'v'
1048
+ available = False
1049
+
1050
+ def __init__(self, config={}, lang='ja'):
1051
+ if 'azure.ai.vision.imageanalysis' not in sys.modules:
1052
+ logger.warning('azure-ai-vision-imageanalysis not available, Azure Image Analysis will not work!')
1053
+ else:
1054
+ logger.info(f'Parsing Azure credentials')
1055
+ try:
1056
+ self.client = ImageAnalysisClient(config['endpoint'], AzureKeyCredential(config['api_key']))
1057
+ self.available = True
1058
+ logger.info('Azure Image Analysis ready')
1059
+ except:
1060
+ logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')
1061
+
1062
+ def __call__(self, img, furigana_filter_sensitivity=0):
1063
+ img, is_path = input_to_pil_image(img)
1064
+ if not img:
1065
+ return (False, 'Invalid image provided')
1066
+
1067
+ try:
1068
+ read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ])
1069
+ except ServiceRequestError:
1070
+ return (False, 'Connection error!')
1071
+ except:
1072
+ return (False, 'Unknown error!')
1073
+
1074
+ res = ''
1075
+ if read_result.read:
1076
+ for block in read_result.read.blocks:
1077
+ for line in block.lines:
1078
+ res += line.text + '\n'
1079
+ else:
1080
+ return (False, 'Unknown error!')
1081
+
1082
+ x = (True, res)
1083
+
1084
+ # img.close()
1085
+ return x
1086
+
1087
+ def _preprocess(self, img):
1088
+ if any(x < 50 for x in img.size):
1089
+ resize_factor = max(50 / img.width, 50 / img.height)
1090
+ new_w = int(img.width * resize_factor)
1091
+ new_h = int(img.height * resize_factor)
1092
+ img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
1093
+
1094
+ return pil_image_to_bytes(img)
1095
+
1096
+ class EasyOCR:
1097
+ name = 'easyocr'
1098
+ readable_name = 'EasyOCR'
1099
+ key = 'e'
1100
+ available = False
1101
+
1102
+ def __init__(self, config={'gpu': True}, lang='ja'):
1103
+ if 'easyocr' not in sys.modules:
1104
+ logger.warning('easyocr not available, EasyOCR will not work!')
1105
+ else:
1106
+ logger.info('Loading EasyOCR model')
1107
+ logging.getLogger('easyocr.easyocr').setLevel(logging.ERROR)
1108
+ self.model = easyocr.Reader(['ja','en'], gpu=config['gpu'])
1109
+ self.available = True
1110
+ logger.info('EasyOCR ready')
1111
+
1112
+ def __call__(self, img, furigana_filter_sensitivity=0):
1113
+ img, is_path = input_to_pil_image(img)
1114
+ if not img:
1115
+ return (False, 'Invalid image provided')
1116
+
1117
+ res = ''
1118
+ read_result = self.model.readtext(self._preprocess(img), detail=0)
1119
+ for text in read_result:
1120
+ res += text + '\n'
1121
+
1122
+ x = (True, res)
1123
+
1124
+ # img.close()
1125
+ return x
1126
+
1127
+ def _preprocess(self, img):
1128
+ return pil_image_to_numpy_array(img)
1129
+
1130
+ class RapidOCR:
1131
+ name = 'rapidocr'
1132
+ readable_name = 'RapidOCR'
1133
+ key = 'r'
1134
+ available = False
1135
+
1136
+ def __init__(self, lang='ja'):
1137
+ if 'rapidocr_onnxruntime' not in sys.modules:
1138
+ logger.warning('rapidocr_onnxruntime not available, RapidOCR will not work!')
1139
+ else:
1140
+ rapidocr_model_file = os.path.join(os.path.expanduser('~'),'.cache','rapidocr_japan_PP-OCRv4_rec_infer.onnx')
1141
+ if not os.path.isfile(rapidocr_model_file):
1142
+ logger.info('Downloading RapidOCR model ' + rapidocr_model_file)
1143
+ try:
1144
+ cache_folder = os.path.join(os.path.expanduser('~'),'.cache')
1145
+ if not os.path.isdir(cache_folder):
1146
+ os.makedirs(cache_folder)
1147
+ urllib.request.urlretrieve('https://github.com/AuroraWright/owocr/raw/master/rapidocr_japan_PP-OCRv4_rec_infer.onnx', rapidocr_model_file)
1148
+ except:
1149
+ logger.warning('Download failed. RapidOCR will not work!')
1150
+ return
1151
+
1152
+ logger.info('Loading RapidOCR model')
1153
+ self.model = ROCR(rec_model_path=rapidocr_model_file)
1154
+ logging.getLogger().setLevel(logging.ERROR)
1155
+ self.available = True
1156
+ logger.info('RapidOCR ready')
1157
+
1158
+ def __call__(self, img, furigana_filter_sensitivity=0):
1159
+ img, is_path = input_to_pil_image(img)
1160
+ if not img:
1161
+ return (False, 'Invalid image provided')
1162
+
1163
+ res = ''
1164
+ read_results, elapsed = self.model(self._preprocess(img))
1165
+ if read_results:
1166
+ for read_result in read_results:
1167
+ res += read_result[1] + '\n'
1168
+
1169
+ x = (True, res)
1170
+
1171
+ # img.close()
1172
+ return x
1173
+
1174
+ def _preprocess(self, img):
1175
+ return pil_image_to_numpy_array(img)
1176
+
1177
+ class OCRSpace:
1178
+ name = 'ocrspace'
1179
+ readable_name = 'OCRSpace'
1180
+ key = 'o'
1181
+ available = False
1182
+
1183
+ def __init__(self, config={}, lang='ja'):
1184
+ try:
1185
+ self.api_key = config['api_key']
1186
+ self.max_byte_size = config.get('file_size_limit', 1000000)
1187
+ self.available = True
1188
+ logger.info('OCRSpace ready')
1189
+ except:
1190
+ logger.warning('Error reading API key from config, OCRSpace will not work!')
1191
+
1192
+ def __call__(self, img, furigana_filter_sensitivity=0):
1193
+ img, is_path = input_to_pil_image(img)
1194
+ if not img:
1195
+ return (False, 'Invalid image provided')
1196
+
1197
+ img_bytes, img_extension = self._preprocess(img)
1198
+ if not img_bytes:
1199
+ return (False, 'Image is too big!')
1200
+
1201
+ data = {
1202
+ 'apikey': self.api_key,
1203
+ 'language': 'jpn'
1204
+ }
1205
+ files = {'file': ('image.' + img_extension, img_bytes, 'image/' + img_extension)}
1206
+
1207
+ try:
1208
+ res = requests.post('https://api.ocr.space/parse/image', data=data, files=files, timeout=5)
1209
+ except requests.exceptions.Timeout:
1210
+ return (False, 'Request timeout!')
1211
+ except requests.exceptions.ConnectionError:
1212
+ return (False, 'Connection error!')
1213
+
1214
+ if res.status_code != 200:
1215
+ return (False, 'Unknown error!')
1216
+
1217
+ res = res.json()
1218
+
1219
+ if isinstance(res, str):
1220
+ return (False, 'Unknown error!')
1221
+ if res['IsErroredOnProcessing']:
1222
+ return (False, res['ErrorMessage'])
1223
+
1224
+ res = res['ParsedResults'][0]['ParsedText']
1225
+ x = (True, res)
1226
+
1227
+ # img.close()
1228
+ return x
1229
+
1230
+ def _preprocess(self, img):
1231
+ return limit_image_size(img, self.max_byte_size)
1232
+
1233
+
1234
+ class GeminiOCR:
1235
+ name = 'gemini'
1236
+ readable_name = 'Gemini'
1237
+ key = 'm'
1238
+ available = False
1239
+
1240
+ def __init__(self, config={'api_key': None}, lang='ja'):
1241
+ # if "google-generativeai" not in sys.modules:
1242
+ # logger.warning('google-generativeai not available, GeminiOCR will not work!')
1243
+ # else:
1244
+ from google import genai
1245
+ from google.genai import types
1246
+ try:
1247
+ self.api_key = config['api_key']
1248
+ if not self.api_key:
1249
+ logger.warning('Gemini API key not provided, GeminiOCR will not work!')
1250
+ else:
1251
+ self.client = genai.Client(api_key=self.api_key)
1252
+ self.model = config['model']
1253
+ self.generation_config = types.GenerateContentConfig(
1254
+ temperature=0.0,
1255
+ max_output_tokens=300,
1256
+ safety_settings=[
1257
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
1258
+ threshold=types.HarmBlockThreshold.BLOCK_NONE),
1259
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
1260
+ threshold=types.HarmBlockThreshold.BLOCK_NONE),
1261
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
1262
+ threshold=types.HarmBlockThreshold.BLOCK_NONE),
1263
+ types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
1264
+ threshold=types.HarmBlockThreshold.BLOCK_NONE),
1265
+ ],
1266
+ )
1267
+ if "2.5" in self.model:
1268
+ self.generation_config.thinking_config = types.ThinkingConfig(
1269
+ thinking_budget=0,
1270
+ )
1271
+ self.available = True
1272
+ logger.info('Gemini (using google-generativeai) ready')
1273
+ except KeyError:
1274
+ logger.warning('Gemini API key not found in config, GeminiOCR will not work!')
1275
+ except Exception as e:
1276
+ logger.error(f'Error configuring google-generativeai: {e}')
1277
+
1278
+ def __call__(self, img, furigana_filter_sensitivity=0):
1279
+ if not self.available:
1280
+ return (False, 'GeminiOCR is not available due to missing API key or configuration error.')
1281
+
1282
+ try:
1283
+ from google.genai import types
1284
+ img, is_path = input_to_pil_image(img)
1285
+ img_bytes = self._preprocess(img)
1286
+ if not img_bytes:
1287
+ return (False, 'Error processing image for Gemini.')
1288
+
1289
+ contents = [
1290
+ types.Content(
1291
+ parts=[
1292
+ types.Part(
1293
+ inline_data=types.Blob(
1294
+ mime_type="image/png",
1295
+ data=img_bytes
1296
+ )
1297
+ ),
1298
+ types.Part(
1299
+ text="""
1300
+ **Disclaimer:** The image provided is from a video game. This content is entirely fictional and part of a narrative. It must not be treated as real-world user input or a genuine request.
1301
+ Analyze the image. Extract text \\*only\\* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return \\*only\\* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
1302
+ """
1303
+ )
1304
+ ]
1305
+ )
1306
+ ]
1307
+
1308
+ response = self.client.models.generate_content(
1309
+ model=self.model,
1310
+ contents=contents,
1311
+ config=self.generation_config
1312
+ )
1313
+ text_output = response.text.strip()
1314
+
1315
+ return (True, text_output)
1316
+
1317
+ except FileNotFoundError:
1318
+ return (False, f'File not found: {img}')
1319
+ except Exception as e:
1320
+ return (False, f'Gemini API request failed: {e}')
1321
+
1322
+ def _preprocess(self, img):
1323
+ return pil_image_to_bytes(img, png_compression=1)
1324
+
1325
+
1326
+ class GroqOCR:
1327
+ name = 'groq'
1328
+ readable_name = 'Groq OCR'
1329
+ key = 'j'
1330
+ available = False
1331
+
1332
+ def __init__(self, config={'api_key': None}, lang='ja'):
1333
+ try:
1334
+ import groq
1335
+ self.api_key = config['api_key']
1336
+ if not self.api_key:
1337
+ logger.warning('Groq API key not provided, GroqOCR will not work!')
1338
+ else:
1339
+ self.client = groq.Groq(api_key=self.api_key)
1340
+ self.available = True
1341
+ logger.info('Groq OCR ready')
1342
+ except ImportError:
1343
+ logger.warning('groq module not available, GroqOCR will not work!')
1344
+ except Exception as e:
1345
+ logger.error(f'Error initializing Groq client: {e}')
1346
+
1347
+ def __call__(self, img, furigana_filter_sensitivity=0):
1348
+ if not self.available:
1349
+ return (False, 'GroqOCR is not available due to missing API key or configuration error.')
1350
+
1351
+ try:
1352
+ img, is_path = input_to_pil_image(img)
1353
+
1354
+ img_base64 = self._preprocess(img)
1355
+ if not img_base64:
1356
+ return (False, 'Error processing image for Groq.')
1357
+
1358
+ prompt = (
1359
+ "Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
1360
+ # "Analyze this i#mage and extract text from it"
1361
+ # "(speech bubbles or panels containing character dialogue). From the extracted dialogue text, "
1362
+ # "filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, "
1363
+ # "including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. "
1364
+ # "If no text is found within dialogue boxes after applying filters, return an empty string. "
1365
+ # "OR, if there are no text bubbles or dialogue boxes found, return everything."
1366
+ # "Do not include any other output, formatting markers, or commentary, only the text from the image."
1367
+ )
1368
+
1369
+ response = self.client.chat.completions.create(
1370
+ model="meta-llama/llama-4-scout-17b-16e-instruct",
1371
+ messages=[
1372
+ {
1373
+ "role": "user",
1374
+ "content": [
1375
+ {"type": "text", "text": prompt},
1376
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
1377
+ ],
1378
+ }
1379
+ ],
1380
+ max_tokens=300,
1381
+ temperature=0.0
1382
+ )
1383
+
1384
+ if response.choices and response.choices[0].message.content:
1385
+ text_output = response.choices[0].message.content.strip()
1386
+ return (True, text_output)
1387
+ else:
1388
+ return (True, "")
1389
+
1390
+ except FileNotFoundError:
1391
+ return (False, f'File not found: {img}')
1392
+ except Exception as e:
1393
+ return (False, f'Groq API request failed: {e}')
1394
+
1395
+ def _preprocess(self, img):
1396
+ return base64.b64encode(pil_image_to_bytes(img, png_compression=1)).decode('utf-8')
1397
+
1398
+
1399
+ # OpenAI-Compatible Endpoint OCR using LM Studio
1400
+ class localLLMOCR:
1401
+ name= 'local_llm_ocr'
1402
+ readable_name = 'Local LLM OCR'
1403
+ key = 'a'
1404
+ available = False
1405
+ last_ocr_time = time.time() - 5
1406
+
1407
+ def __init__(self, config={}, lang='ja'):
1408
+ self.keep_llm_hot_thread = None
1409
+ # All three config values are required: url, model, api_key
1410
+ if not config or not (config.get('url') and config.get('model') and config.get('api_key')):
1411
+ logger.warning('Local LLM OCR requires url, model, and api_key in config, Local LLM OCR will not work!')
1412
+ return
1413
+
1414
+ try:
1415
+ import openai
1416
+ except ImportError:
1417
+ logger.warning('openai module not available, Local LLM OCR will not work!')
1418
+ return
1419
+ import openai, threading
1420
+ try:
1421
+ self.api_url = config.get('url', 'http://localhost:1234/v1/chat/completions')
1422
+ self.model = config.get('model', 'qwen2.5-vl-3b-instruct')
1423
+ self.api_key = config.get('api_key', 'lm-studio')
1424
+ self.keep_warm = config.get('keep_warm', True)
1425
+ self.custom_prompt = config.get('prompt', None)
1426
+ self.available = True
1427
+ # if any(x in self.api_url for x in ['localhost', '127.0.0.1']):
1428
+ # if not self.check_connection(self.api_url):
1429
+ # logger.warning('Local LLM OCR API is not reachable')
1430
+ # return
1431
+ self.client = openai.OpenAI(
1432
+ base_url=self.api_url.replace('/v1/chat/completions', '/v1'),
1433
+ api_key=self.api_key
1434
+ )
1435
+ if self.client.models.retrieve(self.model):
1436
+ self.model = self.model
1437
+ logger.info(f'Local LLM OCR (OpenAI-compatible) ready with model {self.model}')
1438
+ if self.keep_warm:
1439
+ self.keep_llm_hot_thread = threading.Thread(target=self.keep_llm_warm, daemon=True)
1440
+ self.keep_llm_hot_thread.start()
1441
+ except Exception as e:
1442
+ logger.warning(f'Error initializing Local LLM OCR, Local LLM OCR will not work!')
1443
+
1444
+ def check_connection(self, url, port=None):
1445
+ import http.client
1446
+ conn = http.client.HTTPConnection(url, port or 1234, timeout=0.1)
1447
+ try:
1448
+ conn.request("GET", "/v1/models")
1449
+ response = conn.getresponse()
1450
+ if response.status == 200:
1451
+ logger.info('Local LLM OCR API is reachable')
1452
+ return True
1453
+ else:
1454
+ logger.warning('Local LLM OCR API is not reachable')
1455
+ return False
1456
+ except Exception as e:
1457
+ logger.warning(f'Error connecting to Local LLM OCR API: {e}')
1458
+ return False
1459
+ finally:
1460
+ conn.close()
1461
+
1462
+ def keep_llm_warm(self):
1463
+ def ocr_blank_black_image():
1464
+ if self.last_ocr_time and (time.time() - self.last_ocr_time) < 5:
1465
+ return
1466
+ import numpy as np
1467
+ from PIL import Image
1468
+ # Create a blank black image
1469
+ blank_image = Image.fromarray(np.zeros((100, 100, 3), dtype=np.uint8))
1470
+ logger.info('Keeping local LLM OCR warm with a blank black image')
1471
+ self(blank_image)
1472
+
1473
+ while True:
1474
+ ocr_blank_black_image()
1475
+ time.sleep(5)
1476
+
1477
+ def __call__(self, img, furigana_filter_sensitivity=0):
1478
+ import base64
1479
+ try:
1480
+ img, is_path = input_to_pil_image(img)
1481
+ img_bytes = pil_image_to_bytes(img)
1482
+ img_base64 = base64.b64encode(img_bytes).decode('utf-8')
1483
+ if self.custom_prompt and self.custom_prompt.strip() != "":
1484
+ prompt = self.custom_prompt.strip()
1485
+ else:
1486
+ prompt = f"""
1487
+ Extract all {CommonLanguages.from_code(get_ocr_language()).name} Text from Image. Ignore all Furigana. Do not return any commentary, just the text in the image. If there is no text in the image, return "" (Empty String).
1488
+ """
1489
+
1490
+ response = self.client.chat.completions.create(
1491
+ model=self.model,
1492
+ messages=[
1493
+ {
1494
+ "role": "user",
1495
+ "content": [
1496
+ {"type": "text", "text": prompt},
1497
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
1498
+ ],
1499
+ }
1500
+ ],
1501
+ max_tokens=4096,
1502
+ temperature=0.1
1503
+ )
1504
+ self.last_ocr_time = time.time()
1505
+ if response.choices and response.choices[0].message.content:
1506
+ text_output = response.choices[0].message.content.strip()
1507
+ return (True, text_output)
1508
+ else:
1509
+ return (True, "")
1510
+ except Exception as e:
1511
+ return (False, f'Local LLM OCR request failed: {e}')
1512
+
1513
+ # class QWENOCR:
1514
+ # name = 'qwenv2'
1515
+ # readable_name = 'Qwen2-VL'
1516
+ # key = 'q'
1517
+
1518
+ # # Class-level attributes for model and processor to ensure they are loaded only once
1519
+ # model = None
1520
+ # processor = None
1521
+ # device = None
1522
+ # available = False
1523
+
1524
+ # @classmethod
1525
+ # def initialize(cls):
1526
+ # import torch
1527
+ # from transformers import AutoModelForImageTextToText, AutoProcessor
1528
+ # """
1529
+ # Class method to initialize the model. Call this once at the start of your application.
1530
+ # This prevents reloading the model on every instantiation.
1531
+ # """
1532
+ # if cls.model is not None:
1533
+ # logger.info('Qwen2-VL is already initialized.')
1534
+ # return
1535
+
1536
+ # try:
1537
+ # if not torch.cuda.is_available():
1538
+ # logger.warning("CUDA not available, Qwen2-VL will run on CPU, which will be very slow.")
1539
+ # # You might want to prevent initialization on CPU entirely
1540
+ # # raise RuntimeError("CUDA is required for efficient Qwen2-VL operation.")
1541
+
1542
+ # cls.device = "cuda" if torch.cuda.is_available() else "cpu"
1543
+
1544
+ # cls.model = AutoModelForImageTextToText.from_pretrained(
1545
+ # "Qwen/Qwen2-VL-2B-Instruct",
1546
+ # torch_dtype="auto", # Uses bfloat16/float16 if available, which is faster
1547
+ # device_map=cls.device
1548
+ # )
1549
+ # # For PyTorch 2.0+, torch.compile can significantly speed up inference after a warm-up call
1550
+ # # cls.model = torch.compile(cls.model)
1551
+
1552
+ # cls.processor = AutoProcessor.from_pretrained(
1553
+ # "Qwen/Qwen2-VL-2B-Instruct",
1554
+ # use_fast=True
1555
+ # )
1556
+
1557
+ # cls.available = True
1558
+
1559
+ # conversation = [
1560
+ # {
1561
+ # "role": "user",
1562
+ # "content": [
1563
+ # {"type": "image"},
1564
+ # {"type": "text", "text": "Extract all the text from this image, ignore all furigana."},
1565
+ # ],
1566
+ # }
1567
+ # ]
1568
+
1569
+ # # The same prompt is applied to all images in the batch
1570
+ # cls.text_prompt = cls.processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
1571
+ # logger.info(f'Qwen2.5-VL ready on device: {cls.device}')
1572
+ # except Exception as e:
1573
+ # logger.warning(f'Qwen2-VL not available: {e}')
1574
+ # cls.available = False
1575
+
1576
+ # def __init__(self, config={}, lang='ja'):
1577
+ # # The __init__ is now very lightweight. It just checks if initialization has happened.
1578
+ # if not self.available:
1579
+ # raise RuntimeError("QWENOCR has not been initialized. Call QWENOCR.initialize() first.")
1580
+
1581
+ # def __call__(self, images):
1582
+ # """
1583
+ # Processes a single image or a list of images.
1584
+ # :param images: A single image (path or PIL.Image) or a list of images.
1585
+ # :return: A tuple (success, list_of_results)
1586
+ # """
1587
+ # if not self.available:
1588
+ # return (False, ['Qwen2-VL is not available.'])
1589
+
1590
+ # try:
1591
+ # # Standardize input to be a list
1592
+ # if not isinstance(images, list):
1593
+ # images = [images]
1594
+
1595
+ # pil_images = [input_to_pil_image(img)[0] for img in images]
1596
+
1597
+ # # The processor handles batching of images and text prompts
1598
+ # inputs = self.processor(
1599
+ # text=[self.text_prompt] * len(pil_images),
1600
+ # images=pil_images,
1601
+ # padding=True,
1602
+ # return_tensors="pt"
1603
+ # ).to(self.device)
1604
+
1605
+ # output_ids = self.model.generate(**inputs, max_new_tokens=32)
1606
+
1607
+ # # The decoding logic needs to be slightly adjusted for batching
1608
+ # input_ids_len = [len(x) for x in inputs.input_ids]
1609
+ # generated_ids = [
1610
+ # output_ids[i][input_ids_len[i]:] for i in range(len(input_ids_len))
1611
+ # ]
1612
+
1613
+ # output_text = self.processor.batch_decode(
1614
+ # generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
1615
+ # )
1616
+
1617
+ # return (True, output_text)
1618
+ # except Exception as e:
1619
+ # return (False, [f'Qwen2-VL inference failed: {e}'])
1620
+
1621
+
1622
+ # QWENOCR.initialize()
1623
+ # qwenocr = QWENOCR()
1624
+
1625
+ # localOCR = localLLMOCR(config={'api_url': 'http://localhost:1234/v1/chat/completions', 'model': 'qwen2.5-vl-3b-instruct'})
1626
+
1627
+ # for i in range(10):
1628
+ # start_time = time.time()
1629
+ # res, text = localOCR(Image.open(r"C:\Users\Beangate\GSM\GameSentenceMiner\GameSentenceMiner\owocr\owocr\test_furigana.png")) # Example usage
1630
+ # end_time = time.time()
1631
+
1632
+ # print(f"Time taken: {end_time - start_time:.2f} seconds")
1633
+ # print(text)
1634
+ # class LocalOCR:
1635
+ # name = 'local_ocr'
1636
+ # readable_name = 'Local OCR'
1637
+ # key = '-'
1638
+ # available = False
1639
+ #
1640
+ # def __init__(self, lang='ja'):
1641
+ # self.requests_session = requests.Session()
1642
+ # self.available = True
1643
+ # # logger.info('Local OCR ready') # Uncomment if you have a logger defined
1644
+ #
1645
+ # def __call__(self, img, furigana_filter_sensitivity=0):
1646
+ # if not isinstance(img, Image.Image):
1647
+ # try:
1648
+ # img = Image.open(io.BytesIO(img))
1649
+ # except Exception:
1650
+ # return (False, 'Invalid image provided')
1651
+ #
1652
+ # img = input_to_pil_image(img)
1653
+ #
1654
+ # img_base64 = self._preprocess(img)
1655
+ # if not img_base64:
1656
+ # return (False, 'Image preprocessing failed (e.g., too big after resize)!')
1657
+ #
1658
+ # api_url = 'http://localhost:2333/api/ocr'
1659
+ # # Send as JSON with base64 encoded image
1660
+ # json_data = {
1661
+ # 'image': img_base64
1662
+ # }
1663
+ #
1664
+ # try:
1665
+ # res = self.requests_session.post(api_url, json=json_data, timeout=5)
1666
+ # print(res.content)
1667
+ # except requests.exceptions.Timeout:
1668
+ # return (False, 'Request timeout!')
1669
+ # except requests.exceptions.ConnectionError:
1670
+ # return (False, 'Connection error!')
1671
+ #
1672
+ # if res.status_code != 200:
1673
+ # return (False, f'Error: {res.status_code} - {res.text}')
1674
+ #
1675
+ # try:
1676
+ # data = res.json()
1677
+ # # Assuming the local OCR service returns text in a 'text' key
1678
+ # extracted_text = data.get('text', '')
1679
+ # return (True, extracted_text)
1680
+ # except requests.exceptions.JSONDecodeError:
1681
+ # return (False, 'Invalid JSON response from OCR service!')
1682
+ #
1683
+ # def _preprocess(self, img):
1684
+ # return base64.b64encode(pil_image_to_bytes(img, png_compression=1)).decode('utf-8')
1685
+
1686
+ # lens = GeminiOCR(config={'model': 'gemini-2.5-flash-lite-preview-06-17', 'api_key': ''})
1687
+ #
1688
+ # res, text = lens(Image.open('test_furigana.png')) # Example usage
1689
+ #
1690
+ # print(text)