GameSentenceMiner 2.14.7__py3-none-any.whl → 2.14.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- GameSentenceMiner/config_gui.py +19 -10
- GameSentenceMiner/gsm.py +68 -8
- GameSentenceMiner/locales/en_us.json +4 -0
- GameSentenceMiner/locales/ja_jp.json +4 -0
- GameSentenceMiner/locales/zh_cn.json +4 -0
- GameSentenceMiner/obs.py +12 -8
- {gamesentenceminer-2.14.7.dist-info → gamesentenceminer-2.14.9.dist-info}/METADATA +1 -2
- gamesentenceminer-2.14.9.dist-info/RECORD +24 -0
- GameSentenceMiner/ai/__init__.py +0 -0
- GameSentenceMiner/ai/ai_prompting.py +0 -473
- GameSentenceMiner/ocr/__init__.py +0 -0
- GameSentenceMiner/ocr/gsm_ocr_config.py +0 -174
- GameSentenceMiner/ocr/ocrconfig.py +0 -129
- GameSentenceMiner/ocr/owocr_area_selector.py +0 -629
- GameSentenceMiner/ocr/owocr_helper.py +0 -638
- GameSentenceMiner/ocr/ss_picker.py +0 -140
- GameSentenceMiner/owocr/owocr/__init__.py +0 -1
- GameSentenceMiner/owocr/owocr/__main__.py +0 -9
- GameSentenceMiner/owocr/owocr/config.py +0 -148
- GameSentenceMiner/owocr/owocr/lens_betterproto.py +0 -1238
- GameSentenceMiner/owocr/owocr/ocr.py +0 -1691
- GameSentenceMiner/owocr/owocr/run.py +0 -1817
- GameSentenceMiner/owocr/owocr/screen_coordinate_picker.py +0 -109
- GameSentenceMiner/tools/__init__.py +0 -0
- GameSentenceMiner/tools/audio_offset_selector.py +0 -215
- GameSentenceMiner/tools/ss_selector.py +0 -135
- GameSentenceMiner/tools/window_transparency.py +0 -214
- GameSentenceMiner/util/__init__.py +0 -0
- GameSentenceMiner/util/communication/__init__.py +0 -22
- GameSentenceMiner/util/communication/send.py +0 -7
- GameSentenceMiner/util/communication/websocket.py +0 -94
- GameSentenceMiner/util/configuration.py +0 -1198
- GameSentenceMiner/util/db.py +0 -408
- GameSentenceMiner/util/downloader/Untitled_json.py +0 -472
- GameSentenceMiner/util/downloader/__init__.py +0 -0
- GameSentenceMiner/util/downloader/download_tools.py +0 -194
- GameSentenceMiner/util/downloader/oneocr_dl.py +0 -250
- GameSentenceMiner/util/electron_config.py +0 -259
- GameSentenceMiner/util/ffmpeg.py +0 -571
- GameSentenceMiner/util/get_overlay_coords.py +0 -366
- GameSentenceMiner/util/gsm_utils.py +0 -323
- GameSentenceMiner/util/model.py +0 -206
- GameSentenceMiner/util/notification.py +0 -147
- GameSentenceMiner/util/text_log.py +0 -214
- GameSentenceMiner/web/__init__.py +0 -0
- GameSentenceMiner/web/service.py +0 -132
- GameSentenceMiner/web/static/__init__.py +0 -0
- GameSentenceMiner/web/static/apple-touch-icon.png +0 -0
- GameSentenceMiner/web/static/favicon-96x96.png +0 -0
- GameSentenceMiner/web/static/favicon.ico +0 -0
- GameSentenceMiner/web/static/favicon.svg +0 -3
- GameSentenceMiner/web/static/site.webmanifest +0 -21
- GameSentenceMiner/web/static/style.css +0 -292
- GameSentenceMiner/web/static/web-app-manifest-192x192.png +0 -0
- GameSentenceMiner/web/static/web-app-manifest-512x512.png +0 -0
- GameSentenceMiner/web/templates/__init__.py +0 -0
- GameSentenceMiner/web/templates/index.html +0 -50
- GameSentenceMiner/web/templates/text_replacements.html +0 -238
- GameSentenceMiner/web/templates/utility.html +0 -483
- GameSentenceMiner/web/texthooking_page.py +0 -584
- GameSentenceMiner/wip/__init___.py +0 -0
- gamesentenceminer-2.14.7.dist-info/RECORD +0 -77
- {gamesentenceminer-2.14.7.dist-info → gamesentenceminer-2.14.9.dist-info}/WHEEL +0 -0
- {gamesentenceminer-2.14.7.dist-info → gamesentenceminer-2.14.9.dist-info}/entry_points.txt +0 -0
- {gamesentenceminer-2.14.7.dist-info → gamesentenceminer-2.14.9.dist-info}/licenses/LICENSE +0 -0
- {gamesentenceminer-2.14.7.dist-info → gamesentenceminer-2.14.9.dist-info}/top_level.txt +0 -0
@@ -1,1691 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
import os
|
3
|
-
import io
|
4
|
-
import time
|
5
|
-
from pathlib import Path
|
6
|
-
import sys
|
7
|
-
import platform
|
8
|
-
import logging
|
9
|
-
from math import sqrt, floor
|
10
|
-
import json
|
11
|
-
import base64
|
12
|
-
from urllib.parse import urlparse, parse_qs
|
13
|
-
|
14
|
-
import numpy as np
|
15
|
-
import rapidfuzz.fuzz
|
16
|
-
from PIL import Image
|
17
|
-
from loguru import logger
|
18
|
-
import requests
|
19
|
-
|
20
|
-
|
21
|
-
try:
|
22
|
-
from GameSentenceMiner.util.electron_config import get_ocr_language, get_furigana_filter_sensitivity
|
23
|
-
from GameSentenceMiner.util.configuration import CommonLanguages
|
24
|
-
except ImportError:
|
25
|
-
pass
|
26
|
-
|
27
|
-
# from GameSentenceMiner.util.configuration import get_temporary_directory
|
28
|
-
|
29
|
-
try:
|
30
|
-
from manga_ocr import MangaOcr as MOCR
|
31
|
-
except ImportError:
|
32
|
-
pass
|
33
|
-
|
34
|
-
try:
|
35
|
-
import Vision
|
36
|
-
import objc
|
37
|
-
from AppKit import NSData, NSImage, NSBundle
|
38
|
-
from CoreFoundation import CFRunLoopRunInMode, kCFRunLoopDefaultMode, CFRunLoopStop, CFRunLoopGetCurrent
|
39
|
-
except ImportError:
|
40
|
-
pass
|
41
|
-
|
42
|
-
try:
|
43
|
-
from google.cloud import vision
|
44
|
-
from google.oauth2 import service_account
|
45
|
-
from google.api_core.exceptions import ServiceUnavailable
|
46
|
-
except ImportError:
|
47
|
-
pass
|
48
|
-
|
49
|
-
try:
|
50
|
-
from azure.ai.vision.imageanalysis import ImageAnalysisClient
|
51
|
-
from azure.ai.vision.imageanalysis.models import VisualFeatures
|
52
|
-
from azure.core.credentials import AzureKeyCredential
|
53
|
-
from azure.core.exceptions import ServiceRequestError
|
54
|
-
except ImportError:
|
55
|
-
pass
|
56
|
-
|
57
|
-
try:
|
58
|
-
import easyocr
|
59
|
-
except ImportError:
|
60
|
-
pass
|
61
|
-
|
62
|
-
try:
|
63
|
-
from rapidocr_onnxruntime import RapidOCR as ROCR
|
64
|
-
import urllib.request
|
65
|
-
except ImportError:
|
66
|
-
pass
|
67
|
-
|
68
|
-
try:
|
69
|
-
import winocr
|
70
|
-
except ImportError:
|
71
|
-
pass
|
72
|
-
|
73
|
-
try:
|
74
|
-
try:
|
75
|
-
if os.path.exists(os.path.expanduser('~/.config/oneocr/oneocr.dll')):
|
76
|
-
import oneocr
|
77
|
-
except Exception as e:
|
78
|
-
oneocr = None
|
79
|
-
logger.warning(f'Failed to import OneOCR: {e}', exc_info=True)
|
80
|
-
except ImportError:
|
81
|
-
pass
|
82
|
-
|
83
|
-
try:
|
84
|
-
import pyjson5
|
85
|
-
except ImportError:
|
86
|
-
pass
|
87
|
-
|
88
|
-
try:
|
89
|
-
import betterproto
|
90
|
-
from GameSentenceMiner.owocr.owocr.lens_betterproto import *
|
91
|
-
import random
|
92
|
-
except ImportError:
|
93
|
-
pass
|
94
|
-
|
95
|
-
try:
|
96
|
-
import fpng_py
|
97
|
-
optimized_png_encode = True
|
98
|
-
except:
|
99
|
-
optimized_png_encode = False
|
100
|
-
|
101
|
-
|
102
|
-
def empty_post_process(text):
|
103
|
-
return text
|
104
|
-
|
105
|
-
|
106
|
-
def post_process(text, keep_blank_lines=False):
|
107
|
-
import jaconv
|
108
|
-
if keep_blank_lines:
|
109
|
-
text = '\n'.join([''.join(i.split()) for i in text.splitlines()])
|
110
|
-
else:
|
111
|
-
text = ''.join([''.join(i.split()) for i in text.splitlines()])
|
112
|
-
text = text.replace('…', '...')
|
113
|
-
text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
|
114
|
-
text = jaconv.h2z(text, ascii=True, digit=True)
|
115
|
-
return text
|
116
|
-
|
117
|
-
|
118
|
-
def input_to_pil_image(img):
|
119
|
-
is_path = False
|
120
|
-
if isinstance(img, Image.Image):
|
121
|
-
pil_image = img
|
122
|
-
elif isinstance(img, (bytes, bytearray)):
|
123
|
-
pil_image = Image.open(io.BytesIO(img))
|
124
|
-
elif isinstance(img, Path):
|
125
|
-
is_path = True
|
126
|
-
try:
|
127
|
-
pil_image = Image.open(img)
|
128
|
-
pil_image.load()
|
129
|
-
except (UnidentifiedImageError, OSError) as e:
|
130
|
-
return None
|
131
|
-
else:
|
132
|
-
raise ValueError(f'img must be a path, PIL.Image or bytes object, instead got: {img}')
|
133
|
-
return pil_image, is_path
|
134
|
-
|
135
|
-
|
136
|
-
def pil_image_to_bytes(img, img_format='png', png_compression=6, jpeg_quality=80, optimize=False):
|
137
|
-
if img_format == 'png' and optimized_png_encode and not optimize:
|
138
|
-
raw_data = img.convert('RGBA').tobytes()
|
139
|
-
image_bytes = fpng_py.fpng_encode_image_to_memory(raw_data, img.width, img.height)
|
140
|
-
else:
|
141
|
-
image_bytes = io.BytesIO()
|
142
|
-
if img_format == 'jpeg':
|
143
|
-
img = img.convert('RGB')
|
144
|
-
img.save(image_bytes, format=img_format, compress_level=png_compression, quality=jpeg_quality, optimize=optimize, subsampling=0)
|
145
|
-
image_bytes = image_bytes.getvalue()
|
146
|
-
return image_bytes
|
147
|
-
|
148
|
-
|
149
|
-
def pil_image_to_numpy_array(img):
|
150
|
-
return np.array(img.convert('RGBA'))
|
151
|
-
|
152
|
-
|
153
|
-
def limit_image_size(img, max_size):
|
154
|
-
img_bytes = pil_image_to_bytes(img)
|
155
|
-
if len(img_bytes) <= max_size:
|
156
|
-
return img_bytes, 'png'
|
157
|
-
|
158
|
-
scaling_factor = 0.60 if any(x > 2000 for x in img.size) else 0.75
|
159
|
-
new_w = int(img.width * scaling_factor)
|
160
|
-
new_h = int(img.height * scaling_factor)
|
161
|
-
resized_img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
162
|
-
resized_img_bytes = pil_image_to_bytes(resized_img)
|
163
|
-
if len(resized_img_bytes) <= max_size:
|
164
|
-
return resized_img_bytes, 'png'
|
165
|
-
|
166
|
-
for _ in range(2):
|
167
|
-
jpeg_quality = 80
|
168
|
-
while jpeg_quality >= 60:
|
169
|
-
img_bytes = pil_image_to_bytes(img, 'jpeg', jpeg_quality=jpeg_quality, optimize=True)
|
170
|
-
if len(img_bytes) <= max_size:
|
171
|
-
return img_bytes, 'jpeg'
|
172
|
-
jpeg_quality -= 5
|
173
|
-
img = resized_img
|
174
|
-
|
175
|
-
return False, ''
|
176
|
-
|
177
|
-
|
178
|
-
def get_regex(lang):
|
179
|
-
if lang == "ja":
|
180
|
-
return re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
|
181
|
-
elif lang == "zh":
|
182
|
-
return re.compile(r'[\u4E00-\u9FFF]')
|
183
|
-
elif lang == "ko":
|
184
|
-
return re.compile(r'[\uAC00-\uD7AF]')
|
185
|
-
elif lang == "ar":
|
186
|
-
return re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]')
|
187
|
-
elif lang == "ru":
|
188
|
-
return re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]')
|
189
|
-
elif lang == "el":
|
190
|
-
return re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]')
|
191
|
-
elif lang == "he":
|
192
|
-
return re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]')
|
193
|
-
elif lang == "th":
|
194
|
-
return re.compile(r'[\u0E00-\u0E7F]')
|
195
|
-
else:
|
196
|
-
return re.compile(
|
197
|
-
r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]')
|
198
|
-
|
199
|
-
|
200
|
-
class MangaOcr:
|
201
|
-
name = 'mangaocr'
|
202
|
-
readable_name = 'Manga OCR'
|
203
|
-
key = 'm'
|
204
|
-
available = False
|
205
|
-
|
206
|
-
def __init__(self, config={'pretrained_model_name_or_path':'kha-white/manga-ocr-base','force_cpu': False}, lang='ja'):
|
207
|
-
if 'manga_ocr' not in sys.modules:
|
208
|
-
logger.warning('manga-ocr not available, Manga OCR will not work!')
|
209
|
-
else:
|
210
|
-
logger.disable('manga_ocr')
|
211
|
-
logging.getLogger('transformers').setLevel(logging.ERROR) # silence transformers >=4.46 warnings
|
212
|
-
from manga_ocr import ocr
|
213
|
-
ocr.post_process = empty_post_process
|
214
|
-
logger.info(f'Loading Manga OCR model')
|
215
|
-
self.model = MOCR(config['pretrained_model_name_or_path'], config['force_cpu'])
|
216
|
-
self.available = True
|
217
|
-
logger.info('Manga OCR ready')
|
218
|
-
|
219
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
220
|
-
img, is_path = input_to_pil_image(img)
|
221
|
-
if not img:
|
222
|
-
return (False, 'Invalid image provided')
|
223
|
-
|
224
|
-
x = (True, self.model(img))
|
225
|
-
|
226
|
-
# img.close()
|
227
|
-
return x
|
228
|
-
|
229
|
-
class GoogleVision:
|
230
|
-
name = 'gvision'
|
231
|
-
readable_name = 'Google Vision'
|
232
|
-
key = 'g'
|
233
|
-
available = False
|
234
|
-
|
235
|
-
def __init__(self, lang='ja'):
|
236
|
-
if 'google.cloud' not in sys.modules:
|
237
|
-
logger.warning('google-cloud-vision not available, Google Vision will not work!')
|
238
|
-
else:
|
239
|
-
logger.info(f'Parsing Google credentials')
|
240
|
-
google_credentials_file = os.path.join(os.path.expanduser('~'),'.config','google_vision.json')
|
241
|
-
try:
|
242
|
-
google_credentials = service_account.Credentials.from_service_account_file(google_credentials_file)
|
243
|
-
self.client = vision.ImageAnnotatorClient(credentials=google_credentials)
|
244
|
-
self.available = True
|
245
|
-
logger.info('Google Vision ready')
|
246
|
-
except:
|
247
|
-
logger.warning('Error parsing Google credentials, Google Vision will not work!')
|
248
|
-
|
249
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
250
|
-
img, is_path = input_to_pil_image(img)
|
251
|
-
if not img:
|
252
|
-
return (False, 'Invalid image provided')
|
253
|
-
|
254
|
-
image_bytes = self._preprocess(img)
|
255
|
-
image = vision.Image(content=image_bytes)
|
256
|
-
try:
|
257
|
-
response = self.client.text_detection(image=image)
|
258
|
-
except ServiceUnavailable:
|
259
|
-
return (False, 'Connection error!')
|
260
|
-
except:
|
261
|
-
return (False, 'Unknown error!')
|
262
|
-
texts = response.text_annotations
|
263
|
-
res = texts[0].description if len(texts) > 0 else ''
|
264
|
-
x = (True, res)
|
265
|
-
|
266
|
-
# img.close()
|
267
|
-
return x
|
268
|
-
|
269
|
-
def _preprocess(self, img):
|
270
|
-
return pil_image_to_bytes(img)
|
271
|
-
|
272
|
-
class GoogleLens:
|
273
|
-
name = 'glens'
|
274
|
-
readable_name = 'Google Lens'
|
275
|
-
key = 'l'
|
276
|
-
available = False
|
277
|
-
|
278
|
-
def __init__(self, lang='ja'):
|
279
|
-
import regex
|
280
|
-
self.regex = get_regex(lang)
|
281
|
-
self.initial_lang = lang
|
282
|
-
self.punctuation_regex = regex.compile(r'[\p{P}\p{S}]')
|
283
|
-
if 'betterproto' not in sys.modules:
|
284
|
-
logger.warning('betterproto not available, Google Lens will not work!')
|
285
|
-
else:
|
286
|
-
self.available = True
|
287
|
-
logger.info('Google Lens ready')
|
288
|
-
|
289
|
-
def __call__(self, img, furigana_filter_sensitivity=0, return_coords=False):
|
290
|
-
if furigana_filter_sensitivity != None:
|
291
|
-
furigana_filter_sensitivity = get_furigana_filter_sensitivity()
|
292
|
-
else:
|
293
|
-
furigana_filter_sensitivity = 0
|
294
|
-
lang = get_ocr_language()
|
295
|
-
img, is_path = input_to_pil_image(img)
|
296
|
-
if lang != self.initial_lang:
|
297
|
-
self.initial_lang = lang
|
298
|
-
self.regex = get_regex(lang)
|
299
|
-
if not img:
|
300
|
-
return (False, 'Invalid image provided')
|
301
|
-
|
302
|
-
request = LensOverlayServerRequest()
|
303
|
-
|
304
|
-
request.objects_request.request_context.request_id.uuid = random.randint(0, 2**64 - 1)
|
305
|
-
request.objects_request.request_context.request_id.sequence_id = 0
|
306
|
-
request.objects_request.request_context.request_id.image_sequence_id = 0
|
307
|
-
request.objects_request.request_context.request_id.analytics_id = random.randbytes(16)
|
308
|
-
request.objects_request.request_context.request_id.routing_info = LensOverlayRoutingInfo()
|
309
|
-
|
310
|
-
request.objects_request.request_context.client_context.platform = Platform.WEB
|
311
|
-
request.objects_request.request_context.client_context.surface = Surface.CHROMIUM
|
312
|
-
|
313
|
-
request.objects_request.request_context.client_context.locale_context.language = 'ja'
|
314
|
-
request.objects_request.request_context.client_context.locale_context.region = 'Asia/Tokyo'
|
315
|
-
request.objects_request.request_context.client_context.locale_context.time_zone = '' # not set by chromium
|
316
|
-
|
317
|
-
request.objects_request.request_context.client_context.app_id = '' # not set by chromium
|
318
|
-
|
319
|
-
filter = AppliedFilter()
|
320
|
-
filter.filter_type = LensOverlayFilterType.AUTO_FILTER
|
321
|
-
request.objects_request.request_context.client_context.client_filters.filter.append(filter)
|
322
|
-
|
323
|
-
image_data = self._preprocess(img)
|
324
|
-
request.objects_request.image_data.payload.image_bytes = image_data[0]
|
325
|
-
request.objects_request.image_data.image_metadata.width = image_data[1]
|
326
|
-
request.objects_request.image_data.image_metadata.height = image_data[2]
|
327
|
-
|
328
|
-
payload = request.SerializeToString()
|
329
|
-
|
330
|
-
headers = {
|
331
|
-
'Host': 'lensfrontend-pa.googleapis.com',
|
332
|
-
'Connection': 'keep-alive',
|
333
|
-
'Content-Type': 'application/x-protobuf',
|
334
|
-
'X-Goog-Api-Key': 'AIzaSyDr2UxVnv_U85AbhhY8XSHSIavUW0DC-sY',
|
335
|
-
'Sec-Fetch-Site': 'none',
|
336
|
-
'Sec-Fetch-Mode': 'no-cors',
|
337
|
-
'Sec-Fetch-Dest': 'empty',
|
338
|
-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
|
339
|
-
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
340
|
-
'Accept-Language': 'ja-JP;q=0.6,ja;q=0.5'
|
341
|
-
}
|
342
|
-
|
343
|
-
try:
|
344
|
-
res = requests.post('https://lensfrontend-pa.googleapis.com/v1/crupload', data=payload, headers=headers, timeout=5)
|
345
|
-
except requests.exceptions.Timeout:
|
346
|
-
return (False, 'Request timeout!')
|
347
|
-
except requests.exceptions.ConnectionError:
|
348
|
-
return (False, 'Connection error!')
|
349
|
-
|
350
|
-
if res.status_code != 200:
|
351
|
-
return (False, 'Unknown error!')
|
352
|
-
|
353
|
-
response_proto = LensOverlayServerResponse().FromString(res.content)
|
354
|
-
response_dict = response_proto.to_dict(betterproto.Casing.SNAKE)
|
355
|
-
|
356
|
-
if os.path.exists(r"C:\Users\Beangate\GSM\Electron App\test"):
|
357
|
-
with open(os.path.join(r"C:\Users\Beangate\GSM\Electron App\test", 'glens_response.json'), 'w', encoding='utf-8') as f:
|
358
|
-
json.dump(response_dict, f, indent=4, ensure_ascii=False)
|
359
|
-
res = ''
|
360
|
-
text = response_dict['objects_response']['text']
|
361
|
-
skipped = []
|
362
|
-
previous_line = None
|
363
|
-
if 'text_layout' in text:
|
364
|
-
for paragraph in text['text_layout']['paragraphs']:
|
365
|
-
if previous_line:
|
366
|
-
prev_bbox = previous_line['geometry']['bounding_box']
|
367
|
-
curr_bbox = paragraph['geometry']['bounding_box']
|
368
|
-
vertical_space = abs(curr_bbox['center_y'] - prev_bbox['center_y']) * img.height
|
369
|
-
prev_height = prev_bbox['height'] * img.height
|
370
|
-
current_height = curr_bbox['height'] * img.height
|
371
|
-
avg_height = (prev_height + current_height) / 2
|
372
|
-
# If vertical space is close to previous line's height, add a blank line
|
373
|
-
# logger.info(f"Vertical space: {vertical_space}, Average height: {avg_height}")
|
374
|
-
# logger.info(avg_height * 2)
|
375
|
-
if vertical_space > avg_height * 2:
|
376
|
-
res += 'BLANK_LINE'
|
377
|
-
for line in paragraph['lines']:
|
378
|
-
if furigana_filter_sensitivity:
|
379
|
-
for word in line['words']:
|
380
|
-
if not self.punctuation_regex.findall(word):
|
381
|
-
continue
|
382
|
-
if 'geometry' not in word:
|
383
|
-
res += word['plain_text'] + word['text_separator']
|
384
|
-
continue
|
385
|
-
word_width = word['geometry']['bounding_box']['width'] * img.width
|
386
|
-
word_height = word['geometry']['bounding_box']['height'] * img.height
|
387
|
-
if word_width > furigana_filter_sensitivity and word_height > furigana_filter_sensitivity:
|
388
|
-
res += word['plain_text'] + word['text_separator']
|
389
|
-
else:
|
390
|
-
skipped.extend(word['plain_text'])
|
391
|
-
continue
|
392
|
-
else:
|
393
|
-
for word in line['words']:
|
394
|
-
res += word['plain_text'] + word['text_separator']
|
395
|
-
|
396
|
-
previous_line = paragraph
|
397
|
-
res += '\n'
|
398
|
-
# logger.info(
|
399
|
-
# f"Skipped {len(skipped)} chars due to furigana filter sensitivity: {furigana_filter_sensitivity}")
|
400
|
-
# widths = []
|
401
|
-
# heights = []
|
402
|
-
# if 'text_layout' in text:
|
403
|
-
# paragraphs = text['text_layout']['paragraphs']
|
404
|
-
# for paragraph in paragraphs:
|
405
|
-
# for line in paragraph['lines']:
|
406
|
-
# for word in line['words']:
|
407
|
-
# if self.kana_kanji_regex.search(word['plain_text']) is None:
|
408
|
-
# continue
|
409
|
-
# widths.append(word['geometry']['bounding_box']['width'])
|
410
|
-
# heights.append(word['geometry']['bounding_box']['height'])
|
411
|
-
#
|
412
|
-
# max_width = max(sorted(widths)[:-max(1, len(widths) // 10)]) if len(widths) > 1 else 0
|
413
|
-
# max_height = max(sorted(heights)[:-max(1, len(heights) // 10)]) if len(heights) > 1 else 0
|
414
|
-
#
|
415
|
-
# required_width = max_width * furigana_filter_sensitivity
|
416
|
-
# required_height = max_height * furigana_filter_sensitivity
|
417
|
-
#
|
418
|
-
# if 'text_layout' in text:
|
419
|
-
# paragraphs = text['text_layout']['paragraphs']
|
420
|
-
# for paragraph in paragraphs:
|
421
|
-
# for line in paragraph['lines']:
|
422
|
-
# if furigana_filter_sensitivity == 0 or line['geometry']['bounding_box']['width'] > required_width or line['geometry']['bounding_box']['height'] > required_height:
|
423
|
-
# for word in line['words']:
|
424
|
-
# res += word['plain_text'] + word['text_separator']
|
425
|
-
# else:
|
426
|
-
# continue
|
427
|
-
# res += '\n'
|
428
|
-
# else:
|
429
|
-
# if 'text_layout' in text:
|
430
|
-
# paragraphs = text['text_layout']['paragraphs']
|
431
|
-
# for paragraph in paragraphs:
|
432
|
-
# for line in paragraph['lines']:
|
433
|
-
# for word in line['words']:
|
434
|
-
# res += word['plain_text'] + word['text_separator']
|
435
|
-
# else:
|
436
|
-
# continue
|
437
|
-
# res += '\n'
|
438
|
-
|
439
|
-
if return_coords:
|
440
|
-
x = (True, res, response_dict)
|
441
|
-
else:
|
442
|
-
x = (True, res)
|
443
|
-
|
444
|
-
if skipped:
|
445
|
-
logger.info(f"Skipped {len(skipped)} chars due to furigana filter sensitivity: {furigana_filter_sensitivity}")
|
446
|
-
logger.debug(f"Skipped chars: {''.join(skipped)}")
|
447
|
-
|
448
|
-
# img.close()
|
449
|
-
return x
|
450
|
-
|
451
|
-
def _preprocess(self, img):
|
452
|
-
if img.width * img.height > 3000000:
|
453
|
-
aspect_ratio = img.width / img.height
|
454
|
-
new_w = int(sqrt(3000000 * aspect_ratio))
|
455
|
-
new_h = int(new_w / aspect_ratio)
|
456
|
-
img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
457
|
-
|
458
|
-
return (pil_image_to_bytes(img), img.width, img.height)
|
459
|
-
|
460
|
-
class GoogleLensWeb:
|
461
|
-
name = 'glensweb'
|
462
|
-
readable_name = 'Google Lens (web)'
|
463
|
-
key = 'k'
|
464
|
-
available = False
|
465
|
-
|
466
|
-
def __init__(self, lang='ja'):
|
467
|
-
if 'pyjson5' not in sys.modules:
|
468
|
-
logger.warning('pyjson5 not available, Google Lens (web) will not work!')
|
469
|
-
else:
|
470
|
-
self.requests_session = requests.Session()
|
471
|
-
self.available = True
|
472
|
-
logger.info('Google Lens (web) ready')
|
473
|
-
|
474
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
475
|
-
img, is_path = input_to_pil_image(img)
|
476
|
-
if not img:
|
477
|
-
return (False, 'Invalid image provided')
|
478
|
-
|
479
|
-
url = 'https://lens.google.com/v3/upload'
|
480
|
-
files = {'encoded_image': ('image.png', self._preprocess(img), 'image/png')}
|
481
|
-
headers = {
|
482
|
-
'Host': 'lens.google.com',
|
483
|
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
|
484
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
485
|
-
'Accept-Language': 'ja-JP;q=0.6,ja;q=0.5',
|
486
|
-
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
487
|
-
'Referer': 'https://www.google.com/',
|
488
|
-
'Origin': 'https://www.google.com',
|
489
|
-
'Alt-Used': 'lens.google.com',
|
490
|
-
'Connection': 'keep-alive',
|
491
|
-
'Upgrade-Insecure-Requests': '1',
|
492
|
-
'Sec-Fetch-Dest': 'document',
|
493
|
-
'Sec-Fetch-Mode': 'navigate',
|
494
|
-
'Sec-Fetch-Site': 'same-site',
|
495
|
-
'Priority': 'u=0, i',
|
496
|
-
'TE': 'trailers'
|
497
|
-
}
|
498
|
-
cookies = {'SOCS': 'CAESEwgDEgk0ODE3Nzk3MjQaAmVuIAEaBgiA_LyaBg'}
|
499
|
-
|
500
|
-
try:
|
501
|
-
res = self.requests_session.post(url, files=files, headers=headers, cookies=cookies, timeout=5, allow_redirects=False)
|
502
|
-
except requests.exceptions.Timeout:
|
503
|
-
return (False, 'Request timeout!')
|
504
|
-
except requests.exceptions.ConnectionError:
|
505
|
-
return (False, 'Connection error!')
|
506
|
-
|
507
|
-
if res.status_code != 303:
|
508
|
-
return (False, 'Unknown error!')
|
509
|
-
|
510
|
-
redirect_url = res.headers.get('Location')
|
511
|
-
if not redirect_url:
|
512
|
-
return (False, 'Error getting redirect URL!')
|
513
|
-
|
514
|
-
parsed_url = urlparse(redirect_url)
|
515
|
-
query_params = parse_qs(parsed_url.query)
|
516
|
-
|
517
|
-
if ('vsrid' not in query_params) or ('gsessionid' not in query_params):
|
518
|
-
return (False, 'Unknown error!')
|
519
|
-
|
520
|
-
try:
|
521
|
-
res = self.requests_session.get(f"https://lens.google.com/qfmetadata?vsrid={query_params['vsrid'][0]}&gsessionid={query_params['gsessionid'][0]}", timeout=5)
|
522
|
-
except requests.exceptions.Timeout:
|
523
|
-
return (False, 'Request timeout!')
|
524
|
-
except requests.exceptions.ConnectionError:
|
525
|
-
return (False, 'Connection error!')
|
526
|
-
|
527
|
-
if (len(res.text.splitlines()) != 3):
|
528
|
-
return (False, 'Unknown error!')
|
529
|
-
|
530
|
-
lens_object = pyjson5.loads(res.text.splitlines()[2])
|
531
|
-
|
532
|
-
res = ''
|
533
|
-
text = lens_object[0][2][0][0]
|
534
|
-
for paragraph in text:
|
535
|
-
for line in paragraph[1]:
|
536
|
-
for word in line[0]:
|
537
|
-
res += word[1] + word[2]
|
538
|
-
res += '\n'
|
539
|
-
|
540
|
-
x = (True, res)
|
541
|
-
|
542
|
-
# img.close()
|
543
|
-
return x
|
544
|
-
|
545
|
-
def _preprocess(self, img):
|
546
|
-
if img.width * img.height > 3000000:
|
547
|
-
aspect_ratio = img.width / img.height
|
548
|
-
new_w = int(sqrt(3000000 * aspect_ratio))
|
549
|
-
new_h = int(new_w / aspect_ratio)
|
550
|
-
img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
551
|
-
|
552
|
-
return pil_image_to_bytes(img)
|
553
|
-
|
554
|
-
class Bing:
|
555
|
-
name = 'bing'
|
556
|
-
readable_name = 'Bing'
|
557
|
-
key = 'b'
|
558
|
-
available = False
|
559
|
-
|
560
|
-
def __init__(self, lang='ja'):
|
561
|
-
self.requests_session = requests.Session()
|
562
|
-
self.available = True
|
563
|
-
logger.info('Bing ready')
|
564
|
-
|
565
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
566
|
-
img, is_path = input_to_pil_image(img)
|
567
|
-
if not img:
|
568
|
-
return (False, 'Invalid image provided')
|
569
|
-
|
570
|
-
img_bytes = self._preprocess(img)
|
571
|
-
if not img_bytes:
|
572
|
-
return (False, 'Image is too big!')
|
573
|
-
|
574
|
-
upload_url = 'https://www.bing.com/images/search?view=detailv2&iss=sbiupload'
|
575
|
-
upload_headers = {
|
576
|
-
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
577
|
-
'accept-language': 'ja-JP;q=0.6,ja;q=0.5',
|
578
|
-
'cache-control': 'max-age=0',
|
579
|
-
'origin': 'https://www.bing.com',
|
580
|
-
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
|
581
|
-
}
|
582
|
-
files = {
|
583
|
-
'imgurl': (None, ''),
|
584
|
-
'cbir': (None, 'sbi'),
|
585
|
-
'imageBin': (None, img_bytes)
|
586
|
-
}
|
587
|
-
|
588
|
-
for _ in range(2):
|
589
|
-
api_host = urlparse(upload_url).netloc
|
590
|
-
try:
|
591
|
-
res = self.requests_session.post(upload_url, headers=upload_headers, files=files, timeout=5, allow_redirects=False)
|
592
|
-
except requests.exceptions.Timeout:
|
593
|
-
return (False, 'Request timeout!')
|
594
|
-
except requests.exceptions.ConnectionError:
|
595
|
-
return (False, 'Connection error!')
|
596
|
-
|
597
|
-
if res.status_code != 302:
|
598
|
-
return (False, 'Unknown error!')
|
599
|
-
|
600
|
-
redirect_url = res.headers.get('Location')
|
601
|
-
if not redirect_url:
|
602
|
-
return (False, 'Error getting redirect URL!')
|
603
|
-
if not redirect_url.startswith('https://'):
|
604
|
-
break
|
605
|
-
upload_url = redirect_url
|
606
|
-
|
607
|
-
parsed_url = urlparse(redirect_url)
|
608
|
-
query_params = parse_qs(parsed_url.query)
|
609
|
-
|
610
|
-
image_insights_token = query_params.get('insightsToken')
|
611
|
-
if not image_insights_token:
|
612
|
-
return (False, 'Error getting token!')
|
613
|
-
image_insights_token = image_insights_token[0]
|
614
|
-
|
615
|
-
api_url = f'https://{api_host}/images/api/custom/knowledge'
|
616
|
-
api_headers = {
|
617
|
-
'accept': '*/*',
|
618
|
-
'accept-language': 'ja-JP;q=0.6,ja;q=0.5',
|
619
|
-
'origin': 'https://www.bing.com',
|
620
|
-
'referer': f'https://www.bing.com/images/search?view=detailV2&insightstoken={image_insights_token}',
|
621
|
-
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0',
|
622
|
-
}
|
623
|
-
api_data_json = {
|
624
|
-
'imageInfo': {'imageInsightsToken': image_insights_token, 'source': 'Url'},
|
625
|
-
'knowledgeRequest': {'invokedSkills': ['OCR'], 'index': 1}
|
626
|
-
}
|
627
|
-
files = {
|
628
|
-
'knowledgeRequest': (None, json.dumps(api_data_json), 'application/json')
|
629
|
-
}
|
630
|
-
|
631
|
-
try:
|
632
|
-
res = self.requests_session.post(api_url, headers=api_headers, files=files, timeout=5)
|
633
|
-
except requests.exceptions.Timeout:
|
634
|
-
return (False, 'Request timeout!')
|
635
|
-
except requests.exceptions.ConnectionError:
|
636
|
-
return (False, 'Connection error!')
|
637
|
-
|
638
|
-
if res.status_code != 200:
|
639
|
-
return (False, 'Unknown error!')
|
640
|
-
|
641
|
-
data = res.json()
|
642
|
-
|
643
|
-
res = ''
|
644
|
-
text_tag = None
|
645
|
-
for tag in data['tags']:
|
646
|
-
if tag.get('displayName') == '##TextRecognition':
|
647
|
-
text_tag = tag
|
648
|
-
break
|
649
|
-
if text_tag:
|
650
|
-
text_action = None
|
651
|
-
for action in text_tag['actions']:
|
652
|
-
if action.get('_type') == 'ImageKnowledge/TextRecognitionAction':
|
653
|
-
text_action = action
|
654
|
-
break
|
655
|
-
if text_action:
|
656
|
-
regions = text_action['data'].get('regions', [])
|
657
|
-
for region in regions:
|
658
|
-
for line in region.get('lines', []):
|
659
|
-
res += line['text'] + '\n'
|
660
|
-
|
661
|
-
x = (True, res)
|
662
|
-
|
663
|
-
# img.close()
|
664
|
-
return x
|
665
|
-
|
666
|
-
def _preprocess(self, img):
|
667
|
-
max_pixel_size = 4000
|
668
|
-
max_byte_size = 767772
|
669
|
-
res = None
|
670
|
-
|
671
|
-
if any(x > max_pixel_size for x in img.size):
|
672
|
-
resize_factor = max(max_pixel_size / img.width, max_pixel_size / img.height)
|
673
|
-
new_w = int(img.width * resize_factor)
|
674
|
-
new_h = int(img.height * resize_factor)
|
675
|
-
img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
676
|
-
|
677
|
-
img_bytes, _ = limit_image_size(img, max_byte_size)
|
678
|
-
|
679
|
-
if img_bytes:
|
680
|
-
res = base64.b64encode(img_bytes).decode('utf-8')
|
681
|
-
|
682
|
-
return res
|
683
|
-
|
684
|
-
class AppleVision:
|
685
|
-
name = 'avision'
|
686
|
-
readable_name = 'Apple Vision'
|
687
|
-
key = 'a'
|
688
|
-
available = False
|
689
|
-
|
690
|
-
def __init__(self, lang='ja'):
|
691
|
-
if sys.platform != 'darwin':
|
692
|
-
logger.warning('Apple Vision is not supported on non-macOS platforms!')
|
693
|
-
elif int(platform.mac_ver()[0].split('.')[0]) < 13:
|
694
|
-
logger.warning('Apple Vision is not supported on macOS older than Ventura/13.0!')
|
695
|
-
else:
|
696
|
-
self.available = True
|
697
|
-
logger.info('Apple Vision ready')
|
698
|
-
|
699
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
700
|
-
img, is_path = input_to_pil_image(img)
|
701
|
-
if not img:
|
702
|
-
return (False, 'Invalid image provided')
|
703
|
-
|
704
|
-
with objc.autorelease_pool():
|
705
|
-
req = Vision.VNRecognizeTextRequest.alloc().init()
|
706
|
-
|
707
|
-
req.setRevision_(Vision.VNRecognizeTextRequestRevision3)
|
708
|
-
req.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
|
709
|
-
req.setUsesLanguageCorrection_(True)
|
710
|
-
req.setRecognitionLanguages_(['ja','en'])
|
711
|
-
|
712
|
-
handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(
|
713
|
-
self._preprocess(img), None
|
714
|
-
)
|
715
|
-
|
716
|
-
success = handler.performRequests_error_([req], None)
|
717
|
-
res = ''
|
718
|
-
if success[0]:
|
719
|
-
for result in req.results():
|
720
|
-
res += result.text() + '\n'
|
721
|
-
x = (True, res)
|
722
|
-
else:
|
723
|
-
x = (False, 'Unknown error!')
|
724
|
-
|
725
|
-
# img.close()
|
726
|
-
return x
|
727
|
-
|
728
|
-
def _preprocess(self, img):
|
729
|
-
return pil_image_to_bytes(img, 'tiff')
|
730
|
-
|
731
|
-
|
732
|
-
class AppleLiveText:
|
733
|
-
name = 'alivetext'
|
734
|
-
readable_name = 'Apple Live Text'
|
735
|
-
key = 'd'
|
736
|
-
available = False
|
737
|
-
|
738
|
-
def __init__(self, lang='ja'):
|
739
|
-
if sys.platform != 'darwin':
|
740
|
-
logger.warning('Apple Live Text is not supported on non-macOS platforms!')
|
741
|
-
elif int(platform.mac_ver()[0].split('.')[0]) < 13:
|
742
|
-
logger.warning('Apple Live Text is not supported on macOS older than Ventura/13.0!')
|
743
|
-
else:
|
744
|
-
app_info = NSBundle.mainBundle().infoDictionary()
|
745
|
-
app_info['LSBackgroundOnly'] = '1'
|
746
|
-
self.VKCImageAnalyzer = objc.lookUpClass('VKCImageAnalyzer')
|
747
|
-
self.VKCImageAnalyzerRequest = objc.lookUpClass('VKCImageAnalyzerRequest')
|
748
|
-
objc.registerMetaDataForSelector(
|
749
|
-
b'VKCImageAnalyzer',
|
750
|
-
b'processRequest:progressHandler:completionHandler:',
|
751
|
-
{
|
752
|
-
'arguments': {
|
753
|
-
3: {
|
754
|
-
'callable': {
|
755
|
-
'retval': {'type': b'v'},
|
756
|
-
'arguments': {
|
757
|
-
0: {'type': b'^v'},
|
758
|
-
1: {'type': b'd'},
|
759
|
-
}
|
760
|
-
}
|
761
|
-
},
|
762
|
-
4: {
|
763
|
-
'callable': {
|
764
|
-
'retval': {'type': b'v'},
|
765
|
-
'arguments': {
|
766
|
-
0: {'type': b'^v'},
|
767
|
-
1: {'type': b'@'},
|
768
|
-
2: {'type': b'@'},
|
769
|
-
}
|
770
|
-
}
|
771
|
-
}
|
772
|
-
}
|
773
|
-
}
|
774
|
-
)
|
775
|
-
self.available = True
|
776
|
-
logger.info('Apple Live Text ready')
|
777
|
-
|
778
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
779
|
-
img, is_path = input_to_pil_image(img)
|
780
|
-
if not img:
|
781
|
-
return (False, 'Invalid image provided')
|
782
|
-
|
783
|
-
with objc.autorelease_pool():
|
784
|
-
analyzer = self.VKCImageAnalyzer.alloc().init()
|
785
|
-
req = self.VKCImageAnalyzerRequest.alloc().initWithImage_requestType_(self._preprocess(img), 1) #VKAnalysisTypeText
|
786
|
-
req.setLocales_(['ja','en'])
|
787
|
-
self.result = None
|
788
|
-
analyzer.processRequest_progressHandler_completionHandler_(req, lambda progress: None, self._process)
|
789
|
-
|
790
|
-
CFRunLoopRunInMode(kCFRunLoopDefaultMode, 10.0, False)
|
791
|
-
|
792
|
-
if self.result == None:
|
793
|
-
return (False, 'Unknown error!')
|
794
|
-
return (True, self.result)
|
795
|
-
|
796
|
-
def _process(self, analysis, error):
|
797
|
-
res = ''
|
798
|
-
lines = analysis.allLines()
|
799
|
-
if lines:
|
800
|
-
for line in lines:
|
801
|
-
res += line.string() + '\n'
|
802
|
-
self.result = res
|
803
|
-
CFRunLoopStop(CFRunLoopGetCurrent())
|
804
|
-
|
805
|
-
def _preprocess(self, img):
|
806
|
-
image_bytes = pil_image_to_bytes(img, 'tiff')
|
807
|
-
ns_data = NSData.dataWithBytes_length_(image_bytes, len(image_bytes))
|
808
|
-
ns_image = NSImage.alloc().initWithData_(ns_data)
|
809
|
-
return ns_image
|
810
|
-
|
811
|
-
|
812
|
-
class WinRTOCR:
|
813
|
-
name = 'winrtocr'
|
814
|
-
readable_name = 'WinRT OCR'
|
815
|
-
key = 'w'
|
816
|
-
available = False
|
817
|
-
|
818
|
-
def __init__(self, config={}, lang='ja'):
|
819
|
-
if sys.platform == 'win32':
|
820
|
-
if int(platform.release()) < 10:
|
821
|
-
logger.warning('WinRT OCR is not supported on Windows older than 10!')
|
822
|
-
elif 'winocr' not in sys.modules:
|
823
|
-
logger.warning('winocr not available, WinRT OCR will not work!')
|
824
|
-
else:
|
825
|
-
self.available = True
|
826
|
-
logger.info('WinRT OCR ready')
|
827
|
-
else:
|
828
|
-
try:
|
829
|
-
self.url = config['url']
|
830
|
-
self.available = True
|
831
|
-
logger.info('WinRT OCR ready')
|
832
|
-
except:
|
833
|
-
logger.warning('Error reading URL from config, WinRT OCR will not work!')
|
834
|
-
|
835
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
836
|
-
img, is_path = input_to_pil_image(img)
|
837
|
-
if not img:
|
838
|
-
return (False, 'Invalid image provided')
|
839
|
-
|
840
|
-
if sys.platform == 'win32':
|
841
|
-
res = winocr.recognize_pil_sync(img, lang='ja')['text']
|
842
|
-
else:
|
843
|
-
params = {'lang': 'ja'}
|
844
|
-
try:
|
845
|
-
res = requests.post(self.url, params=params, data=self._preprocess(img), timeout=3)
|
846
|
-
except requests.exceptions.Timeout:
|
847
|
-
return (False, 'Request timeout!')
|
848
|
-
except requests.exceptions.ConnectionError:
|
849
|
-
return (False, 'Connection error!')
|
850
|
-
|
851
|
-
if res.status_code != 200:
|
852
|
-
return (False, 'Unknown error!')
|
853
|
-
|
854
|
-
res = res.json()['text']
|
855
|
-
|
856
|
-
x = (True, res)
|
857
|
-
|
858
|
-
|
859
|
-
# img.close()
|
860
|
-
return x
|
861
|
-
|
862
|
-
def _preprocess(self, img):
|
863
|
-
return pil_image_to_bytes(img, png_compression=1)
|
864
|
-
|
865
|
-
class OneOCR:
|
866
|
-
name = 'oneocr'
|
867
|
-
readable_name = 'OneOCR'
|
868
|
-
key = 'z'
|
869
|
-
available = False
|
870
|
-
|
871
|
-
def __init__(self, config={}, lang='ja'):
|
872
|
-
self.initial_lang = lang
|
873
|
-
self.regex = get_regex(lang)
|
874
|
-
if sys.platform == 'win32':
|
875
|
-
if int(platform.release()) < 10:
|
876
|
-
logger.warning('OneOCR is not supported on Windows older than 10!')
|
877
|
-
elif 'oneocr' not in sys.modules:
|
878
|
-
logger.warning('oneocr not available, OneOCR will not work!')
|
879
|
-
elif not os.path.exists(os.path.expanduser('~/.config/oneocr/oneocr.dll')):
|
880
|
-
logger.warning('OneOCR DLLs not found, please install OwOCR Dependencies via OCR Tab in GSM.')
|
881
|
-
else:
|
882
|
-
try:
|
883
|
-
logger.info(f'Loading OneOCR model')
|
884
|
-
self.model = oneocr.OcrEngine()
|
885
|
-
except RuntimeError as e:
|
886
|
-
logger.warning(e + ', OneOCR will not work!')
|
887
|
-
else:
|
888
|
-
self.available = True
|
889
|
-
logger.info('OneOCR ready')
|
890
|
-
else:
|
891
|
-
try:
|
892
|
-
self.url = config['url']
|
893
|
-
self.available = True
|
894
|
-
logger.info('OneOCR ready')
|
895
|
-
except:
|
896
|
-
logger.warning('Error reading URL from config, OneOCR will not work!')
|
897
|
-
|
898
|
-
def get_regex(self, lang):
|
899
|
-
if lang == "ja":
|
900
|
-
self.regex = re.compile(r'[\u3041-\u3096\u30A1-\u30FA\u4E00-\u9FFF]')
|
901
|
-
elif lang == "zh":
|
902
|
-
self.regex = re.compile(r'[\u4E00-\u9FFF]')
|
903
|
-
elif lang == "ko":
|
904
|
-
self.regex = re.compile(r'[\uAC00-\uD7AF]')
|
905
|
-
elif lang == "ar":
|
906
|
-
self.regex = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]')
|
907
|
-
elif lang == "ru":
|
908
|
-
self.regex = re.compile(r'[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F\u1C80-\u1C8F]')
|
909
|
-
elif lang == "el":
|
910
|
-
self.regex = re.compile(r'[\u0370-\u03FF\u1F00-\u1FFF]')
|
911
|
-
elif lang == "he":
|
912
|
-
self.regex = re.compile(r'[\u0590-\u05FF\uFB1D-\uFB4F]')
|
913
|
-
elif lang == "th":
|
914
|
-
self.regex = re.compile(r'[\u0E00-\u0E7F]')
|
915
|
-
else:
|
916
|
-
self.regex = re.compile(
|
917
|
-
r'[a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1D00-\u1D7F\u1D80-\u1DBF\u1E00-\u1EFF\u2C60-\u2C7F\uA720-\uA7FF\uAB30-\uAB6F]')
|
918
|
-
|
919
|
-
def __call__(self, img, furigana_filter_sensitivity=0, return_coords=False, multiple_crop_coords=False, return_one_box=True):
|
920
|
-
lang = get_ocr_language()
|
921
|
-
if furigana_filter_sensitivity != None:
|
922
|
-
furigana_filter_sensitivity = get_furigana_filter_sensitivity()
|
923
|
-
else:
|
924
|
-
furigana_filter_sensitivity = 0
|
925
|
-
if lang != self.initial_lang:
|
926
|
-
self.initial_lang = lang
|
927
|
-
self.regex = get_regex(lang)
|
928
|
-
img, is_path = input_to_pil_image(img)
|
929
|
-
if img.width < 51 or img.height < 51:
|
930
|
-
new_width = max(img.width, 51)
|
931
|
-
new_height = max(img.height, 51)
|
932
|
-
new_img = Image.new("RGBA", (new_width, new_height), (0, 0, 0, 0))
|
933
|
-
new_img.paste(img, ((new_width - img.width) // 2, (new_height - img.height) // 2))
|
934
|
-
img = new_img
|
935
|
-
if not img:
|
936
|
-
return (False, 'Invalid image provided')
|
937
|
-
crop_coords = None
|
938
|
-
crop_coords_list = []
|
939
|
-
if sys.platform == 'win32':
|
940
|
-
try:
|
941
|
-
ocr_resp = self.model.recognize_pil(img)
|
942
|
-
# if os.path.exists(os.path.expanduser("~/GSM/temp")):
|
943
|
-
# with open(os.path.join(os.path.expanduser("~/GSM/temp"), 'oneocr_response.json'), 'w',
|
944
|
-
# encoding='utf-8') as f:
|
945
|
-
# json.dump(ocr_resp, f, indent=4, ensure_ascii=False)
|
946
|
-
# print(json.dumps(ocr_resp))
|
947
|
-
filtered_lines = [line for line in ocr_resp['lines'] if self.regex.search(line['text'])]
|
948
|
-
x_coords = [line['bounding_rect'][f'x{i}'] for line in filtered_lines for i in range(1, 5)]
|
949
|
-
y_coords = [line['bounding_rect'][f'y{i}'] for line in filtered_lines for i in range(1, 5)]
|
950
|
-
if x_coords and y_coords:
|
951
|
-
crop_coords = (min(x_coords) - 5, min(y_coords) - 5, max(x_coords) + 5, max(y_coords) + 5)
|
952
|
-
# logger.info(filtered_lines)
|
953
|
-
res = ''
|
954
|
-
skipped = []
|
955
|
-
boxes = []
|
956
|
-
if furigana_filter_sensitivity > 0:
|
957
|
-
for line in filtered_lines:
|
958
|
-
x1, x2, x3, x4 = line['bounding_rect']['x1'], line['bounding_rect']['x2'], \
|
959
|
-
line['bounding_rect']['x3'], line['bounding_rect']['x4']
|
960
|
-
y1, y2, y3, y4 = line['bounding_rect']['y1'], line['bounding_rect']['y2'], \
|
961
|
-
line['bounding_rect']['y3'], line['bounding_rect']['y4']
|
962
|
-
width = max(x2 - x1, x3 - x4)
|
963
|
-
height = max(y3 - y1, y4 - y2)
|
964
|
-
if width > furigana_filter_sensitivity and height > furigana_filter_sensitivity:
|
965
|
-
res += line['text']
|
966
|
-
else:
|
967
|
-
skipped.extend(char for char in line['text'])
|
968
|
-
continue
|
969
|
-
res += '\n'
|
970
|
-
# logger.info(
|
971
|
-
# f"Skipped {len(skipped)} chars due to furigana filter sensitivity: {furigana_filter_sensitivity}")
|
972
|
-
# widths, heights = [], []
|
973
|
-
# for line in ocr_resp['lines']:
|
974
|
-
# for word in line['words']:
|
975
|
-
# if self.kana_kanji_regex.search(word['text']) is None:
|
976
|
-
# continue
|
977
|
-
# # x1, x2, x3, x4 = line['bounding_rect']['x1'], line['bounding_rect']['x2'], line['bounding_rect']['x3'], line['bounding_rect']['x4']
|
978
|
-
# # y1, y2, y3, y4 = line['bounding_rect']['y1'], line['bounding_rect']['y2'], line['bounding_rect']['y3'], line['bounding_rect']['y4']
|
979
|
-
# x1, x2, x3, x4 = word['bounding_rect']['x1'], word['bounding_rect']['x2'], \
|
980
|
-
# word['bounding_rect']['x3'], word['bounding_rect']['x4']
|
981
|
-
# y1, y2, y3, y4 = word['bounding_rect']['y1'], word['bounding_rect']['y2'], \
|
982
|
-
# word['bounding_rect']['y3'], word['bounding_rect']['y4']
|
983
|
-
# widths.append(max(x2 - x1, x3 - x4))
|
984
|
-
# heights.append(max(y2 - y1, y3 - y4))
|
985
|
-
#
|
986
|
-
#
|
987
|
-
# max_width = max(sorted(widths)[:-max(1, len(widths) // 10)]) if len(widths) > 1 else 0
|
988
|
-
# max_height = max(sorted(heights)[:-max(1, len(heights) // 10)]) if len(heights) > 1 else 0
|
989
|
-
#
|
990
|
-
# required_width = max_width * furigana_filter_sensitivity
|
991
|
-
# required_height = max_height * furigana_filter_sensitivity
|
992
|
-
# for line in ocr_resp['lines']:
|
993
|
-
# for word in line['words']:
|
994
|
-
# x1, x2, x3, x4 = word['bounding_rect']['x1'], word['bounding_rect']['x2'], \
|
995
|
-
# word['bounding_rect']['x3'], word['bounding_rect']['x4']
|
996
|
-
# y1, y2, y3, y4 = word['bounding_rect']['y1'], word['bounding_rect']['y2'], \
|
997
|
-
# word['bounding_rect']['y3'], word['bounding_rect']['y4']
|
998
|
-
# width = max(x2 - x1, x3 - x4)
|
999
|
-
# height = max(y2 - y1, y3 - y4)
|
1000
|
-
# if furigana_filter_sensitivity == 0 or width > required_width or height > required_height:
|
1001
|
-
# res += word['text']
|
1002
|
-
# else:
|
1003
|
-
# continue
|
1004
|
-
# res += '\n'
|
1005
|
-
else:
|
1006
|
-
res = ocr_resp['text']
|
1007
|
-
|
1008
|
-
if multiple_crop_coords:
|
1009
|
-
logger.info(f"Getting multiple crop coords for {len(filtered_lines)} lines")
|
1010
|
-
for line in filtered_lines:
|
1011
|
-
crop_coords_list.append(
|
1012
|
-
(line['bounding_rect']['x1'] - 5, line['bounding_rect']['y1'] - 5,
|
1013
|
-
line['bounding_rect']['x3'] + 5, line['bounding_rect']['y3'] + 5))
|
1014
|
-
|
1015
|
-
except RuntimeError as e:
|
1016
|
-
return (False, e)
|
1017
|
-
else:
|
1018
|
-
try:
|
1019
|
-
res = requests.post(self.url, data=self._preprocess(img), timeout=3)
|
1020
|
-
except requests.exceptions.Timeout:
|
1021
|
-
return (False, 'Request timeout!')
|
1022
|
-
except requests.exceptions.ConnectionError:
|
1023
|
-
return (False, 'Connection error!')
|
1024
|
-
|
1025
|
-
if res.status_code != 200:
|
1026
|
-
return (False, 'Unknown error!')
|
1027
|
-
|
1028
|
-
res = res.json()['text']
|
1029
|
-
|
1030
|
-
x = [True, res]
|
1031
|
-
if return_coords:
|
1032
|
-
x.append(filtered_lines)
|
1033
|
-
if multiple_crop_coords:
|
1034
|
-
x.append(crop_coords_list)
|
1035
|
-
if return_one_box:
|
1036
|
-
x.append(crop_coords)
|
1037
|
-
if is_path:
|
1038
|
-
img.close()
|
1039
|
-
return x
|
1040
|
-
|
1041
|
-
def _preprocess(self, img):
|
1042
|
-
return pil_image_to_bytes(img, png_compression=1)
|
1043
|
-
|
1044
|
-
class AzureImageAnalysis:
|
1045
|
-
name = 'azure'
|
1046
|
-
readable_name = 'Azure Image Analysis'
|
1047
|
-
key = 'v'
|
1048
|
-
available = False
|
1049
|
-
|
1050
|
-
def __init__(self, config={}, lang='ja'):
|
1051
|
-
if 'azure.ai.vision.imageanalysis' not in sys.modules:
|
1052
|
-
logger.warning('azure-ai-vision-imageanalysis not available, Azure Image Analysis will not work!')
|
1053
|
-
else:
|
1054
|
-
logger.info(f'Parsing Azure credentials')
|
1055
|
-
try:
|
1056
|
-
self.client = ImageAnalysisClient(config['endpoint'], AzureKeyCredential(config['api_key']))
|
1057
|
-
self.available = True
|
1058
|
-
logger.info('Azure Image Analysis ready')
|
1059
|
-
except:
|
1060
|
-
logger.warning('Error parsing Azure credentials, Azure Image Analysis will not work!')
|
1061
|
-
|
1062
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
1063
|
-
img, is_path = input_to_pil_image(img)
|
1064
|
-
if not img:
|
1065
|
-
return (False, 'Invalid image provided')
|
1066
|
-
|
1067
|
-
try:
|
1068
|
-
read_result = self.client.analyze(image_data=self._preprocess(img), visual_features=[VisualFeatures.READ])
|
1069
|
-
except ServiceRequestError:
|
1070
|
-
return (False, 'Connection error!')
|
1071
|
-
except:
|
1072
|
-
return (False, 'Unknown error!')
|
1073
|
-
|
1074
|
-
res = ''
|
1075
|
-
if read_result.read:
|
1076
|
-
for block in read_result.read.blocks:
|
1077
|
-
for line in block.lines:
|
1078
|
-
res += line.text + '\n'
|
1079
|
-
else:
|
1080
|
-
return (False, 'Unknown error!')
|
1081
|
-
|
1082
|
-
x = (True, res)
|
1083
|
-
|
1084
|
-
# img.close()
|
1085
|
-
return x
|
1086
|
-
|
1087
|
-
def _preprocess(self, img):
|
1088
|
-
if any(x < 50 for x in img.size):
|
1089
|
-
resize_factor = max(50 / img.width, 50 / img.height)
|
1090
|
-
new_w = int(img.width * resize_factor)
|
1091
|
-
new_h = int(img.height * resize_factor)
|
1092
|
-
img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
|
1093
|
-
|
1094
|
-
return pil_image_to_bytes(img)
|
1095
|
-
|
1096
|
-
class EasyOCR:
|
1097
|
-
name = 'easyocr'
|
1098
|
-
readable_name = 'EasyOCR'
|
1099
|
-
key = 'e'
|
1100
|
-
available = False
|
1101
|
-
|
1102
|
-
def __init__(self, config={'gpu': True}, lang='ja'):
|
1103
|
-
if 'easyocr' not in sys.modules:
|
1104
|
-
logger.warning('easyocr not available, EasyOCR will not work!')
|
1105
|
-
else:
|
1106
|
-
logger.info('Loading EasyOCR model')
|
1107
|
-
logging.getLogger('easyocr.easyocr').setLevel(logging.ERROR)
|
1108
|
-
self.model = easyocr.Reader(['ja','en'], gpu=config['gpu'])
|
1109
|
-
self.available = True
|
1110
|
-
logger.info('EasyOCR ready')
|
1111
|
-
|
1112
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
1113
|
-
img, is_path = input_to_pil_image(img)
|
1114
|
-
if not img:
|
1115
|
-
return (False, 'Invalid image provided')
|
1116
|
-
|
1117
|
-
res = ''
|
1118
|
-
read_result = self.model.readtext(self._preprocess(img), detail=0)
|
1119
|
-
for text in read_result:
|
1120
|
-
res += text + '\n'
|
1121
|
-
|
1122
|
-
x = (True, res)
|
1123
|
-
|
1124
|
-
# img.close()
|
1125
|
-
return x
|
1126
|
-
|
1127
|
-
def _preprocess(self, img):
|
1128
|
-
return pil_image_to_numpy_array(img)
|
1129
|
-
|
1130
|
-
class RapidOCR:
|
1131
|
-
name = 'rapidocr'
|
1132
|
-
readable_name = 'RapidOCR'
|
1133
|
-
key = 'r'
|
1134
|
-
available = False
|
1135
|
-
|
1136
|
-
def __init__(self, lang='ja'):
|
1137
|
-
if 'rapidocr_onnxruntime' not in sys.modules:
|
1138
|
-
logger.warning('rapidocr_onnxruntime not available, RapidOCR will not work!')
|
1139
|
-
else:
|
1140
|
-
rapidocr_model_file = os.path.join(os.path.expanduser('~'),'.cache','rapidocr_japan_PP-OCRv4_rec_infer.onnx')
|
1141
|
-
if not os.path.isfile(rapidocr_model_file):
|
1142
|
-
logger.info('Downloading RapidOCR model ' + rapidocr_model_file)
|
1143
|
-
try:
|
1144
|
-
cache_folder = os.path.join(os.path.expanduser('~'),'.cache')
|
1145
|
-
if not os.path.isdir(cache_folder):
|
1146
|
-
os.makedirs(cache_folder)
|
1147
|
-
urllib.request.urlretrieve('https://github.com/AuroraWright/owocr/raw/master/rapidocr_japan_PP-OCRv4_rec_infer.onnx', rapidocr_model_file)
|
1148
|
-
except:
|
1149
|
-
logger.warning('Download failed. RapidOCR will not work!')
|
1150
|
-
return
|
1151
|
-
|
1152
|
-
logger.info('Loading RapidOCR model')
|
1153
|
-
self.model = ROCR(rec_model_path=rapidocr_model_file)
|
1154
|
-
logging.getLogger().setLevel(logging.ERROR)
|
1155
|
-
self.available = True
|
1156
|
-
logger.info('RapidOCR ready')
|
1157
|
-
|
1158
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
1159
|
-
img, is_path = input_to_pil_image(img)
|
1160
|
-
if not img:
|
1161
|
-
return (False, 'Invalid image provided')
|
1162
|
-
|
1163
|
-
res = ''
|
1164
|
-
read_results, elapsed = self.model(self._preprocess(img))
|
1165
|
-
if read_results:
|
1166
|
-
for read_result in read_results:
|
1167
|
-
res += read_result[1] + '\n'
|
1168
|
-
|
1169
|
-
x = (True, res)
|
1170
|
-
|
1171
|
-
# img.close()
|
1172
|
-
return x
|
1173
|
-
|
1174
|
-
def _preprocess(self, img):
|
1175
|
-
return pil_image_to_numpy_array(img)
|
1176
|
-
|
1177
|
-
class OCRSpace:
|
1178
|
-
name = 'ocrspace'
|
1179
|
-
readable_name = 'OCRSpace'
|
1180
|
-
key = 'o'
|
1181
|
-
available = False
|
1182
|
-
|
1183
|
-
def __init__(self, config={}, lang='ja'):
|
1184
|
-
try:
|
1185
|
-
self.api_key = config['api_key']
|
1186
|
-
self.max_byte_size = config.get('file_size_limit', 1000000)
|
1187
|
-
self.available = True
|
1188
|
-
logger.info('OCRSpace ready')
|
1189
|
-
except:
|
1190
|
-
logger.warning('Error reading API key from config, OCRSpace will not work!')
|
1191
|
-
|
1192
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
1193
|
-
img, is_path = input_to_pil_image(img)
|
1194
|
-
if not img:
|
1195
|
-
return (False, 'Invalid image provided')
|
1196
|
-
|
1197
|
-
img_bytes, img_extension = self._preprocess(img)
|
1198
|
-
if not img_bytes:
|
1199
|
-
return (False, 'Image is too big!')
|
1200
|
-
|
1201
|
-
data = {
|
1202
|
-
'apikey': self.api_key,
|
1203
|
-
'language': 'jpn'
|
1204
|
-
}
|
1205
|
-
files = {'file': ('image.' + img_extension, img_bytes, 'image/' + img_extension)}
|
1206
|
-
|
1207
|
-
try:
|
1208
|
-
res = requests.post('https://api.ocr.space/parse/image', data=data, files=files, timeout=5)
|
1209
|
-
except requests.exceptions.Timeout:
|
1210
|
-
return (False, 'Request timeout!')
|
1211
|
-
except requests.exceptions.ConnectionError:
|
1212
|
-
return (False, 'Connection error!')
|
1213
|
-
|
1214
|
-
if res.status_code != 200:
|
1215
|
-
return (False, 'Unknown error!')
|
1216
|
-
|
1217
|
-
res = res.json()
|
1218
|
-
|
1219
|
-
if isinstance(res, str):
|
1220
|
-
return (False, 'Unknown error!')
|
1221
|
-
if res['IsErroredOnProcessing']:
|
1222
|
-
return (False, res['ErrorMessage'])
|
1223
|
-
|
1224
|
-
res = res['ParsedResults'][0]['ParsedText']
|
1225
|
-
x = (True, res)
|
1226
|
-
|
1227
|
-
# img.close()
|
1228
|
-
return x
|
1229
|
-
|
1230
|
-
def _preprocess(self, img):
|
1231
|
-
return limit_image_size(img, self.max_byte_size)
|
1232
|
-
|
1233
|
-
|
1234
|
-
class GeminiOCR:
|
1235
|
-
name = 'gemini'
|
1236
|
-
readable_name = 'Gemini'
|
1237
|
-
key = 'm'
|
1238
|
-
available = False
|
1239
|
-
|
1240
|
-
def __init__(self, config={'api_key': None}, lang='ja'):
|
1241
|
-
# if "google-generativeai" not in sys.modules:
|
1242
|
-
# logger.warning('google-generativeai not available, GeminiOCR will not work!')
|
1243
|
-
# else:
|
1244
|
-
from google import genai
|
1245
|
-
from google.genai import types
|
1246
|
-
try:
|
1247
|
-
self.api_key = config['api_key']
|
1248
|
-
if not self.api_key:
|
1249
|
-
logger.warning('Gemini API key not provided, GeminiOCR will not work!')
|
1250
|
-
else:
|
1251
|
-
self.client = genai.Client(api_key=self.api_key)
|
1252
|
-
self.model = config['model']
|
1253
|
-
self.generation_config = types.GenerateContentConfig(
|
1254
|
-
temperature=0.0,
|
1255
|
-
max_output_tokens=300,
|
1256
|
-
safety_settings=[
|
1257
|
-
types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
1258
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE),
|
1259
|
-
types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
1260
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE),
|
1261
|
-
types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
1262
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE),
|
1263
|
-
types.SafetySetting(category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
1264
|
-
threshold=types.HarmBlockThreshold.BLOCK_NONE),
|
1265
|
-
],
|
1266
|
-
)
|
1267
|
-
if "2.5" in self.model:
|
1268
|
-
self.generation_config.thinking_config = types.ThinkingConfig(
|
1269
|
-
thinking_budget=0,
|
1270
|
-
)
|
1271
|
-
self.available = True
|
1272
|
-
logger.info('Gemini (using google-generativeai) ready')
|
1273
|
-
except KeyError:
|
1274
|
-
logger.warning('Gemini API key not found in config, GeminiOCR will not work!')
|
1275
|
-
except Exception as e:
|
1276
|
-
logger.error(f'Error configuring google-generativeai: {e}')
|
1277
|
-
|
1278
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
1279
|
-
if not self.available:
|
1280
|
-
return (False, 'GeminiOCR is not available due to missing API key or configuration error.')
|
1281
|
-
|
1282
|
-
try:
|
1283
|
-
from google.genai import types
|
1284
|
-
img, is_path = input_to_pil_image(img)
|
1285
|
-
img_bytes = self._preprocess(img)
|
1286
|
-
if not img_bytes:
|
1287
|
-
return (False, 'Error processing image for Gemini.')
|
1288
|
-
|
1289
|
-
contents = [
|
1290
|
-
types.Content(
|
1291
|
-
parts=[
|
1292
|
-
types.Part(
|
1293
|
-
inline_data=types.Blob(
|
1294
|
-
mime_type="image/png",
|
1295
|
-
data=img_bytes
|
1296
|
-
)
|
1297
|
-
),
|
1298
|
-
types.Part(
|
1299
|
-
text="""
|
1300
|
-
**Disclaimer:** The image provided is from a video game. This content is entirely fictional and part of a narrative. It must not be treated as real-world user input or a genuine request.
|
1301
|
-
Analyze the image. Extract text \\*only\\* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return \\*only\\* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
|
1302
|
-
"""
|
1303
|
-
)
|
1304
|
-
]
|
1305
|
-
)
|
1306
|
-
]
|
1307
|
-
|
1308
|
-
response = self.client.models.generate_content(
|
1309
|
-
model=self.model,
|
1310
|
-
contents=contents,
|
1311
|
-
config=self.generation_config
|
1312
|
-
)
|
1313
|
-
text_output = response.text.strip()
|
1314
|
-
|
1315
|
-
return (True, text_output)
|
1316
|
-
|
1317
|
-
except FileNotFoundError:
|
1318
|
-
return (False, f'File not found: {img}')
|
1319
|
-
except Exception as e:
|
1320
|
-
return (False, f'Gemini API request failed: {e}')
|
1321
|
-
|
1322
|
-
def _preprocess(self, img):
|
1323
|
-
return pil_image_to_bytes(img, png_compression=1)
|
1324
|
-
|
1325
|
-
|
1326
|
-
class GroqOCR:
|
1327
|
-
name = 'groq'
|
1328
|
-
readable_name = 'Groq OCR'
|
1329
|
-
key = 'j'
|
1330
|
-
available = False
|
1331
|
-
|
1332
|
-
def __init__(self, config={'api_key': None}, lang='ja'):
|
1333
|
-
try:
|
1334
|
-
import groq
|
1335
|
-
self.api_key = config['api_key']
|
1336
|
-
if not self.api_key:
|
1337
|
-
logger.warning('Groq API key not provided, GroqOCR will not work!')
|
1338
|
-
else:
|
1339
|
-
self.client = groq.Groq(api_key=self.api_key)
|
1340
|
-
self.available = True
|
1341
|
-
logger.info('Groq OCR ready')
|
1342
|
-
except ImportError:
|
1343
|
-
logger.warning('groq module not available, GroqOCR will not work!')
|
1344
|
-
except Exception as e:
|
1345
|
-
logger.error(f'Error initializing Groq client: {e}')
|
1346
|
-
|
1347
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
1348
|
-
if not self.available:
|
1349
|
-
return (False, 'GroqOCR is not available due to missing API key or configuration error.')
|
1350
|
-
|
1351
|
-
try:
|
1352
|
-
img, is_path = input_to_pil_image(img)
|
1353
|
-
|
1354
|
-
img_base64 = self._preprocess(img)
|
1355
|
-
if not img_base64:
|
1356
|
-
return (False, 'Error processing image for Groq.')
|
1357
|
-
|
1358
|
-
prompt = (
|
1359
|
-
"Analyze the image. Extract text *only* from within dialogue boxes (speech bubbles or panels containing character dialogue). If Text appears to be vertical, read the text from top to bottom, right to left. From the extracted dialogue text, filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. If no text is found within dialogue boxes after applying filters, return nothing. Do not include any other output, formatting markers, or commentary."
|
1360
|
-
# "Analyze this i#mage and extract text from it"
|
1361
|
-
# "(speech bubbles or panels containing character dialogue). From the extracted dialogue text, "
|
1362
|
-
# "filter out any furigana. Ignore and do not include any text found outside of dialogue boxes, "
|
1363
|
-
# "including character names, speaker labels, or sound effects. Return *only* the filtered dialogue text. "
|
1364
|
-
# "If no text is found within dialogue boxes after applying filters, return an empty string. "
|
1365
|
-
# "OR, if there are no text bubbles or dialogue boxes found, return everything."
|
1366
|
-
# "Do not include any other output, formatting markers, or commentary, only the text from the image."
|
1367
|
-
)
|
1368
|
-
|
1369
|
-
response = self.client.chat.completions.create(
|
1370
|
-
model="meta-llama/llama-4-scout-17b-16e-instruct",
|
1371
|
-
messages=[
|
1372
|
-
{
|
1373
|
-
"role": "user",
|
1374
|
-
"content": [
|
1375
|
-
{"type": "text", "text": prompt},
|
1376
|
-
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
|
1377
|
-
],
|
1378
|
-
}
|
1379
|
-
],
|
1380
|
-
max_tokens=300,
|
1381
|
-
temperature=0.0
|
1382
|
-
)
|
1383
|
-
|
1384
|
-
if response.choices and response.choices[0].message.content:
|
1385
|
-
text_output = response.choices[0].message.content.strip()
|
1386
|
-
return (True, text_output)
|
1387
|
-
else:
|
1388
|
-
return (True, "")
|
1389
|
-
|
1390
|
-
except FileNotFoundError:
|
1391
|
-
return (False, f'File not found: {img}')
|
1392
|
-
except Exception as e:
|
1393
|
-
return (False, f'Groq API request failed: {e}')
|
1394
|
-
|
1395
|
-
def _preprocess(self, img):
|
1396
|
-
return base64.b64encode(pil_image_to_bytes(img, png_compression=1)).decode('utf-8')
|
1397
|
-
|
1398
|
-
|
1399
|
-
# OpenAI-Compatible Endpoint OCR using LM Studio
|
1400
|
-
class localLLMOCR:
|
1401
|
-
name= 'local_llm_ocr'
|
1402
|
-
readable_name = 'Local LLM OCR'
|
1403
|
-
key = 'a'
|
1404
|
-
available = False
|
1405
|
-
last_ocr_time = time.time() - 5
|
1406
|
-
|
1407
|
-
def __init__(self, config={}, lang='ja'):
|
1408
|
-
self.keep_llm_hot_thread = None
|
1409
|
-
# All three config values are required: url, model, api_key
|
1410
|
-
if not config or not (config.get('url') and config.get('model') and config.get('api_key')):
|
1411
|
-
logger.warning('Local LLM OCR requires url, model, and api_key in config, Local LLM OCR will not work!')
|
1412
|
-
return
|
1413
|
-
|
1414
|
-
try:
|
1415
|
-
import openai
|
1416
|
-
except ImportError:
|
1417
|
-
logger.warning('openai module not available, Local LLM OCR will not work!')
|
1418
|
-
return
|
1419
|
-
import openai, threading
|
1420
|
-
try:
|
1421
|
-
self.api_url = config.get('url', 'http://localhost:1234/v1/chat/completions')
|
1422
|
-
self.model = config.get('model', 'qwen2.5-vl-3b-instruct')
|
1423
|
-
self.api_key = config.get('api_key', 'lm-studio')
|
1424
|
-
self.keep_warm = config.get('keep_warm', True)
|
1425
|
-
self.custom_prompt = config.get('prompt', None)
|
1426
|
-
self.available = True
|
1427
|
-
if any(x in self.api_url for x in ['localhost', '127.0.0.1']):
|
1428
|
-
if not self.check_connection(self.api_url):
|
1429
|
-
logger.warning('Local LLM OCR API is not reachable')
|
1430
|
-
return
|
1431
|
-
self.client = openai.OpenAI(
|
1432
|
-
base_url=self.api_url.replace('/v1/chat/completions', '/v1'),
|
1433
|
-
api_key=self.api_key
|
1434
|
-
)
|
1435
|
-
if self.client.models.retrieve(self.model):
|
1436
|
-
self.model = self.model
|
1437
|
-
logger.info(f'Local LLM OCR (OpenAI-compatible) ready with model {self.model}')
|
1438
|
-
if self.keep_warm:
|
1439
|
-
self.keep_llm_hot_thread = threading.Thread(target=self.keep_llm_warm, daemon=True)
|
1440
|
-
self.keep_llm_hot_thread.start()
|
1441
|
-
except Exception as e:
|
1442
|
-
logger.warning(f'Error initializing Local LLM OCR, Local LLM OCR will not work!')
|
1443
|
-
|
1444
|
-
def check_connection(self, url, port=None):
|
1445
|
-
# simple connectivity check with mega low timeout
|
1446
|
-
import http.client
|
1447
|
-
conn = http.client.HTTPConnection(url, port or 1234, timeout=0.1)
|
1448
|
-
try:
|
1449
|
-
conn.request("GET", "/v1/models")
|
1450
|
-
response = conn.getresponse()
|
1451
|
-
if response.status == 200:
|
1452
|
-
logger.info('Local LLM OCR API is reachable')
|
1453
|
-
return True
|
1454
|
-
else:
|
1455
|
-
logger.warning('Local LLM OCR API is not reachable')
|
1456
|
-
return False
|
1457
|
-
except Exception as e:
|
1458
|
-
logger.warning(f'Error connecting to Local LLM OCR API: {e}')
|
1459
|
-
return False
|
1460
|
-
finally:
|
1461
|
-
conn.close()
|
1462
|
-
|
1463
|
-
def keep_llm_warm(self):
|
1464
|
-
def ocr_blank_black_image():
|
1465
|
-
if self.last_ocr_time and (time.time() - self.last_ocr_time) < 5:
|
1466
|
-
return
|
1467
|
-
import numpy as np
|
1468
|
-
from PIL import Image
|
1469
|
-
# Create a blank black image
|
1470
|
-
blank_image = Image.fromarray(np.zeros((100, 100, 3), dtype=np.uint8))
|
1471
|
-
logger.info('Keeping local LLM OCR warm with a blank black image')
|
1472
|
-
self(blank_image)
|
1473
|
-
|
1474
|
-
while True:
|
1475
|
-
ocr_blank_black_image()
|
1476
|
-
time.sleep(5)
|
1477
|
-
|
1478
|
-
def __call__(self, img, furigana_filter_sensitivity=0):
|
1479
|
-
import base64
|
1480
|
-
try:
|
1481
|
-
img, is_path = input_to_pil_image(img)
|
1482
|
-
img_bytes = pil_image_to_bytes(img)
|
1483
|
-
img_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
1484
|
-
if self.custom_prompt and self.custom_prompt.strip() != "":
|
1485
|
-
prompt = self.custom_prompt.strip()
|
1486
|
-
else:
|
1487
|
-
prompt = f"""
|
1488
|
-
Extract all {CommonLanguages.from_code(get_ocr_language()).name} Text from Image. Ignore all Furigana. Do not return any commentary, just the text in the image. If there is no text in the image, return "" (Empty String).
|
1489
|
-
"""
|
1490
|
-
|
1491
|
-
response = self.client.chat.completions.create(
|
1492
|
-
model=self.model,
|
1493
|
-
messages=[
|
1494
|
-
{
|
1495
|
-
"role": "user",
|
1496
|
-
"content": [
|
1497
|
-
{"type": "text", "text": prompt},
|
1498
|
-
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}},
|
1499
|
-
],
|
1500
|
-
}
|
1501
|
-
],
|
1502
|
-
max_tokens=4096,
|
1503
|
-
temperature=0.1
|
1504
|
-
)
|
1505
|
-
self.last_ocr_time = time.time()
|
1506
|
-
if response.choices and response.choices[0].message.content:
|
1507
|
-
text_output = response.choices[0].message.content.strip()
|
1508
|
-
return (True, text_output)
|
1509
|
-
else:
|
1510
|
-
return (True, "")
|
1511
|
-
except Exception as e:
|
1512
|
-
return (False, f'Local LLM OCR request failed: {e}')
|
1513
|
-
|
1514
|
-
# class QWENOCR:
|
1515
|
-
# name = 'qwenv2'
|
1516
|
-
# readable_name = 'Qwen2-VL'
|
1517
|
-
# key = 'q'
|
1518
|
-
|
1519
|
-
# # Class-level attributes for model and processor to ensure they are loaded only once
|
1520
|
-
# model = None
|
1521
|
-
# processor = None
|
1522
|
-
# device = None
|
1523
|
-
# available = False
|
1524
|
-
|
1525
|
-
# @classmethod
|
1526
|
-
# def initialize(cls):
|
1527
|
-
# import torch
|
1528
|
-
# from transformers import AutoModelForImageTextToText, AutoProcessor
|
1529
|
-
# """
|
1530
|
-
# Class method to initialize the model. Call this once at the start of your application.
|
1531
|
-
# This prevents reloading the model on every instantiation.
|
1532
|
-
# """
|
1533
|
-
# if cls.model is not None:
|
1534
|
-
# logger.info('Qwen2-VL is already initialized.')
|
1535
|
-
# return
|
1536
|
-
|
1537
|
-
# try:
|
1538
|
-
# if not torch.cuda.is_available():
|
1539
|
-
# logger.warning("CUDA not available, Qwen2-VL will run on CPU, which will be very slow.")
|
1540
|
-
# # You might want to prevent initialization on CPU entirely
|
1541
|
-
# # raise RuntimeError("CUDA is required for efficient Qwen2-VL operation.")
|
1542
|
-
|
1543
|
-
# cls.device = "cuda" if torch.cuda.is_available() else "cpu"
|
1544
|
-
|
1545
|
-
# cls.model = AutoModelForImageTextToText.from_pretrained(
|
1546
|
-
# "Qwen/Qwen2-VL-2B-Instruct",
|
1547
|
-
# torch_dtype="auto", # Uses bfloat16/float16 if available, which is faster
|
1548
|
-
# device_map=cls.device
|
1549
|
-
# )
|
1550
|
-
# # For PyTorch 2.0+, torch.compile can significantly speed up inference after a warm-up call
|
1551
|
-
# # cls.model = torch.compile(cls.model)
|
1552
|
-
|
1553
|
-
# cls.processor = AutoProcessor.from_pretrained(
|
1554
|
-
# "Qwen/Qwen2-VL-2B-Instruct",
|
1555
|
-
# use_fast=True
|
1556
|
-
# )
|
1557
|
-
|
1558
|
-
# cls.available = True
|
1559
|
-
|
1560
|
-
# conversation = [
|
1561
|
-
# {
|
1562
|
-
# "role": "user",
|
1563
|
-
# "content": [
|
1564
|
-
# {"type": "image"},
|
1565
|
-
# {"type": "text", "text": "Extract all the text from this image, ignore all furigana."},
|
1566
|
-
# ],
|
1567
|
-
# }
|
1568
|
-
# ]
|
1569
|
-
|
1570
|
-
# # The same prompt is applied to all images in the batch
|
1571
|
-
# cls.text_prompt = cls.processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
|
1572
|
-
# logger.info(f'Qwen2.5-VL ready on device: {cls.device}')
|
1573
|
-
# except Exception as e:
|
1574
|
-
# logger.warning(f'Qwen2-VL not available: {e}')
|
1575
|
-
# cls.available = False
|
1576
|
-
|
1577
|
-
# def __init__(self, config={}, lang='ja'):
|
1578
|
-
# # The __init__ is now very lightweight. It just checks if initialization has happened.
|
1579
|
-
# if not self.available:
|
1580
|
-
# raise RuntimeError("QWENOCR has not been initialized. Call QWENOCR.initialize() first.")
|
1581
|
-
|
1582
|
-
# def __call__(self, images):
|
1583
|
-
# """
|
1584
|
-
# Processes a single image or a list of images.
|
1585
|
-
# :param images: A single image (path or PIL.Image) or a list of images.
|
1586
|
-
# :return: A tuple (success, list_of_results)
|
1587
|
-
# """
|
1588
|
-
# if not self.available:
|
1589
|
-
# return (False, ['Qwen2-VL is not available.'])
|
1590
|
-
|
1591
|
-
# try:
|
1592
|
-
# # Standardize input to be a list
|
1593
|
-
# if not isinstance(images, list):
|
1594
|
-
# images = [images]
|
1595
|
-
|
1596
|
-
# pil_images = [input_to_pil_image(img)[0] for img in images]
|
1597
|
-
|
1598
|
-
# # The processor handles batching of images and text prompts
|
1599
|
-
# inputs = self.processor(
|
1600
|
-
# text=[self.text_prompt] * len(pil_images),
|
1601
|
-
# images=pil_images,
|
1602
|
-
# padding=True,
|
1603
|
-
# return_tensors="pt"
|
1604
|
-
# ).to(self.device)
|
1605
|
-
|
1606
|
-
# output_ids = self.model.generate(**inputs, max_new_tokens=32)
|
1607
|
-
|
1608
|
-
# # The decoding logic needs to be slightly adjusted for batching
|
1609
|
-
# input_ids_len = [len(x) for x in inputs.input_ids]
|
1610
|
-
# generated_ids = [
|
1611
|
-
# output_ids[i][input_ids_len[i]:] for i in range(len(input_ids_len))
|
1612
|
-
# ]
|
1613
|
-
|
1614
|
-
# output_text = self.processor.batch_decode(
|
1615
|
-
# generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
|
1616
|
-
# )
|
1617
|
-
|
1618
|
-
# return (True, output_text)
|
1619
|
-
# except Exception as e:
|
1620
|
-
# return (False, [f'Qwen2-VL inference failed: {e}'])
|
1621
|
-
|
1622
|
-
|
1623
|
-
# QWENOCR.initialize()
|
1624
|
-
# qwenocr = QWENOCR()
|
1625
|
-
|
1626
|
-
# localOCR = localLLMOCR(config={'api_url': 'http://localhost:1234/v1/chat/completions', 'model': 'qwen2.5-vl-3b-instruct'})
|
1627
|
-
|
1628
|
-
# for i in range(10):
|
1629
|
-
# start_time = time.time()
|
1630
|
-
# res, text = localOCR(Image.open(r"C:\Users\Beangate\GSM\GameSentenceMiner\GameSentenceMiner\owocr\owocr\test_furigana.png")) # Example usage
|
1631
|
-
# end_time = time.time()
|
1632
|
-
|
1633
|
-
# print(f"Time taken: {end_time - start_time:.2f} seconds")
|
1634
|
-
# print(text)
|
1635
|
-
# class LocalOCR:
|
1636
|
-
# name = 'local_ocr'
|
1637
|
-
# readable_name = 'Local OCR'
|
1638
|
-
# key = '-'
|
1639
|
-
# available = False
|
1640
|
-
#
|
1641
|
-
# def __init__(self, lang='ja'):
|
1642
|
-
# self.requests_session = requests.Session()
|
1643
|
-
# self.available = True
|
1644
|
-
# # logger.info('Local OCR ready') # Uncomment if you have a logger defined
|
1645
|
-
#
|
1646
|
-
# def __call__(self, img, furigana_filter_sensitivity=0):
|
1647
|
-
# if not isinstance(img, Image.Image):
|
1648
|
-
# try:
|
1649
|
-
# img = Image.open(io.BytesIO(img))
|
1650
|
-
# except Exception:
|
1651
|
-
# return (False, 'Invalid image provided')
|
1652
|
-
#
|
1653
|
-
# img = input_to_pil_image(img)
|
1654
|
-
#
|
1655
|
-
# img_base64 = self._preprocess(img)
|
1656
|
-
# if not img_base64:
|
1657
|
-
# return (False, 'Image preprocessing failed (e.g., too big after resize)!')
|
1658
|
-
#
|
1659
|
-
# api_url = 'http://localhost:2333/api/ocr'
|
1660
|
-
# # Send as JSON with base64 encoded image
|
1661
|
-
# json_data = {
|
1662
|
-
# 'image': img_base64
|
1663
|
-
# }
|
1664
|
-
#
|
1665
|
-
# try:
|
1666
|
-
# res = self.requests_session.post(api_url, json=json_data, timeout=5)
|
1667
|
-
# print(res.content)
|
1668
|
-
# except requests.exceptions.Timeout:
|
1669
|
-
# return (False, 'Request timeout!')
|
1670
|
-
# except requests.exceptions.ConnectionError:
|
1671
|
-
# return (False, 'Connection error!')
|
1672
|
-
#
|
1673
|
-
# if res.status_code != 200:
|
1674
|
-
# return (False, f'Error: {res.status_code} - {res.text}')
|
1675
|
-
#
|
1676
|
-
# try:
|
1677
|
-
# data = res.json()
|
1678
|
-
# # Assuming the local OCR service returns text in a 'text' key
|
1679
|
-
# extracted_text = data.get('text', '')
|
1680
|
-
# return (True, extracted_text)
|
1681
|
-
# except requests.exceptions.JSONDecodeError:
|
1682
|
-
# return (False, 'Invalid JSON response from OCR service!')
|
1683
|
-
#
|
1684
|
-
# def _preprocess(self, img):
|
1685
|
-
# return base64.b64encode(pil_image_to_bytes(img, png_compression=1)).decode('utf-8')
|
1686
|
-
|
1687
|
-
# lens = GeminiOCR(config={'model': 'gemini-2.5-flash-lite-preview-06-17', 'api_key': ''})
|
1688
|
-
#
|
1689
|
-
# res, text = lens(Image.open('test_furigana.png')) # Example usage
|
1690
|
-
#
|
1691
|
-
# print(text)
|