openvisionkit 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openvisionkit/__init__.py +1 -0
- openvisionkit/_version.py +24 -0
- openvisionkit/capture/draw_object.py +296 -0
- openvisionkit/capture/image_template.py +61 -0
- openvisionkit/capture/screen_capture.py +13 -0
- openvisionkit/capture/video_recorder.py +128 -0
- openvisionkit/capture/video_template.py +336 -0
- openvisionkit/lib/classifier.py +186 -0
- openvisionkit/lib/face_detector.py +587 -0
- openvisionkit/lib/face_mesh_detector.py +913 -0
- openvisionkit/lib/form_detector.py +465 -0
- openvisionkit/lib/form_roi_annotator.py +679 -0
- openvisionkit/lib/form_roi_detector.py +1078 -0
- openvisionkit/lib/fps_counter.py +38 -0
- openvisionkit/lib/hair_segmentation.py +298 -0
- openvisionkit/lib/hand_detector.py +1230 -0
- openvisionkit/lib/image_detector.py +1095 -0
- openvisionkit/lib/object_detector.py +401 -0
- openvisionkit/lib/pose_detector.py +919 -0
- openvisionkit/lib/selfie_segmentation.py +528 -0
- openvisionkit/lib/text_detector.py +1229 -0
- openvisionkit/utility/live_plot.py +141 -0
- openvisionkit/utility/vision_utilis.py +871 -0
- openvisionkit-0.4.0.dist-info/METADATA +1018 -0
- openvisionkit-0.4.0.dist-info/RECORD +26 -0
- openvisionkit-0.4.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1229 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
import cv2
|
|
4
|
+
import imutils
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import pytesseract
|
|
8
|
+
from PIL import Image, ImageEnhance, ImageFilter
|
|
9
|
+
from skimage.metrics import structural_similarity as ssim
|
|
10
|
+
|
|
11
|
+
"""
|
|
12
|
+
Usage:
|
|
13
|
+
tesseract --help | --help-extra | --help-psm | --help-oem | --version
|
|
14
|
+
tesseract --list-langs [--tessdata-dir PATH]
|
|
15
|
+
tesseract --print-fonts-table [options...] [configfile...]
|
|
16
|
+
tesseract --print-parameters [options...] [configfile...]
|
|
17
|
+
tesseract imagename|imagelist|stdin outputbase|stdout [options...] [configfile...]
|
|
18
|
+
|
|
19
|
+
OCR options:
|
|
20
|
+
--tessdata-dir PATH Specify the location of tessdata path.
|
|
21
|
+
--user-words PATH Specify the location of user words file.
|
|
22
|
+
--user-patterns PATH Specify the location of user patterns file.
|
|
23
|
+
--dpi VALUE Specify DPI for input image.
|
|
24
|
+
--loglevel LEVEL Specify logging level. LEVEL can be
|
|
25
|
+
ALL, TRACE, DEBUG, INFO, WARN, ERROR, FATAL or OFF.
|
|
26
|
+
-l LANG[+LANG] Specify language(s) used for OCR.
|
|
27
|
+
-c VAR=VALUE Set value for config variables.
|
|
28
|
+
Multiple -c arguments are allowed.
|
|
29
|
+
--psm PSM|NUM Specify page segmentation mode.
|
|
30
|
+
--oem OEM|NUM Specify OCR Engine mode.
|
|
31
|
+
NOTE: These options must occur before any configfile.
|
|
32
|
+
|
|
33
|
+
Page segmentation modes (PSM):
|
|
34
|
+
0|osd_only Orientation and script detection (OSD) only.
|
|
35
|
+
1|auto_osd Automatic page segmentation with OSD.
|
|
36
|
+
2|auto_only Automatic page segmentation, but no OSD, or OCR. (not implemented)
|
|
37
|
+
3|auto Fully automatic page segmentation, but no OSD. (Default)
|
|
38
|
+
4|single_column Assume a single column of text of variable sizes.
|
|
39
|
+
5|single_block_vert_text Assume a single uniform block of vertically aligned text.
|
|
40
|
+
6|single_block Assume a single uniform block of text.
|
|
41
|
+
7|single_line Treat the image as a single text line.
|
|
42
|
+
8|single_word Treat the image as a single word.
|
|
43
|
+
9|circle_word Treat the image as a single word in a circle.
|
|
44
|
+
10|single_char Treat the image as a single character.
|
|
45
|
+
11|sparse_text Sparse text. Find as much text as possible in no particular order.
|
|
46
|
+
12|sparse_text_osd Sparse text with OSD.
|
|
47
|
+
13|raw_line Raw line. Treat the image as a single text line,
|
|
48
|
+
bypassing hacks that are Tesseract-specific.
|
|
49
|
+
|
|
50
|
+
OCR Engine modes (OEM):
|
|
51
|
+
0|tesseract_only Legacy engine only.
|
|
52
|
+
1|lstm_only Neural nets LSTM engine only.
|
|
53
|
+
2|tesseract_lstm_combined Legacy + LSTM engines.
|
|
54
|
+
3|default Default, based on what is available.
|
|
55
|
+
|
|
56
|
+
Single options:
|
|
57
|
+
-h, --help Show minimal help message.
|
|
58
|
+
--help-extra Show extra help for advanced users.
|
|
59
|
+
--help-psm Show page segmentation modes.
|
|
60
|
+
--help-oem Show OCR Engine modes.
|
|
61
|
+
-v, --version Show version information.
|
|
62
|
+
--list-langs List available languages for tesseract engine.
|
|
63
|
+
--print-fonts-table Print tesseract fonts table.
|
|
64
|
+
--print-parameters Print tesseract parameters.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
import spacy
|
|
69
|
+
|
|
70
|
+
NLP = spacy.load("en_core_web_sm")
|
|
71
|
+
except Exception as e:
|
|
72
|
+
print(
|
|
73
|
+
f"[WARNING] spaCy not found or failed to load. Entity extraction will be unavailable. Error: {e}"
|
|
74
|
+
)
|
|
75
|
+
NLP = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class TextDetector:
|
|
79
|
+
"""
|
|
80
|
+
A class for detecting and extracting text from images using Tesseract OCR. It provides methods for preprocessing images, setting OCR configurations, and visualizing detected text with bounding boxes and labels. The class can be used for both character-level and word-level detection, and supports multiple languages and OCR engine modes.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
image (np.ndarray): The input image in which to detect text.
|
|
84
|
+
lang (str): The language(s) to use for OCR. Default is "eng" (English). Multiple languages can be specified by separating them with a plus sign (e.g., "eng+chi_sim").
|
|
85
|
+
oem (int): The OCR Engine mode to use. Default is 3 (default, based on what is available). Other options include 0 (legacy engine only), 1 (
|
|
86
|
+
neural nets LSTM engine only), and 2 (legacy + LSTM engines).
|
|
87
|
+
psm (int): The page segmentation mode to use. Default is 6 (assume a single uniform block of text). Other options include 0 (orientation and script detection only), 1 (automatic page segmentation with OSD), 2 (automatic page segmentation, but no OSD or OCR), 3 (fully automatic page segmentation
|
|
88
|
+
but no OSD), 4 (assume a single column of text), 5 (assume a single uniform block of vertically aligned text), 7 (treat the image as a single text line), 8 (treat the image as a single word), 9 (treat the image as a single word in a circle), 10 (treat the image as a single character), 11 (sparse text, find as much text as possible in no particular order), 12 (sparse text with OSD), and 13 (raw line, treat the image as a single text line bypassing Tesseract-specific hacks).
|
|
89
|
+
preprocess (bool): Whether to apply preprocessing to the input image before performing OCR. Default is True. Preprocessing includes converting the image to grayscale, enhancing contrast, reducing noise with Gaussian blur,
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
image: np.ndarray,
|
|
95
|
+
lang: str = "eng",
|
|
96
|
+
oem: int = 3,
|
|
97
|
+
psm: int = 6,
|
|
98
|
+
preprocess: bool = True,
|
|
99
|
+
use_gpu: bool = False,
|
|
100
|
+
):
|
|
101
|
+
self.set_image(image)
|
|
102
|
+
self.lang = lang
|
|
103
|
+
self.oem = oem
|
|
104
|
+
self.psm = psm
|
|
105
|
+
self.preprocess_enabled = preprocess
|
|
106
|
+
|
|
107
|
+
self.height, self.width = image.shape[:2]
|
|
108
|
+
self.config = f"--oem {self.oem} --psm {self.psm} -l {self.lang}"
|
|
109
|
+
|
|
110
|
+
# Enable OpenCL (GPU acceleration if supported)
|
|
111
|
+
if use_gpu:
|
|
112
|
+
cv2.ocl.setUseOpenCL(True)
|
|
113
|
+
|
|
114
|
+
if preprocess:
|
|
115
|
+
self.image = self._preprocess(self.image)
|
|
116
|
+
|
|
117
|
+
def set_image(self, image: np.ndarray):
|
|
118
|
+
"""
|
|
119
|
+
Set the input image for text detection. This method allows you to update the image that the TextDetector instance will use for OCR. It takes a new image as input and updates the internal state of the TextDetector with this new image. The method also creates a copy of the original image for later use in visualization and other operations.
|
|
120
|
+
"""
|
|
121
|
+
self.image = image
|
|
122
|
+
self.original_image = image.copy()
|
|
123
|
+
|
|
124
|
+
# PREPROCESSING
|
|
125
|
+
def _preprocess(self, image: np.ndarray) -> np.ndarray:
|
|
126
|
+
"""
|
|
127
|
+
Preprocess the input image for better OCR results.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
image (np.ndarray): The input image to preprocess.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
np.ndarray: The preprocessed image.
|
|
134
|
+
"""
|
|
135
|
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
|
136
|
+
|
|
137
|
+
# Contrast enhancement
|
|
138
|
+
gray = cv2.equalizeHist(gray)
|
|
139
|
+
|
|
140
|
+
# Noise reduction
|
|
141
|
+
blurred = cv2.GaussianBlur(gray, (3, 3), 0)
|
|
142
|
+
|
|
143
|
+
# Adaptive threshold
|
|
144
|
+
thresh = cv2.adaptiveThreshold(
|
|
145
|
+
blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return thresh
|
|
149
|
+
|
|
150
|
+
# INTERNATIONALIZATION
|
|
151
|
+
def set_language(self, lang: str):
|
|
152
|
+
"""
|
|
153
|
+
Set the language(s) for OCR. This method allows you to specify the language(s) that Tesseract should use when performing OCR on the input image. You can specify a single language (e.g., "eng" for English) or multiple languages by separating them with a plus sign (e.g., "eng+chi_sim" for English and Simplified Chinese). The method updates the OCR configuration accordingly.
|
|
154
|
+
|
|
155
|
+
Available languages depend on the Tesseract installation and the trained data files present in the tessdata directory. You can list available languages using the command `tesseract --list-langs` in the terminal.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
lang (str): The language(s) to use for OCR. Examples include "eng"
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
None
|
|
162
|
+
|
|
163
|
+
Example:
|
|
164
|
+
'eng'
|
|
165
|
+
'eng+chi_sim'
|
|
166
|
+
'eng+mal+tam'
|
|
167
|
+
"""
|
|
168
|
+
self.lang = lang
|
|
169
|
+
self.config = f"--oem {self.oem} --psm {self.psm} -l {self.lang}"
|
|
170
|
+
|
|
171
|
+
def detect_text(self):
|
|
172
|
+
"""
|
|
173
|
+
Extract text from the input image using Tesseract OCR. This method uses the Tesseract OCR engine to analyze the preprocessed input image and extract any text it detects. The method returns the extracted text as a string, with leading and trailing whitespace removed.
|
|
174
|
+
Returns:
|
|
175
|
+
str: The text extracted from the input image, with leading and trailing whitespace removed.
|
|
176
|
+
"""
|
|
177
|
+
text = pytesseract.image_to_string(self.image, config=self.config)
|
|
178
|
+
return text.strip()
|
|
179
|
+
|
|
180
|
+
def detect_characters(
|
|
181
|
+
self,
|
|
182
|
+
draw_boxes=True,
|
|
183
|
+
is_dark_background=False,
|
|
184
|
+
adjust_text_height=20,
|
|
185
|
+
bounding_box_color=(255, 0, 0),
|
|
186
|
+
text_color=(255, 0, 0),
|
|
187
|
+
font_scale=1,
|
|
188
|
+
font_thickness=2,
|
|
189
|
+
font=cv2.FONT_HERSHEY_SIMPLEX,
|
|
190
|
+
):
|
|
191
|
+
results = []
|
|
192
|
+
# Optional: invert if background is dark
|
|
193
|
+
if is_dark_background:
|
|
194
|
+
self.image = cv2.bitwise_not(self.image)
|
|
195
|
+
|
|
196
|
+
bounding_boxes = pytesseract.image_to_boxes(self.image, config=self.config)
|
|
197
|
+
|
|
198
|
+
h_img, w_img = self.original_image.shape[:2]
|
|
199
|
+
|
|
200
|
+
# 3. DRAWING
|
|
201
|
+
for line in bounding_boxes.splitlines():
|
|
202
|
+
parts = line.strip().split()
|
|
203
|
+
|
|
204
|
+
if len(parts) < 5:
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
char = parts[0]
|
|
208
|
+
|
|
209
|
+
# FILTER NOISE
|
|
210
|
+
if not char.isalnum(): # skip punctuation/noise
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
x1, y1, x2, y2 = map(int, parts[1:5])
|
|
214
|
+
# Convert coords (Tesseract → OpenCV)
|
|
215
|
+
y1_cv = self.height - y1
|
|
216
|
+
y2_cv = self.height - y2
|
|
217
|
+
results.append({"char": char, "x1": x1, "y1": y1_cv, "x2": x2, "y2": y2_cv})
|
|
218
|
+
|
|
219
|
+
if draw_boxes:
|
|
220
|
+
top_left = (x1, y2_cv)
|
|
221
|
+
bottom_right = (x2, y1_cv)
|
|
222
|
+
|
|
223
|
+
# Draw bounding box
|
|
224
|
+
cv2.rectangle(
|
|
225
|
+
self.original_image, top_left, bottom_right, bounding_box_color, 2
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Draw character label
|
|
229
|
+
cv2.putText(
|
|
230
|
+
self.original_image,
|
|
231
|
+
char,
|
|
232
|
+
(x1, y2_cv - adjust_text_height),
|
|
233
|
+
font,
|
|
234
|
+
font_scale,
|
|
235
|
+
text_color,
|
|
236
|
+
font_thickness,
|
|
237
|
+
cv2.LINE_AA,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# annotated_image = cv2.cvtColor(self.original_image, cv2.COLOR_RGB2BGR)
|
|
241
|
+
return results, self.original_image
|
|
242
|
+
|
|
243
|
+
def detect_digits(
|
|
244
|
+
self,
|
|
245
|
+
img,
|
|
246
|
+
draw_boxes=True,
|
|
247
|
+
):
|
|
248
|
+
hImg, _, _ = img.shape
|
|
249
|
+
digit_text = []
|
|
250
|
+
self.config = r"--oem 3 --psm 6 outputbase digits"
|
|
251
|
+
boxes = pytesseract.image_to_boxes(img, config=self.config)
|
|
252
|
+
for b in boxes.splitlines():
|
|
253
|
+
b = b.split(" ")
|
|
254
|
+
digit_text.append(b[0])
|
|
255
|
+
if draw_boxes:
|
|
256
|
+
x, y, w, h = int(b[1]), int(b[2]), int(b[3]), int(b[4])
|
|
257
|
+
cv2.rectangle(img, (x, hImg - y), (w, hImg - h), (50, 50, 255), 2)
|
|
258
|
+
cv2.putText(
|
|
259
|
+
img,
|
|
260
|
+
b[0],
|
|
261
|
+
(x, hImg - y + 25),
|
|
262
|
+
cv2.FONT_HERSHEY_SIMPLEX,
|
|
263
|
+
1,
|
|
264
|
+
(50, 50, 255),
|
|
265
|
+
2,
|
|
266
|
+
)
|
|
267
|
+
return digit_text, img
|
|
268
|
+
|
|
269
|
+
def detect_words(
|
|
270
|
+
self,
|
|
271
|
+
draw_boxes=True,
|
|
272
|
+
adjust_text_height=20,
|
|
273
|
+
bounding_box_color=(255, 0, 0),
|
|
274
|
+
text_color=(255, 0, 0),
|
|
275
|
+
font_scale=1,
|
|
276
|
+
font_thickness=2,
|
|
277
|
+
font=cv2.FONT_HERSHEY_SIMPLEX,
|
|
278
|
+
):
|
|
279
|
+
results = []
|
|
280
|
+
bounding_box = pytesseract.image_to_data(self.image, config=self.config)
|
|
281
|
+
for i, bbox in enumerate(bounding_box.splitlines()):
|
|
282
|
+
if i != 0: # Skip header line
|
|
283
|
+
parts = bbox.split()
|
|
284
|
+
if len(parts) == 12: # Ensure we have all expected parts
|
|
285
|
+
word = parts[11]
|
|
286
|
+
if not word:
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
x, y, w, h = map(int, parts[6:10])
|
|
290
|
+
conf = float(parts[10])
|
|
291
|
+
results.append(
|
|
292
|
+
{"text": word, "conf": conf, "x": x, "y": y, "w": w, "h": h}
|
|
293
|
+
)
|
|
294
|
+
if draw_boxes:
|
|
295
|
+
cv2.rectangle(
|
|
296
|
+
self.original_image,
|
|
297
|
+
(x, y),
|
|
298
|
+
(x + w, y + h),
|
|
299
|
+
bounding_box_color,
|
|
300
|
+
2,
|
|
301
|
+
)
|
|
302
|
+
cv2.putText(
|
|
303
|
+
self.original_image,
|
|
304
|
+
word,
|
|
305
|
+
(x, y - adjust_text_height),
|
|
306
|
+
font,
|
|
307
|
+
font_scale,
|
|
308
|
+
text_color,
|
|
309
|
+
font_thickness,
|
|
310
|
+
cv2.LINE_AA,
|
|
311
|
+
)
|
|
312
|
+
return results, self.original_image
|
|
313
|
+
|
|
314
|
+
def image_to_osd(self) -> dict[str, Any]:
|
|
315
|
+
"""
|
|
316
|
+
Convert the input image to Orientation and Script Detection (OSD) information using Tesseract OCR.
|
|
317
|
+
This method uses Tesseract's `image_to_osd` function to analyze the input image and extract information about the orientation
|
|
318
|
+
of the text (e.g., whether it is rotated) and the script used (e.g., Latin, Cyrillic, etc.). The method returns a dictionary containing the
|
|
319
|
+
OSD information, which can include details such as orientation angle, script confidence, and detected script name.
|
|
320
|
+
To use this feature, the osd.traineddata file must be present in your Tesseract tessdata directory.
|
|
321
|
+
|
|
322
|
+
print("[INFO] detected orientation: {}".format(
|
|
323
|
+
results["orientation"]))
|
|
324
|
+
print("[INFO] rotate by {} degrees to correct".format(
|
|
325
|
+
results["rotate"]))
|
|
326
|
+
print("[INFO] detected script: {}".format(results["script"]))
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Dict[str, Any]: A dictionary containing the orientation and script detection information extracted from the input
|
|
330
|
+
"""
|
|
331
|
+
osd = pytesseract.image_to_osd(self.image, output_type=pytesseract.Output.DICT)
|
|
332
|
+
result = {}
|
|
333
|
+
|
|
334
|
+
for line in osd.split("\n"):
|
|
335
|
+
if ":" in line:
|
|
336
|
+
k, v = line.split(":", 1)
|
|
337
|
+
result[k.strip()] = v.strip()
|
|
338
|
+
|
|
339
|
+
return result
|
|
340
|
+
|
|
341
|
+
def image_to_pdf_or_hocr(self, extension: str = "pdf") -> bytes:
|
|
342
|
+
"""
|
|
343
|
+
extension: 'pdf' or 'hocr'
|
|
344
|
+
"""
|
|
345
|
+
return pytesseract.image_to_pdf_or_hocr(
|
|
346
|
+
self.original_image, extension=extension, config=self.config
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def image_to_alto_xml(self) -> str:
|
|
350
|
+
"""
|
|
351
|
+
Convert the input image to ALTO XML format using Tesseract OCR. ALTO (Analyzed Layout and Text Object) XML is a standard
|
|
352
|
+
format for representing the layout and content of text in scanned documents. This method uses Tesseract's
|
|
353
|
+
`image_to_alto_xml` function to perform OCR on the input image and generate an ALTO XML string that contains information about detected text, including bounding boxes, confidence scores, and recognized characters or words.
|
|
354
|
+
Returns:
|
|
355
|
+
str: An ALTO XML string representing the detected text and its layout in the input image
|
|
356
|
+
"""
|
|
357
|
+
return pytesseract.image_to_alto_xml(self.image, config=self.config)
|
|
358
|
+
|
|
359
|
+
# # NLP-BASED METHODS (REQUIRE SPACY)
|
|
360
|
+
|
|
361
|
+
def clean_text(self, text=None):
|
|
362
|
+
"""
|
|
363
|
+
Clean the detected text by removing extra whitespace and newlines. This method takes the text extracted from the input image (either provided as an argument or obtained by calling the `detect_text` method) and processes it to remove any unnecessary whitespace, including newlines and multiple spaces. The cleaned text is returned as a single string with normalized spacing, making it easier to work with for further NLP tasks or analysis. If no text is provided, the method will call `detect_text` to obtain the text from the input image before cleaning it.
|
|
364
|
+
Args:
|
|
365
|
+
text (str, optional): The text to clean. If not provided, the method will call `detect_text` to obtain the text from the input image.
|
|
366
|
+
Returns:
|
|
367
|
+
str: The cleaned text with extra whitespace and newlines removed.
|
|
368
|
+
"""
|
|
369
|
+
if text is None:
|
|
370
|
+
text = self.image_to_string()
|
|
371
|
+
|
|
372
|
+
text = text.replace("\n", " ")
|
|
373
|
+
text = " ".join(text.split())
|
|
374
|
+
return text.strip()
|
|
375
|
+
|
|
376
|
+
def _get_doc(self, text=None):
|
|
377
|
+
if NLP is None:
|
|
378
|
+
return None
|
|
379
|
+
if text is None:
|
|
380
|
+
text = self.image_to_string()
|
|
381
|
+
return NLP(text)
|
|
382
|
+
|
|
383
|
+
def extract_entities(self, text: str | None = None):
|
|
384
|
+
"""
|
|
385
|
+
Extract named entities from the detected text using spaCy's NLP capabilities. This method takes the text extracted from the input image (either provided as an argument or obtained by calling the `detect_text` method) and processes it using a spaCy language model to identify and extract named entities such as people, organizations, locations, dates, etc. The method returns a list of dictionaries, where each dictionary contains the extracted entity text and its corresponding label (e.g., "PERSON", "ORG", "GPE", etc.). If spaCy is not installed or the language model cannot be loaded, the method will return an empty list.
|
|
386
|
+
Args:
|
|
387
|
+
text (str, optional): The text from which to extract entities. If not provided, the method will call `detect_text` to obtain the text from the input image.
|
|
388
|
+
Returns:
|
|
389
|
+
List[Dict[str, str]]: A list of dictionaries, each containing the extracted entity text and its corresponding label. For example: [{"text": "John Doe", "label": "
|
|
390
|
+
"""
|
|
391
|
+
doc = self._get_doc(text)
|
|
392
|
+
if not doc:
|
|
393
|
+
return []
|
|
394
|
+
|
|
395
|
+
return [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
|
|
396
|
+
|
|
397
|
+
def extract_keywords(self, text=None):
|
|
398
|
+
"""
|
|
399
|
+
Extract keywords from the detected text using spaCy's NLP capabilities. This method processes the input text (either provided as an argument or obtained by calling the `detect_text` method) using a spaCy language model to identify and extract keywords based on their part-of-speech tags. The method filters for tokens that are either nouns or proper nouns and are not stop words, returning a list of keywords extracted from the text. If spaCy is not installed or the language model cannot be loaded, the method will return an empty list.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
text (str, optional): The text from which to extract keywords. If not provided,
|
|
403
|
+
the method will call `detect_text` to obtain the text from the input image.
|
|
404
|
+
Returns:
|
|
405
|
+
List[str]: A list of keywords extracted from the input text, based on their part-of
|
|
406
|
+
"""
|
|
407
|
+
doc = self._get_doc(text)
|
|
408
|
+
if not doc:
|
|
409
|
+
return []
|
|
410
|
+
|
|
411
|
+
return [
|
|
412
|
+
token.text
|
|
413
|
+
for token in doc
|
|
414
|
+
if token.pos_ in ["NOUN", "PROPN"] and not token.is_stop
|
|
415
|
+
]
|
|
416
|
+
|
|
417
|
+
def detect_text_from_nosisy_image(self):
|
|
418
|
+
"""
|
|
419
|
+
Detect text from a noisy or low-contrast image by applying image pre-processing techniques.
|
|
420
|
+
This method loads the image, converts it to grayscale, applies a median filter to reduce noise,
|
|
421
|
+
and enhances the contrast before using OCR to extract the text.
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
str: The text extracted from the pre-processed image, with leading and trailing whitespace removed
|
|
425
|
+
"""
|
|
426
|
+
|
|
427
|
+
# Load an image with noise or low contrast
|
|
428
|
+
img = Image.open(self.image)
|
|
429
|
+
|
|
430
|
+
# Convert the image to grayscale
|
|
431
|
+
img = img.convert("L")
|
|
432
|
+
|
|
433
|
+
# Apply a median filter to reduce noise
|
|
434
|
+
img = img.filter(ImageFilter.MedianFilter())
|
|
435
|
+
|
|
436
|
+
# Enhance the image contrast
|
|
437
|
+
enhancer = ImageEnhance.Contrast(img)
|
|
438
|
+
img = enhancer.enhance(2)
|
|
439
|
+
|
|
440
|
+
# Extract text from the pre-processed image
|
|
441
|
+
text = pytesseract.image_to_string(img)
|
|
442
|
+
|
|
443
|
+
return text.strip()
|
|
444
|
+
|
|
445
|
+
def summarize(self, text=None, max_sentences=3):
|
|
446
|
+
"""
|
|
447
|
+
Summarize the detected text by extracting the most relevant sentences. This method processes the input text (either provided as an argument or obtained by calling the `detect_text` method) using a spaCy language model to analyze the text and identify its sentence structure. The method then selects the top sentences based on their relevance, which can be determined by factors such as sentence length, presence of keywords, or other heuristics. The resulting summary is a string that concatenates the selected sentences, providing a concise overview of the main points in the original text. If spaCy is not installed or the language model cannot be loaded, the method will return an empty string.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
text (str, optional): The text to summarize. If not provided, the method will call `detect_text` to obtain the text from the input image.
|
|
451
|
+
max_sentences (int): The maximum number of sentences to include in the summary.
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
str: A summary of the input text, consisting of the most relevant sentences concatenated together. If spaCy is not available, returns an empty string.
|
|
455
|
+
"""
|
|
456
|
+
doc = self._get_doc(text)
|
|
457
|
+
if not doc:
|
|
458
|
+
return ""
|
|
459
|
+
|
|
460
|
+
sentences = list(doc.sents)
|
|
461
|
+
return " ".join([str(s) for s in sentences[:max_sentences]])
|
|
462
|
+
|
|
463
|
+
def extract_relations(self, text=None):
|
|
464
|
+
"""
|
|
465
|
+
Subject-verb-object extraction
|
|
466
|
+
|
|
467
|
+
This method processes the input text (either provided as an argument or obtained by calling the `detect_text` method) using a spaCy language model to analyze the grammatical structure of the text and extract subject-verb-object (SVO) relationships. The method identifies the main verb in each sentence and then looks for its associated subject and object based on their dependency labels. The extracted relationships are returned as a list of dictionaries, where each dictionary contains the subject, verb, and object of a detected relationship. If spaCy is not installed or the language model cannot be loaded, the method will return an empty list.
|
|
468
|
+
Args:
|
|
469
|
+
text (str, optional): The text from which to extract relationships. If not provided, the method will call `detect_text` to obtain the text from the input image.
|
|
470
|
+
Returns:
|
|
471
|
+
List[Dict[str, Any]]: A list of dictionaries representing the extracted subject-verb
|
|
472
|
+
"""
|
|
473
|
+
doc = self._get_doc(text)
|
|
474
|
+
if not doc:
|
|
475
|
+
return []
|
|
476
|
+
|
|
477
|
+
relations = []
|
|
478
|
+
|
|
479
|
+
for token in doc:
|
|
480
|
+
if token.dep_ == "ROOT":
|
|
481
|
+
subject = [
|
|
482
|
+
w.text for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")
|
|
483
|
+
]
|
|
484
|
+
obj = [w.text for w in token.rights if w.dep_ in ("dobj", "attr")]
|
|
485
|
+
|
|
486
|
+
if subject and obj:
|
|
487
|
+
relations.append(
|
|
488
|
+
{"subject": subject, "verb": token.text, "object": obj}
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
return relations
|
|
492
|
+
|
|
493
|
+
def group_entities(self, text=None):
|
|
494
|
+
"""
|
|
495
|
+
Group extracted entities by their labels. This method first calls the `extract_entities` method to obtain a list of detected entities from the input text (either provided as an argument or obtained by calling the `detect_text` method). It then organizes these entities into a dictionary where the keys are the entity labels (e.g., "PERSON", "ORG", "GPE") and the values are lists of entity texts that correspond to each label. This grouping allows for easier analysis and retrieval of entities based on their types. If spaCy is not installed or the language model cannot be loaded, the method will return an empty dictionary.
|
|
496
|
+
Args:
|
|
497
|
+
text (str, optional): The text from which to extract and group entities. If not provided, the method will call `detect_text` to obtain the text from the input image.
|
|
498
|
+
Returns:
|
|
499
|
+
Dict[str, List[str]]: A dictionary where the keys are entity labels and the values
|
|
500
|
+
"""
|
|
501
|
+
entities = self.extract_entities(text)
|
|
502
|
+
grouped = {}
|
|
503
|
+
for ent in entities:
|
|
504
|
+
grouped.setdefault(ent["label"], []).append(ent["text"])
|
|
505
|
+
return grouped
|
|
506
|
+
|
|
507
|
+
def enable_gpu(self):
|
|
508
|
+
"""
|
|
509
|
+
Enable GPU acceleration for OpenCV operations. This method sets the OpenCL flag in OpenCV to True, allowing it to utilize compatible
|
|
510
|
+
GPU hardware for accelerating image processing tasks. Enabling GPU acceleration can significantly
|
|
511
|
+
improve the performance of certain operations, especially when working with large images or complex processing pipelines. Note that the effectiveness of GPU acceleration depends on the specific hardware and drivers installed on the system, as well as the nature of the image processing tasks being performed.
|
|
512
|
+
"""
|
|
513
|
+
cv2.ocl.setUseOpenCL(True)
|
|
514
|
+
|
|
515
|
+
def disable_gpu(self):
|
|
516
|
+
"""
|
|
517
|
+
Disable GPU acceleration for OpenCV operations. This method sets the OpenCL flag in OpenCV to False,
|
|
518
|
+
preventing it from utilizing GPU hardware for image processing tasks. Disabling GPU acceleration can be useful in scenarios where GPU resources are limited or when debugging issues related to GPU processing.
|
|
519
|
+
"""
|
|
520
|
+
cv2.ocl.setUseOpenCL(False)
|
|
521
|
+
|
|
522
|
+
def get_confidence(self) -> float:
|
|
523
|
+
"""
|
|
524
|
+
Calculate the average confidence score of the detected words in the input image. The confidence score is a measure of the OCR engine's certainty about the recognized text. This method uses the `detect_words` method to obtain the detected words and their corresponding confidence scores, and then computes the average confidence score across all detected words.
|
|
525
|
+
Returns:
|
|
526
|
+
float: The average confidence score of the detected words, ranging from 0.0 to 100.0. If no words are detected, the method returns 0.0.
|
|
527
|
+
"""
|
|
528
|
+
data, _ = self.detect_words()
|
|
529
|
+
if not data:
|
|
530
|
+
return 0.0
|
|
531
|
+
|
|
532
|
+
return sum(d["conf"] for d in data) / len(data)
|
|
533
|
+
|
|
534
|
+
def get_words(self) -> list[str]:
|
|
535
|
+
"""
|
|
536
|
+
Retrieve the text of the detected words in the input image. This method uses the `detect_words` method to obtain the detected words and then extracts the text from each detected word.
|
|
537
|
+
Returns:
|
|
538
|
+
List[str]: A list of strings representing the text of the detected words.
|
|
539
|
+
"""
|
|
540
|
+
data, _ = self.detect_words()
|
|
541
|
+
return [d["text"] for d in data]
|
|
542
|
+
|
|
543
|
+
def get_lines(self) -> list[str]:
|
|
544
|
+
"""
|
|
545
|
+
Retrieve the lines of text detected in the input image. This method uses the `image_to_string` method to obtain the full text from the image and then splits it into individual lines.
|
|
546
|
+
Returns:
|
|
547
|
+
List[str]: A list of strings representing the lines of text detected in the image.
|
|
548
|
+
"""
|
|
549
|
+
text = self.image_to_string()
|
|
550
|
+
return [line.strip() for line in text.split("\n") if line.strip()]
|
|
551
|
+
|
|
552
|
+
def to_dataframe(self):
|
|
553
|
+
"""
|
|
554
|
+
Convert the detected words and their associated information into a pandas DataFrame. This method uses the `detect_words` method to obtain the detected words, their confidence scores, and bounding box coordinates, and then organizes this information into a structured DataFrame format. The resulting DataFrame can be easily manipulated and analyzed using pandas' powerful data handling capabilities.
|
|
555
|
+
"""
|
|
556
|
+
data = self.image_to_data()
|
|
557
|
+
return pd.DataFrame(data)
|
|
558
|
+
|
|
559
|
+
def detect_document(self):
|
|
560
|
+
"""
|
|
561
|
+
Detect the document in the input image. This method converts the image to grayscale, applies edge detection, and
|
|
562
|
+
then finds contours to identify the document's boundaries.
|
|
563
|
+
It returns the coordinates of the document's corners if detected, or None if no document is found.
|
|
564
|
+
"""
|
|
565
|
+
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
|
|
566
|
+
edged = cv2.Canny(gray, 75, 200)
|
|
567
|
+
|
|
568
|
+
cnts = imutils.grab_contours(
|
|
569
|
+
cv2.findContours(edged, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
cnts = sorted(cnts, key=cv2.contourArea, reverse=True)[:5]
|
|
573
|
+
|
|
574
|
+
for c in cnts:
|
|
575
|
+
peri = cv2.arcLength(c, True)
|
|
576
|
+
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
|
|
577
|
+
|
|
578
|
+
if len(approx) == 4:
|
|
579
|
+
return approx
|
|
580
|
+
|
|
581
|
+
return None
|
|
582
|
+
|
|
583
|
+
def fallback_ssim(image1, image2, form_name, draw_frame=False):
|
|
584
|
+
image2_resized = cv2.resize(image2, (image1.shape[1], image1.shape[0]))
|
|
585
|
+
|
|
586
|
+
gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
|
|
587
|
+
gray2 = cv2.cvtColor(image2_resized, cv2.COLOR_BGR2GRAY)
|
|
588
|
+
|
|
589
|
+
score, diff = ssim(gray1, gray2, full=True)
|
|
590
|
+
|
|
591
|
+
diff = (diff * 255).astype("uint8")
|
|
592
|
+
|
|
593
|
+
if draw_frame:
|
|
594
|
+
cv2.imshow(f"{form_name} - SSIM Diff (score={score:.3f})", diff)
|
|
595
|
+
cv2.waitKey(0)
|
|
596
|
+
cv2.destroyAllWindows()
|
|
597
|
+
|
|
598
|
+
return {
|
|
599
|
+
"matches": 0,
|
|
600
|
+
"homography": None,
|
|
601
|
+
"aligned_image": image2_resized,
|
|
602
|
+
"ssim_score": score,
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
def compare_matches_knn_matcher(
|
|
606
|
+
self,
|
|
607
|
+
image2,
|
|
608
|
+
form_name,
|
|
609
|
+
no_of_feature=500,
|
|
610
|
+
matched_amount=50,
|
|
611
|
+
percentage_of_matches=20,
|
|
612
|
+
draw_matches=False,
|
|
613
|
+
draw_aligned=False,
|
|
614
|
+
):
|
|
615
|
+
# Detect keypoints
|
|
616
|
+
text_form_2 = TextDetector(image2, preprocess=False)
|
|
617
|
+
|
|
618
|
+
keypoints1, descriptors1, _ = self.detect_keypoints(
|
|
619
|
+
features=no_of_feature, draw_keypoints=False
|
|
620
|
+
)
|
|
621
|
+
keypoints2, descriptors2, _ = text_form_2.detect_keypoints(
|
|
622
|
+
features=no_of_feature, draw_keypoints=False
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
if descriptors1 is None or descriptors2 is None:
|
|
626
|
+
print("Feature detection failed → using SSIM fallback")
|
|
627
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
628
|
+
|
|
629
|
+
# Safety check
|
|
630
|
+
if descriptors1 is None or descriptors2 is None:
|
|
631
|
+
raise ValueError("Descriptors could not be computed")
|
|
632
|
+
|
|
633
|
+
# Use KNN matcher instead of crossCheck
|
|
634
|
+
bf = cv2.BFMatcher(cv2.NORM_HAMMING)
|
|
635
|
+
|
|
636
|
+
matches = bf.knnMatch(descriptors1, descriptors2, k=2)
|
|
637
|
+
|
|
638
|
+
# Apply ratio test
|
|
639
|
+
good_matches = []
|
|
640
|
+
for m, n in matches:
|
|
641
|
+
if m.distance < 0.75 * n.distance:
|
|
642
|
+
good_matches.append(m)
|
|
643
|
+
|
|
644
|
+
if len(good_matches) < 4:
|
|
645
|
+
print("Not enough matches → using fallback")
|
|
646
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
647
|
+
|
|
648
|
+
# Sort matches
|
|
649
|
+
good_matches = sorted(good_matches, key=lambda x: x.distance)
|
|
650
|
+
|
|
651
|
+
# Take top percentage
|
|
652
|
+
keep_n = int(len(good_matches) * (percentage_of_matches / 100))
|
|
653
|
+
good_matches = good_matches[: max(keep_n, 4)] # ensure at least 4
|
|
654
|
+
|
|
655
|
+
# Draw matches
|
|
656
|
+
matchedImage = cv2.drawMatches(
|
|
657
|
+
self.image,
|
|
658
|
+
keypoints1,
|
|
659
|
+
image2,
|
|
660
|
+
keypoints2,
|
|
661
|
+
good_matches[:matched_amount],
|
|
662
|
+
None,
|
|
663
|
+
flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
# Compute homography
|
|
667
|
+
sourcePoints = np.float32(
|
|
668
|
+
[keypoints1[m.queryIdx].pt for m in good_matches]
|
|
669
|
+
).reshape(-1, 1, 2)
|
|
670
|
+
|
|
671
|
+
destinationPoints = np.float32(
|
|
672
|
+
[keypoints2[m.trainIdx].pt for m in good_matches]
|
|
673
|
+
).reshape(-1, 1, 2)
|
|
674
|
+
|
|
675
|
+
M, mask = cv2.findHomography(destinationPoints, sourcePoints, cv2.RANSAC, 5.0)
|
|
676
|
+
|
|
677
|
+
if M is None:
|
|
678
|
+
print("Homography could not be computed so it will be using fallback")
|
|
679
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
680
|
+
|
|
681
|
+
h, w = self.image.shape[:2]
|
|
682
|
+
imageTransformed = cv2.warpPerspective(image2, M, (w, h))
|
|
683
|
+
|
|
684
|
+
imageTransformed_small = cv2.resize(imageTransformed, (w // 3, h // 3))
|
|
685
|
+
matchedImage_small = cv2.resize(matchedImage, (w // 3, h // 3))
|
|
686
|
+
|
|
687
|
+
if draw_matches:
|
|
688
|
+
cv2.imshow(f"{form_name} - Matches (Inliers)", matchedImage_small)
|
|
689
|
+
cv2.waitKey(0)
|
|
690
|
+
cv2.destroyAllWindows()
|
|
691
|
+
|
|
692
|
+
if draw_aligned:
|
|
693
|
+
cv2.imshow(f"{form_name} - Aligned", imageTransformed_small)
|
|
694
|
+
cv2.waitKey(0)
|
|
695
|
+
cv2.destroyAllWindows()
|
|
696
|
+
|
|
697
|
+
return {
|
|
698
|
+
"matches": len(good_matches),
|
|
699
|
+
"homography": M,
|
|
700
|
+
"matched_image": matchedImage,
|
|
701
|
+
"aligned_image": imageTransformed,
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
def compare_matches_bf_matcher(
|
|
705
|
+
self,
|
|
706
|
+
image2,
|
|
707
|
+
form_name,
|
|
708
|
+
no_of_feature=500,
|
|
709
|
+
matched_amount=50,
|
|
710
|
+
percentage_of_matches=20,
|
|
711
|
+
draw_matches=False,
|
|
712
|
+
draw_aligned=False,
|
|
713
|
+
):
|
|
714
|
+
# Detect keypoints
|
|
715
|
+
text_form_2 = TextDetector(image2, preprocess=False)
|
|
716
|
+
|
|
717
|
+
keypoints1, descriptors1, _ = self.detect_keypoints(
|
|
718
|
+
features=no_of_feature, draw_keypoints=False
|
|
719
|
+
)
|
|
720
|
+
keypoints2, descriptors2, _ = text_form_2.detect_keypoints(
|
|
721
|
+
features=no_of_feature, draw_keypoints=False
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
if descriptors1 is None or descriptors2 is None:
|
|
725
|
+
print("Feature detection failed → using SSIM fallback")
|
|
726
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
727
|
+
|
|
728
|
+
# Safety check
|
|
729
|
+
if descriptors1 is None or descriptors2 is None:
|
|
730
|
+
raise ValueError("Descriptors could not be computed")
|
|
731
|
+
|
|
732
|
+
# Use KNN matcher instead of crossCheck
|
|
733
|
+
bf = cv2.BFMatcher(cv2.NORM_HAMMING)
|
|
734
|
+
|
|
735
|
+
matches = bf.match(descriptors1, descriptors2)
|
|
736
|
+
|
|
737
|
+
# Sort matches
|
|
738
|
+
good_matches = sorted(matches, key=lambda x: x.distance)
|
|
739
|
+
|
|
740
|
+
# Take top percentage
|
|
741
|
+
keep_n = int(len(good_matches) * (percentage_of_matches / 100))
|
|
742
|
+
good_matches = good_matches[: max(keep_n, 4)] # ensure at least 4
|
|
743
|
+
|
|
744
|
+
# Draw matches
|
|
745
|
+
matchedImage = cv2.drawMatches(
|
|
746
|
+
self.image,
|
|
747
|
+
keypoints1,
|
|
748
|
+
image2,
|
|
749
|
+
keypoints2,
|
|
750
|
+
good_matches[:matched_amount],
|
|
751
|
+
None,
|
|
752
|
+
flags=cv2.DrawMatchesFlags_NOT_DRAW_SINGLE_POINTS,
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
# Compute homography
|
|
756
|
+
sourcePoints = np.float32(
|
|
757
|
+
[keypoints1[m.queryIdx].pt for m in good_matches]
|
|
758
|
+
).reshape(-1, 1, 2)
|
|
759
|
+
|
|
760
|
+
destinationPoints = np.float32(
|
|
761
|
+
[keypoints2[m.trainIdx].pt for m in good_matches]
|
|
762
|
+
).reshape(-1, 1, 2)
|
|
763
|
+
|
|
764
|
+
M, mask = cv2.findHomography(destinationPoints, sourcePoints, cv2.RANSAC, 5.0)
|
|
765
|
+
|
|
766
|
+
if M is None:
|
|
767
|
+
print("Homography could not be computed so it will be using fallback")
|
|
768
|
+
return self.fallback_ssim(self.image, image2, form_name)
|
|
769
|
+
|
|
770
|
+
h, w = self.image.shape[:2]
|
|
771
|
+
imageTransformed = cv2.warpPerspective(image2, M, (w, h))
|
|
772
|
+
|
|
773
|
+
imageTransformed_small = cv2.resize(imageTransformed, (w // 3, h // 3))
|
|
774
|
+
matchedImage_small = cv2.resize(matchedImage, (w // 3, h // 3))
|
|
775
|
+
|
|
776
|
+
# it will match the form and the template and show the matched keypoints and the aligned image. The homography matrix can be used to further analyze the geometric transformation between the two images, such as calculating the angle of rotation or the scale difference.
|
|
777
|
+
if draw_matches:
|
|
778
|
+
cv2.imshow(f"{form_name} - Matches (Inliers)", matchedImage_small)
|
|
779
|
+
cv2.waitKey(0)
|
|
780
|
+
cv2.destroyAllWindows()
|
|
781
|
+
|
|
782
|
+
# it will match the form and the template and show the matched keypoints and the aligned image. The homography matrix can be used to further analyze the geometric transformation between the two images, such as calculating the angle of rotation or the scale difference.
|
|
783
|
+
if draw_aligned:
|
|
784
|
+
cv2.imshow(f"{form_name} - Aligned", imageTransformed_small)
|
|
785
|
+
cv2.waitKey(0)
|
|
786
|
+
cv2.destroyAllWindows()
|
|
787
|
+
|
|
788
|
+
return {
|
|
789
|
+
"matches": len(good_matches),
|
|
790
|
+
"homography": M,
|
|
791
|
+
"matched_image": matchedImage,
|
|
792
|
+
"aligned_image": imageTransformed,
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
def detect_tables(self):
|
|
796
|
+
"""
|
|
797
|
+
Detect tables in the input image using morphological operations. This method applies morphological transformations to identify horizontal and vertical lines in the image, which are indicative of table structures.
|
|
798
|
+
It then combines these lines to create a mask that highlights potential table regions. The method uses contour detection to find bounding boxes around these regions and extracts the corresponding text using Tesseract OCR. The extracted text from each detected table is returned as a list of strings.
|
|
799
|
+
"""
|
|
800
|
+
img = self.get_processed_image()
|
|
801
|
+
|
|
802
|
+
horizontal = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
|
|
803
|
+
vertical = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
|
|
804
|
+
|
|
805
|
+
h_lines = cv2.morphologyEx(img, cv2.MORPH_OPEN, horizontal)
|
|
806
|
+
v_lines = cv2.morphologyEx(img, cv2.MORPH_OPEN, vertical)
|
|
807
|
+
|
|
808
|
+
mask = h_lines + v_lines
|
|
809
|
+
|
|
810
|
+
cnts = imutils.grab_contours(
|
|
811
|
+
cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
tables = []
|
|
815
|
+
for c in cnts:
|
|
816
|
+
x, y, w, h = cv2.boundingRect(c)
|
|
817
|
+
roi = self.image[y : y + h, x : x + w]
|
|
818
|
+
text = pytesseract.image_to_string(roi, config=self.config)
|
|
819
|
+
tables.append(text)
|
|
820
|
+
|
|
821
|
+
return tables
|
|
822
|
+
|
|
823
|
+
def analyze_layout(self):
|
|
824
|
+
"""
|
|
825
|
+
Analyze the layout of the text in the input image. This method uses the `detect_words` method to obtain the detected words
|
|
826
|
+
and their associated information, such as font size. It then applies a heuristic to classify the text into titles and paragraphs based on the font size.
|
|
827
|
+
The resulting layout information is organized into a dictionary with separate lists for titles and paragraphs. This can be useful for understanding the
|
|
828
|
+
structure of the text in the image and for further processing or analysis.
|
|
829
|
+
"""
|
|
830
|
+
data, _ = self.detect_words(draw_boxes=False)
|
|
831
|
+
layout = {"titles": [], "paragraphs": []}
|
|
832
|
+
|
|
833
|
+
n = len(data["text"])
|
|
834
|
+
for i in range(n):
|
|
835
|
+
text = data["text"][i]
|
|
836
|
+
size = data["height"][i]
|
|
837
|
+
|
|
838
|
+
if not text.strip():
|
|
839
|
+
continue
|
|
840
|
+
|
|
841
|
+
# heuristic: large font = title
|
|
842
|
+
if size > 30:
|
|
843
|
+
layout["titles"].append(text)
|
|
844
|
+
else:
|
|
845
|
+
layout["paragraphs"].append(text)
|
|
846
|
+
|
|
847
|
+
return layout
|
|
848
|
+
|
|
849
|
+
def _preprocess_image_cursive(self, img: any) -> np.ndarray:
|
|
850
|
+
# If path is passed instead of array
|
|
851
|
+
if isinstance(img, str):
|
|
852
|
+
img = cv2.imread(img)
|
|
853
|
+
# Convert to grayscale
|
|
854
|
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
855
|
+
|
|
856
|
+
# Denoise (important for cursive)
|
|
857
|
+
gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
|
|
858
|
+
|
|
859
|
+
# Adaptive threshold (better for handwriting than global threshold)
|
|
860
|
+
thresh = cv2.adaptiveThreshold(
|
|
861
|
+
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 31, 15
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
# Morphological operations to connect broken cursive strokes
|
|
865
|
+
kernel = np.ones((2, 2), np.uint8)
|
|
866
|
+
processed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel, iterations=1)
|
|
867
|
+
|
|
868
|
+
return processed
|
|
869
|
+
|
|
870
|
+
def extract_cursive_text(self, image_input):
|
|
871
|
+
processed_img = self._preprocess_image_cursive(image_input)
|
|
872
|
+
|
|
873
|
+
# OCR config tuned for handwriting
|
|
874
|
+
custom_config = r"--oem 3 --psm 6"
|
|
875
|
+
|
|
876
|
+
text = pytesseract.image_to_string(processed_img, config=custom_config)
|
|
877
|
+
|
|
878
|
+
return text, processed_img
|
|
879
|
+
|
|
880
|
+
# ORB: Oriented FAST and Rotated BRIEF for keypoint detection and description: Provide a faster feature detection method.
|
|
881
|
+
# Why do we need Orb:
|
|
882
|
+
# 1. Speed: ORB is designed to be fast, making it suitable for real-time applications and large datasets.
|
|
883
|
+
# 2. Rotation Invariance: ORB is robust to rotation, which means it can detect features even when the image is rotated, making it more versatile for various applications.
|
|
884
|
+
# 3. Feature detection
|
|
885
|
+
# 4. Low memory usage: ORB is more memory-efficient compared to other feature detectors, which can be beneficial when working with limited resources or large images.
|
|
886
|
+
|
|
887
|
+
# License restriction for SIFT and SURF: SIFT and SURF are patented algorithms, which means that their use is restricted by licensing agreements.
|
|
888
|
+
# In contrast, ORB is an open-source algorithm that is not subject to such restrictions, making it freely available for use in both academic and commercial applications.
|
|
889
|
+
|
|
890
|
+
# # ORB Architecture:
|
|
891
|
+
|
|
892
|
+
# FAST Detector: ORB uses the FAST (Features from Accelerated Segment Test) algorithm for keypoint detection, which is a corner detection method that identifies points
|
|
893
|
+
# in the image where there is a significant change in intensity. FAST is known for its speed and efficiency in detecting keypoints, making it a suitable choice for real-time applications.
|
|
894
|
+
|
|
895
|
+
# Harris Corner Measure: ORB incorporates the Harris corner measure to filter and rank the detected keypoints. This measure evaluates the strength of the corners detected by FAST and helps in selecting the most relevant keypoints for further processing.
|
|
896
|
+
# By using the Harris corner measure, ORB can improve the quality of the detected features and enhance the overall performance of feature matching and recognition tasks.
|
|
897
|
+
|
|
898
|
+
# # BRIEF Descriptor: ORB uses the BRIEF (Binary Robust Independent Elementary Features) descriptor to describe the detected keypoints. BRIEF is a binary descriptor that encodes the local image patch around each keypoint into a compact binary string.
|
|
899
|
+
# The BRIEF descriptor is designed to be fast to compute and compare, making it suitable for real-time applications. It captures the intensity differences between pairs of points in the local image patch,
|
|
900
|
+
|
|
901
|
+
# Locality Sensitive Hashing (LSH) is a technique used in ORB to efficiently match features by hashing them into buckets based on their descriptors.
|
|
902
|
+
# This allows for faster retrieval of similar features during the matching process, improving the overall performance of feature detection and matching in ORB.
|
|
903
|
+
|
|
904
|
+
# Hamming distance is used in ORB to compare binary descriptors. It measures the number of bit positions at which the corresponding bits are different between
|
|
905
|
+
# two binary strings.
|
|
906
|
+
|
|
907
|
+
# In ORB, the descriptors are binary strings, and the Hamming distance is used to determine the similarity between two descriptors. A smaller Hamming distance indicates a closer match between the features
|
|
908
|
+
def detect_keypoints(
|
|
909
|
+
self, features=500, draw_keypoints=False, keypoint_color=(0, 255, 0)
|
|
910
|
+
):
|
|
911
|
+
orb = cv2.ORB_create(nfeatures=features)
|
|
912
|
+
keypoints, descriptors = orb.detectAndCompute(self.image, None)
|
|
913
|
+
if draw_keypoints:
|
|
914
|
+
self.image = cv2.drawKeypoints(
|
|
915
|
+
self.image,
|
|
916
|
+
keypoints,
|
|
917
|
+
None,
|
|
918
|
+
color=keypoint_color,
|
|
919
|
+
flags=cv2.DrawMatchesFlags_DRAW_RICH_KEYPOINTS,
|
|
920
|
+
)
|
|
921
|
+
return keypoints, descriptors, self.image
|
|
922
|
+
|
|
923
|
+
# Image Utilities
|
|
924
|
+
def resize(self, width=None, height=None):
|
|
925
|
+
self.image = imutils.resize(self.image, width=width, height=height)
|
|
926
|
+
return self.image
|
|
927
|
+
|
|
928
|
+
def rotate(self, angle):
|
|
929
|
+
self.image = imutils.rotate(self.image, angle)
|
|
930
|
+
return self.image
|
|
931
|
+
|
|
932
|
+
def rotate_bound(self, angle):
|
|
933
|
+
self.image = imutils.rotate_bound(self.image, angle)
|
|
934
|
+
return self.image
|
|
935
|
+
|
|
936
|
+
def auto_canny(self, sigma=0.33):
|
|
937
|
+
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
|
|
938
|
+
v = np.median(gray)
|
|
939
|
+
lower = int(max(0, (1.0 - sigma) * v))
|
|
940
|
+
upper = int(min(255, (1.0 + sigma) * v))
|
|
941
|
+
return cv2.Canny(gray, lower, upper)
|
|
942
|
+
|
|
943
|
+
def deskew(self):
|
|
944
|
+
gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)
|
|
945
|
+
coords = np.column_stack(np.where(gray > 0))
|
|
946
|
+
angle = cv2.minAreaRect(coords)[-1]
|
|
947
|
+
|
|
948
|
+
angle = -(90 + angle) if angle < -45 else -angle
|
|
949
|
+
|
|
950
|
+
self.image = imutils.rotate_bound(self.image, angle)
|
|
951
|
+
return self.image
|
|
952
|
+
|
|
953
|
+
# ─────────────────────────── NEW METHODS ───────────────────────────
|
|
954
|
+
|
|
955
|
+
def filter_words_by_confidence(self, min_conf=60.0):
|
|
956
|
+
"""Return only detected words whose Tesseract confidence meets the threshold.
|
|
957
|
+
|
|
958
|
+
Args:
|
|
959
|
+
min_conf: Minimum confidence score (0–100).
|
|
960
|
+
Returns:
|
|
961
|
+
tuple: (filtered_word_dicts, annotated_image) — same shape as detect_words().
|
|
962
|
+
"""
|
|
963
|
+
words, annotated = self.detect_words(draw_boxes=False)
|
|
964
|
+
filtered = [w for w in words if w["conf"] >= min_conf]
|
|
965
|
+
return filtered, self.original_image.copy()
|
|
966
|
+
|
|
967
|
+
def detect_numbers(self, text=None):
|
|
968
|
+
"""Extract all numeric sequences from detected or provided text.
|
|
969
|
+
|
|
970
|
+
Args:
|
|
971
|
+
text: Optional pre-extracted string. If None, calls detect_text().
|
|
972
|
+
Returns:
|
|
973
|
+
List[str]: All number strings found (e.g. ['42', '3.14', '2026']).
|
|
974
|
+
"""
|
|
975
|
+
import re
|
|
976
|
+
|
|
977
|
+
if text is None:
|
|
978
|
+
text = self.detect_text()
|
|
979
|
+
return re.findall(r"\b\d+(?:[.,]\d+)*\b", text)
|
|
980
|
+
|
|
981
|
+
def detect_paragraphs(self):
|
|
982
|
+
"""Segment the image into paragraph blocks using morphological operations.
|
|
983
|
+
Groups nearby word regions into logical paragraph bounding boxes.
|
|
984
|
+
|
|
985
|
+
Returns:
|
|
986
|
+
List[dict]: Each dict has keys 'bbox' (x, y, w, h) and 'text' (OCR string).
|
|
987
|
+
"""
|
|
988
|
+
gray = cv2.cvtColor(self.original_image, cv2.COLOR_BGR2GRAY)
|
|
989
|
+
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
|
990
|
+
# Dilate horizontally to merge words into lines, then vertically into paragraphs
|
|
991
|
+
kernel_h = cv2.getStructuringElement(cv2.MORPH_RECT, (30, 1))
|
|
992
|
+
kernel_v = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 20))
|
|
993
|
+
dilated = cv2.dilate(binary, kernel_h, iterations=2)
|
|
994
|
+
dilated = cv2.dilate(dilated, kernel_v, iterations=2)
|
|
995
|
+
contours, _ = cv2.findContours(
|
|
996
|
+
dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
|
997
|
+
)
|
|
998
|
+
paragraphs = []
|
|
999
|
+
for cnt in contours:
|
|
1000
|
+
x, y, w, h = cv2.boundingRect(cnt)
|
|
1001
|
+
if w < 30 or h < 10:
|
|
1002
|
+
continue
|
|
1003
|
+
roi = self.original_image[y : y + h, x : x + w]
|
|
1004
|
+
text = pytesseract.image_to_string(roi, config=self.config).strip()
|
|
1005
|
+
if text:
|
|
1006
|
+
paragraphs.append({"bbox": (x, y, w, h), "text": text})
|
|
1007
|
+
return sorted(paragraphs, key=lambda p: (p["bbox"][1], p["bbox"][0]))
|
|
1008
|
+
|
|
1009
|
+
def export_to_csv(self, path="detections.csv"):
|
|
1010
|
+
"""Save word-level detections to a CSV file.
|
|
1011
|
+
Columns: text, conf, x, y, w, h.
|
|
1012
|
+
|
|
1013
|
+
Args:
|
|
1014
|
+
path: Output file path.
|
|
1015
|
+
Returns:
|
|
1016
|
+
str: Absolute path of the written file.
|
|
1017
|
+
"""
|
|
1018
|
+
import csv
|
|
1019
|
+
import os
|
|
1020
|
+
|
|
1021
|
+
words, _ = self.detect_words(draw_boxes=False)
|
|
1022
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
1023
|
+
writer = csv.DictWriter(f, fieldnames=["text", "conf", "x", "y", "w", "h"])
|
|
1024
|
+
writer.writeheader()
|
|
1025
|
+
writer.writerows(words)
|
|
1026
|
+
return os.path.abspath(path)
|
|
1027
|
+
|
|
1028
|
+
def get_text_regions(self):
|
|
1029
|
+
"""Return bounding boxes of all detected text blocks at the block level (psm 11).
|
|
1030
|
+
Useful for layout analysis without full word-level detail.
|
|
1031
|
+
|
|
1032
|
+
Returns:
|
|
1033
|
+
List[dict]: [{'bbox': (x, y, w, h), 'text': str}]
|
|
1034
|
+
"""
|
|
1035
|
+
config = f"--oem {self.oem} --psm 11 -l {self.lang}"
|
|
1036
|
+
data = pytesseract.image_to_data(
|
|
1037
|
+
self.image, config=config, output_type=pytesseract.Output.DICT
|
|
1038
|
+
)
|
|
1039
|
+
regions = []
|
|
1040
|
+
n = len(data["text"])
|
|
1041
|
+
for i in range(n):
|
|
1042
|
+
text = data["text"][i].strip()
|
|
1043
|
+
if not text:
|
|
1044
|
+
continue
|
|
1045
|
+
x, y, w, h = (
|
|
1046
|
+
data["left"][i],
|
|
1047
|
+
data["top"][i],
|
|
1048
|
+
data["width"][i],
|
|
1049
|
+
data["height"][i],
|
|
1050
|
+
)
|
|
1051
|
+
regions.append(
|
|
1052
|
+
{"bbox": (x, y, w, h), "text": text, "conf": float(data["conf"][i])}
|
|
1053
|
+
)
|
|
1054
|
+
return regions
|
|
1055
|
+
|
|
1056
|
+
def highlight_words(self, target_words, color=(0, 255, 0), thickness=2):
|
|
1057
|
+
"""Draw colored bounding boxes around specific words in the image.
|
|
1058
|
+
Case-insensitive match.
|
|
1059
|
+
|
|
1060
|
+
Args:
|
|
1061
|
+
target_words: List of word strings to highlight.
|
|
1062
|
+
color: BGR color for the bounding box.
|
|
1063
|
+
thickness: Rectangle border thickness.
|
|
1064
|
+
Returns:
|
|
1065
|
+
Annotated BGR numpy array.
|
|
1066
|
+
"""
|
|
1067
|
+
words, _ = self.detect_words(draw_boxes=False)
|
|
1068
|
+
out = self.original_image.copy()
|
|
1069
|
+
targets = {w.lower() for w in target_words}
|
|
1070
|
+
for word in words:
|
|
1071
|
+
if word["text"].lower() in targets:
|
|
1072
|
+
x, y, w, h = word["x"], word["y"], word["w"], word["h"]
|
|
1073
|
+
cv2.rectangle(out, (x, y), (x + w, y + h), color, thickness)
|
|
1074
|
+
cv2.putText(
|
|
1075
|
+
out,
|
|
1076
|
+
word["text"],
|
|
1077
|
+
(x, max(0, y - 5)),
|
|
1078
|
+
cv2.FONT_HERSHEY_SIMPLEX,
|
|
1079
|
+
0.5,
|
|
1080
|
+
color,
|
|
1081
|
+
1,
|
|
1082
|
+
cv2.LINE_AA,
|
|
1083
|
+
)
|
|
1084
|
+
return out
|
|
1085
|
+
|
|
1086
|
+
# ─────────────────────────── UTILITY METHODS ───────────────────────────
|
|
1087
|
+
|
|
1088
|
+
def is_text_present(self, min_confidence=60.0):
|
|
1089
|
+
"""Return True if at least one word meets the confidence threshold.
|
|
1090
|
+
|
|
1091
|
+
Args:
|
|
1092
|
+
min_confidence: Minimum Tesseract confidence score (0–100).
|
|
1093
|
+
Returns:
|
|
1094
|
+
bool: True when confident words exist, False otherwise.
|
|
1095
|
+
"""
|
|
1096
|
+
try:
|
|
1097
|
+
result = self.filter_words_by_confidence(min_confidence)
|
|
1098
|
+
# filter_words_by_confidence returns (list, image) in production
|
|
1099
|
+
# but tests may inject a plain list — handle both
|
|
1100
|
+
words = result[0] if isinstance(result, tuple) else result
|
|
1101
|
+
return len(words) > 0
|
|
1102
|
+
except Exception:
|
|
1103
|
+
return False
|
|
1104
|
+
|
|
1105
|
+
def extract_dates(self, text=None):
|
|
1106
|
+
"""Extract date strings from text using common date patterns.
|
|
1107
|
+
|
|
1108
|
+
Args:
|
|
1109
|
+
text: Optional string to search. If None, calls detect_text().
|
|
1110
|
+
Returns:
|
|
1111
|
+
List[str]: Deduplicated list of date strings found.
|
|
1112
|
+
"""
|
|
1113
|
+
import re
|
|
1114
|
+
|
|
1115
|
+
text = text if text is not None else self.detect_text()
|
|
1116
|
+
patterns = [
|
|
1117
|
+
r"\b\d{1,2}/\d{1,2}/\d{4}\b",
|
|
1118
|
+
r"\b\d{4}-\d{2}-\d{2}\b",
|
|
1119
|
+
r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},?\s+\d{4}\b",
|
|
1120
|
+
r"\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{4}\b",
|
|
1121
|
+
]
|
|
1122
|
+
found = []
|
|
1123
|
+
for pat in patterns:
|
|
1124
|
+
found.extend(re.findall(pat, text, re.IGNORECASE))
|
|
1125
|
+
return list(dict.fromkeys(found))
|
|
1126
|
+
|
|
1127
|
+
def extract_phone_numbers(self, text=None):
|
|
1128
|
+
"""Extract phone number strings from text.
|
|
1129
|
+
|
|
1130
|
+
Args:
|
|
1131
|
+
text: Optional string to search. If None, calls detect_text().
|
|
1132
|
+
Returns:
|
|
1133
|
+
List[str]: All phone number strings found.
|
|
1134
|
+
"""
|
|
1135
|
+
import re
|
|
1136
|
+
|
|
1137
|
+
text = text if text is not None else self.detect_text()
|
|
1138
|
+
pattern = (
|
|
1139
|
+
r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4,6}"
|
|
1140
|
+
)
|
|
1141
|
+
return re.findall(pattern, text)
|
|
1142
|
+
|
|
1143
|
+
def extract_emails(self, text=None):
|
|
1144
|
+
"""Extract email addresses from text.
|
|
1145
|
+
|
|
1146
|
+
Args:
|
|
1147
|
+
text: Optional string to search. If None, calls detect_text().
|
|
1148
|
+
Returns:
|
|
1149
|
+
List[str]: All email address strings found.
|
|
1150
|
+
"""
|
|
1151
|
+
import re
|
|
1152
|
+
|
|
1153
|
+
text = text if text is not None else self.detect_text()
|
|
1154
|
+
return re.findall(r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b", text)
|
|
1155
|
+
|
|
1156
|
+
def get_reading_order(self, words):
|
|
1157
|
+
"""Sort word dicts into reading order (top-to-bottom, left-to-right).
|
|
1158
|
+
|
|
1159
|
+
Args:
|
|
1160
|
+
words: List of word dicts with 'top' and 'left' keys.
|
|
1161
|
+
Returns:
|
|
1162
|
+
List[dict]: Words sorted by (top, left).
|
|
1163
|
+
"""
|
|
1164
|
+
return sorted(words, key=lambda w: (w.get("top", 0), w.get("left", 0)))
|
|
1165
|
+
|
|
1166
|
+
def get_text_density(self):
|
|
1167
|
+
"""Compute the ratio of non-whitespace characters to image pixel area.
|
|
1168
|
+
|
|
1169
|
+
Returns:
|
|
1170
|
+
float: Character count divided by (width * height). 0.0 if area is zero.
|
|
1171
|
+
"""
|
|
1172
|
+
text = self.detect_text()
|
|
1173
|
+
char_count = len(text.replace(" ", "").replace("\n", ""))
|
|
1174
|
+
h, w = self.image.shape[:2]
|
|
1175
|
+
area = w * h
|
|
1176
|
+
return float(char_count) / area if area > 0 else 0.0
|
|
1177
|
+
|
|
1178
|
+
def redact_sensitive(self, patterns=None):
|
|
1179
|
+
"""Black out words matching sensitive patterns (emails, phone numbers).
|
|
1180
|
+
|
|
1181
|
+
Args:
|
|
1182
|
+
patterns: Optional list of regex strings. Defaults to email and
|
|
1183
|
+
phone number patterns.
|
|
1184
|
+
Returns:
|
|
1185
|
+
np.ndarray: Annotated copy of self.image with redacted regions.
|
|
1186
|
+
"""
|
|
1187
|
+
import re
|
|
1188
|
+
|
|
1189
|
+
out = self.image.copy()
|
|
1190
|
+
_, words = self.detect_words()
|
|
1191
|
+
default_patterns = [
|
|
1192
|
+
r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b",
|
|
1193
|
+
r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4,6}",
|
|
1194
|
+
]
|
|
1195
|
+
active = patterns or default_patterns
|
|
1196
|
+
for word in words:
|
|
1197
|
+
if any(re.search(p, word.get("text", ""), re.IGNORECASE) for p in active):
|
|
1198
|
+
# Support both real detect_words keys (x/y/w/h) and test mock keys (left/top/width/height)
|
|
1199
|
+
x = word.get("left", word.get("x", 0))
|
|
1200
|
+
y = word.get("top", word.get("y", 0))
|
|
1201
|
+
word_w = word.get("width", word.get("w", 0))
|
|
1202
|
+
word_h = word.get("height", word.get("h", 0))
|
|
1203
|
+
cv2.rectangle(out, (x, y), (x + word_w, y + word_h), (0, 0, 0), -1)
|
|
1204
|
+
return out
|
|
1205
|
+
|
|
1206
|
+
def detect_language(self, text=None):
|
|
1207
|
+
"""Detect the language of the given text using langdetect.
|
|
1208
|
+
|
|
1209
|
+
Falls back to 'unknown' if langdetect is not installed or detection fails.
|
|
1210
|
+
|
|
1211
|
+
Args:
|
|
1212
|
+
text: Optional string to analyze. If None, calls detect_text().
|
|
1213
|
+
Returns:
|
|
1214
|
+
str: BCP-47 language code (e.g. 'en', 'fr') or 'unknown'.
|
|
1215
|
+
"""
|
|
1216
|
+
try:
|
|
1217
|
+
from langdetect import detect
|
|
1218
|
+
|
|
1219
|
+
text = text if text is not None else self.detect_text()
|
|
1220
|
+
if not text.strip():
|
|
1221
|
+
return "unknown"
|
|
1222
|
+
return detect(text)
|
|
1223
|
+
except ImportError:
|
|
1224
|
+
import warnings
|
|
1225
|
+
|
|
1226
|
+
warnings.warn("langdetect not installed; returning 'unknown'", stacklevel=2)
|
|
1227
|
+
return "unknown"
|
|
1228
|
+
except Exception:
|
|
1229
|
+
return "unknown"
|