py2ls 0.1.10.1__py3-none-any.whl → 0.1.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/ips.py +770 -2
- py2ls/netfinder.py +33 -8
- py2ls/ocr.py +258 -94
- py2ls/translator.py +470 -119
- {py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/METADATA +1 -1
- {py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/RECORD +7 -7
- {py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/WHEEL +1 -1
py2ls/netfinder.py
CHANGED
@@ -80,17 +80,40 @@ def get_tags(content, ascending=True):
|
|
80
80
|
return tag_names
|
81
81
|
|
82
82
|
|
83
|
-
def get_attr(content, where, attr):
|
83
|
+
def get_attr(content, where=None, attr=None, **kwargs):
|
84
|
+
"""
|
85
|
+
usage: nt.get_attr(soup, where="a", attr="href", class_="res-1foik6i")
|
86
|
+
|
87
|
+
Extracts the specified attribute from tags in the content.
|
88
|
+
|
89
|
+
Parameters:
|
90
|
+
- content: BeautifulSoup object of the HTML content.
|
91
|
+
- where: The tag name to search for (e.g., 'time').
|
92
|
+
- attr: The attribute to extract (e.g., 'datetime').
|
93
|
+
- kwargs: Additional filtering conditions for find_all.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
- A list of attribute values if found; otherwise, prints debug info.
|
97
|
+
"""
|
98
|
+
# Extract all tags from the content
|
84
99
|
all_tags = get_tags(content)
|
85
100
|
if all([where, attr]):
|
86
101
|
if where in all_tags:
|
87
|
-
|
88
|
-
|
102
|
+
if kwargs:
|
103
|
+
element_ = content.find_all(where, **kwargs)
|
104
|
+
else:
|
105
|
+
element_ = content.find_all(where)
|
106
|
+
attr_values = [i.get(attr) for i in element_ if i.has_attr(attr)]
|
107
|
+
if attr_values:
|
108
|
+
return attr_values
|
109
|
+
else:
|
110
|
+
print(f"The attribute '{attr}' is not found in the elements.")
|
89
111
|
else:
|
90
|
-
print(
|
91
|
-
|
92
|
-
)
|
112
|
+
print(f"Cannot find tag '{where}' in the content.")
|
113
|
+
print("Available tags:")
|
93
114
|
pp(all_tags)
|
115
|
+
else:
|
116
|
+
print("Please provide both 'where' (tag name) and 'attr' (attribute).")
|
94
117
|
|
95
118
|
|
96
119
|
def extract_text_from_content(
|
@@ -159,8 +182,10 @@ def extract_text_from_content(
|
|
159
182
|
else:
|
160
183
|
result_set = content.find_all(where, attrs=dict(**search_kwargs))
|
161
184
|
if "get" in kwargs:
|
162
|
-
|
163
|
-
return get_attr(
|
185
|
+
del search_kwargs["get"] # rm 'get' key
|
186
|
+
return get_attr(
|
187
|
+
content, where=where, attr=kwargs["get"], **search_kwargs
|
188
|
+
)
|
164
189
|
if not result_set:
|
165
190
|
print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
|
166
191
|
if extend:
|
py2ls/ocr.py
CHANGED
@@ -11,6 +11,7 @@ import re
|
|
11
11
|
|
12
12
|
from PIL import Image, ImageDraw, ImageFont
|
13
13
|
import PIL.PngImagePlugin
|
14
|
+
import pytesseract
|
14
15
|
|
15
16
|
"""
|
16
17
|
Optical Character Recognition (OCR)
|
@@ -18,25 +19,125 @@ import PIL.PngImagePlugin
|
|
18
19
|
|
19
20
|
# Valid language codes
|
20
21
|
lang_valid = {
|
21
|
-
"
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
22
|
+
"easyocr": {
|
23
|
+
"english": "en",
|
24
|
+
"thai": "th",
|
25
|
+
"chinese_traditional": "ch_tra",
|
26
|
+
"chinese": "ch_sim",
|
27
|
+
"japanese": "ja",
|
28
|
+
"korean": "ko",
|
29
|
+
"tamil": "ta",
|
30
|
+
"telugu": "te",
|
31
|
+
"kannada": "kn",
|
32
|
+
"german": "de",
|
33
|
+
},
|
34
|
+
"pytesseract": {
|
35
|
+
"afrikaans": "afr",
|
36
|
+
"amharic": "amh",
|
37
|
+
"arabic": "ara",
|
38
|
+
"assamese": "asm",
|
39
|
+
"azerbaijani": "aze",
|
40
|
+
"azerbaijani_cyrillic": "aze_cyrl",
|
41
|
+
"belarusian": "bel",
|
42
|
+
"bengali": "ben",
|
43
|
+
"tibetan": "bod",
|
44
|
+
"bosnian": "bos",
|
45
|
+
"breton": "bre",
|
46
|
+
"bulgarian": "bul",
|
47
|
+
"catalan": "cat",
|
48
|
+
"cebuano": "ceb",
|
49
|
+
"czech": "ces",
|
50
|
+
"chinese": "chi_sim",
|
51
|
+
"chinese_vertical": "chi_sim_vert",
|
52
|
+
"chinese_traditional": "chi_tra",
|
53
|
+
"chinese_traditional_vertical": "chi_tra_vert",
|
54
|
+
"cherokee": "chr",
|
55
|
+
"corsican": "cos",
|
56
|
+
"welsh": "cym",
|
57
|
+
"danish": "dan",
|
58
|
+
"danish_fraktur": "dan_frak",
|
59
|
+
"german": "deu",
|
60
|
+
"german_fraktur": "deu_frak",
|
61
|
+
"german_latf": "deu_latf",
|
62
|
+
"dhivehi": "div",
|
63
|
+
"dzongkha": "dzo",
|
64
|
+
"greek": "ell",
|
65
|
+
"english": "eng",
|
66
|
+
"middle_english": "enm",
|
67
|
+
"esperanto": "epo",
|
68
|
+
"math_equations": "equ",
|
69
|
+
"estonian": "est",
|
70
|
+
"basque": "eus",
|
71
|
+
"faroese": "fao",
|
72
|
+
"persian": "fas",
|
73
|
+
"filipino": "fil",
|
74
|
+
"finnish": "fin",
|
75
|
+
"french": "fra",
|
76
|
+
"middle_french": "frm",
|
77
|
+
"frisian": "fry",
|
78
|
+
"scottish_gaelic": "gla",
|
79
|
+
"irish": "gle",
|
80
|
+
"galician": "glg",
|
81
|
+
"ancient_greek": "grc",
|
82
|
+
"gujarati": "guj",
|
83
|
+
"haitian_creole": "hat",
|
84
|
+
"hebrew": "heb",
|
85
|
+
"hindi": "hin",
|
86
|
+
"croatian": "hrv",
|
87
|
+
"hungarian": "hun",
|
88
|
+
"armenian": "hye",
|
89
|
+
"inuktitut": "iku",
|
90
|
+
"indonesian": "ind",
|
91
|
+
"icelandic": "isl",
|
92
|
+
"italian": "ita",
|
93
|
+
"old_italian": "ita_old",
|
94
|
+
"javanese": "jav",
|
95
|
+
"japanese": "jpn",
|
96
|
+
"japanese_vertical": "jpn_vert",
|
97
|
+
"kannada": "kan",
|
98
|
+
"georgian": "kat",
|
99
|
+
"old_georgian": "kat_old",
|
100
|
+
"kazakh": "kaz",
|
101
|
+
"khmer": "khm",
|
102
|
+
"kyrgyz": "kir",
|
103
|
+
"kurdish_kurmanji": "kmr",
|
104
|
+
"korean": "kor",
|
105
|
+
"korean_vertical": "kor_vert",
|
106
|
+
"lao": "lao",
|
107
|
+
"latin": "lat",
|
108
|
+
"latvian": "lav",
|
109
|
+
"lithuanian": "lit",
|
110
|
+
"luxembourgish": "ltz",
|
111
|
+
"malayalam": "mal",
|
112
|
+
"marathi": "mar",
|
113
|
+
"macedonian": "mkd",
|
114
|
+
"maltese": "mlt",
|
115
|
+
"mongolian": "mon",
|
116
|
+
"maori": "mri",
|
117
|
+
"malay": "msa",
|
118
|
+
"burmese": "mya",
|
119
|
+
"nepali": "nep",
|
120
|
+
"dutch": "nld",
|
121
|
+
"norwegian": "nor",
|
122
|
+
"occitan": "oci",
|
123
|
+
"oriya": "ori",
|
124
|
+
"script_detection": "osd",
|
125
|
+
"punjabi": "pan",
|
126
|
+
"polish": "pol",
|
127
|
+
"portuguese": "por",
|
128
|
+
},
|
31
129
|
}
|
32
130
|
|
33
131
|
|
34
|
-
def lang_auto_detect(
|
132
|
+
def lang_auto_detect(
|
133
|
+
lang,
|
134
|
+
model="easyocr", # "easyocr" or "pytesseract"
|
135
|
+
):
|
35
136
|
res_lang = []
|
36
137
|
if isinstance(lang, str):
|
37
138
|
lang = [lang]
|
38
139
|
for i in lang:
|
39
|
-
res_lang.append(lang_valid[strcmp(i, list(lang_valid.keys()))[0]])
|
140
|
+
res_lang.append(lang_valid[model][strcmp(i, list(lang_valid[model].keys()))[0]])
|
40
141
|
return res_lang
|
41
142
|
|
42
143
|
|
@@ -140,7 +241,13 @@ def correct_skew(image):
|
|
140
241
|
angle = -(90 + angle)
|
141
242
|
else:
|
142
243
|
angle = -angle
|
143
|
-
|
244
|
+
(h, w) = image.shape[:2]
|
245
|
+
center = (w // 2, h // 2)
|
246
|
+
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
247
|
+
rotated = cv2.warpAffine(
|
248
|
+
image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
|
249
|
+
)
|
250
|
+
return rotated
|
144
251
|
|
145
252
|
|
146
253
|
def undistort_image(image, camera_matrix, dist_coeffs):
|
@@ -183,8 +290,8 @@ def preprocess_img(
|
|
183
290
|
threshold=True,
|
184
291
|
threshold_method="adaptive",
|
185
292
|
rotate="auto",
|
186
|
-
skew=
|
187
|
-
|
293
|
+
skew=False,
|
294
|
+
blur=True,
|
188
295
|
blur_ksize=(5, 5),
|
189
296
|
morph=True,
|
190
297
|
morph_op="open",
|
@@ -240,9 +347,9 @@ def preprocess_img(
|
|
240
347
|
else:
|
241
348
|
img_preprocessed = image
|
242
349
|
|
243
|
-
#
|
244
|
-
|
245
|
-
|
350
|
+
# Correct skew
|
351
|
+
if skew:
|
352
|
+
img_preprocessed = correct_skew(image)
|
246
353
|
|
247
354
|
# Convert to grayscale
|
248
355
|
if grayscale:
|
@@ -264,8 +371,8 @@ def preprocess_img(
|
|
264
371
|
img_preprocessed, 127, 255, cv2.THRESH_BINARY
|
265
372
|
)
|
266
373
|
|
267
|
-
# Denoise
|
268
|
-
if
|
374
|
+
# Denoise by Gaussian Blur
|
375
|
+
if blur:
|
269
376
|
img_preprocessed = cv2.GaussianBlur(img_preprocessed, blur_ksize, 0)
|
270
377
|
|
271
378
|
# 形态学处理
|
@@ -372,6 +479,7 @@ def text_postprocess(
|
|
372
479
|
def get_text(
|
373
480
|
image,
|
374
481
|
lang=["ch_sim", "en"],
|
482
|
+
model="easyocr", # "pytesseract"
|
375
483
|
thr=0.25,
|
376
484
|
gpu=True,
|
377
485
|
decoder="wordbeamsearch", #'greedy', 'beamsearch' and 'wordbeamsearch'(hightly accurate)
|
@@ -382,7 +490,7 @@ def get_text(
|
|
382
490
|
ax=None,
|
383
491
|
cmap=cv2.COLOR_BGR2RGB, # draw_box
|
384
492
|
font=cv2.FONT_HERSHEY_SIMPLEX,
|
385
|
-
|
493
|
+
font_scale=0.8,
|
386
494
|
thickness_text=2, # Line thickness of 2 px
|
387
495
|
color_box=(0, 255, 0), # draw_box
|
388
496
|
color_text=(0, 0, 255), # draw_box
|
@@ -428,7 +536,10 @@ def get_text(
|
|
428
536
|
adjust_contrast=0.7
|
429
537
|
)
|
430
538
|
"""
|
431
|
-
|
539
|
+
|
540
|
+
if ax is None:
|
541
|
+
ax = plt.gca()
|
542
|
+
lang = lang_auto_detect(lang, model)
|
432
543
|
print(f"detecting language(s):{lang}")
|
433
544
|
if isinstance(image, str):
|
434
545
|
image = cv2.imread(image)
|
@@ -441,80 +552,133 @@ def get_text(
|
|
441
552
|
if preprocess is None:
|
442
553
|
preprocess = {}
|
443
554
|
image_process = preprocess_img(image, **preprocess)
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
555
|
+
if "easy" in model.lower():
|
556
|
+
# Perform OCR on the image
|
557
|
+
reader = easyocr.Reader(lang, gpu=gpu)
|
558
|
+
detections = reader.readtext(image_process, decoder=decoder, **kwargs)
|
559
|
+
if postprocess is None:
|
560
|
+
postprocess = dict(
|
561
|
+
spell_check=True,
|
562
|
+
clean=True,
|
563
|
+
filter=dict(min_length=2),
|
564
|
+
pattern=None,
|
565
|
+
merge=True,
|
566
|
+
)
|
567
|
+
text_corr = []
|
568
|
+
for _, text, _ in detections:
|
569
|
+
text_corr.extend(text_postprocess(text, **postprocess))
|
570
|
+
if show:
|
571
|
+
for bbox, text, score in detections:
|
572
|
+
if score > thr:
|
573
|
+
top_left = tuple(map(int, bbox[0]))
|
574
|
+
bottom_right = tuple(map(int, bbox[2]))
|
575
|
+
image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
|
576
|
+
# image = cv2.putText(
|
577
|
+
# image, text, top_left, font, font_scale, color_text, thickness_text
|
578
|
+
# )
|
579
|
+
image = add_text_pil(
|
580
|
+
image,
|
581
|
+
text,
|
582
|
+
top_left,
|
583
|
+
font_size=font_scale * 32,
|
584
|
+
color=color_text,
|
585
|
+
)
|
586
|
+
img_cmp = cv2.cvtColor(image, cmap)
|
587
|
+
ax.imshow(img_cmp)
|
588
|
+
ax.axis("off")
|
589
|
+
# plt.show()
|
590
|
+
# 根据输出类型返回相应的结果
|
591
|
+
if output == "all":
|
592
|
+
return ax, detections
|
593
|
+
elif "t" in output.lower() and "x" in output.lower():
|
594
|
+
# 提取文本,过滤低置信度的结果
|
595
|
+
text = [text_ for _, text_, score_ in detections if score_ >= thr]
|
596
|
+
if postprocess:
|
597
|
+
return ax, text
|
598
|
+
else:
|
599
|
+
return text_corr
|
600
|
+
elif "score" in output.lower() or "prob" in output.lower():
|
601
|
+
# 提取分数
|
602
|
+
scores = [score_ for _, _, score_ in detections]
|
603
|
+
return ax, scores
|
604
|
+
elif "box" in output.lower():
|
605
|
+
# 提取边界框,过滤低置信度的结果
|
606
|
+
bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
|
607
|
+
return ax, bboxes
|
486
608
|
else:
|
487
|
-
|
488
|
-
|
489
|
-
# 提取分数
|
490
|
-
scores = [score_ for _, _, score_ in detections]
|
491
|
-
return ax, scores
|
492
|
-
elif "box" in output.lower():
|
493
|
-
# 提取边界框,过滤低置信度的结果
|
494
|
-
bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
|
495
|
-
return ax, bboxes
|
609
|
+
# 默认返回所有检测信息
|
610
|
+
return ax, detections
|
496
611
|
else:
|
497
|
-
#
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
612
|
+
# 根据输出类型返回相应的结果
|
613
|
+
if output == "all":
|
614
|
+
return detections
|
615
|
+
elif "t" in output.lower() and "x" in output.lower():
|
616
|
+
# 提取文本,过滤低置信度的结果
|
617
|
+
text = [text_ for _, text_, score_ in detections if score_ >= thr]
|
618
|
+
return text
|
619
|
+
elif "score" in output.lower() or "prob" in output.lower():
|
620
|
+
# 提取分数
|
621
|
+
scores = [score_ for _, _, score_ in detections]
|
622
|
+
return scores
|
623
|
+
elif "box" in output.lower():
|
624
|
+
# 提取边界框,过滤低置信度的结果
|
625
|
+
bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
|
626
|
+
return bboxes
|
627
|
+
else:
|
628
|
+
# 默认返回所有检测信息
|
629
|
+
return detections
|
630
|
+
else: # "pytesseract"
|
631
|
+
text = pytesseract.image_to_string(image_process, lang="+".join(lang), **kwargs)
|
632
|
+
bboxes = pytesseract.image_to_boxes(image_process, **kwargs)
|
633
|
+
if show:
|
634
|
+
# Image dimensions
|
635
|
+
h, w, _ = image.shape
|
636
|
+
|
637
|
+
for line in bboxes.splitlines():
|
638
|
+
parts = line.split()
|
639
|
+
if len(parts) == 6:
|
640
|
+
char, left, bottom, right, top, _ = parts
|
641
|
+
left, bottom, right, top = map(int, [left, bottom, right, top])
|
642
|
+
|
643
|
+
# Convert Tesseract coordinates (bottom-left and top-right) to (top-left and bottom-right)
|
644
|
+
top_left = (left, h - top)
|
645
|
+
bottom_right = (right, h - bottom)
|
646
|
+
|
647
|
+
# Draw the bounding box
|
648
|
+
image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
|
649
|
+
image = add_text_pil(
|
650
|
+
image,
|
651
|
+
char,
|
652
|
+
top_left,
|
653
|
+
font_size=font_scale * 32,
|
654
|
+
color=color_text,
|
655
|
+
)
|
656
|
+
img_cmp = cv2.cvtColor(image, cmap)
|
657
|
+
ax.imshow(img_cmp)
|
658
|
+
ax.axis("off")
|
659
|
+
if output == "all":
|
660
|
+
# Get verbose data including boxes, confidences, line and page numbers
|
661
|
+
detections = pytesseract.image_to_data(image_process)
|
662
|
+
return ax, detections
|
663
|
+
elif "t" in output.lower() and "x" in output.lower():
|
664
|
+
return ax, text
|
665
|
+
elif "box" in output.lower():
|
666
|
+
return ax, bboxes
|
667
|
+
else:
|
668
|
+
# Get information about orientation and script detection
|
669
|
+
return pytesseract.image_to_osd(image_process, **kwargs)
|
515
670
|
else:
|
516
|
-
|
517
|
-
|
671
|
+
if output == "all":
|
672
|
+
# Get verbose data including boxes, confidences, line and page numbers
|
673
|
+
detections = pytesseract.image_to_data(image_process, **kwargs)
|
674
|
+
return detections
|
675
|
+
elif "t" in output.lower() and "x" in output.lower():
|
676
|
+
return text
|
677
|
+
elif "box" in output.lower():
|
678
|
+
return bboxes
|
679
|
+
else:
|
680
|
+
# Get information about orientation and script detection
|
681
|
+
return pytesseract.image_to_osd(image_process, **kwargs)
|
518
682
|
|
519
683
|
|
520
684
|
def draw_box(
|
@@ -543,7 +707,7 @@ def draw_box(
|
|
543
707
|
bottom_right = tuple(map(int, bbox[2]))
|
544
708
|
image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
|
545
709
|
# image = cv2.putText(
|
546
|
-
# image, text, top_left, font,
|
710
|
+
# image, text, top_left, font, font_scale, color_text, thickness_text
|
547
711
|
# )
|
548
712
|
image = add_text_pil(
|
549
713
|
image, text, top_left, font_size=font_scale * 32, color=color_text
|