py2ls 0.1.10.1__py3-none-any.whl → 0.1.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
py2ls/netfinder.py CHANGED
@@ -80,17 +80,40 @@ def get_tags(content, ascending=True):
80
80
  return tag_names
81
81
 
82
82
 
83
- def get_attr(content, where, attr):
83
+ def get_attr(content, where=None, attr=None, **kwargs):
84
+ """
85
+ usage: nt.get_attr(soup, where="a", attr="href", class_="res-1foik6i")
86
+
87
+ Extracts the specified attribute from tags in the content.
88
+
89
+ Parameters:
90
+ - content: BeautifulSoup object of the HTML content.
91
+ - where: The tag name to search for (e.g., 'time').
92
+ - attr: The attribute to extract (e.g., 'datetime').
93
+ - kwargs: Additional filtering conditions for find_all.
94
+
95
+ Returns:
96
+ - A list of attribute values if found; otherwise, prints debug info.
97
+ """
98
+ # Extract all tags from the content
84
99
  all_tags = get_tags(content)
85
100
  if all([where, attr]):
86
101
  if where in all_tags:
87
- element_ = content.find_all(where)
88
- return [i[attr] for i in element_]
102
+ if kwargs:
103
+ element_ = content.find_all(where, **kwargs)
104
+ else:
105
+ element_ = content.find_all(where)
106
+ attr_values = [i.get(attr) for i in element_ if i.has_attr(attr)]
107
+ if attr_values:
108
+ return attr_values
109
+ else:
110
+ print(f"The attribute '{attr}' is not found in the elements.")
89
111
  else:
90
- print(
91
- f"cannot find attr {attr} in tag_name{where}\n or possibly cannot find the tag_names:"
92
- )
112
+ print(f"Cannot find tag '{where}' in the content.")
113
+ print("Available tags:")
93
114
  pp(all_tags)
115
+ else:
116
+ print("Please provide both 'where' (tag name) and 'attr' (attribute).")
94
117
 
95
118
 
96
119
  def extract_text_from_content(
@@ -159,8 +182,10 @@ def extract_text_from_content(
159
182
  else:
160
183
  result_set = content.find_all(where, attrs=dict(**search_kwargs))
161
184
  if "get" in kwargs:
162
- attr = kwargs["get"]
163
- return get_attr(content, where, attr)
185
+ del search_kwargs["get"] # rm 'get' key
186
+ return get_attr(
187
+ content, where=where, attr=kwargs["get"], **search_kwargs
188
+ )
164
189
  if not result_set:
165
190
  print("Failed: check the 'attrs' setting: attrs={'id':'xample'}")
166
191
  if extend:
py2ls/ocr.py CHANGED
@@ -11,6 +11,7 @@ import re
11
11
 
12
12
  from PIL import Image, ImageDraw, ImageFont
13
13
  import PIL.PngImagePlugin
14
+ import pytesseract
14
15
 
15
16
  """
16
17
  Optical Character Recognition (OCR)
@@ -18,25 +19,125 @@ import PIL.PngImagePlugin
18
19
 
19
20
  # Valid language codes
20
21
  lang_valid = {
21
- "english": "en",
22
- "thai": "th",
23
- "chinese_traditional": "ch_tra",
24
- "chinese": "ch_sim",
25
- "japanese": "ja",
26
- "korean": "ko",
27
- "tamil": "ta",
28
- "telugu": "te",
29
- "kannada": "kn",
30
- "german": "de",
22
+ "easyocr": {
23
+ "english": "en",
24
+ "thai": "th",
25
+ "chinese_traditional": "ch_tra",
26
+ "chinese": "ch_sim",
27
+ "japanese": "ja",
28
+ "korean": "ko",
29
+ "tamil": "ta",
30
+ "telugu": "te",
31
+ "kannada": "kn",
32
+ "german": "de",
33
+ },
34
+ "pytesseract": {
35
+ "afrikaans": "afr",
36
+ "amharic": "amh",
37
+ "arabic": "ara",
38
+ "assamese": "asm",
39
+ "azerbaijani": "aze",
40
+ "azerbaijani_cyrillic": "aze_cyrl",
41
+ "belarusian": "bel",
42
+ "bengali": "ben",
43
+ "tibetan": "bod",
44
+ "bosnian": "bos",
45
+ "breton": "bre",
46
+ "bulgarian": "bul",
47
+ "catalan": "cat",
48
+ "cebuano": "ceb",
49
+ "czech": "ces",
50
+ "chinese": "chi_sim",
51
+ "chinese_vertical": "chi_sim_vert",
52
+ "chinese_traditional": "chi_tra",
53
+ "chinese_traditional_vertical": "chi_tra_vert",
54
+ "cherokee": "chr",
55
+ "corsican": "cos",
56
+ "welsh": "cym",
57
+ "danish": "dan",
58
+ "danish_fraktur": "dan_frak",
59
+ "german": "deu",
60
+ "german_fraktur": "deu_frak",
61
+ "german_latf": "deu_latf",
62
+ "dhivehi": "div",
63
+ "dzongkha": "dzo",
64
+ "greek": "ell",
65
+ "english": "eng",
66
+ "middle_english": "enm",
67
+ "esperanto": "epo",
68
+ "math_equations": "equ",
69
+ "estonian": "est",
70
+ "basque": "eus",
71
+ "faroese": "fao",
72
+ "persian": "fas",
73
+ "filipino": "fil",
74
+ "finnish": "fin",
75
+ "french": "fra",
76
+ "middle_french": "frm",
77
+ "frisian": "fry",
78
+ "scottish_gaelic": "gla",
79
+ "irish": "gle",
80
+ "galician": "glg",
81
+ "ancient_greek": "grc",
82
+ "gujarati": "guj",
83
+ "haitian_creole": "hat",
84
+ "hebrew": "heb",
85
+ "hindi": "hin",
86
+ "croatian": "hrv",
87
+ "hungarian": "hun",
88
+ "armenian": "hye",
89
+ "inuktitut": "iku",
90
+ "indonesian": "ind",
91
+ "icelandic": "isl",
92
+ "italian": "ita",
93
+ "old_italian": "ita_old",
94
+ "javanese": "jav",
95
+ "japanese": "jpn",
96
+ "japanese_vertical": "jpn_vert",
97
+ "kannada": "kan",
98
+ "georgian": "kat",
99
+ "old_georgian": "kat_old",
100
+ "kazakh": "kaz",
101
+ "khmer": "khm",
102
+ "kyrgyz": "kir",
103
+ "kurdish_kurmanji": "kmr",
104
+ "korean": "kor",
105
+ "korean_vertical": "kor_vert",
106
+ "lao": "lao",
107
+ "latin": "lat",
108
+ "latvian": "lav",
109
+ "lithuanian": "lit",
110
+ "luxembourgish": "ltz",
111
+ "malayalam": "mal",
112
+ "marathi": "mar",
113
+ "macedonian": "mkd",
114
+ "maltese": "mlt",
115
+ "mongolian": "mon",
116
+ "maori": "mri",
117
+ "malay": "msa",
118
+ "burmese": "mya",
119
+ "nepali": "nep",
120
+ "dutch": "nld",
121
+ "norwegian": "nor",
122
+ "occitan": "oci",
123
+ "oriya": "ori",
124
+ "script_detection": "osd",
125
+ "punjabi": "pan",
126
+ "polish": "pol",
127
+ "portuguese": "por",
128
+ },
31
129
  }
32
130
 
33
131
 
34
- def lang_auto_detect(lang):
132
+ def lang_auto_detect(
133
+ lang,
134
+ model="easyocr", # "easyocr" or "pytesseract"
135
+ ):
35
136
  res_lang = []
36
137
  if isinstance(lang, str):
37
138
  lang = [lang]
38
139
  for i in lang:
39
- res_lang.append(lang_valid[strcmp(i, list(lang_valid.keys()))[0]])
140
+ res_lang.append(lang_valid[model][strcmp(i, list(lang_valid[model].keys()))[0]])
40
141
  return res_lang
41
142
 
42
143
 
@@ -140,7 +241,13 @@ def correct_skew(image):
140
241
  angle = -(90 + angle)
141
242
  else:
142
243
  angle = -angle
143
- return rotate_image(image, angle)
244
+ (h, w) = image.shape[:2]
245
+ center = (w // 2, h // 2)
246
+ M = cv2.getRotationMatrix2D(center, angle, 1.0)
247
+ rotated = cv2.warpAffine(
248
+ image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
249
+ )
250
+ return rotated
144
251
 
145
252
 
146
253
  def undistort_image(image, camera_matrix, dist_coeffs):
@@ -183,8 +290,8 @@ def preprocess_img(
183
290
  threshold=True,
184
291
  threshold_method="adaptive",
185
292
  rotate="auto",
186
- skew=True,
187
- denoise=True,
293
+ skew=False,
294
+ blur=True,
188
295
  blur_ksize=(5, 5),
189
296
  morph=True,
190
297
  morph_op="open",
@@ -240,9 +347,9 @@ def preprocess_img(
240
347
  else:
241
348
  img_preprocessed = image
242
349
 
243
- # # Correct skew
244
- # if skew:
245
- # img_preprocessed = correct_skew(image)
350
+ # Correct skew
351
+ if skew:
352
+ img_preprocessed = correct_skew(image)
246
353
 
247
354
  # Convert to grayscale
248
355
  if grayscale:
@@ -264,8 +371,8 @@ def preprocess_img(
264
371
  img_preprocessed, 127, 255, cv2.THRESH_BINARY
265
372
  )
266
373
 
267
- # Denoise
268
- if denoise:
374
+ # Denoise by Gaussian Blur
375
+ if blur:
269
376
  img_preprocessed = cv2.GaussianBlur(img_preprocessed, blur_ksize, 0)
270
377
 
271
378
  # 形态学处理
@@ -372,6 +479,7 @@ def text_postprocess(
372
479
  def get_text(
373
480
  image,
374
481
  lang=["ch_sim", "en"],
482
+ model="easyocr", # "pytesseract"
375
483
  thr=0.25,
376
484
  gpu=True,
377
485
  decoder="wordbeamsearch", #'greedy', 'beamsearch' and 'wordbeamsearch'(hightly accurate)
@@ -382,7 +490,7 @@ def get_text(
382
490
  ax=None,
383
491
  cmap=cv2.COLOR_BGR2RGB, # draw_box
384
492
  font=cv2.FONT_HERSHEY_SIMPLEX,
385
- fontScale=0.8,
493
+ font_scale=0.8,
386
494
  thickness_text=2, # Line thickness of 2 px
387
495
  color_box=(0, 255, 0), # draw_box
388
496
  color_text=(0, 0, 255), # draw_box
@@ -428,7 +536,10 @@ def get_text(
428
536
  adjust_contrast=0.7
429
537
  )
430
538
  """
431
- lang = lang_auto_detect(lang)
539
+
540
+ if ax is None:
541
+ ax = plt.gca()
542
+ lang = lang_auto_detect(lang, model)
432
543
  print(f"detecting language(s):{lang}")
433
544
  if isinstance(image, str):
434
545
  image = cv2.imread(image)
@@ -441,80 +552,133 @@ def get_text(
441
552
  if preprocess is None:
442
553
  preprocess = {}
443
554
  image_process = preprocess_img(image, **preprocess)
444
-
445
- # Perform OCR on the image
446
- reader = easyocr.Reader(lang, gpu=gpu)
447
- detections = reader.readtext(image_process, decoder=decoder, **kwargs)
448
- if postprocess is None:
449
- postprocess = dict(
450
- spell_check=True,
451
- clean=True,
452
- filter=dict(min_length=2),
453
- pattern=None,
454
- merge=True,
455
- )
456
- text_corr = []
457
- for _, text, _ in detections:
458
- text_corr.extend(text_postprocess(text, **postprocess))
459
- if show:
460
- if ax is None:
461
- ax = plt.gca()
462
- for bbox, text, score in detections:
463
- if score > thr:
464
- top_left = tuple(map(int, bbox[0]))
465
- bottom_right = tuple(map(int, bbox[2]))
466
- image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
467
- # image = cv2.putText(
468
- # image, text, top_left, font, fontScale, color_text, thickness_text
469
- # )
470
- image = add_text_pil(
471
- image, text, top_left, font_size=fontScale * 32, color=color_text
472
- )
473
-
474
- img_cmp = cv2.cvtColor(image, cmap)
475
- ax.imshow(img_cmp)
476
- ax.axis("off")
477
- # plt.show()
478
- # 根据输出类型返回相应的结果
479
- if output == "all":
480
- return ax, detections
481
- elif "t" in output.lower() and "x" in output.lower():
482
- # 提取文本,过滤低置信度的结果
483
- text = [text_ for _, text_, score_ in detections if score_ >= thr]
484
- if postprocess:
485
- return ax, text
555
+ if "easy" in model.lower():
556
+ # Perform OCR on the image
557
+ reader = easyocr.Reader(lang, gpu=gpu)
558
+ detections = reader.readtext(image_process, decoder=decoder, **kwargs)
559
+ if postprocess is None:
560
+ postprocess = dict(
561
+ spell_check=True,
562
+ clean=True,
563
+ filter=dict(min_length=2),
564
+ pattern=None,
565
+ merge=True,
566
+ )
567
+ text_corr = []
568
+ for _, text, _ in detections:
569
+ text_corr.extend(text_postprocess(text, **postprocess))
570
+ if show:
571
+ for bbox, text, score in detections:
572
+ if score > thr:
573
+ top_left = tuple(map(int, bbox[0]))
574
+ bottom_right = tuple(map(int, bbox[2]))
575
+ image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
576
+ # image = cv2.putText(
577
+ # image, text, top_left, font, font_scale, color_text, thickness_text
578
+ # )
579
+ image = add_text_pil(
580
+ image,
581
+ text,
582
+ top_left,
583
+ font_size=font_scale * 32,
584
+ color=color_text,
585
+ )
586
+ img_cmp = cv2.cvtColor(image, cmap)
587
+ ax.imshow(img_cmp)
588
+ ax.axis("off")
589
+ # plt.show()
590
+ # 根据输出类型返回相应的结果
591
+ if output == "all":
592
+ return ax, detections
593
+ elif "t" in output.lower() and "x" in output.lower():
594
+ # 提取文本,过滤低置信度的结果
595
+ text = [text_ for _, text_, score_ in detections if score_ >= thr]
596
+ if postprocess:
597
+ return ax, text
598
+ else:
599
+ return text_corr
600
+ elif "score" in output.lower() or "prob" in output.lower():
601
+ # 提取分数
602
+ scores = [score_ for _, _, score_ in detections]
603
+ return ax, scores
604
+ elif "box" in output.lower():
605
+ # 提取边界框,过滤低置信度的结果
606
+ bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
607
+ return ax, bboxes
486
608
  else:
487
- return text_corr
488
- elif "score" in output.lower() or "prob" in output.lower():
489
- # 提取分数
490
- scores = [score_ for _, _, score_ in detections]
491
- return ax, scores
492
- elif "box" in output.lower():
493
- # 提取边界框,过滤低置信度的结果
494
- bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
495
- return ax, bboxes
609
+ # 默认返回所有检测信息
610
+ return ax, detections
496
611
  else:
497
- # 默认返回所有检测信息
498
- return ax, detections
499
- else:
500
- # 根据输出类型返回相应的结果
501
- if output == "all":
502
- return detections
503
- elif "t" in output.lower() and "x" in output.lower():
504
- # 提取文本,过滤低置信度的结果
505
- text = [text_ for _, text_, score_ in detections if score_ >= thr]
506
- return text
507
- elif "score" in output.lower() or "prob" in output.lower():
508
- # 提取分数
509
- scores = [score_ for _, _, score_ in detections]
510
- return scores
511
- elif "box" in output.lower():
512
- # 提取边界框,过滤低置信度的结果
513
- bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
514
- return bboxes
612
+ # 根据输出类型返回相应的结果
613
+ if output == "all":
614
+ return detections
615
+ elif "t" in output.lower() and "x" in output.lower():
616
+ # 提取文本,过滤低置信度的结果
617
+ text = [text_ for _, text_, score_ in detections if score_ >= thr]
618
+ return text
619
+ elif "score" in output.lower() or "prob" in output.lower():
620
+ # 提取分数
621
+ scores = [score_ for _, _, score_ in detections]
622
+ return scores
623
+ elif "box" in output.lower():
624
+ # 提取边界框,过滤低置信度的结果
625
+ bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
626
+ return bboxes
627
+ else:
628
+ # 默认返回所有检测信息
629
+ return detections
630
+ else: # "pytesseract"
631
+ text = pytesseract.image_to_string(image_process, lang="+".join(lang), **kwargs)
632
+ bboxes = pytesseract.image_to_boxes(image_process, **kwargs)
633
+ if show:
634
+ # Image dimensions
635
+ h, w, _ = image.shape
636
+
637
+ for line in bboxes.splitlines():
638
+ parts = line.split()
639
+ if len(parts) == 6:
640
+ char, left, bottom, right, top, _ = parts
641
+ left, bottom, right, top = map(int, [left, bottom, right, top])
642
+
643
+ # Convert Tesseract coordinates (bottom-left and top-right) to (top-left and bottom-right)
644
+ top_left = (left, h - top)
645
+ bottom_right = (right, h - bottom)
646
+
647
+ # Draw the bounding box
648
+ image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
649
+ image = add_text_pil(
650
+ image,
651
+ char,
652
+ top_left,
653
+ font_size=font_scale * 32,
654
+ color=color_text,
655
+ )
656
+ img_cmp = cv2.cvtColor(image, cmap)
657
+ ax.imshow(img_cmp)
658
+ ax.axis("off")
659
+ if output == "all":
660
+ # Get verbose data including boxes, confidences, line and page numbers
661
+ detections = pytesseract.image_to_data(image_process)
662
+ return ax, detections
663
+ elif "t" in output.lower() and "x" in output.lower():
664
+ return ax, text
665
+ elif "box" in output.lower():
666
+ return ax, bboxes
667
+ else:
668
+ # Get information about orientation and script detection
669
+ return pytesseract.image_to_osd(image_process, **kwargs)
515
670
  else:
516
- # 默认返回所有检测信息
517
- return detections
671
+ if output == "all":
672
+ # Get verbose data including boxes, confidences, line and page numbers
673
+ detections = pytesseract.image_to_data(image_process, **kwargs)
674
+ return detections
675
+ elif "t" in output.lower() and "x" in output.lower():
676
+ return text
677
+ elif "box" in output.lower():
678
+ return bboxes
679
+ else:
680
+ # Get information about orientation and script detection
681
+ return pytesseract.image_to_osd(image_process, **kwargs)
518
682
 
519
683
 
520
684
  def draw_box(
@@ -543,7 +707,7 @@ def draw_box(
543
707
  bottom_right = tuple(map(int, bbox[2]))
544
708
  image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
545
709
  # image = cv2.putText(
546
- # image, text, top_left, font, fontScale, color_text, thickness_text
710
+ # image, text, top_left, font, font_scale, color_text, thickness_text
547
711
  # )
548
712
  image = add_text_pil(
549
713
  image, text, top_left, font_size=font_scale * 32, color=color_text