PyPI - py2ls - Versions diffs - 0.1.10.1__py3-none-any.whl → 0.1.10.2__py3-none-any.whl - Mend

py2ls 0.1.10.1py3-none-any.whl → 0.1.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

py2ls/ips.py +770 -2
py2ls/netfinder.py +33 -8
py2ls/ocr.py +258 -94
py2ls/translator.py +470 -119
{py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/METADATA +1 -1
{py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/RECORD +7 -7
{py2ls-0.1.10.1.dist-info → py2ls-0.1.10.2.dist-info}/WHEEL +1 -1

py2ls/netfinder.py CHANGED Viewed

@@ -80,17 +80,40 @@ def get_tags(content, ascending=True):
             return tag_names
-def get_attr(content, where, attr):
+def get_attr(content, where=None, attr=None, **kwargs):
+    """
+    usage: nt.get_attr(soup, where="a", attr="href", class_="res-1foik6i")
+    Extracts the specified attribute from tags in the content.
+    Parameters:
+    - content: BeautifulSoup object of the HTML content.
+    - where: The tag name to search for (e.g., 'time').
+    - attr: The attribute to extract (e.g., 'datetime').
+    - kwargs: Additional filtering conditions for find_all.
+    Returns:
+    - A list of attribute values if found; otherwise, prints debug info.
+    """
+    # Extract all tags from the content
     all_tags = get_tags(content)
     if all([where, attr]):
         if where in all_tags:
-            element_ = content.find_all(where)
-            return [i[attr] for i in element_]
+            if kwargs:
+                element_ = content.find_all(where, **kwargs)
+            else:
+                element_ = content.find_all(where)
+            attr_values = [i.get(attr) for i in element_ if i.has_attr(attr)]
+            if attr_values:
+                return attr_values
+            else:
+                print(f"The attribute '{attr}' is not found in the elements.")
         else:
-            print(
-                f"cannot find attr {attr} in tag_name{where}\n or possibly cannot find the tag_names:"
-            )
+            print(f"Cannot find tag '{where}' in the content.")
+            print("Available tags:")
             pp(all_tags)
+    else:
+        print("Please provide both 'where' (tag name) and 'attr' (attribute).")
 def extract_text_from_content(
@@ -159,8 +182,10 @@ def extract_text_from_content(
             else:
                 result_set = content.find_all(where, attrs=dict(**search_kwargs))
             if "get" in kwargs:
-                attr = kwargs["get"]
-                return get_attr(content, where, attr)
+                del search_kwargs["get"]  # rm 'get' key
+                return get_attr(
+                    content, where=where, attr=kwargs["get"], **search_kwargs
+                )
             if not result_set:
                 print("Failed: check the 'attrs' setting:  attrs={'id':'xample'}")
             if extend:

py2ls/ocr.py CHANGED Viewed

@@ -11,6 +11,7 @@ import re
 from PIL import Image, ImageDraw, ImageFont
 import PIL.PngImagePlugin
+import pytesseract
 """
     Optical Character Recognition (OCR)
@@ -18,25 +19,125 @@ import PIL.PngImagePlugin
 # Valid language codes
 lang_valid = {
-    "english": "en",
-    "thai": "th",
-    "chinese_traditional": "ch_tra",
-    "chinese": "ch_sim",
-    "japanese": "ja",
-    "korean": "ko",
-    "tamil": "ta",
-    "telugu": "te",
-    "kannada": "kn",
-    "german": "de",
+    "easyocr": {
+        "english": "en",
+        "thai": "th",
+        "chinese_traditional": "ch_tra",
+        "chinese": "ch_sim",
+        "japanese": "ja",
+        "korean": "ko",
+        "tamil": "ta",
+        "telugu": "te",
+        "kannada": "kn",
+        "german": "de",
+    },
+    "pytesseract": {
+        "afrikaans": "afr",
+        "amharic": "amh",
+        "arabic": "ara",
+        "assamese": "asm",
+        "azerbaijani": "aze",
+        "azerbaijani_cyrillic": "aze_cyrl",
+        "belarusian": "bel",
+        "bengali": "ben",
+        "tibetan": "bod",
+        "bosnian": "bos",
+        "breton": "bre",
+        "bulgarian": "bul",
+        "catalan": "cat",
+        "cebuano": "ceb",
+        "czech": "ces",
+        "chinese": "chi_sim",
+        "chinese_vertical": "chi_sim_vert",
+        "chinese_traditional": "chi_tra",
+        "chinese_traditional_vertical": "chi_tra_vert",
+        "cherokee": "chr",
+        "corsican": "cos",
+        "welsh": "cym",
+        "danish": "dan",
+        "danish_fraktur": "dan_frak",
+        "german": "deu",
+        "german_fraktur": "deu_frak",
+        "german_latf": "deu_latf",
+        "dhivehi": "div",
+        "dzongkha": "dzo",
+        "greek": "ell",
+        "english": "eng",
+        "middle_english": "enm",
+        "esperanto": "epo",
+        "math_equations": "equ",
+        "estonian": "est",
+        "basque": "eus",
+        "faroese": "fao",
+        "persian": "fas",
+        "filipino": "fil",
+        "finnish": "fin",
+        "french": "fra",
+        "middle_french": "frm",
+        "frisian": "fry",
+        "scottish_gaelic": "gla",
+        "irish": "gle",
+        "galician": "glg",
+        "ancient_greek": "grc",
+        "gujarati": "guj",
+        "haitian_creole": "hat",
+        "hebrew": "heb",
+        "hindi": "hin",
+        "croatian": "hrv",
+        "hungarian": "hun",
+        "armenian": "hye",
+        "inuktitut": "iku",
+        "indonesian": "ind",
+        "icelandic": "isl",
+        "italian": "ita",
+        "old_italian": "ita_old",
+        "javanese": "jav",
+        "japanese": "jpn",
+        "japanese_vertical": "jpn_vert",
+        "kannada": "kan",
+        "georgian": "kat",
+        "old_georgian": "kat_old",
+        "kazakh": "kaz",
+        "khmer": "khm",
+        "kyrgyz": "kir",
+        "kurdish_kurmanji": "kmr",
+        "korean": "kor",
+        "korean_vertical": "kor_vert",
+        "lao": "lao",
+        "latin": "lat",
+        "latvian": "lav",
+        "lithuanian": "lit",
+        "luxembourgish": "ltz",
+        "malayalam": "mal",
+        "marathi": "mar",
+        "macedonian": "mkd",
+        "maltese": "mlt",
+        "mongolian": "mon",
+        "maori": "mri",
+        "malay": "msa",
+        "burmese": "mya",
+        "nepali": "nep",
+        "dutch": "nld",
+        "norwegian": "nor",
+        "occitan": "oci",
+        "oriya": "ori",
+        "script_detection": "osd",
+        "punjabi": "pan",
+        "polish": "pol",
+        "portuguese": "por",
+    },
 }
-def lang_auto_detect(lang):
+def lang_auto_detect(
+    lang,
+    model="easyocr",  # "easyocr" or "pytesseract"
+):
     res_lang = []
     if isinstance(lang, str):
         lang = [lang]
     for i in lang:
-        res_lang.append(lang_valid[strcmp(i, list(lang_valid.keys()))[0]])
+        res_lang.append(lang_valid[model][strcmp(i, list(lang_valid[model].keys()))[0]])
     return res_lang
@@ -140,7 +241,13 @@ def correct_skew(image):
         angle = -(90 + angle)
     else:
         angle = -angle
-    return rotate_image(image, angle)
+    (h, w) = image.shape[:2]
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, angle, 1.0)
+    rotated = cv2.warpAffine(
+        image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
+    )
+    return rotated
 def undistort_image(image, camera_matrix, dist_coeffs):
@@ -183,8 +290,8 @@ def preprocess_img(
     threshold=True,
     threshold_method="adaptive",
     rotate="auto",
-    skew=True,
-    denoise=True,
+    skew=False,
+    blur=True,
     blur_ksize=(5, 5),
     morph=True,
     morph_op="open",
@@ -240,9 +347,9 @@ def preprocess_img(
     else:
         img_preprocessed = image
-    # # Correct skew
-    # if skew:
-    #     img_preprocessed = correct_skew(image)
+    # Correct skew
+    if skew:
+        img_preprocessed = correct_skew(image)
     # Convert to grayscale
     if grayscale:
@@ -264,8 +371,8 @@ def preprocess_img(
                 img_preprocessed, 127, 255, cv2.THRESH_BINARY
             )
-    # Denoise
-    if denoise:
+    # Denoise by Gaussian Blur
+    if blur:
         img_preprocessed = cv2.GaussianBlur(img_preprocessed, blur_ksize, 0)
     # 形态学处理
@@ -372,6 +479,7 @@ def text_postprocess(
 def get_text(
     image,
     lang=["ch_sim", "en"],
+    model="easyocr",  # "pytesseract"
     thr=0.25,
     gpu=True,
     decoder="wordbeamsearch",  #'greedy', 'beamsearch' and 'wordbeamsearch'(hightly accurate)
@@ -382,7 +490,7 @@ def get_text(
     ax=None,
     cmap=cv2.COLOR_BGR2RGB,  # draw_box
     font=cv2.FONT_HERSHEY_SIMPLEX,
-    fontScale=0.8,
+    font_scale=0.8,
     thickness_text=2,  # Line thickness of 2 px
     color_box=(0, 255, 0),  # draw_box
     color_text=(0, 0, 255),  # draw_box
@@ -428,7 +536,10 @@ def get_text(
         adjust_contrast=0.7
     )
     """
-    lang = lang_auto_detect(lang)
+    if ax is None:
+        ax = plt.gca()
+    lang = lang_auto_detect(lang, model)
     print(f"detecting language(s):{lang}")
     if isinstance(image, str):
         image = cv2.imread(image)
@@ -441,80 +552,133 @@ def get_text(
     if preprocess is None:
         preprocess = {}
     image_process = preprocess_img(image, **preprocess)
-    # Perform OCR on the image
-    reader = easyocr.Reader(lang, gpu=gpu)
-    detections = reader.readtext(image_process, decoder=decoder, **kwargs)
-    if postprocess is None:
-        postprocess = dict(
-            spell_check=True,
-            clean=True,
-            filter=dict(min_length=2),
-            pattern=None,
-            merge=True,
-        )
-        text_corr = []
-        for _, text, _ in detections:
-            text_corr.extend(text_postprocess(text, **postprocess))
-    if show:
-        if ax is None:
-            ax = plt.gca()
-        for bbox, text, score in detections:
-            if score > thr:
-                top_left = tuple(map(int, bbox[0]))
-                bottom_right = tuple(map(int, bbox[2]))
-                image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
-                # image = cv2.putText(
-                #     image, text, top_left, font, fontScale, color_text, thickness_text
-                # )
-                image = add_text_pil(
-                    image, text, top_left, font_size=fontScale * 32, color=color_text
-                )
-        img_cmp = cv2.cvtColor(image, cmap)
-        ax.imshow(img_cmp)
-        ax.axis("off")
-        # plt.show()
-        # 根据输出类型返回相应的结果
-        if output == "all":
-            return ax, detections
-        elif "t" in output.lower() and "x" in output.lower():
-            # 提取文本，过滤低置信度的结果
-            text = [text_ for _, text_, score_ in detections if score_ >= thr]
-            if postprocess:
-                return ax, text
+    if "easy" in model.lower():
+        # Perform OCR on the image
+        reader = easyocr.Reader(lang, gpu=gpu)
+        detections = reader.readtext(image_process, decoder=decoder, **kwargs)
+        if postprocess is None:
+            postprocess = dict(
+                spell_check=True,
+                clean=True,
+                filter=dict(min_length=2),
+                pattern=None,
+                merge=True,
+            )
+            text_corr = []
+            for _, text, _ in detections:
+                text_corr.extend(text_postprocess(text, **postprocess))
+        if show:
+            for bbox, text, score in detections:
+                if score > thr:
+                    top_left = tuple(map(int, bbox[0]))
+                    bottom_right = tuple(map(int, bbox[2]))
+                    image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
+                    # image = cv2.putText(
+                    #     image, text, top_left, font, font_scale, color_text, thickness_text
+                    # )
+                    image = add_text_pil(
+                        image,
+                        text,
+                        top_left,
+                        font_size=font_scale * 32,
+                        color=color_text,
+                    )
+            img_cmp = cv2.cvtColor(image, cmap)
+            ax.imshow(img_cmp)
+            ax.axis("off")
+            # plt.show()
+            # 根据输出类型返回相应的结果
+            if output == "all":
+                return ax, detections
+            elif "t" in output.lower() and "x" in output.lower():
+                # 提取文本，过滤低置信度的结果
+                text = [text_ for _, text_, score_ in detections if score_ >= thr]
+                if postprocess:
+                    return ax, text
+                else:
+                    return text_corr
+            elif "score" in output.lower() or "prob" in output.lower():
+                # 提取分数
+                scores = [score_ for _, _, score_ in detections]
+                return ax, scores
+            elif "box" in output.lower():
+                # 提取边界框，过滤低置信度的结果
+                bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
+                return ax, bboxes
             else:
-                return text_corr
-        elif "score" in output.lower() or "prob" in output.lower():
-            # 提取分数
-            scores = [score_ for _, _, score_ in detections]
-            return ax, scores
-        elif "box" in output.lower():
-            # 提取边界框，过滤低置信度的结果
-            bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
-            return ax, bboxes
+                # 默认返回所有检测信息
+                return ax, detections
         else:
-            # 默认返回所有检测信息
-            return ax, detections
-    else:
-        # 根据输出类型返回相应的结果
-        if output == "all":
-            return detections
-        elif "t" in output.lower() and "x" in output.lower():
-            # 提取文本，过滤低置信度的结果
-            text = [text_ for _, text_, score_ in detections if score_ >= thr]
-            return text
-        elif "score" in output.lower() or "prob" in output.lower():
-            # 提取分数
-            scores = [score_ for _, _, score_ in detections]
-            return scores
-        elif "box" in output.lower():
-            # 提取边界框，过滤低置信度的结果
-            bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
-            return bboxes
+            # 根据输出类型返回相应的结果
+            if output == "all":
+                return detections
+            elif "t" in output.lower() and "x" in output.lower():
+                # 提取文本，过滤低置信度的结果
+                text = [text_ for _, text_, score_ in detections if score_ >= thr]
+                return text
+            elif "score" in output.lower() or "prob" in output.lower():
+                # 提取分数
+                scores = [score_ for _, _, score_ in detections]
+                return scores
+            elif "box" in output.lower():
+                # 提取边界框，过滤低置信度的结果
+                bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
+                return bboxes
+            else:
+                # 默认返回所有检测信息
+                return detections
+    else:  # "pytesseract"
+        text = pytesseract.image_to_string(image_process, lang="+".join(lang), **kwargs)
+        bboxes = pytesseract.image_to_boxes(image_process, **kwargs)
+        if show:
+            # Image dimensions
+            h, w, _ = image.shape
+            for line in bboxes.splitlines():
+                parts = line.split()
+                if len(parts) == 6:
+                    char, left, bottom, right, top, _ = parts
+                    left, bottom, right, top = map(int, [left, bottom, right, top])
+                    # Convert Tesseract coordinates (bottom-left and top-right) to (top-left and bottom-right)
+                    top_left = (left, h - top)
+                    bottom_right = (right, h - bottom)
+                    # Draw the bounding box
+                    image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
+                    image = add_text_pil(
+                        image,
+                        char,
+                        top_left,
+                        font_size=font_scale * 32,
+                        color=color_text,
+                    )
+            img_cmp = cv2.cvtColor(image, cmap)
+            ax.imshow(img_cmp)
+            ax.axis("off")
+            if output == "all":
+                # Get verbose data including boxes, confidences, line and page numbers
+                detections = pytesseract.image_to_data(image_process)
+                return ax, detections
+            elif "t" in output.lower() and "x" in output.lower():
+                return ax, text
+            elif "box" in output.lower():
+                return ax, bboxes
+            else:
+                # Get information about orientation and script detection
+                return pytesseract.image_to_osd(image_process, **kwargs)
         else:
-            # 默认返回所有检测信息
-            return detections
+            if output == "all":
+                # Get verbose data including boxes, confidences, line and page numbers
+                detections = pytesseract.image_to_data(image_process, **kwargs)
+                return detections
+            elif "t" in output.lower() and "x" in output.lower():
+                return text
+            elif "box" in output.lower():
+                return bboxes
+            else:
+                # Get information about orientation and script detection
+                return pytesseract.image_to_osd(image_process, **kwargs)
 def draw_box(
@@ -543,7 +707,7 @@ def draw_box(
             bottom_right = tuple(map(int, bbox[2]))
             image = cv2.rectangle(image, top_left, bottom_right, color_box, 2)
             # image = cv2.putText(
-            #     image, text, top_left, font, fontScale, color_text, thickness_text
+            #     image, text, top_left, font, font_scale, color_text, thickness_text
             # )
             image = add_text_pil(
                 image, text, top_left, font_size=font_scale * 32, color=color_text

py2ls 0.1.10.1__py3-none-any.whl → 0.1.10.2__py3-none-any.whl

py2ls 0.1.10.1py3-none-any.whl → 0.1.10.2py3-none-any.whl