PyPI - py2ls - Versions diffs - 0.2.4.32__py3-none-any.whl → 0.2.4.33__py3-none-any.whl - Mend

py2ls 0.2.4.32py3-none-any.whl → 0.2.4.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

py2ls/.git/index +0 -0
py2ls/ips.py +736 -164
py2ls/netfinder.py +99 -0
py2ls/ocr.py +140 -126
py2ls/plot.py +612 -376
{py2ls-0.2.4.32.dist-info → py2ls-0.2.4.33.dist-info}/METADATA +1 -1
{py2ls-0.2.4.32.dist-info → py2ls-0.2.4.33.dist-info}/RECORD +8 -8
{py2ls-0.2.4.32.dist-info → py2ls-0.2.4.33.dist-info}/WHEEL +0 -0

py2ls/netfinder.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from bs4 import BeautifulSoup
+import scrapy
 import requests
 import os
 import pandas as pd
@@ -332,6 +333,94 @@ def parse_cookies(cookies_str):
     return cookies_dict
+class FetchSpider(scrapy.Spider):
+    name = "fetch_spider"
+    def __init__(self, url, parser="html.parser", cookies=None, headers=None, *args, **kwargs):
+        super(FetchSpider, self).__init__(*args, **kwargs)
+        self.start_urls = [url]
+        self.cookies = cookies
+        self.headers = headers
+        self.parser = parser
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(
+                url,
+                cookies=self.cookies,
+                headers=self.headers,
+                callback=self.parse
+            )
+    def parse(self, response):
+        # Use the desired parser (default: html.parser)
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(response.text, self.parser)
+        yield {"content": soup}
+def fetch_scrapy(
+    url,
+    parser="html.parser",
+    cookies=None,
+    headers=None,
+    settings=None,
+):
+    """
+    Fetches content using Scrapy.
+    Args:
+        url (str): The URL to scrape.
+        parser (str): Parser for BeautifulSoup (e.g., "lxml", "html.parser").
+        cookies (dict): Cookies to pass in the request.
+        headers (dict): HTTP headers for the request.
+        settings (dict): Scrapy settings, if any.
+    Returns:
+        dict: Parsed content as a dictionary.
+    """
+    from scrapy.utils.project import get_project_settings
+    from scrapy.crawler import CrawlerProcess
+    from scrapy.signalmanager import dispatcher
+    from scrapy import signals
+    import scrapy
+    # Container for scraped content
+    content = []
+    # Callback function for item scraped signal
+    def handle_item(item, response, spider):
+        content.append(item["content"])
+    # Scrapy settings
+    process_settings = settings or get_project_settings()
+    process_settings.update(
+        {
+            "USER_AGENT": "CustomUserAgent/1.0",  # Use a custom user agent
+            "DOWNLOAD_DELAY": 1,  # Prevent overloading servers
+            "COOKIES_ENABLED": bool(cookies),
+            "LOG_LEVEL": "ERROR",  # Minimize log verbosity
+        }
+    )
+    # Initialize and configure Scrapy process
+    process = CrawlerProcess(settings=process_settings)
+    dispatcher.connect(handle_item, signal=signals.item_scraped)
+    # Start the Scrapy crawl
+    process.crawl(
+        FetchSpider,
+        url=url,
+        parser=parser,
+        cookies=cookies,
+        headers=headers,
+    )
+    process.start()  # Blocks until all crawls are finished
+    # Return the first scraped content or None if empty
+    return content[0] if content else None
 def fetch_all(
     url,
     parser="lxml",
@@ -558,6 +647,16 @@ def fetch_all(
             else:
                 logger.warning("Selenium could not fetch content")
                 return None, None
+        elif 'scr' in driver.lower():
+            settings = {
+                "USER_AGENT": user_agent(),
+                "DOWNLOAD_DELAY": 1,  # Prevent overloading the server
+                "COOKIES_ENABLED": True if cookies else False,
+                "LOG_LEVEL": "WARNING",  # Reduce log verbosity
+            }
+            content=fetch_scrapy(url, parser=parser, cookies=cookies, headers=headers, settings=settings)
+            return parser, content
     except requests.RequestException as e:
         logger.error(f"Error fetching URL '{url}': {e}")
         return None, None

py2ls/ocr.py CHANGED Viewed

@@ -1,23 +1,16 @@
-import easyocr
 import cv2
 import numpy as np
 import matplotlib.pyplot as plt
 from py2ls.ips import (
     strcmp,
     detect_angle,
-)  # Ensure this function is defined in your 'ips' module
-from spellchecker import SpellChecker
-import re
-from PIL import Image, ImageDraw, ImageFont
-import PIL.PngImagePlugin
-import pytesseract
-from paddleocr import PaddleOCR
+    str2words,
+    isa
+)
 import logging
-logging.getLogger("ppocr").setLevel(
-    logging.WARNING
-)  # or logging.ERROR to show only error messages
+#logging.getLogger("ppocr").setLevel(logging.ERROR)
+logging.getLogger("ppocr").setLevel(logging.WARNING)
 """
     Optical Character Recognition (OCR)
@@ -285,10 +278,12 @@ def add_text_pil(
     image,
     text,
     position,
+    cvt_cmp=True,
     font_size=12,
     color=(0, 0, 0),
     bg_color=(133, 203, 245, 100),
 ):
+    from PIL import Image, ImageDraw, ImageFont
     # Convert the image to PIL format
     pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)).convert("RGBA")
     # Define the font (make sure to use a font that supports Chinese characters)
@@ -337,7 +332,7 @@ def add_text_pil(
         overlay = overlay.convert("RGBA")
     combined = Image.alpha_composite(pil_image, overlay)
     # Convert the image back to OpenCV format
-    image = cv2.cvtColor(np.array(combined), cv2.COLOR_RGBA2BGR)
+    image = cv2.cvtColor(np.array(combined), cv2.COLOR_RGBA2BGR) #if cvt_cmp else np.array(combined)
     return image
@@ -348,7 +343,7 @@ def preprocess_img(
     threshold_method="adaptive",
     rotate="auto",
     skew=False,
-    blur=True,
+    blur=False,#True,
     blur_ksize=(5, 5),
     morph=True,
     morph_op="open",
@@ -384,12 +379,14 @@ def preprocess_img(
     clahe_grid_size: CLAHE 的网格大小。
     edge_detection: 是否进行边缘检测。
     """
+    import PIL.PngImagePlugin
     if isinstance(image, PIL.PngImagePlugin.PngImageFile):
         image = np.array(image)
     if isinstance(image, str):
         image = cv2.imread(image)
     if not isinstance(image, np.ndarray):
         image = np.array(image)
     try:
         if image.shape[1] == 4:  # Check if it has an alpha channel
             # Drop the alpha channel (if needed), or handle it as required
@@ -507,6 +504,8 @@ def text_postprocess(
     pattern=None,
     merge=True,
 ):
+    import re
+    from spellchecker import SpellChecker
     def correct_spelling(text_list):
         spell = SpellChecker()
@@ -531,9 +530,9 @@ def text_postprocess(
         return merged_text
     results = text
-    print(results)
     if spell_check:
-        results = correct_spelling(results)
+        # results = correct_spelling(results)
+        results=str2words(results)
     if clean:
         results = clean_text(results)
     if filter:
@@ -552,42 +551,39 @@ def get_text(
     image,
     lang=["ch_sim", "en"],
     model="paddleocr",  # "pytesseract","paddleocr","easyocr"
-    thr=0.1,
+    thr=0.1,
     gpu=True,
     decoder="wordbeamsearch",  #'greedy', 'beamsearch' and 'wordbeamsearch'(hightly accurate)
     output="txt",
     preprocess=None,
-    postprocess="not ready",
+    postprocess=False,# do not check spell
     show=True,
     ax=None,
     cmap=cv2.COLOR_BGR2RGB,  # draw_box
-    font=cv2.FONT_HERSHEY_SIMPLEX,
-    font_scale=0.8,
-    thickness_text=2,  # Line thickness of 2 px
-    box_color=(0, 255, 0),  # draw_box
-    font_color=(0, 0, 0),
-    bg_color=(133, 203, 245, 100),
+    font=cv2.FONT_HERSHEY_SIMPLEX,# draw_box
+    fontsize=8,# draw_box
+    figsize=[10,10],
+    box_color = (0, 255, 0),  # draw_box
+    fontcolor = (0, 0, 0),# draw_box
+    bg_color=(133, 203, 245, 100),# draw_box
     usage=False,
     **kwargs,
 ):
     """
-    功能: 该函数使用 EasyOCR 进行文本识别，并允许自定义图像预处理步骤和结果展示。
-    参数:
-    image: 输入的图像路径或图像数据。
-    lang: OCR 语言列表。
-    thr: 置信度阈值，低于此阈值的检测结果将被过滤。
-    gpu: 是否使用 GPU。
-    output: 输出类型，可以是 'all'（返回所有检测结果）、'text'（返回文本）、'score'（返回置信度分数）、'box'（返回边界框）。
-    preprocess: 预处理参数字典，传递给 preprocess_img 函数。
-    show: 是否显示结果图像。
-    ax: 用于显示图像的 Matplotlib 子图。
-    cmap: 用于显示图像的颜色映射。
-    box_color: 边界框的颜色。
-    font_color: 文本的颜色。
-    kwargs: 传递给 EasyOCR readtext 函数的其他参数。
-    # Uage
+        image: 输入的图像路径或图像数据。
+        lang: OCR 语言列表。
+        thr: 置信度阈值，低于此阈值的检测结果将被过滤。
+        gpu: 是否使用 GPU。
+        output: 输出类型，可以是 'all'（返回所有检测结果）、'text'（返回文本）、'score'（返回置信度分数）、'box'（返回边界框）。
+        preprocess: 预处理参数字典，传递给 preprocess_img 函数。
+        show: 是否显示结果图像。
+        ax: 用于显示图像的 Matplotlib 子图。
+        cmap: 用于显示图像的颜色映射。
+        box_color: 边界框的颜色。
+        fontcolor: 文本的颜色。
+        kwargs: 传递给 EasyOCR readtext 函数的其他参数。
     """
+    from PIL import Image
     if usage:
         print(
             """
@@ -612,16 +608,20 @@ def get_text(
                 "edge_detection": False
             },
             adjust_contrast=0.7
-        )
-    """
-        )
+        )""")
-    models = ["easyocr", "paddleocr", "pytesseract","ddddocr"]
+    models = ["easyocr", "paddleocr", "pytesseract","ddddocr","zerox"]
     model = strcmp(model, models)[0]
     lang = lang_auto_detect(lang, model)
-    if isinstance(image, str):
-        dir_img=image
+    cvt_cmp=True
+    if isinstance(image, str) and isa(image,'file'):
         image = cv2.imread(image)
+    elif isa(image,'image'):
+        cvt_cmp=False
+        print(1)
+        image = np.array(image)
+    else:
+        raise ValueError(f"not support image with {type(image)} type")
     # Ensure lang is always a list
     if isinstance(lang, str):
@@ -631,110 +631,94 @@ def get_text(
     if preprocess is None:
         preprocess = {}
     image_process = preprocess_img(image, **preprocess)
+    plt.figure(figsize=figsize) if show else None
+    # plt.subplot(131)
+    # plt.imshow(cv2.cvtColor(image, cmap))  if cvt_cmp else plt.imshow(image)
+    # plt.subplot(132)
+    # plt.imshow(image_process)
+    # plt.subplot(133)
     if "easy" in model.lower():
+        import easyocr
         print(f"detecting language(s):{lang}")
         # Perform OCR on the image
         reader = easyocr.Reader(lang, gpu=gpu)
         detections = reader.readtext(image_process, decoder=decoder, **kwargs)
-        if postprocess is None:
-            postprocess = dict(
-                spell_check=True,
-                clean=True,
-                filter=dict(min_length=2),
-                pattern=None,
-                merge=True,
-            )
-            text_corr = []
-            [
-                text_corr.extend(text_postprocess(text, **postprocess))
-                for _, text, _ in detections
-            ]
+        text_corr = []
+        for _, text, _ in detections:
+            text_corr.append(text_postprocess(text) if postprocess else text)
         if show:
             if ax is None:
                 ax = plt.gca()
-            for bbox, text, score in detections:
+            for i, (bbox, text, score) in enumerate(detections):
                 if score > thr:
                     top_left = tuple(map(int, bbox[0]))
                     bottom_right = tuple(map(int, bbox[2]))
-                    image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
-                    # image = cv2.putText(
-                    #     image, text, top_left, font, font_scale, font_color, thickness_text
-                    # )
+                    image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
                     image = add_text_pil(
                         image,
-                        text,
+                        text_corr[i],
                         top_left,
-                        font_size=font_scale * 32,
-                        color=font_color,
+                        cvt_cmp=cvt_cmp,
+                        font_size=fontsize *6,
+                        color=fontcolor,
                     )
-            # img_cmp = cv2.cvtColor(image, cmap)
-            ax.imshow(image)
+            try:
+                img_cmp = cv2.cvtColor(image, cmap) if cvt_cmp else image
+            except:
+                img_cmp=image
+            ax.imshow(img_cmp) if cvt_cmp else ax.imshow(image)
             ax.axis("off")
-            # plt.show()
-            # 根据输出类型返回相应的结果
             if output == "all":
                 return ax, detections
             elif "t" in output.lower() and "x" in output.lower():
-                # 提取文本，过滤低置信度的结果
                 text = [text_ for _, text_, score_ in detections if score_ >= thr]
                 if postprocess:
                     return ax, text
                 else:
                     return text_corr
             elif "score" in output.lower() or "prob" in output.lower():
-                # 提取分数
                 scores = [score_ for _, _, score_ in detections]
                 return ax, scores
             elif "box" in output.lower():
-                # 提取边界框，过滤低置信度的结果
                 bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
                 return ax, bboxes
             else:
-                # 默认返回所有检测信息
                 return ax, detections
         else:
-            # 根据输出类型返回相应的结果
             if output == "all":
                 return detections
             elif "t" in output.lower() and "x" in output.lower():
-                # 提取文本，过滤低置信度的结果
                 text = [text_ for _, text_, score_ in detections if score_ >= thr]
                 return text
             elif "score" in output.lower() or "prob" in output.lower():
-                # 提取分数
                 scores = [score_ for _, _, score_ in detections]
                 return scores
             elif "box" in output.lower():
-                # 提取边界框，过滤低置信度的结果
                 bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
                 return bboxes
             else:
-                # 默认返回所有检测信息
                 return detections
     elif "pad" in model.lower():
+        from paddleocr import PaddleOCR
+        lang=strcmp(lang, ['ch','en','french','german','korean','japan'])[0]
         ocr = PaddleOCR(
             use_angle_cls=True,
             cls=True,
+            lang=lang
         )  # PaddleOCR supports only one language at a time
-        result = ocr.ocr(image_process, **kwargs)
+        cls=kwargs.pop('cls',True)
+        result = ocr.ocr(image_process,cls=cls, **kwargs)
         detections = []
         if result[0] is not None:
             for line in result[0]:
                 bbox, (text, score) = line
+                text = str2words(text) if postprocess else text # check spell
                 detections.append((bbox, text, score))
-        if postprocess is None:
-            postprocess = dict(
-                spell_check=True,
-                clean=True,
-                filter=dict(min_length=2),
-                pattern=None,
-                merge=True,
-            )
-            text_corr = []
-            [
-                text_corr.extend(text_postprocess(text, **postprocess))
-                for _, text, _ in detections
-            ]
         if show:
             if ax is None:
                 ax = plt.gca()
@@ -746,60 +730,48 @@ def get_text(
                     )  # Bottom-left for more accurate placement
                     bottom_right = tuple(map(int, bbox[2]))
                     image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
-                    # image = cv2.putText(
-                    #     image, text, top_left, font, font_scale, font_color, thickness_text
-                    # )
                     image = add_text_pil(
                         image,
                         text,
                         top_left,
-                        font_size=font_scale * 32,
-                        color=font_color,
+                        cvt_cmp=cvt_cmp,
+                        font_size=fontsize *6,
+                        color=fontcolor,
                         bg_color=bg_color,
                     )
-            img_cmp = cv2.cvtColor(image, cmap)
-            ax.imshow(image)
+            try:
+                img_cmp = cv2.cvtColor(image, cmap) if cvt_cmp else image
+            except:
+                img_cmp = image
+            ax.imshow(img_cmp)
             ax.axis("off")
-            # plt.show()
-            # 根据输出类型返回相应的结果
             if output == "all":
                 return ax, detections
             elif "t" in output.lower() and "x" in output.lower():
-                # 提取文本，过滤低置信度的结果
                 text = [text_ for _, text_, score_ in detections if score_ >= thr]
-                if postprocess:
-                    return ax, text
-                else:
-                    return text_corr
+                return ax, text
             elif "score" in output.lower() or "prob" in output.lower():
-                # 提取分数
                 scores = [score_ for _, _, score_ in detections]
                 return ax, scores
             elif "box" in output.lower():
-                # 提取边界框，过滤低置信度的结果
                 bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
                 return ax, bboxes
             else:
-                # 默认返回所有检测信息
                 return ax, detections
         else:
-            # 根据输出类型返回相应的结果
             if output == "all":
                 return detections
             elif "t" in output.lower() and "x" in output.lower():
-                # 提取文本，过滤低置信度的结果
                 text = [text_ for _, text_, score_ in detections if score_ >= thr]
                 return text
             elif "score" in output.lower() or "prob" in output.lower():
-                # 提取分数
                 scores = [score_ for _, _, score_ in detections]
                 return scores
             elif "box" in output.lower():
-                # 提取边界框，过滤低置信度的结果
                 bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
                 return bboxes
             else:
-                # 默认返回所有检测信息
                 return detections
     elif "ddddocr" in  model.lower():
         import ddddocr
@@ -844,7 +816,51 @@ def get_text(
             ax.imshow(image_vis)
             ax.axis("off")
         return detections
+    elif "zerox" in model.lower():
+        from pyzerox import zerox
+        result = zerox(image_process)
+        detections = [(bbox, text, score) for bbox, text, score in result]
+        # Postprocess and visualize
+        if postprocess is None:
+            postprocess = dict(
+                spell_check=True,
+                clean=True,
+                filter=dict(min_length=2),
+                pattern=None,
+                merge=True,
+            )
+        text_corr = [text_postprocess(text, **postprocess) for _, text, _ in detections]
+        # Display results if 'show' is True
+        if show:
+            if ax is None:
+                ax = plt.gca()
+            for bbox, text, score in detections:
+                if score > thr:
+                    top_left = tuple(map(int, bbox[0]))
+                    bottom_right = tuple(map(int, bbox[2]))
+                    image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
+                    image = add_text_pil(image, text, top_left, cvt_cmp=cvt_cmp,font_size=fontsize *6, color=fontcolor, bg_color=bg_color)
+            ax.imshow(image)
+            ax.axis("off")
+        # Return result based on 'output' type
+        if output == "all":
+            return ax, detections
+        elif "t" in output.lower() and "x" in output.lower():
+            text = [text_ for _, text_, score_ in detections if score_ >= thr]
+            return ax, text
+        elif "score" in output.lower() or "prob" in output.lower():
+            scores = [score_ for _, _, score_ in detections]
+            return ax, scores
+        elif "box" in output.lower():
+            bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
+            return ax, bboxes
+        else:
+            return detections
     else:  # "pytesseract"
+        import pytesseract
         if ax is None:
             ax = plt.gca()
         text = pytesseract.image_to_string(image_process, lang="+".join(lang), **kwargs)
@@ -869,8 +885,9 @@ def get_text(
                         image,
                         char,
                         left,
-                        font_size=font_scale * 32,
-                        color=font_color,
+                        cvt_cmp=cvt_cmp,
+                        font_size=fontsize *6,
+                        color=fontcolor,
                     )
             img_cmp = cv2.cvtColor(image, cmap)
             ax.imshow(img_cmp)
@@ -906,8 +923,8 @@ def draw_box(
     thr=0.25,
     cmap=cv2.COLOR_BGR2RGB,
     box_color=(0, 255, 0),  # draw_box
-    font_color=(0, 0, 255),  # draw_box
-    font_scale=0.8,
+    fontcolor=(0, 0, 255),  # draw_box
+    fontsize=8,
     show=True,
     ax=None,
     **kwargs,
@@ -924,12 +941,9 @@ def draw_box(
         if score > thr:
             top_left = tuple(map(int, bbox[0]))
             bottom_right = tuple(map(int, bbox[2]))
-            image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
-            # image = cv2.putText(
-            #     image, text, top_left, font, font_scale, font_color, thickness_text
-            # )
+            image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
             image = add_text_pil(
-                image, text, top_left, font_size=font_scale * 32, color=font_color
+                image, text, top_left, cvt_cmp=cvt_cmp,font_size=fontsize *6, color=fontcolor
             )
     img_cmp = cv2.cvtColor(image, cmap)

py2ls 0.2.4.32__py3-none-any.whl → 0.2.4.33__py3-none-any.whl

py2ls 0.2.4.32py3-none-any.whl → 0.2.4.33py3-none-any.whl