PyPI - py2ls - Versions diffs - 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl - Mend

py2ls 0.2.4.32py3-none-any.whl → 0.2.4.34py3-none-any.whl

Files changed (8) hide show

py2ls/.git/index +0 -0
py2ls/ips.py +779 -194
py2ls/netfinder.py +99 -0
py2ls/ocr.py +139 -126
py2ls/plot.py +612 -376
{py2ls-0.2.4.32.dist-info → py2ls-0.2.4.34.dist-info}/METADATA +2 -2
{py2ls-0.2.4.32.dist-info → py2ls-0.2.4.34.dist-info}/RECORD +8 -8
{py2ls-0.2.4.32.dist-info → py2ls-0.2.4.34.dist-info}/WHEEL +0 -0

py2ls/netfinder.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from bs4 import BeautifulSoup
+import scrapy
 import requests
 import os
 import pandas as pd
@@ -332,6 +333,94 @@ def parse_cookies(cookies_str):
     return cookies_dict
+class FetchSpider(scrapy.Spider):
+    name = "fetch_spider"
+    def __init__(self, url, parser="html.parser", cookies=None, headers=None, *args, **kwargs):
+        super(FetchSpider, self).__init__(*args, **kwargs)
+        self.start_urls = [url]
+        self.cookies = cookies
+        self.headers = headers
+        self.parser = parser
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(
+                url,
+                cookies=self.cookies,
+                headers=self.headers,
+                callback=self.parse
+            )
+    def parse(self, response):
+        # Use the desired parser (default: html.parser)
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(response.text, self.parser)
+        yield {"content": soup}
+def fetch_scrapy(
+    url,
+    parser="html.parser",
+    cookies=None,
+    headers=None,
+    settings=None,
+):
+    """
+    Fetches content using Scrapy.
+    Args:
+        url (str): The URL to scrape.
+        parser (str): Parser for BeautifulSoup (e.g., "lxml", "html.parser").
+        cookies (dict): Cookies to pass in the request.
+        headers (dict): HTTP headers for the request.
+        settings (dict): Scrapy settings, if any.
+    Returns:
+        dict: Parsed content as a dictionary.
+    """
+    from scrapy.utils.project import get_project_settings
+    from scrapy.crawler import CrawlerProcess
+    from scrapy.signalmanager import dispatcher
+    from scrapy import signals
+    import scrapy
+    # Container for scraped content
+    content = []
+    # Callback function for item scraped signal
+    def handle_item(item, response, spider):
+        content.append(item["content"])
+    # Scrapy settings
+    process_settings = settings or get_project_settings()
+    process_settings.update(
+        {
+            "USER_AGENT": "CustomUserAgent/1.0",  # Use a custom user agent
+            "DOWNLOAD_DELAY": 1,  # Prevent overloading servers
+            "COOKIES_ENABLED": bool(cookies),
+            "LOG_LEVEL": "ERROR",  # Minimize log verbosity
+        }
+    )
+    # Initialize and configure Scrapy process
+    process = CrawlerProcess(settings=process_settings)
+    dispatcher.connect(handle_item, signal=signals.item_scraped)
+    # Start the Scrapy crawl
+    process.crawl(
+        FetchSpider,
+        url=url,
+        parser=parser,
+        cookies=cookies,
+        headers=headers,
+    )
+    process.start()  # Blocks until all crawls are finished
+    # Return the first scraped content or None if empty
+    return content[0] if content else None
 def fetch_all(
     url,
     parser="lxml",
@@ -558,6 +647,16 @@ def fetch_all(
             else:
                 logger.warning("Selenium could not fetch content")
                 return None, None
+        elif 'scr' in driver.lower():
+            settings = {
+                "USER_AGENT": user_agent(),
+                "DOWNLOAD_DELAY": 1,  # Prevent overloading the server
+                "COOKIES_ENABLED": True if cookies else False,
+                "LOG_LEVEL": "WARNING",  # Reduce log verbosity
+            }
+            content=fetch_scrapy(url, parser=parser, cookies=cookies, headers=headers, settings=settings)
+            return parser, content
     except requests.RequestException as e:
         logger.error(f"Error fetching URL '{url}': {e}")
         return None, None

py2ls/ocr.py CHANGED Viewed

@@ -1,24 +1,15 @@
-import easyocr
 import cv2
 import numpy as np
 import matplotlib.pyplot as plt
 from py2ls.ips import (
     strcmp,
     detect_angle,
-)  # Ensure this function is defined in your 'ips' module
-from spellchecker import SpellChecker
-import re
-from PIL import Image, ImageDraw, ImageFont
-import PIL.PngImagePlugin
-import pytesseract
-from paddleocr import PaddleOCR
+    str2words,
+    isa
+)
 import logging
-logging.getLogger("ppocr").setLevel(
-    logging.WARNING
-)  # or logging.ERROR to show only error messages
 """
     Optical Character Recognition (OCR)
 """
@@ -285,10 +276,12 @@ def add_text_pil(
     image,
     text,
     position,
+    cvt_cmp=True,
     font_size=12,
     color=(0, 0, 0),
     bg_color=(133, 203, 245, 100),
 ):
+    from PIL import Image, ImageDraw, ImageFont
     # Convert the image to PIL format
     pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)).convert("RGBA")
     # Define the font (make sure to use a font that supports Chinese characters)
@@ -337,7 +330,7 @@ def add_text_pil(
         overlay = overlay.convert("RGBA")
     combined = Image.alpha_composite(pil_image, overlay)
     # Convert the image back to OpenCV format
-    image = cv2.cvtColor(np.array(combined), cv2.COLOR_RGBA2BGR)
+    image = cv2.cvtColor(np.array(combined), cv2.COLOR_RGBA2BGR) #if cvt_cmp else np.array(combined)
     return image
@@ -348,7 +341,7 @@ def preprocess_img(
     threshold_method="adaptive",
     rotate="auto",
     skew=False,
-    blur=True,
+    blur=False,#True,
     blur_ksize=(5, 5),
     morph=True,
     morph_op="open",
@@ -384,12 +377,14 @@ def preprocess_img(
     clahe_grid_size: CLAHE 的网格大小。
     edge_detection: 是否进行边缘检测。
     """
+    import PIL.PngImagePlugin
     if isinstance(image, PIL.PngImagePlugin.PngImageFile):
         image = np.array(image)
     if isinstance(image, str):
         image = cv2.imread(image)
     if not isinstance(image, np.ndarray):
         image = np.array(image)
     try:
         if image.shape[1] == 4:  # Check if it has an alpha channel
             # Drop the alpha channel (if needed), or handle it as required
@@ -507,6 +502,8 @@ def text_postprocess(
     pattern=None,
     merge=True,
 ):
+    import re
+    from spellchecker import SpellChecker
     def correct_spelling(text_list):
         spell = SpellChecker()
@@ -531,9 +528,9 @@ def text_postprocess(
         return merged_text
     results = text
-    print(results)
     if spell_check:
-        results = correct_spelling(results)
+        # results = correct_spelling(results)
+        results=str2words(results)
     if clean:
         results = clean_text(results)
     if filter:
@@ -552,42 +549,39 @@ def get_text(
     image,
     lang=["ch_sim", "en"],
     model="paddleocr",  # "pytesseract","paddleocr","easyocr"
-    thr=0.1,
+    thr=0.1,
     gpu=True,
     decoder="wordbeamsearch",  #'greedy', 'beamsearch' and 'wordbeamsearch'(hightly accurate)
     output="txt",
     preprocess=None,
-    postprocess="not ready",
+    postprocess=False,# do not check spell
     show=True,
     ax=None,
     cmap=cv2.COLOR_BGR2RGB,  # draw_box
-    font=cv2.FONT_HERSHEY_SIMPLEX,
-    font_scale=0.8,
-    thickness_text=2,  # Line thickness of 2 px
-    box_color=(0, 255, 0),  # draw_box
-    font_color=(0, 0, 0),
-    bg_color=(133, 203, 245, 100),
+    font=cv2.FONT_HERSHEY_SIMPLEX,# draw_box
+    fontsize=8,# draw_box
+    figsize=[10,10],
+    box_color = (0, 255, 0),  # draw_box
+    fontcolor = (0, 0, 0),# draw_box
+    bg_color=(133, 203, 245, 100),# draw_box
     usage=False,
     **kwargs,
 ):
     """
-    功能: 该函数使用 EasyOCR 进行文本识别，并允许自定义图像预处理步骤和结果展示。
-    参数:
-    image: 输入的图像路径或图像数据。
-    lang: OCR 语言列表。
-    thr: 置信度阈值，低于此阈值的检测结果将被过滤。
-    gpu: 是否使用 GPU。
-    output: 输出类型，可以是 'all'（返回所有检测结果）、'text'（返回文本）、'score'（返回置信度分数）、'box'（返回边界框）。
-    preprocess: 预处理参数字典，传递给 preprocess_img 函数。
-    show: 是否显示结果图像。
-    ax: 用于显示图像的 Matplotlib 子图。
-    cmap: 用于显示图像的颜色映射。
-    box_color: 边界框的颜色。
-    font_color: 文本的颜色。
-    kwargs: 传递给 EasyOCR readtext 函数的其他参数。
-    # Uage
+        image: 输入的图像路径或图像数据。
+        lang: OCR 语言列表。
+        thr: 置信度阈值，低于此阈值的检测结果将被过滤。
+        gpu: 是否使用 GPU。
+        output: 输出类型，可以是 'all'（返回所有检测结果）、'text'（返回文本）、'score'（返回置信度分数）、'box'（返回边界框）。
+        preprocess: 预处理参数字典，传递给 preprocess_img 函数。
+        show: 是否显示结果图像。
+        ax: 用于显示图像的 Matplotlib 子图。
+        cmap: 用于显示图像的颜色映射。
+        box_color: 边界框的颜色。
+        fontcolor: 文本的颜色。
+        kwargs: 传递给 EasyOCR readtext 函数的其他参数。
     """
+    from PIL import Image
     if usage:
         print(
             """
@@ -612,16 +606,19 @@ def get_text(
                 "edge_detection": False
             },
             adjust_contrast=0.7
-        )
-    """
-        )
+        )""")
-    models = ["easyocr", "paddleocr", "pytesseract","ddddocr"]
+    models = ["easyocr", "paddleocr", "pytesseract","ddddocr","zerox"]
     model = strcmp(model, models)[0]
     lang = lang_auto_detect(lang, model)
-    if isinstance(image, str):
-        dir_img=image
+    cvt_cmp=True
+    if isinstance(image, str) and isa(image,'file'):
         image = cv2.imread(image)
+    elif isa(image,'image'):
+        cvt_cmp=False
+        image = np.array(image)
+    else:
+        raise ValueError(f"not support image with {type(image)} type")
     # Ensure lang is always a list
     if isinstance(lang, str):
@@ -631,110 +628,96 @@ def get_text(
     if preprocess is None:
         preprocess = {}
     image_process = preprocess_img(image, **preprocess)
+    plt.figure(figsize=figsize) if show else None
+    # plt.subplot(131)
+    # plt.imshow(cv2.cvtColor(image, cmap))  if cvt_cmp else plt.imshow(image)
+    # plt.subplot(132)
+    # plt.imshow(image_process)
+    # plt.subplot(133)
     if "easy" in model.lower():
+        import easyocr
         print(f"detecting language(s):{lang}")
         # Perform OCR on the image
         reader = easyocr.Reader(lang, gpu=gpu)
         detections = reader.readtext(image_process, decoder=decoder, **kwargs)
-        if postprocess is None:
-            postprocess = dict(
-                spell_check=True,
-                clean=True,
-                filter=dict(min_length=2),
-                pattern=None,
-                merge=True,
-            )
-            text_corr = []
-            [
-                text_corr.extend(text_postprocess(text, **postprocess))
-                for _, text, _ in detections
-            ]
+        text_corr = []
+        for _, text, _ in detections:
+            text_corr.append(text_postprocess(text) if postprocess else text)
         if show:
             if ax is None:
                 ax = plt.gca()
-            for bbox, text, score in detections:
+            for i, (bbox, text, score) in enumerate(detections):
                 if score > thr:
                     top_left = tuple(map(int, bbox[0]))
                     bottom_right = tuple(map(int, bbox[2]))
-                    image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
-                    # image = cv2.putText(
-                    #     image, text, top_left, font, font_scale, font_color, thickness_text
-                    # )
+                    image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
                     image = add_text_pil(
                         image,
-                        text,
+                        text_corr[i],
                         top_left,
-                        font_size=font_scale * 32,
-                        color=font_color,
+                        cvt_cmp=cvt_cmp,
+                        font_size=fontsize *6,
+                        color=fontcolor,
                     )
-            # img_cmp = cv2.cvtColor(image, cmap)
-            ax.imshow(image)
+            try:
+                img_cmp = cv2.cvtColor(image, cmap) if cvt_cmp else image
+            except:
+                img_cmp=image
+            ax.imshow(img_cmp) if cvt_cmp else ax.imshow(image)
             ax.axis("off")
-            # plt.show()
-            # 根据输出类型返回相应的结果
             if output == "all":
                 return ax, detections
             elif "t" in output.lower() and "x" in output.lower():
-                # 提取文本，过滤低置信度的结果
                 text = [text_ for _, text_, score_ in detections if score_ >= thr]
                 if postprocess:
                     return ax, text
                 else:
                     return text_corr
             elif "score" in output.lower() or "prob" in output.lower():
-                # 提取分数
                 scores = [score_ for _, _, score_ in detections]
                 return ax, scores
             elif "box" in output.lower():
-                # 提取边界框，过滤低置信度的结果
                 bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
                 return ax, bboxes
             else:
-                # 默认返回所有检测信息
                 return ax, detections
         else:
-            # 根据输出类型返回相应的结果
             if output == "all":
                 return detections
             elif "t" in output.lower() and "x" in output.lower():
-                # 提取文本，过滤低置信度的结果
                 text = [text_ for _, text_, score_ in detections if score_ >= thr]
                 return text
             elif "score" in output.lower() or "prob" in output.lower():
-                # 提取分数
                 scores = [score_ for _, _, score_ in detections]
                 return scores
             elif "box" in output.lower():
-                # 提取边界框，过滤低置信度的结果
                 bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
                 return bboxes
             else:
-                # 默认返回所有检测信息
                 return detections
     elif "pad" in model.lower():
+        from paddleocr import PaddleOCR
+        logging.getLogger("ppocr").setLevel(logging.ERROR)
+        lang=strcmp(lang, ['ch','en','french','german','korean','japan'])[0]
         ocr = PaddleOCR(
             use_angle_cls=True,
             cls=True,
+            lang=lang
         )  # PaddleOCR supports only one language at a time
-        result = ocr.ocr(image_process, **kwargs)
+        cls=kwargs.pop('cls',True)
+        result = ocr.ocr(image_process,cls=cls, **kwargs)
         detections = []
         if result[0] is not None:
             for line in result[0]:
                 bbox, (text, score) = line
+                text = str2words(text) if postprocess else text # check spell
                 detections.append((bbox, text, score))
-        if postprocess is None:
-            postprocess = dict(
-                spell_check=True,
-                clean=True,
-                filter=dict(min_length=2),
-                pattern=None,
-                merge=True,
-            )
-            text_corr = []
-            [
-                text_corr.extend(text_postprocess(text, **postprocess))
-                for _, text, _ in detections
-            ]
         if show:
             if ax is None:
                 ax = plt.gca()
@@ -746,60 +729,48 @@ def get_text(
                     )  # Bottom-left for more accurate placement
                     bottom_right = tuple(map(int, bbox[2]))
                     image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
-                    # image = cv2.putText(
-                    #     image, text, top_left, font, font_scale, font_color, thickness_text
-                    # )
                     image = add_text_pil(
                         image,
                         text,
                         top_left,
-                        font_size=font_scale * 32,
-                        color=font_color,
+                        cvt_cmp=cvt_cmp,
+                        font_size=fontsize *6,
+                        color=fontcolor,
                         bg_color=bg_color,
                     )
-            img_cmp = cv2.cvtColor(image, cmap)
-            ax.imshow(image)
+            try:
+                img_cmp = cv2.cvtColor(image, cmap) if cvt_cmp else image
+            except:
+                img_cmp = image
+            ax.imshow(img_cmp)
             ax.axis("off")
-            # plt.show()
-            # 根据输出类型返回相应的结果
             if output == "all":
                 return ax, detections
             elif "t" in output.lower() and "x" in output.lower():
-                # 提取文本，过滤低置信度的结果
                 text = [text_ for _, text_, score_ in detections if score_ >= thr]
-                if postprocess:
-                    return ax, text
-                else:
-                    return text_corr
+                return ax, text
             elif "score" in output.lower() or "prob" in output.lower():
-                # 提取分数
                 scores = [score_ for _, _, score_ in detections]
                 return ax, scores
             elif "box" in output.lower():
-                # 提取边界框，过滤低置信度的结果
                 bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
                 return ax, bboxes
             else:
-                # 默认返回所有检测信息
                 return ax, detections
         else:
-            # 根据输出类型返回相应的结果
             if output == "all":
                 return detections
             elif "t" in output.lower() and "x" in output.lower():
-                # 提取文本，过滤低置信度的结果
                 text = [text_ for _, text_, score_ in detections if score_ >= thr]
                 return text
             elif "score" in output.lower() or "prob" in output.lower():
-                # 提取分数
                 scores = [score_ for _, _, score_ in detections]
                 return scores
             elif "box" in output.lower():
-                # 提取边界框，过滤低置信度的结果
                 bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
                 return bboxes
             else:
-                # 默认返回所有检测信息
                 return detections
     elif "ddddocr" in  model.lower():
         import ddddocr
@@ -844,7 +815,51 @@ def get_text(
             ax.imshow(image_vis)
             ax.axis("off")
         return detections
+    elif "zerox" in model.lower():
+        from pyzerox import zerox
+        result = zerox(image_process)
+        detections = [(bbox, text, score) for bbox, text, score in result]
+        # Postprocess and visualize
+        if postprocess is None:
+            postprocess = dict(
+                spell_check=True,
+                clean=True,
+                filter=dict(min_length=2),
+                pattern=None,
+                merge=True,
+            )
+        text_corr = [text_postprocess(text, **postprocess) for _, text, _ in detections]
+        # Display results if 'show' is True
+        if show:
+            if ax is None:
+                ax = plt.gca()
+            for bbox, text, score in detections:
+                if score > thr:
+                    top_left = tuple(map(int, bbox[0]))
+                    bottom_right = tuple(map(int, bbox[2]))
+                    image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
+                    image = add_text_pil(image, text, top_left, cvt_cmp=cvt_cmp,font_size=fontsize *6, color=fontcolor, bg_color=bg_color)
+            ax.imshow(image)
+            ax.axis("off")
+        # Return result based on 'output' type
+        if output == "all":
+            return ax, detections
+        elif "t" in output.lower() and "x" in output.lower():
+            text = [text_ for _, text_, score_ in detections if score_ >= thr]
+            return ax, text
+        elif "score" in output.lower() or "prob" in output.lower():
+            scores = [score_ for _, _, score_ in detections]
+            return ax, scores
+        elif "box" in output.lower():
+            bboxes = [bbox_ for bbox_, _, score_ in detections if score_ >= thr]
+            return ax, bboxes
+        else:
+            return detections
     else:  # "pytesseract"
+        import pytesseract
         if ax is None:
             ax = plt.gca()
         text = pytesseract.image_to_string(image_process, lang="+".join(lang), **kwargs)
@@ -869,8 +884,9 @@ def get_text(
                         image,
                         char,
                         left,
-                        font_size=font_scale * 32,
-                        color=font_color,
+                        cvt_cmp=cvt_cmp,
+                        font_size=fontsize *6,
+                        color=fontcolor,
                     )
             img_cmp = cv2.cvtColor(image, cmap)
             ax.imshow(img_cmp)
@@ -906,8 +922,8 @@ def draw_box(
     thr=0.25,
     cmap=cv2.COLOR_BGR2RGB,
     box_color=(0, 255, 0),  # draw_box
-    font_color=(0, 0, 255),  # draw_box
-    font_scale=0.8,
+    fontcolor=(0, 0, 255),  # draw_box
+    fontsize=8,
     show=True,
     ax=None,
     **kwargs,
@@ -924,12 +940,9 @@ def draw_box(
         if score > thr:
             top_left = tuple(map(int, bbox[0]))
             bottom_right = tuple(map(int, bbox[2]))
-            image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
-            # image = cv2.putText(
-            #     image, text, top_left, font, font_scale, font_color, thickness_text
-            # )
+            image = cv2.rectangle(image, top_left, bottom_right, box_color, 2)
             image = add_text_pil(
-                image, text, top_left, font_size=font_scale * 32, color=font_color
+                image, text, top_left, cvt_cmp=cvt_cmp,font_size=fontsize *6, color=fontcolor
             )
     img_cmp = cv2.cvtColor(image, cmap)

py2ls 0.2.4.32__py3-none-any.whl → 0.2.4.34__py3-none-any.whl

py2ls 0.2.4.32py3-none-any.whl → 0.2.4.34py3-none-any.whl