PyPI - hjxdl - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

hjxdl 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

hdl/_version.py +2 -2
hdl/utils/llm/extract.py +119 -0
{hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/METADATA +1 -1
{hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/RECORD +6 -5
{hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/WHEEL +1 -1
{hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/top_level.txt +0 -0

hdl/_version.py CHANGED Viewed

@@ -12,5 +12,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '0.1.1'
-__version_tuple__ = version_tuple = (0, 1, 1)
+__version__ = version = '0.1.3'
+__version_tuple__ = version_tuple = (0, 1, 3)

hdl/utils/llm/extract.py ADDED Viewed

@@ -0,0 +1,119 @@
+import pdfplumber
+import pytesseract
+from PIL import Image
+import pandas as pd
+import io
+from spire.doc import Document
+from spire.doc.common import *
+class DocExtractor():
+    def __init__(
+        self,
+        lang: str = "chi_sim"
+    ) -> None:
+        self.lang = lang
+    @classmethod
+    def text_from_doc(
+        doc_path
+    ):
+        document = Document()
+        # Load a Word document
+        document.LoadFromFile(doc_path)
+        document_text = document.GetText()
+        return document_text
+    @staticmethod
+    def text_from_plain(
+        txt_path
+    ):
+        with open(txt_path, "r") as f:
+            text = f.read()
+        return text
+    @staticmethod
+    def extract_text_from_image(
+        image: Image.Image,
+    ) -> str:
+        return pytesseract.image_to_string(image, lang=self.lang)
+    @staticmethod
+    def is_within_bbox(
+        bbox1, bbox2
+    ):
+        """Check if bbox1 is within bbox2."""
+        return bbox1[0] >= bbox2[0] and bbox1[1] >= bbox2[1] and bbox1[2] <= bbox2[2] and bbox1[3] <= bbox2[3]
+    def text_tables_from_pdf(
+        self,
+        pdf_path,
+        table_from_pic: bool = False
+    ):
+        all_tables = []
+        all_texts = []
+        with pdfplumber.open(pdf_path) as pdf:
+            for page_number, page in enumerate(pdf.pages):
+                tables = page.find_tables()
+                page_text = page.extract_text(x_tolerance=0.1, y_tolerance=0.1) or ''
+                page_text_lines = page_text.split('\n')
+                # Extract tables
+                if tables:
+                    for table in tables:
+                        if table and len(table.extract()) > 1:
+                            table_data = table.extract()
+                            df = pd.DataFrame(table_data[1:], columns=table_data[0])
+                            df['Page'] = page_number + 1  # 添加页码信息
+                            all_tables.append(df)
+                # Get bounding boxes for tables
+                table_bboxes = [table.bbox for table in tables]
+                # Filter out text within table bounding boxes
+                non_table_text = []
+                for char in page.chars:
+                    char_bbox = (char['x0'], char['top'], char['x1'], char['bottom'])
+                    if not any(self.is_within_bbox(char_bbox, table_bbox) for table_bbox in table_bboxes):
+                        non_table_text.append(char['text'])
+                remaining_text = ''.join(non_table_text).strip()
+                if remaining_text:
+                    all_texts.append(remaining_text)
+                # Extract tables from images if specified
+                if table_from_pic:
+                    for img in page.images:
+                        try:
+                            x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
+                            if x0 < 0 or top < 0 or x1 > page.width or bottom > page.height:
+                                print(f"Skipping image with invalid bounds on page {page_number + 1}")
+                                continue
+                            cropped_image = page.within_bbox((x0, top, x1, bottom)).to_image()
+                            img_bytes = io.BytesIO()
+                            cropped_image.save(img_bytes, format='PNG')
+                            img_bytes.seek(0)
+                            pil_image = Image.open(img_bytes)
+                            ocr_text = self.extract_text_from_image(pil_image, lang=self.lang)
+                            table = [line.split() for line in ocr_text.split('\n') if line.strip()]
+                            if table:
+                                num_columns = max(len(row) for row in table)
+                                for row in table:
+                                    if len(row) != num_columns:
+                                        row.extend([''] * (num_columns - len(row)))
+                                df = pd.DataFrame(table[1:], columns=table[0])
+                                df['Page'] = page_number + 1
+                                all_tables.append(df)
+                        except Exception as e:
+                            print(f"Error processing image on page {page_number + 1}: {e}")
+        if all_tables:
+            return all_texts, all_tables
+        else:
+            return all_texts, [pd.DataFrame()]

{hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: hjxdl
-Version: 0.1.1
+Version: 0.1.3
 Summary: A collection of functions for Jupyter notebooks
 Home-page: https://github.com/huluxiaohuowa/hdl
 Author: Jianxing Hu

{hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 hdl/__init__.py,sha256=5sZZNySv08wwfzJcSDssGTqUn9wlmDsR6R4XB8J8mFM,70
-hdl/_version.py,sha256=PKIMyjdUACH4-ONvtunQCnYE2UhlMfp9su83e3HXl5E,411
+hdl/_version.py,sha256=L5DCMp1QAlSqy-8bW7d51bLubTxNjZGYc5fMQkb752U,411
 hdl/args/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hdl/args/loss_args.py,sha256=s7YzSdd7IjD24rZvvOrxLLFqMZQb9YylxKeyelSdrTk,70
 hdl/controllers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -86,9 +86,10 @@ hdl/utils/general/glob.py,sha256=8-RCnt6L297wMIfn34ZAMCsGCZUjHG3MGglGZI1cX0g,491
 hdl/utils/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hdl/utils/llm/chat.py,sha256=H2c8assJlSdZQKIfPkYrVZHqv66TsdsxtaLXv0kNe1w,11565
 hdl/utils/llm/embs.py,sha256=sC8tga7HgDwPI2m7TDWKp9kkxEIMxEyMtgmEhfRi4vI,6362
+hdl/utils/llm/extract.py,sha256=eF-oHu5sMtes8I6ZfNXnEykPfzqbn-2WvnIKiUMz6BA,4573
 hdl/utils/schedulers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 hdl/utils/schedulers/norm_lr.py,sha256=bDwCmdEK-WkgxQMFBiMuchv8Mm7C0-GZJ6usm-PQk14,4461
-hjxdl-0.1.1.dist-info/METADATA,sha256=ORYQSX57x_WZQ1pG6KIjvesKNPshg1vCKv_8kw0hyas,542
-hjxdl-0.1.1.dist-info/WHEEL,sha256=pd56usn78UTvq1xeX_ZwFhoK6jE5u5wzu4TTBIG5cQ0,91
-hjxdl-0.1.1.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
-hjxdl-0.1.1.dist-info/RECORD,,
+hjxdl-0.1.3.dist-info/METADATA,sha256=e-L25DrhaIVW_yfpSpmoqH4k-J6yaTooOGviFgDXFwo,542
+hjxdl-0.1.3.dist-info/WHEEL,sha256=-oYQCr74JF3a37z2nRlQays_SX2MqOANoqVjBBAP2yE,91
+hjxdl-0.1.3.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
+hjxdl-0.1.3.dist-info/RECORD,,

{hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (71.0.0)
+Generator: setuptools (71.0.3)
 Root-Is-Purelib: true
 Tag: py3-none-any

{hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

hjxdl 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

hjxdl 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl