hjxdl 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hdl/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.1.1'
16
- __version_tuple__ = version_tuple = (0, 1, 1)
15
+ __version__ = version = '0.1.3'
16
+ __version_tuple__ = version_tuple = (0, 1, 3)
@@ -0,0 +1,119 @@
1
+ import pdfplumber
2
+ import pytesseract
3
+ from PIL import Image
4
+ import pandas as pd
5
+ import io
6
+ from spire.doc import Document
7
+ from spire.doc.common import *
8
+
9
+
10
+ class DocExtractor():
11
+ def __init__(
12
+ self,
13
+ lang: str = "chi_sim"
14
+ ) -> None:
15
+ self.lang = lang
16
+
17
+ @classmethod
18
+ def text_from_doc(
19
+ doc_path
20
+ ):
21
+ document = Document()
22
+ # Load a Word document
23
+ document.LoadFromFile(doc_path)
24
+ document_text = document.GetText()
25
+ return document_text
26
+
27
+ @staticmethod
28
+ def text_from_plain(
29
+ txt_path
30
+ ):
31
+ with open(txt_path, "r") as f:
32
+ text = f.read()
33
+ return text
34
+
35
+ @staticmethod
36
+ def extract_text_from_image(
37
+ image: Image.Image,
38
+ ) -> str:
39
+ return pytesseract.image_to_string(image, lang=self.lang)
40
+
41
+ @staticmethod
42
+ def is_within_bbox(
43
+ bbox1, bbox2
44
+ ):
45
+ """Check if bbox1 is within bbox2."""
46
+ return bbox1[0] >= bbox2[0] and bbox1[1] >= bbox2[1] and bbox1[2] <= bbox2[2] and bbox1[3] <= bbox2[3]
47
+
48
+ def text_tables_from_pdf(
49
+ self,
50
+ pdf_path,
51
+ table_from_pic: bool = False
52
+ ):
53
+ all_tables = []
54
+ all_texts = []
55
+ with pdfplumber.open(pdf_path) as pdf:
56
+ for page_number, page in enumerate(pdf.pages):
57
+ tables = page.find_tables()
58
+ page_text = page.extract_text(x_tolerance=0.1, y_tolerance=0.1) or ''
59
+ page_text_lines = page_text.split('\n')
60
+
61
+ # Extract tables
62
+ if tables:
63
+ for table in tables:
64
+ if table and len(table.extract()) > 1:
65
+ table_data = table.extract()
66
+ df = pd.DataFrame(table_data[1:], columns=table_data[0])
67
+ df['Page'] = page_number + 1 # 添加页码信息
68
+ all_tables.append(df)
69
+
70
+ # Get bounding boxes for tables
71
+ table_bboxes = [table.bbox for table in tables]
72
+
73
+ # Filter out text within table bounding boxes
74
+ non_table_text = []
75
+ for char in page.chars:
76
+ char_bbox = (char['x0'], char['top'], char['x1'], char['bottom'])
77
+ if not any(self.is_within_bbox(char_bbox, table_bbox) for table_bbox in table_bboxes):
78
+ non_table_text.append(char['text'])
79
+ remaining_text = ''.join(non_table_text).strip()
80
+ if remaining_text:
81
+ all_texts.append(remaining_text)
82
+
83
+ # Extract tables from images if specified
84
+ if table_from_pic:
85
+ for img in page.images:
86
+ try:
87
+ x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
88
+ if x0 < 0 or top < 0 or x1 > page.width or bottom > page.height:
89
+ print(f"Skipping image with invalid bounds on page {page_number + 1}")
90
+ continue
91
+
92
+ cropped_image = page.within_bbox((x0, top, x1, bottom)).to_image()
93
+ img_bytes = io.BytesIO()
94
+ cropped_image.save(img_bytes, format='PNG')
95
+ img_bytes.seek(0)
96
+ pil_image = Image.open(img_bytes)
97
+
98
+ ocr_text = self.extract_text_from_image(pil_image, lang=self.lang)
99
+
100
+ table = [line.split() for line in ocr_text.split('\n') if line.strip()]
101
+
102
+ if table:
103
+ num_columns = max(len(row) for row in table)
104
+ for row in table:
105
+ if len(row) != num_columns:
106
+ row.extend([''] * (num_columns - len(row)))
107
+
108
+ df = pd.DataFrame(table[1:], columns=table[0])
109
+ df['Page'] = page_number + 1
110
+ all_tables.append(df)
111
+ except Exception as e:
112
+ print(f"Error processing image on page {page_number + 1}: {e}")
113
+
114
+ if all_tables:
115
+ return all_texts, all_tables
116
+ else:
117
+ return all_texts, [pd.DataFrame()]
118
+
119
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hjxdl
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: A collection of functions for Jupyter notebooks
5
5
  Home-page: https://github.com/huluxiaohuowa/hdl
6
6
  Author: Jianxing Hu
@@ -1,5 +1,5 @@
1
1
  hdl/__init__.py,sha256=5sZZNySv08wwfzJcSDssGTqUn9wlmDsR6R4XB8J8mFM,70
2
- hdl/_version.py,sha256=PKIMyjdUACH4-ONvtunQCnYE2UhlMfp9su83e3HXl5E,411
2
+ hdl/_version.py,sha256=L5DCMp1QAlSqy-8bW7d51bLubTxNjZGYc5fMQkb752U,411
3
3
  hdl/args/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  hdl/args/loss_args.py,sha256=s7YzSdd7IjD24rZvvOrxLLFqMZQb9YylxKeyelSdrTk,70
5
5
  hdl/controllers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -86,9 +86,10 @@ hdl/utils/general/glob.py,sha256=8-RCnt6L297wMIfn34ZAMCsGCZUjHG3MGglGZI1cX0g,491
86
86
  hdl/utils/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
87
  hdl/utils/llm/chat.py,sha256=H2c8assJlSdZQKIfPkYrVZHqv66TsdsxtaLXv0kNe1w,11565
88
88
  hdl/utils/llm/embs.py,sha256=sC8tga7HgDwPI2m7TDWKp9kkxEIMxEyMtgmEhfRi4vI,6362
89
+ hdl/utils/llm/extract.py,sha256=eF-oHu5sMtes8I6ZfNXnEykPfzqbn-2WvnIKiUMz6BA,4573
89
90
  hdl/utils/schedulers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
91
  hdl/utils/schedulers/norm_lr.py,sha256=bDwCmdEK-WkgxQMFBiMuchv8Mm7C0-GZJ6usm-PQk14,4461
91
- hjxdl-0.1.1.dist-info/METADATA,sha256=ORYQSX57x_WZQ1pG6KIjvesKNPshg1vCKv_8kw0hyas,542
92
- hjxdl-0.1.1.dist-info/WHEEL,sha256=pd56usn78UTvq1xeX_ZwFhoK6jE5u5wzu4TTBIG5cQ0,91
93
- hjxdl-0.1.1.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
94
- hjxdl-0.1.1.dist-info/RECORD,,
92
+ hjxdl-0.1.3.dist-info/METADATA,sha256=e-L25DrhaIVW_yfpSpmoqH4k-J6yaTooOGviFgDXFwo,542
93
+ hjxdl-0.1.3.dist-info/WHEEL,sha256=-oYQCr74JF3a37z2nRlQays_SX2MqOANoqVjBBAP2yE,91
94
+ hjxdl-0.1.3.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
95
+ hjxdl-0.1.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.0.0)
2
+ Generator: setuptools (71.0.3)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5