hjxdl 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hdl/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '0.1.1'
16
- __version_tuple__ = version_tuple = (0, 1, 1)
15
+ __version__ = version = '0.1.2'
16
+ __version_tuple__ = version_tuple = (0, 1, 2)
@@ -0,0 +1,121 @@
1
+ import pdfplumber
2
+ import pytesseract
3
+ from PIL import Image
4
+ import pandas as pd
5
+ import io
6
+ from spire.doc import Document
7
+ from spire.doc.common import *
8
+
9
+
10
+ class DocExtractor():
11
+ def __init__(
12
+ self,
13
+ doc_files: list,
14
+ lang: str = "chi_sim"
15
+ ) -> None:
16
+ self.doc_files = doc_files
17
+ self.lang = lang
18
+
19
+ @classmethod
20
+ def text_from_doc(
21
+ doc_path
22
+ ):
23
+ document = Document()
24
+ # Load a Word document
25
+ document.LoadFromFile(doc_path)
26
+ document_text = document.GetText()
27
+ return document_text
28
+
29
+ @staticmethod
30
+ def text_from_plain(
31
+ txt_path
32
+ ):
33
+ with open(txt_path, "r") as f:
34
+ text = f.read()
35
+ return text
36
+
37
+ @staticmethod
38
+ def extract_text_from_image(
39
+ image: Image.Image,
40
+ ) -> str:
41
+ return pytesseract.image_to_string(image, lang=self.lang)
42
+
43
+ @staticmethod
44
+ def is_within_bbox(
45
+ bbox1, bbox2
46
+ ):
47
+ """Check if bbox1 is within bbox2."""
48
+ return bbox1[0] >= bbox2[0] and bbox1[1] >= bbox2[1] and bbox1[2] <= bbox2[2] and bbox1[3] <= bbox2[3]
49
+
50
+ def text_tables_from_pdf(
51
+ self,
52
+ pdf_path,
53
+ table_from_pic: bool = False
54
+ ):
55
+ all_tables = []
56
+ all_texts = []
57
+ with pdfplumber.open(pdf_path) as pdf:
58
+ for page_number, page in enumerate(pdf.pages):
59
+ tables = page.find_tables()
60
+ page_text = page.extract_text(x_tolerance=0.1, y_tolerance=0.1) or ''
61
+ page_text_lines = page_text.split('\n')
62
+
63
+ # Extract tables
64
+ if tables:
65
+ for table in tables:
66
+ if table and len(table.extract()) > 1:
67
+ table_data = table.extract()
68
+ df = pd.DataFrame(table_data[1:], columns=table_data[0])
69
+ df['Page'] = page_number + 1 # 添加页码信息
70
+ all_tables.append(df)
71
+
72
+ # Get bounding boxes for tables
73
+ table_bboxes = [table.bbox for table in tables]
74
+
75
+ # Filter out text within table bounding boxes
76
+ non_table_text = []
77
+ for char in page.chars:
78
+ char_bbox = (char['x0'], char['top'], char['x1'], char['bottom'])
79
+ if not any(self.is_within_bbox(char_bbox, table_bbox) for table_bbox in table_bboxes):
80
+ non_table_text.append(char['text'])
81
+ remaining_text = ''.join(non_table_text).strip()
82
+ if remaining_text:
83
+ all_texts.append(remaining_text)
84
+
85
+ # Extract tables from images if specified
86
+ if table_from_pic:
87
+ for img in page.images:
88
+ try:
89
+ x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
90
+ if x0 < 0 or top < 0 or x1 > page.width or bottom > page.height:
91
+ print(f"Skipping image with invalid bounds on page {page_number + 1}")
92
+ continue
93
+
94
+ cropped_image = page.within_bbox((x0, top, x1, bottom)).to_image()
95
+ img_bytes = io.BytesIO()
96
+ cropped_image.save(img_bytes, format='PNG')
97
+ img_bytes.seek(0)
98
+ pil_image = Image.open(img_bytes)
99
+
100
+ ocr_text = self.extract_text_from_image(pil_image, lang=self.lang)
101
+
102
+ table = [line.split() for line in ocr_text.split('\n') if line.strip()]
103
+
104
+ if table:
105
+ num_columns = max(len(row) for row in table)
106
+ for row in table:
107
+ if len(row) != num_columns:
108
+ row.extend([''] * (num_columns - len(row)))
109
+
110
+ df = pd.DataFrame(table[1:], columns=table[0])
111
+ df['Page'] = page_number + 1
112
+ all_tables.append(df)
113
+ except Exception as e:
114
+ print(f"Error processing image on page {page_number + 1}: {e}")
115
+
116
+ if all_tables:
117
+ return all_texts, all_tables
118
+ else:
119
+ return all_texts, [pd.DataFrame()]
120
+
121
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hjxdl
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: A collection of functions for Jupyter notebooks
5
5
  Home-page: https://github.com/huluxiaohuowa/hdl
6
6
  Author: Jianxing Hu
@@ -1,5 +1,5 @@
1
1
  hdl/__init__.py,sha256=5sZZNySv08wwfzJcSDssGTqUn9wlmDsR6R4XB8J8mFM,70
2
- hdl/_version.py,sha256=PKIMyjdUACH4-ONvtunQCnYE2UhlMfp9su83e3HXl5E,411
2
+ hdl/_version.py,sha256=SFCDdrYA67D1Je-jHgVVh4LOopkPvuV6NMtqSJ7Tfhg,411
3
3
  hdl/args/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  hdl/args/loss_args.py,sha256=s7YzSdd7IjD24rZvvOrxLLFqMZQb9YylxKeyelSdrTk,70
5
5
  hdl/controllers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -86,9 +86,10 @@ hdl/utils/general/glob.py,sha256=8-RCnt6L297wMIfn34ZAMCsGCZUjHG3MGglGZI1cX0g,491
86
86
  hdl/utils/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
87
87
  hdl/utils/llm/chat.py,sha256=H2c8assJlSdZQKIfPkYrVZHqv66TsdsxtaLXv0kNe1w,11565
88
88
  hdl/utils/llm/embs.py,sha256=sC8tga7HgDwPI2m7TDWKp9kkxEIMxEyMtgmEhfRi4vI,6362
89
+ hdl/utils/llm/extract.py,sha256=Ze77dYrXqRTPiz3NMJiLvRMbXZ_-TY5Rq37Cionc1E4,4633
89
90
  hdl/utils/schedulers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
91
  hdl/utils/schedulers/norm_lr.py,sha256=bDwCmdEK-WkgxQMFBiMuchv8Mm7C0-GZJ6usm-PQk14,4461
91
- hjxdl-0.1.1.dist-info/METADATA,sha256=ORYQSX57x_WZQ1pG6KIjvesKNPshg1vCKv_8kw0hyas,542
92
- hjxdl-0.1.1.dist-info/WHEEL,sha256=pd56usn78UTvq1xeX_ZwFhoK6jE5u5wzu4TTBIG5cQ0,91
93
- hjxdl-0.1.1.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
94
- hjxdl-0.1.1.dist-info/RECORD,,
92
+ hjxdl-0.1.2.dist-info/METADATA,sha256=IgnNn-u7YPXh3WAFGSBihoMWK4jLwQ4XllCPoGf9r-o,542
93
+ hjxdl-0.1.2.dist-info/WHEEL,sha256=-oYQCr74JF3a37z2nRlQays_SX2MqOANoqVjBBAP2yE,91
94
+ hjxdl-0.1.2.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
95
+ hjxdl-0.1.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.0.0)
2
+ Generator: setuptools (71.0.3)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5