hjxdl 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdl/_version.py +2 -2
- hdl/utils/llm/extract.py +119 -0
- {hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/METADATA +1 -1
- {hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/RECORD +6 -5
- {hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/WHEEL +1 -1
- {hjxdl-0.1.1.dist-info → hjxdl-0.1.3.dist-info}/top_level.txt +0 -0
hdl/_version.py
CHANGED
hdl/utils/llm/extract.py
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
import pdfplumber
|
2
|
+
import pytesseract
|
3
|
+
from PIL import Image
|
4
|
+
import pandas as pd
|
5
|
+
import io
|
6
|
+
from spire.doc import Document
|
7
|
+
from spire.doc.common import *
|
8
|
+
|
9
|
+
|
10
|
+
class DocExtractor():
|
11
|
+
def __init__(
|
12
|
+
self,
|
13
|
+
lang: str = "chi_sim"
|
14
|
+
) -> None:
|
15
|
+
self.lang = lang
|
16
|
+
|
17
|
+
@classmethod
|
18
|
+
def text_from_doc(
|
19
|
+
doc_path
|
20
|
+
):
|
21
|
+
document = Document()
|
22
|
+
# Load a Word document
|
23
|
+
document.LoadFromFile(doc_path)
|
24
|
+
document_text = document.GetText()
|
25
|
+
return document_text
|
26
|
+
|
27
|
+
@staticmethod
|
28
|
+
def text_from_plain(
|
29
|
+
txt_path
|
30
|
+
):
|
31
|
+
with open(txt_path, "r") as f:
|
32
|
+
text = f.read()
|
33
|
+
return text
|
34
|
+
|
35
|
+
@staticmethod
|
36
|
+
def extract_text_from_image(
|
37
|
+
image: Image.Image,
|
38
|
+
) -> str:
|
39
|
+
return pytesseract.image_to_string(image, lang=self.lang)
|
40
|
+
|
41
|
+
@staticmethod
|
42
|
+
def is_within_bbox(
|
43
|
+
bbox1, bbox2
|
44
|
+
):
|
45
|
+
"""Check if bbox1 is within bbox2."""
|
46
|
+
return bbox1[0] >= bbox2[0] and bbox1[1] >= bbox2[1] and bbox1[2] <= bbox2[2] and bbox1[3] <= bbox2[3]
|
47
|
+
|
48
|
+
def text_tables_from_pdf(
|
49
|
+
self,
|
50
|
+
pdf_path,
|
51
|
+
table_from_pic: bool = False
|
52
|
+
):
|
53
|
+
all_tables = []
|
54
|
+
all_texts = []
|
55
|
+
with pdfplumber.open(pdf_path) as pdf:
|
56
|
+
for page_number, page in enumerate(pdf.pages):
|
57
|
+
tables = page.find_tables()
|
58
|
+
page_text = page.extract_text(x_tolerance=0.1, y_tolerance=0.1) or ''
|
59
|
+
page_text_lines = page_text.split('\n')
|
60
|
+
|
61
|
+
# Extract tables
|
62
|
+
if tables:
|
63
|
+
for table in tables:
|
64
|
+
if table and len(table.extract()) > 1:
|
65
|
+
table_data = table.extract()
|
66
|
+
df = pd.DataFrame(table_data[1:], columns=table_data[0])
|
67
|
+
df['Page'] = page_number + 1 # 添加页码信息
|
68
|
+
all_tables.append(df)
|
69
|
+
|
70
|
+
# Get bounding boxes for tables
|
71
|
+
table_bboxes = [table.bbox for table in tables]
|
72
|
+
|
73
|
+
# Filter out text within table bounding boxes
|
74
|
+
non_table_text = []
|
75
|
+
for char in page.chars:
|
76
|
+
char_bbox = (char['x0'], char['top'], char['x1'], char['bottom'])
|
77
|
+
if not any(self.is_within_bbox(char_bbox, table_bbox) for table_bbox in table_bboxes):
|
78
|
+
non_table_text.append(char['text'])
|
79
|
+
remaining_text = ''.join(non_table_text).strip()
|
80
|
+
if remaining_text:
|
81
|
+
all_texts.append(remaining_text)
|
82
|
+
|
83
|
+
# Extract tables from images if specified
|
84
|
+
if table_from_pic:
|
85
|
+
for img in page.images:
|
86
|
+
try:
|
87
|
+
x0, top, x1, bottom = img["x0"], img["top"], img["x1"], img["bottom"]
|
88
|
+
if x0 < 0 or top < 0 or x1 > page.width or bottom > page.height:
|
89
|
+
print(f"Skipping image with invalid bounds on page {page_number + 1}")
|
90
|
+
continue
|
91
|
+
|
92
|
+
cropped_image = page.within_bbox((x0, top, x1, bottom)).to_image()
|
93
|
+
img_bytes = io.BytesIO()
|
94
|
+
cropped_image.save(img_bytes, format='PNG')
|
95
|
+
img_bytes.seek(0)
|
96
|
+
pil_image = Image.open(img_bytes)
|
97
|
+
|
98
|
+
ocr_text = self.extract_text_from_image(pil_image, lang=self.lang)
|
99
|
+
|
100
|
+
table = [line.split() for line in ocr_text.split('\n') if line.strip()]
|
101
|
+
|
102
|
+
if table:
|
103
|
+
num_columns = max(len(row) for row in table)
|
104
|
+
for row in table:
|
105
|
+
if len(row) != num_columns:
|
106
|
+
row.extend([''] * (num_columns - len(row)))
|
107
|
+
|
108
|
+
df = pd.DataFrame(table[1:], columns=table[0])
|
109
|
+
df['Page'] = page_number + 1
|
110
|
+
all_tables.append(df)
|
111
|
+
except Exception as e:
|
112
|
+
print(f"Error processing image on page {page_number + 1}: {e}")
|
113
|
+
|
114
|
+
if all_tables:
|
115
|
+
return all_texts, all_tables
|
116
|
+
else:
|
117
|
+
return all_texts, [pd.DataFrame()]
|
118
|
+
|
119
|
+
|
@@ -1,5 +1,5 @@
|
|
1
1
|
hdl/__init__.py,sha256=5sZZNySv08wwfzJcSDssGTqUn9wlmDsR6R4XB8J8mFM,70
|
2
|
-
hdl/_version.py,sha256=
|
2
|
+
hdl/_version.py,sha256=L5DCMp1QAlSqy-8bW7d51bLubTxNjZGYc5fMQkb752U,411
|
3
3
|
hdl/args/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
hdl/args/loss_args.py,sha256=s7YzSdd7IjD24rZvvOrxLLFqMZQb9YylxKeyelSdrTk,70
|
5
5
|
hdl/controllers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -86,9 +86,10 @@ hdl/utils/general/glob.py,sha256=8-RCnt6L297wMIfn34ZAMCsGCZUjHG3MGglGZI1cX0g,491
|
|
86
86
|
hdl/utils/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
87
87
|
hdl/utils/llm/chat.py,sha256=H2c8assJlSdZQKIfPkYrVZHqv66TsdsxtaLXv0kNe1w,11565
|
88
88
|
hdl/utils/llm/embs.py,sha256=sC8tga7HgDwPI2m7TDWKp9kkxEIMxEyMtgmEhfRi4vI,6362
|
89
|
+
hdl/utils/llm/extract.py,sha256=eF-oHu5sMtes8I6ZfNXnEykPfzqbn-2WvnIKiUMz6BA,4573
|
89
90
|
hdl/utils/schedulers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
90
91
|
hdl/utils/schedulers/norm_lr.py,sha256=bDwCmdEK-WkgxQMFBiMuchv8Mm7C0-GZJ6usm-PQk14,4461
|
91
|
-
hjxdl-0.1.
|
92
|
-
hjxdl-0.1.
|
93
|
-
hjxdl-0.1.
|
94
|
-
hjxdl-0.1.
|
92
|
+
hjxdl-0.1.3.dist-info/METADATA,sha256=e-L25DrhaIVW_yfpSpmoqH4k-J6yaTooOGviFgDXFwo,542
|
93
|
+
hjxdl-0.1.3.dist-info/WHEEL,sha256=-oYQCr74JF3a37z2nRlQays_SX2MqOANoqVjBBAP2yE,91
|
94
|
+
hjxdl-0.1.3.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
|
95
|
+
hjxdl-0.1.3.dist-info/RECORD,,
|
File without changes
|