hjxdl 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hdl/_version.py +2 -2
- hdl/utils/llm/extract.py +44 -2
- {hjxdl-0.1.4.dist-info → hjxdl-0.1.5.dist-info}/METADATA +1 -1
- {hjxdl-0.1.4.dist-info → hjxdl-0.1.5.dist-info}/RECORD +6 -6
- {hjxdl-0.1.4.dist-info → hjxdl-0.1.5.dist-info}/WHEEL +0 -0
- {hjxdl-0.1.4.dist-info → hjxdl-0.1.5.dist-info}/top_level.txt +0 -0
hdl/_version.py
CHANGED
hdl/utils/llm/extract.py
CHANGED
@@ -13,13 +13,22 @@ class DocExtractor():
|
|
13
13
|
ltp_model_path: str = None,
|
14
14
|
lang: str = "chi_sim"
|
15
15
|
) -> None:
|
16
|
+
"""Initialize the object with the specified LTP model path and language.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
ltp_model_path (str): The file path to the LTP model. Default is None.
|
20
|
+
lang (str): The language to be used for processing. Default is "chi_sim".
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
None
|
24
|
+
"""
|
16
25
|
self.ltp_model_path = ltp_model_path
|
17
26
|
self.lang = lang
|
18
27
|
|
19
28
|
self.split = None
|
20
29
|
if self.ltp_model_path is not None:
|
21
30
|
from ltp import StnSplit, LTP
|
22
|
-
ltp = LTP(
|
31
|
+
ltp = LTP(self.ltp_model_path)
|
23
32
|
self.split = StnSplit()
|
24
33
|
# sents = self.split.split(text)
|
25
34
|
|
@@ -38,6 +47,14 @@ class DocExtractor():
|
|
38
47
|
def text_from_plain(
|
39
48
|
txt_path
|
40
49
|
):
|
50
|
+
"""Reads and returns the text content from a plain text file.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
txt_path (str): The path to the plain text file.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
str: The text content read from the file.
|
57
|
+
"""
|
41
58
|
with open(txt_path, "r") as f:
|
42
59
|
text = f.read()
|
43
60
|
return text
|
@@ -46,13 +63,29 @@ class DocExtractor():
|
|
46
63
|
def extract_text_from_image(
|
47
64
|
image: Image.Image,
|
48
65
|
) -> str:
|
66
|
+
"""Extracts text from the given image using pytesseract.
|
67
|
+
|
68
|
+
Args:
|
69
|
+
image (PIL.Image.Image): The input image from which text needs to be extracted.
|
70
|
+
|
71
|
+
Returns:
|
72
|
+
str: The extracted text from the image.
|
73
|
+
"""
|
49
74
|
return pytesseract.image_to_string(image, lang=self.lang)
|
50
75
|
|
51
76
|
@staticmethod
|
52
77
|
def is_within_bbox(
|
53
78
|
bbox1, bbox2
|
54
79
|
):
|
55
|
-
"""Check if bbox1 is within bbox2.
|
80
|
+
"""Check if bbox1 is within bbox2.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
bbox1 (list): List of 4 integers representing the bounding box coordinates [x_min, y_min, x_max, y_max].
|
84
|
+
bbox2 (list): List of 4 integers representing the bounding box coordinates [x_min, y_min, x_max, y_max].
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
bool: True if bbox1 is within bbox2, False otherwise.
|
88
|
+
"""
|
56
89
|
return bbox1[0] >= bbox2[0] and bbox1[1] >= bbox2[1] and bbox1[2] <= bbox2[2] and bbox1[3] <= bbox2[3]
|
57
90
|
|
58
91
|
def text_tables_from_pdf(
|
@@ -60,6 +93,15 @@ class DocExtractor():
|
|
60
93
|
pdf_path,
|
61
94
|
table_from_pic: bool = False
|
62
95
|
):
|
96
|
+
"""Extract text and tables from a PDF file.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
pdf_path (str): Path to the PDF file.
|
100
|
+
table_from_pic (bool, optional): Whether to extract tables from images in the PDF. Defaults to False.
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
tuple: A tuple containing a list of extracted texts and a list of extracted tables as DataFrames.
|
104
|
+
"""
|
63
105
|
all_tables = []
|
64
106
|
all_texts = []
|
65
107
|
with pdfplumber.open(pdf_path) as pdf:
|
@@ -1,5 +1,5 @@
|
|
1
1
|
hdl/__init__.py,sha256=5sZZNySv08wwfzJcSDssGTqUn9wlmDsR6R4XB8J8mFM,70
|
2
|
-
hdl/_version.py,sha256=
|
2
|
+
hdl/_version.py,sha256=zBVX2byWL6NrFlwjvahpnvSqDsdtebZW0K9WM_cj20U,411
|
3
3
|
hdl/args/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
hdl/args/loss_args.py,sha256=s7YzSdd7IjD24rZvvOrxLLFqMZQb9YylxKeyelSdrTk,70
|
5
5
|
hdl/controllers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -86,10 +86,10 @@ hdl/utils/general/glob.py,sha256=8-RCnt6L297wMIfn34ZAMCsGCZUjHG3MGglGZI1cX0g,491
|
|
86
86
|
hdl/utils/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
87
87
|
hdl/utils/llm/chat.py,sha256=H2c8assJlSdZQKIfPkYrVZHqv66TsdsxtaLXv0kNe1w,11565
|
88
88
|
hdl/utils/llm/embs.py,sha256=sC8tga7HgDwPI2m7TDWKp9kkxEIMxEyMtgmEhfRi4vI,6362
|
89
|
-
hdl/utils/llm/extract.py,sha256=
|
89
|
+
hdl/utils/llm/extract.py,sha256=WbTlQmcPNfrKmzSZSKSdWLA0LqLAgoa4J_IcjxBXACI,6506
|
90
90
|
hdl/utils/schedulers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
91
91
|
hdl/utils/schedulers/norm_lr.py,sha256=bDwCmdEK-WkgxQMFBiMuchv8Mm7C0-GZJ6usm-PQk14,4461
|
92
|
-
hjxdl-0.1.
|
93
|
-
hjxdl-0.1.
|
94
|
-
hjxdl-0.1.
|
95
|
-
hjxdl-0.1.
|
92
|
+
hjxdl-0.1.5.dist-info/METADATA,sha256=bcWpSx6Y2t3rb8n8Ry--JImcj0wIJh6CcZRDdE3xzc8,542
|
93
|
+
hjxdl-0.1.5.dist-info/WHEEL,sha256=-oYQCr74JF3a37z2nRlQays_SX2MqOANoqVjBBAP2yE,91
|
94
|
+
hjxdl-0.1.5.dist-info/top_level.txt,sha256=-kxwTM5JPhylp06z3zAVO3w6_h7wtBfBo2zgM6YZoTk,4
|
95
|
+
hjxdl-0.1.5.dist-info/RECORD,,
|
File without changes
|
File without changes
|