pdf2data-tools 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2data/__init__.py +5 -0
- pdf2data/cli/block_extractor.py +157 -0
- pdf2data/cli/block_finder.py +50 -0
- pdf2data/cli/evaluator.py +45 -0
- pdf2data/cli/metadata_finder.py +67 -0
- pdf2data/cli/pdf2data.py +354 -0
- pdf2data/cli/reference_extractor.py +35 -0
- pdf2data/cli/table_detector.py +81 -0
- pdf2data/cli/text_extractor.py +112 -0
- pdf2data/cli/text_finder.py +70 -0
- pdf2data/cli/upgrade.py +51 -0
- pdf2data/docling.py +266 -0
- pdf2data/evaluator.py +350 -0
- pdf2data/keywords.py +202 -0
- pdf2data/mask.py +689 -0
- pdf2data/metadata.py +127 -0
- pdf2data/mineru.py +356 -0
- pdf2data/mineru_vlm.py +236 -0
- pdf2data/old_code/block.py +877 -0
- pdf2data/padle_pipeline.py +417 -0
- pdf2data/pdf2data_pipeline.py +413 -0
- pdf2data/pdf_classifier.py +46 -0
- pdf2data/pipeline.py +231 -0
- pdf2data/references.py +49 -0
- pdf2data/support.py +1484 -0
- pdf2data/text.py +315 -0
- pdf2data/upgrade.py +221 -0
- pdf2data_tools-0.0.2.dist-info/METADATA +114 -0
- pdf2data_tools-0.0.2.dist-info/RECORD +33 -0
- pdf2data_tools-0.0.2.dist-info/WHEEL +5 -0
- pdf2data_tools-0.0.2.dist-info/entry_points.txt +10 -0
- pdf2data_tools-0.0.2.dist-info/licenses/LICENSE +16 -0
- pdf2data_tools-0.0.2.dist-info/top_level.txt +1 -0
pdf2data/__init__.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@click.command()
|
|
8
|
+
@click.argument("input_folder", type=str)
|
|
9
|
+
@click.argument("output_folder", type=str)
|
|
10
|
+
@click.option(
|
|
11
|
+
"--pipeline",
|
|
12
|
+
default="MinerU", #NotDefined, MinerU
|
|
13
|
+
help="Define the pipeline to be used",
|
|
14
|
+
)
|
|
15
|
+
@click.option(
|
|
16
|
+
"--layout_model",
|
|
17
|
+
default="DocLayout-YOLO-DocStructBench",
|
|
18
|
+
help="model used to determine the overall layout",
|
|
19
|
+
)
|
|
20
|
+
@click.option(
|
|
21
|
+
"--table_model",
|
|
22
|
+
default=None,
|
|
23
|
+
help="model used to identify the tables",
|
|
24
|
+
)
|
|
25
|
+
@click.option(
|
|
26
|
+
"--layout_model_threshold",
|
|
27
|
+
default=0.6,
|
|
28
|
+
help="threshold of the layout model",
|
|
29
|
+
)
|
|
30
|
+
@click.option(
|
|
31
|
+
"--table_model_threshold",
|
|
32
|
+
default=0.6,
|
|
33
|
+
help="threshold of the table detection model",
|
|
34
|
+
)
|
|
35
|
+
@click.option(
|
|
36
|
+
"--struct_model",
|
|
37
|
+
default="microsoft/table-structure-recognition-v1.1-all",
|
|
38
|
+
help="table structure detection model",
|
|
39
|
+
)
|
|
40
|
+
@click.option(
|
|
41
|
+
"--device",
|
|
42
|
+
default="cpu",
|
|
43
|
+
help="device to run the mask models",
|
|
44
|
+
)
|
|
45
|
+
def block_extractor(
|
|
46
|
+
input_folder: str,
|
|
47
|
+
output_folder: str,
|
|
48
|
+
pipeline: Optional[str],
|
|
49
|
+
layout_model: str,
|
|
50
|
+
table_model: str,
|
|
51
|
+
layout_model_threshold: float,
|
|
52
|
+
table_model_threshold: float,
|
|
53
|
+
struct_model: str,
|
|
54
|
+
device: str,
|
|
55
|
+
) -> None:
|
|
56
|
+
if pipeline == "NotDefined":
|
|
57
|
+
from pdf2data.pdf2data_pipeline import PDF2Data
|
|
58
|
+
pdf2data_pipeline: PDF2Data = PDF2Data(layout_model=layout_model,
|
|
59
|
+
layout_model_threshold=layout_model_threshold,
|
|
60
|
+
table_model=table_model,
|
|
61
|
+
table_model_threshold=table_model_threshold,
|
|
62
|
+
table_structure_model=struct_model,
|
|
63
|
+
device=device,
|
|
64
|
+
input_folder=input_folder,
|
|
65
|
+
output_folder=output_folder,
|
|
66
|
+
extract_text=False,
|
|
67
|
+
extract_equations=False)
|
|
68
|
+
pdf2data_pipeline.pdf_transform()
|
|
69
|
+
"""file_list: List[str] = get_doc_list(input_folder, "pdf")
|
|
70
|
+
extractor: BlockExtractor = BlockExtractor(
|
|
71
|
+
ocr_model=ocr_model,
|
|
72
|
+
word_detection_model=word_detection_model,
|
|
73
|
+
word_detection_threshold=word_detection_model_threshold,
|
|
74
|
+
extract_tables=extract_tables,
|
|
75
|
+
extract_figures=extract_figures,
|
|
76
|
+
correct_struct=correct_struct,
|
|
77
|
+
table_zoom=table_zoom,
|
|
78
|
+
cell_zoom=cell_zoom,
|
|
79
|
+
figure_zoom=figure_zoom,
|
|
80
|
+
x_table_corr=x_table_corr,
|
|
81
|
+
y_table_corr=y_table_corr,
|
|
82
|
+
iou_lines=iou_lines,
|
|
83
|
+
iou_struct=iou_struct,
|
|
84
|
+
word_factor=word_factor,
|
|
85
|
+
word_iou=word_iou,
|
|
86
|
+
structure_model=struct_model,
|
|
87
|
+
struct_model_threshold=struct_model_threshold,
|
|
88
|
+
reconstructor_type=reconstructor_type,
|
|
89
|
+
brightness=brightness,
|
|
90
|
+
contrast=contrast,
|
|
91
|
+
letter_ratio=letter_ratio
|
|
92
|
+
)
|
|
93
|
+
mask: LayoutParser = LayoutParser(
|
|
94
|
+
model=layout_model,
|
|
95
|
+
model_threshold=layout_model_threshold,
|
|
96
|
+
table_model=table_model,
|
|
97
|
+
table_model_threshold=table_model_threshold,
|
|
98
|
+
device_type=device
|
|
99
|
+
)
|
|
100
|
+
mask.model_post_init(None)
|
|
101
|
+
total_docs: int = len(file_list)
|
|
102
|
+
doc_number: int = 1
|
|
103
|
+
for file in file_list:
|
|
104
|
+
print(f'{doc_number}//{total_docs} processed')
|
|
105
|
+
print(file)
|
|
106
|
+
file_path: str = input_folder + "/" + file
|
|
107
|
+
layout = mask.get_layout(file_path)
|
|
108
|
+
extractor.get_blocks(file_path, layout, output_folder)
|
|
109
|
+
doc_number += 1"""
|
|
110
|
+
elif pipeline == "MinerU":
|
|
111
|
+
from pdf2data.mineru import MinerU
|
|
112
|
+
miner_pipeline: MinerU = MinerU(
|
|
113
|
+
input_folder=input_folder,
|
|
114
|
+
output_folder=output_folder,
|
|
115
|
+
extract_equations=False,
|
|
116
|
+
extract_text=False)
|
|
117
|
+
miner_pipeline.pdf_transform()
|
|
118
|
+
elif pipeline == "Docling":
|
|
119
|
+
from pdf2data.docling import Docling
|
|
120
|
+
docling_pipeline: Docling = Docling(
|
|
121
|
+
input_folder=input_folder,
|
|
122
|
+
output_folder=output_folder,
|
|
123
|
+
extract_equations=False,
|
|
124
|
+
extract_text=False)
|
|
125
|
+
docling_pipeline.pdf_transform()
|
|
126
|
+
elif pipeline in ["PaddlePPStructure", "PaddleVL"]:
|
|
127
|
+
from pdf2data.padle_pipeline import PaddlePPStructure
|
|
128
|
+
paddle_pipeline: PaddlePPStructure = PaddlePPStructure(
|
|
129
|
+
extractor_name=pipeline,
|
|
130
|
+
input_folder=input_folder,
|
|
131
|
+
output_folder=output_folder,
|
|
132
|
+
extract_equations=False,
|
|
133
|
+
extract_text=False)
|
|
134
|
+
paddle_pipeline.pdf_transform()
|
|
135
|
+
elif pipeline == "MinerUVL":
|
|
136
|
+
from pdf2data.mineru_vlm import MinerUVLM
|
|
137
|
+
mineru_vlm_pipeline: MinerUVLM = MinerUVLM(
|
|
138
|
+
input_folder=input_folder,
|
|
139
|
+
output_folder=output_folder,
|
|
140
|
+
extract_equations=False,
|
|
141
|
+
extract_text=False)
|
|
142
|
+
mineru_vlm_pipeline.pdf_transform()
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def main():
|
|
148
|
+
block_extractor()
|
|
149
|
+
|
|
150
|
+
"""@click.option(
|
|
151
|
+
"--table_model",
|
|
152
|
+
default="TableBank_faster_rcnn_R_101_FPN_3x",
|
|
153
|
+
help="model used to identify the tables",
|
|
154
|
+
)"""
|
|
155
|
+
|
|
156
|
+
if __name__ == "__main__":
|
|
157
|
+
main()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
import json
|
|
4
|
+
from pdf2data.support import get_doc_list
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from pdf2data.keywords import BlockFinder
|
|
9
|
+
|
|
10
|
+
@click.command()
|
|
11
|
+
@click.argument("input_folder", type=str)
|
|
12
|
+
@click.argument("output_folder", type=str)
|
|
13
|
+
@click.argument("keywords_file", type=str)
|
|
14
|
+
@click.option(
|
|
15
|
+
"--generic_file",
|
|
16
|
+
default=None,
|
|
17
|
+
help="file containing generic keywords",
|
|
18
|
+
)
|
|
19
|
+
@click.option(
|
|
20
|
+
"--find_tables",
|
|
21
|
+
default=True,
|
|
22
|
+
help="True to look for tables, False otherwise.",
|
|
23
|
+
)
|
|
24
|
+
@click.option(
|
|
25
|
+
"--find_figures",
|
|
26
|
+
default=False,
|
|
27
|
+
help="True to look for figures, False otherwise.",
|
|
28
|
+
)
|
|
29
|
+
def block_finder(input_folder: str, output_folder: str, keywords_file:str, generic_file: str, find_tables: bool, find_figures: bool):
|
|
30
|
+
if os.path.isdir(output_folder) is False:
|
|
31
|
+
os.mkdir(output_folder)
|
|
32
|
+
finder: BlockFinder = BlockFinder(keywords_file_path=keywords_file, generic_keywords_file_path=generic_file)
|
|
33
|
+
doc_list: List[str] = get_doc_list(input_folder, "")
|
|
34
|
+
final_results_dict: Dict[str, Any] = {}
|
|
35
|
+
results_path = f"{output_folder}/found_blocks.json"
|
|
36
|
+
for doc in doc_list:
|
|
37
|
+
print(doc)
|
|
38
|
+
blocks_path: str = f"{input_folder}/{doc}/{doc}_blocks.json"
|
|
39
|
+
blocks = finder.find(blocks_path, tables=find_tables, figures=find_figures)
|
|
40
|
+
final_results_dict[doc] = blocks
|
|
41
|
+
result_json = json.dumps(final_results_dict, indent=4)
|
|
42
|
+
with open(results_path, "w") as f:
|
|
43
|
+
f.write(result_json)
|
|
44
|
+
|
|
45
|
+
def main():
|
|
46
|
+
block_finder()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
main()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from pdf2data.evaluator import Evaluator
|
|
7
|
+
from pdf2data.support import get_doc_list
|
|
8
|
+
|
|
9
|
+
@click.command()
|
|
10
|
+
@click.argument("ref_folder", type=str)
|
|
11
|
+
@click.argument("results_folder", type=str)
|
|
12
|
+
@click.argument("eval_file_path", type=str)
|
|
13
|
+
@click.argument("type", type=str)
|
|
14
|
+
@click.option(
|
|
15
|
+
"--string_threshold",
|
|
16
|
+
default=0.6,
|
|
17
|
+
help="similarity threshold between strings",
|
|
18
|
+
)
|
|
19
|
+
@click.option(
|
|
20
|
+
"--box_threshold",
|
|
21
|
+
default=0.5,
|
|
22
|
+
help="similarity threshold between boxes",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def evaluator(ref_folder: str, results_folder: str, eval_file_path: str, type: str, string_threshold: float, box_threshold: float):
|
|
26
|
+
available_types: set = set(["metadata", "text", "blocks", "table_detection"])
|
|
27
|
+
if type not in available_types:
|
|
28
|
+
raise AttributeError(f"The specified type is not availabe, the available types are {available_types}")
|
|
29
|
+
evaluator = Evaluator(ref_folder=ref_folder, result_folder=results_folder, eval_file_path=eval_file_path, string_similarity=string_threshold, iou_threshold=box_threshold)
|
|
30
|
+
if type == "metadata":
|
|
31
|
+
evaluator.eval_metadata()
|
|
32
|
+
elif type == "text":
|
|
33
|
+
evaluator.eval_text()
|
|
34
|
+
elif type == "blocks":
|
|
35
|
+
evaluator.eval_blocks()
|
|
36
|
+
elif type == "table_detection":
|
|
37
|
+
evaluator.eval_table_detector()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def main():
|
|
41
|
+
evaluator()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
main()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from pdf2data.metadata import Metadata
|
|
8
|
+
from pdf2data.support import get_doc_list
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@click.command()
|
|
12
|
+
@click.argument("path", type=str)
|
|
13
|
+
@click.option(
|
|
14
|
+
"--output_folder",
|
|
15
|
+
default=None,
|
|
16
|
+
help="Output folder path",
|
|
17
|
+
)
|
|
18
|
+
@click.option(
|
|
19
|
+
"--file_type",
|
|
20
|
+
default="all",
|
|
21
|
+
help="file type to be analysed, options are all, pdf or cermxml",
|
|
22
|
+
)
|
|
23
|
+
def metadata_finder(path: str, output_folder: Optional[str], file_type: str) -> None:
|
|
24
|
+
"""Create a json file containing the metadata for each file in path
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
path : str
|
|
29
|
+
path to the file or folder to be processed
|
|
30
|
+
output_folder : Optional[str]
|
|
31
|
+
Output folder path to save the genereate json file containing the metdata
|
|
32
|
+
"""
|
|
33
|
+
if os.path.isfile(path):
|
|
34
|
+
file_list: List[str] = [path]
|
|
35
|
+
input_folder: str = ""
|
|
36
|
+
elif file_type == "all":
|
|
37
|
+
file_list = get_doc_list(path, "pdf") + get_doc_list(path, "cermxml")
|
|
38
|
+
input_folder = path + "/"
|
|
39
|
+
elif file_type == "pdf":
|
|
40
|
+
file_list = get_doc_list(path, "pdf")
|
|
41
|
+
input_folder = path + "/"
|
|
42
|
+
elif file_type == "cermxml":
|
|
43
|
+
file_list = get_doc_list(path, "cermxml")
|
|
44
|
+
input_folder = path + "/"
|
|
45
|
+
if output_folder is None:
|
|
46
|
+
output_path = ""
|
|
47
|
+
else:
|
|
48
|
+
output_path = output_folder + "/"
|
|
49
|
+
for file in file_list:
|
|
50
|
+
file_path: str = f"{input_folder}{file}"
|
|
51
|
+
file_name: str = os.path.splitext(file)[0]
|
|
52
|
+
metadata: Metadata = Metadata(file_path=file_path)
|
|
53
|
+
metadata.update()
|
|
54
|
+
metadata_dict: Dict[str, Any] = metadata.__dict__
|
|
55
|
+
del metadata_dict["file_path"]
|
|
56
|
+
json_metadata = json.dumps(metadata_dict, indent=4)
|
|
57
|
+
with open(f"{output_path}{file_name}_metadata.json", "w") as j:
|
|
58
|
+
# convert the dictionary into a json variable
|
|
59
|
+
j.write(json_metadata)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main():
|
|
63
|
+
metadata_finder()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
main()
|
pdf2data/cli/pdf2data.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
|
|
8
|
+
from pdf2data.references import References
|
|
9
|
+
from pdf2data.metadata import Metadata
|
|
10
|
+
|
|
11
|
+
@click.command()
|
|
12
|
+
@click.argument("input_folder", type=str)
|
|
13
|
+
@click.argument("output_folder", type=str)
|
|
14
|
+
@click.option(
|
|
15
|
+
"--pipeline",
|
|
16
|
+
default="MinerU", #NotDefined, MinerU
|
|
17
|
+
help="Define the pipeline to be used",
|
|
18
|
+
)
|
|
19
|
+
@click.option(
|
|
20
|
+
"--layout_model",
|
|
21
|
+
default="PP-DocLayout-L",
|
|
22
|
+
help="model used to determine the overall layout",
|
|
23
|
+
)
|
|
24
|
+
@click.option(
|
|
25
|
+
"--layout_model_threshold",
|
|
26
|
+
default=0.7,
|
|
27
|
+
help="threshold of the layout model",
|
|
28
|
+
)
|
|
29
|
+
@click.option(
|
|
30
|
+
"--table_model",
|
|
31
|
+
default=None,
|
|
32
|
+
help="model used to identify the tables",
|
|
33
|
+
)
|
|
34
|
+
@click.option(
|
|
35
|
+
"--table_model_threshold",
|
|
36
|
+
default=0.5,
|
|
37
|
+
help="threshold of the table detection model",
|
|
38
|
+
)
|
|
39
|
+
@click.option(
|
|
40
|
+
"--text_extractor_type",
|
|
41
|
+
default="layoutparser",
|
|
42
|
+
help="type of the text extractor, available: ['layoutparser', 'cermine', 'minersix']",
|
|
43
|
+
)
|
|
44
|
+
@click.option(
|
|
45
|
+
"--reference_extractor_type",
|
|
46
|
+
default="anystyle",
|
|
47
|
+
help="type of the reference extractor, available: ['anystyle', 'cermine']",
|
|
48
|
+
)
|
|
49
|
+
@click.option(
|
|
50
|
+
"--metadata_extractor_type",
|
|
51
|
+
default="pdf2doi",
|
|
52
|
+
help="type of the metadata extractor, available: ['pdf2doi', 'cermine']",
|
|
53
|
+
)
|
|
54
|
+
@click.option(
|
|
55
|
+
"--extract_tables",
|
|
56
|
+
default=True,
|
|
57
|
+
help="True to extract tables, False otherwise",
|
|
58
|
+
)
|
|
59
|
+
@click.option(
|
|
60
|
+
"--extract_figures",
|
|
61
|
+
default=True,
|
|
62
|
+
help="True to extract figures, False otherwise",
|
|
63
|
+
)
|
|
64
|
+
@click.option(
|
|
65
|
+
"--correct_struct",
|
|
66
|
+
default=False,
|
|
67
|
+
help="True to to correct the table structure using the words position, False otherwise",
|
|
68
|
+
)
|
|
69
|
+
@click.option(
|
|
70
|
+
"--table_zoom",
|
|
71
|
+
default=1.5,
|
|
72
|
+
help="zoom of the image containing the table",
|
|
73
|
+
)
|
|
74
|
+
@click.option(
|
|
75
|
+
"--figure_zoom",
|
|
76
|
+
default=3,
|
|
77
|
+
help="zoom of the figures extracted",
|
|
78
|
+
)
|
|
79
|
+
@click.option(
|
|
80
|
+
"--x_table_corr",
|
|
81
|
+
default=0.01,
|
|
82
|
+
help="factor correct the table coordinates in the x axis",
|
|
83
|
+
)
|
|
84
|
+
@click.option(
|
|
85
|
+
"--y_table_corr",
|
|
86
|
+
default=0.01,
|
|
87
|
+
help="factor correct the table coordinates in the y axis",
|
|
88
|
+
)
|
|
89
|
+
@click.option(
|
|
90
|
+
"--iou_lines",
|
|
91
|
+
default=0.5,
|
|
92
|
+
help="iou value to supress collumns and rows",
|
|
93
|
+
)
|
|
94
|
+
@click.option(
|
|
95
|
+
"--iou_struct",
|
|
96
|
+
default=0.02,
|
|
97
|
+
help="minimum iou between table and row/collumn to consider the row/collumn as correct",
|
|
98
|
+
)
|
|
99
|
+
@click.option(
|
|
100
|
+
"--word_factor",
|
|
101
|
+
default=1.0,
|
|
102
|
+
help="factor used to determine the maximum distance to consider two different words as a single table entry",
|
|
103
|
+
)
|
|
104
|
+
@click.option(
|
|
105
|
+
"--word_iou",
|
|
106
|
+
default=0.00001,
|
|
107
|
+
help="iou value to consider that a word is inside a specific table entry",
|
|
108
|
+
)
|
|
109
|
+
@click.option(
|
|
110
|
+
"--struct_model_threshold",
|
|
111
|
+
default=0.3,
|
|
112
|
+
help="table structure detection model threshold",
|
|
113
|
+
)
|
|
114
|
+
@click.option(
|
|
115
|
+
"--reconstructor_type",
|
|
116
|
+
default="entry_by_entry",
|
|
117
|
+
help="type of reconstructor used, options: 'entry_by_entry' or 'word_by_word'",
|
|
118
|
+
)
|
|
119
|
+
@click.option(
|
|
120
|
+
"--brightness",
|
|
121
|
+
default=1.0,
|
|
122
|
+
help="brightness factor of the Table image",
|
|
123
|
+
)
|
|
124
|
+
@click.option(
|
|
125
|
+
"--contrast",
|
|
126
|
+
default=1.1,
|
|
127
|
+
help="contrast factor of the Table image",
|
|
128
|
+
)
|
|
129
|
+
@click.option(
|
|
130
|
+
"--device",
|
|
131
|
+
default="cpu",
|
|
132
|
+
help="device to run the mask models",
|
|
133
|
+
)
|
|
134
|
+
@click.option(
|
|
135
|
+
"--letter_ratio",
|
|
136
|
+
default=4.0,
|
|
137
|
+
help="minimum ratio between letter and ratio to consider a column as a row index or a row as a collumn header",
|
|
138
|
+
)
|
|
139
|
+
def pdf2data(input_folder: str,
|
|
140
|
+
output_folder: str,
|
|
141
|
+
pipeline: str,
|
|
142
|
+
layout_model: str,
|
|
143
|
+
table_model: str,
|
|
144
|
+
layout_model_threshold: float,
|
|
145
|
+
table_model_threshold: float,
|
|
146
|
+
text_extractor_type: float,
|
|
147
|
+
reference_extractor_type: float,
|
|
148
|
+
metadata_extractor_type: float,
|
|
149
|
+
extract_tables: bool,
|
|
150
|
+
extract_figures: bool,
|
|
151
|
+
correct_struct: bool,
|
|
152
|
+
table_zoom: float,
|
|
153
|
+
figure_zoom: float,
|
|
154
|
+
x_table_corr: float,
|
|
155
|
+
y_table_corr: float,
|
|
156
|
+
iou_lines: float,
|
|
157
|
+
iou_struct: float,
|
|
158
|
+
word_factor: float,
|
|
159
|
+
word_iou: float,
|
|
160
|
+
struct_model_threshold: float,
|
|
161
|
+
reconstructor_type: str,
|
|
162
|
+
brightness: float,
|
|
163
|
+
contrast: float,
|
|
164
|
+
device: str,
|
|
165
|
+
letter_ratio: float,
|
|
166
|
+
) -> None:
|
|
167
|
+
start_time = time.time()
|
|
168
|
+
if pipeline == "NotDefined":
|
|
169
|
+
from pdf2data.pdf2data_pipeline import PDF2Data
|
|
170
|
+
pdf2data_pipeline: PDF2Data = PDF2Data(layout_model=layout_model,
|
|
171
|
+
layout_model_threshold=layout_model_threshold,
|
|
172
|
+
table_model=table_model,
|
|
173
|
+
table_model_threshold=table_model_threshold, device=device,
|
|
174
|
+
input_folder=input_folder,
|
|
175
|
+
output_folder=output_folder,
|
|
176
|
+
extract_references=True)
|
|
177
|
+
pdf2data_pipeline.pdf_transform()
|
|
178
|
+
"""file_list: List[str] = get_doc_list(input_folder, "pdf")
|
|
179
|
+
possible_types = set(["layoutparser", "cermine", "minersix"])
|
|
180
|
+
if text_extractor_type not in possible_types:
|
|
181
|
+
raise AttributeError(
|
|
182
|
+
f"{type} is not a available type, try one of the following: {possible_types}"
|
|
183
|
+
)
|
|
184
|
+
if text_extractor_type == "layoutparser":
|
|
185
|
+
text_extension: str = ".pdf"
|
|
186
|
+
else:
|
|
187
|
+
generator: TextFileGenerator = TextFileGenerator(
|
|
188
|
+
input_folder=input_folder, output_folder=output_folder
|
|
189
|
+
)
|
|
190
|
+
generator.model_post_init(None)
|
|
191
|
+
if text_extractor_type == "cermine":
|
|
192
|
+
generator.pdf_to_cermxml()
|
|
193
|
+
text_extension = ".cermxml"
|
|
194
|
+
elif text_extractor_type == "minersix":
|
|
195
|
+
generator.pdf_to_miner("txt")
|
|
196
|
+
text_extension = ".txt"
|
|
197
|
+
if reference_extractor_type == "anystyle":
|
|
198
|
+
reference_extension: str = ".pdf"
|
|
199
|
+
elif reference_extractor_type == "cermine":
|
|
200
|
+
if text_extractor_type != "cermine":
|
|
201
|
+
raise ValueError("Can only use reference_extractor as 'cermine' if the text_extrator is 'cermine'")
|
|
202
|
+
else:
|
|
203
|
+
reference_extension: str = ".cermxml"
|
|
204
|
+
if metadata_extractor_type == "pdf2doi":
|
|
205
|
+
metadata_extension: str = ".pdf"
|
|
206
|
+
elif metadata_extractor_type == "cermine":
|
|
207
|
+
if text_extractor_type != "cermine":
|
|
208
|
+
raise ValueError("Can only use metadata_extractor as 'cermine' if the text_extrator is 'cermine'")
|
|
209
|
+
else:
|
|
210
|
+
metadata_extension: str = ".cermxml"
|
|
211
|
+
extractor: BlockExtractor = BlockExtractor(
|
|
212
|
+
extract_tables=extract_tables,
|
|
213
|
+
extract_figures=extract_figures,
|
|
214
|
+
correct_struct=correct_struct,
|
|
215
|
+
table_zoom=table_zoom,
|
|
216
|
+
figure_zoom=figure_zoom,
|
|
217
|
+
x_table_corr=x_table_corr,
|
|
218
|
+
y_table_corr=y_table_corr,
|
|
219
|
+
iou_lines=iou_lines,
|
|
220
|
+
iou_struct=iou_struct,
|
|
221
|
+
word_factor=word_factor,
|
|
222
|
+
word_iou=word_iou,
|
|
223
|
+
struct_model_threshold=struct_model_threshold,
|
|
224
|
+
reconstructor_type=reconstructor_type,
|
|
225
|
+
brightness=brightness,
|
|
226
|
+
contrast=contrast,
|
|
227
|
+
letter_ratio=letter_ratio
|
|
228
|
+
)
|
|
229
|
+
extractor.model_post_init(None)
|
|
230
|
+
mask: LayoutParser = LayoutParser(
|
|
231
|
+
model=layout_model,
|
|
232
|
+
model_threshold=layout_model_threshold,
|
|
233
|
+
table_model=table_model,
|
|
234
|
+
table_model_threshold=table_model_threshold,
|
|
235
|
+
device_type=device
|
|
236
|
+
)
|
|
237
|
+
mask.model_post_init(None)
|
|
238
|
+
total_docs: int = len(file_list)
|
|
239
|
+
doc_number: int = 1
|
|
240
|
+
for file in file_list:
|
|
241
|
+
print(f'{doc_number}//{total_docs} processed')
|
|
242
|
+
print(file)
|
|
243
|
+
doc_number += 1
|
|
244
|
+
file_name = os.path.splitext(file)[0]
|
|
245
|
+
file_folder = output_folder + "/" + file_name
|
|
246
|
+
|
|
247
|
+
if os.path.isdir(file_folder) is False:
|
|
248
|
+
os.mkdir(file_folder)
|
|
249
|
+
file_path = input_folder + "/" + file
|
|
250
|
+
try:
|
|
251
|
+
pdf = PyPDF2.PdfReader(file_path)
|
|
252
|
+
except PyPDF2.errors.PdfReadError:
|
|
253
|
+
continue
|
|
254
|
+
layout: Dict[str, Any] = mask.get_layout(file_path)
|
|
255
|
+
text_doc = file_name + text_extension
|
|
256
|
+
text_path: str = f"{input_folder}/{text_doc}"
|
|
257
|
+
text_extractor: TextExtractor = TextExtractor(
|
|
258
|
+
input_file=text_path, output_folder=file_folder
|
|
259
|
+
)
|
|
260
|
+
if text_extractor_type == "layoutparser":
|
|
261
|
+
text_extractor.extract_layoutparser(f"{file_name}_text", layout)
|
|
262
|
+
else:
|
|
263
|
+
if text_extractor_type == "cermine":
|
|
264
|
+
text_extractor.extract_cermine(f"{file_name}_text")
|
|
265
|
+
elif text_extractor_type == "minersix":
|
|
266
|
+
text_extractor.extract_txt(f"{file_name}_text")
|
|
267
|
+
reference_path = input_folder + "/" + file_name + reference_extension
|
|
268
|
+
references: References = References(
|
|
269
|
+
file_path=reference_path, output_folder=file_folder
|
|
270
|
+
)
|
|
271
|
+
references.generate_reference_file()
|
|
272
|
+
metadata_path = input_folder + "/" + file_name + metadata_extension
|
|
273
|
+
metadata: Metadata = Metadata(file_path=metadata_path)
|
|
274
|
+
metadata.update()
|
|
275
|
+
doi: str = metadata.doi
|
|
276
|
+
metadata_dict: Dict[str, Any] = metadata.__dict__
|
|
277
|
+
del metadata_dict["file_path"]
|
|
278
|
+
json_metadata = json.dumps(metadata_dict, indent=4)
|
|
279
|
+
with open(f"{file_folder}/{file_name}_metadata.json", "w") as j:
|
|
280
|
+
# convert the dictionary into a json variable
|
|
281
|
+
j.write(json_metadata)
|
|
282
|
+
extractor.get_blocks(file_path, layout, file_folder, doi=doi)
|
|
283
|
+
if text_extractor_type in set(["cermine", "minersix"]):
|
|
284
|
+
shutil.move(
|
|
285
|
+
text_path, file_folder + "/" + file_name + text_extension
|
|
286
|
+
)
|
|
287
|
+
shutil.move(
|
|
288
|
+
file_path, file_folder + "/" + file
|
|
289
|
+
)"""
|
|
290
|
+
elif pipeline == "MinerU":
|
|
291
|
+
from pdf2data.mineru import MinerU
|
|
292
|
+
miner_pipeline: MinerU = MinerU(
|
|
293
|
+
input_folder=input_folder,
|
|
294
|
+
output_folder=output_folder,
|
|
295
|
+
extract_references=True)
|
|
296
|
+
miner_pipeline.pdf_transform()
|
|
297
|
+
elif pipeline == "Docling":
|
|
298
|
+
from pdf2data.docling import Docling
|
|
299
|
+
docling_pipeline: Docling = Docling(
|
|
300
|
+
input_folder=input_folder,
|
|
301
|
+
output_folder=output_folder,
|
|
302
|
+
extract_references=True)
|
|
303
|
+
docling_pipeline.pdf_transform()
|
|
304
|
+
elif pipeline in ["PaddlePPStructure", "PaddleVL"]:
|
|
305
|
+
from pdf2data.padle_pipeline import PaddlePPStructure
|
|
306
|
+
paddle_pipeline: PaddlePPStructure = PaddlePPStructure(
|
|
307
|
+
extractor_name=pipeline,
|
|
308
|
+
input_folder=input_folder,
|
|
309
|
+
output_folder=output_folder,
|
|
310
|
+
extract_references=True)
|
|
311
|
+
paddle_pipeline.pdf_transform()
|
|
312
|
+
elif pipeline == "MinerUVL":
|
|
313
|
+
from pdf2data.mineru_vlm import MinerUVLM
|
|
314
|
+
mineru_vlm_pipeline: MinerUVLM = MinerUVLM(
|
|
315
|
+
input_folder=input_folder,
|
|
316
|
+
output_folder=output_folder,
|
|
317
|
+
extract_references=True)
|
|
318
|
+
mineru_vlm_pipeline.pdf_transform()
|
|
319
|
+
document_extraction_time = time.time() - start_time / 60
|
|
320
|
+
folders_list = os.listdir(output_folder)
|
|
321
|
+
number = 1
|
|
322
|
+
for folder in folders_list:
|
|
323
|
+
print(f"{number} // {len(folders_list)} processed")
|
|
324
|
+
print(folder)
|
|
325
|
+
pdf_file_path = input_folder + "/" + folder + ".pdf"
|
|
326
|
+
file_folder = output_folder + "/" + folder
|
|
327
|
+
txt_file_path = file_folder + "/" + folder + "_references.txt"
|
|
328
|
+
with open(file_folder + "/" + folder + "_content.json", "r") as f:
|
|
329
|
+
content_dict = json.load(f)
|
|
330
|
+
reference_parser =References(
|
|
331
|
+
file_path=txt_file_path, output_folder=file_folder
|
|
332
|
+
)
|
|
333
|
+
content_dict["references"] = reference_parser.generate_reference_list()
|
|
334
|
+
os.remove(txt_file_path)
|
|
335
|
+
metadata: Metadata = Metadata(file_path=pdf_file_path)
|
|
336
|
+
metadata.update()
|
|
337
|
+
metadata_dict: Dict[str, Any] = metadata.__dict__
|
|
338
|
+
content_dict["metadata"] = metadata_dict
|
|
339
|
+
new_content_dict = {"metadata": content_dict["metadata"]}
|
|
340
|
+
new_content_dict.update(content_dict)
|
|
341
|
+
content_json = json.dumps(new_content_dict, indent=4)
|
|
342
|
+
with open(file_folder + "/" + folder + "_content.json", "w") as f:
|
|
343
|
+
f.write(content_json)
|
|
344
|
+
full_time = time.time() - start_time / 60
|
|
345
|
+
extraction_metadata_dict = {"pipeline": pipeline, "document_extraction_time": document_extraction_time, "full_time": full_time}
|
|
346
|
+
with open(output_folder + "/extraction_metadata.json", "w") as f:
|
|
347
|
+
f.write(json.dumps(extraction_metadata_dict, indent=4))
|
|
348
|
+
|
|
349
|
+
def main():
|
|
350
|
+
pdf2data()
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
if __name__ == "__main__":
|
|
354
|
+
main()
|