magic-pdf 0.9.2__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +1 -1
- magic_pdf/libs/Constants.py +3 -1
- magic_pdf/libs/config_reader.py +1 -1
- magic_pdf/libs/draw_bbox.py +10 -4
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/pdf_extract_kit.py +42 -297
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
- magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
- magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
- magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
- magic_pdf/model/sub_modules/model_init.py +144 -0
- magic_pdf/model/sub_modules/model_utils.py +51 -0
- magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +259 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +168 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
- magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
- magic_pdf/model/sub_modules/table/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +14 -0
- magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
- magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
- magic_pdf/model/sub_modules/table/table_utils.py +11 -0
- magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
- magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +1 -1
- magic_pdf/para/para_split_v3.py +13 -15
- magic_pdf/pdf_parse_union_core_v2.py +56 -19
- magic_pdf/resources/model_config/model_configs.yaml +2 -1
- magic_pdf/tools/common.py +47 -3
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/METADATA +9 -3
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/RECORD +65 -44
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/WHEEL +1 -1
- magic_pdf/model/pek_sub_modules/post_process.py +0 -36
- magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
- /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/top_level.txt +0 -0
@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import (
|
|
30
30
|
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
|
31
31
|
ocr_prepare_bboxes_for_layout_split_v2
|
32
32
|
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
33
|
-
|
34
|
-
|
33
|
+
fix_discarded_block,
|
34
|
+
fix_block_spans_v2)
|
35
35
|
from magic_pdf.pre_proc.ocr_span_list_modify import (
|
36
36
|
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
|
37
37
|
remove_overlaps_min_spans)
|
@@ -164,8 +164,8 @@ class ModelSingleton:
|
|
164
164
|
|
165
165
|
|
166
166
|
def do_predict(boxes: List[List[int]], model) -> List[int]:
|
167
|
-
from magic_pdf.model.
|
168
|
-
|
167
|
+
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (boxes2inputs, parse_logits,
|
168
|
+
prepare_inputs)
|
169
169
|
|
170
170
|
inputs = boxes2inputs(boxes)
|
171
171
|
inputs = prepare_inputs(inputs, model)
|
@@ -174,23 +174,57 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
|
|
174
174
|
|
175
175
|
|
176
176
|
def cal_block_index(fix_blocks, sorted_bboxes):
|
177
|
-
for block in fix_blocks:
|
178
177
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
178
|
+
if sorted_bboxes is not None:
|
179
|
+
# 使用layoutreader排序
|
180
|
+
for block in fix_blocks:
|
181
|
+
line_index_list = []
|
182
|
+
if len(block['lines']) == 0:
|
183
|
+
block['index'] = sorted_bboxes.index(block['bbox'])
|
184
|
+
else:
|
185
|
+
for line in block['lines']:
|
186
|
+
line['index'] = sorted_bboxes.index(line['bbox'])
|
187
|
+
line_index_list.append(line['index'])
|
188
|
+
median_value = statistics.median(line_index_list)
|
189
|
+
block['index'] = median_value
|
190
|
+
|
191
|
+
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
192
|
+
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
193
|
+
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
194
|
+
block['lines'] = copy.deepcopy(block['real_lines'])
|
195
|
+
del block['real_lines']
|
196
|
+
else:
|
197
|
+
# 使用xycut排序
|
198
|
+
block_bboxes = []
|
199
|
+
for block in fix_blocks:
|
200
|
+
block_bboxes.append(block['bbox'])
|
201
|
+
|
202
|
+
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
203
|
+
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
204
|
+
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
205
|
+
block['lines'] = copy.deepcopy(block['real_lines'])
|
206
|
+
del block['real_lines']
|
207
|
+
|
208
|
+
import numpy as np
|
209
|
+
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import recursive_xy_cut
|
210
|
+
|
211
|
+
random_boxes = np.array(block_bboxes)
|
212
|
+
np.random.shuffle(random_boxes)
|
213
|
+
res = []
|
214
|
+
recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
|
215
|
+
assert len(res) == len(block_bboxes)
|
216
|
+
sorted_boxes = random_boxes[np.array(res)].tolist()
|
217
|
+
|
218
|
+
for i, block in enumerate(fix_blocks):
|
219
|
+
block['index'] = sorted_boxes.index(block['bbox'])
|
220
|
+
|
221
|
+
# 生成line index
|
222
|
+
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
|
223
|
+
line_inedx = 1
|
224
|
+
for block in sorted_blocks:
|
183
225
|
for line in block['lines']:
|
184
|
-
line['index'] =
|
185
|
-
|
186
|
-
median_value = statistics.median(line_index_list)
|
187
|
-
block['index'] = median_value
|
188
|
-
|
189
|
-
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
190
|
-
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
191
|
-
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
192
|
-
block['lines'] = copy.deepcopy(block['real_lines'])
|
193
|
-
del block['real_lines']
|
226
|
+
line['index'] = line_inedx
|
227
|
+
line_inedx += 1
|
194
228
|
|
195
229
|
return fix_blocks
|
196
230
|
|
@@ -264,6 +298,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
|
264
298
|
block['lines'].append({'bbox': line, 'spans': []})
|
265
299
|
page_line_list.extend(lines)
|
266
300
|
|
301
|
+
if len(page_line_list) > 200: # layoutreader最高支持512line
|
302
|
+
return None
|
303
|
+
|
267
304
|
# 使用layoutreader排序
|
268
305
|
x_scale = 1000.0 / page_w
|
269
306
|
y_scale = 1000.0 / page_h
|
magic_pdf/tools/common.py
CHANGED
@@ -14,6 +14,9 @@ from magic_pdf.pipe.TXTPipe import TXTPipe
|
|
14
14
|
from magic_pdf.pipe.UNIPipe import UNIPipe
|
15
15
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
16
16
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
17
|
+
import fitz
|
18
|
+
# from io import BytesIO
|
19
|
+
# from pypdf import PdfReader, PdfWriter
|
17
20
|
|
18
21
|
|
19
22
|
def prepare_env(output_dir, pdf_file_name, method):
|
@@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method):
|
|
26
29
|
return local_image_dir, local_md_dir
|
27
30
|
|
28
31
|
|
32
|
+
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
|
33
|
+
# # 将字节数据包装在 BytesIO 对象中
|
34
|
+
# pdf_file = BytesIO(pdf_bytes)
|
35
|
+
# # 读取 PDF 的字节数据
|
36
|
+
# reader = PdfReader(pdf_file)
|
37
|
+
# # 创建一个新的 PDF 写入器
|
38
|
+
# writer = PdfWriter()
|
39
|
+
# # 将所有页面添加到新的 PDF 写入器中
|
40
|
+
# end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
|
41
|
+
# if end_page_id > len(reader.pages) - 1:
|
42
|
+
# logger.warning("end_page_id is out of range, use pdf_docs length")
|
43
|
+
# end_page_id = len(reader.pages) - 1
|
44
|
+
# for i, page in enumerate(reader.pages):
|
45
|
+
# if start_page_id <= i <= end_page_id:
|
46
|
+
# writer.add_page(page)
|
47
|
+
# # 创建一个字节缓冲区来存储输出的 PDF 数据
|
48
|
+
# output_buffer = BytesIO()
|
49
|
+
# # 将 PDF 写入字节缓冲区
|
50
|
+
# writer.write(output_buffer)
|
51
|
+
# # 获取字节缓冲区的内容
|
52
|
+
# converted_pdf_bytes = output_buffer.getvalue()
|
53
|
+
# return converted_pdf_bytes
|
54
|
+
|
55
|
+
|
56
|
+
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
|
57
|
+
document = fitz.open("pdf", pdf_bytes)
|
58
|
+
output_document = fitz.open()
|
59
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
|
60
|
+
if end_page_id > len(document) - 1:
|
61
|
+
logger.warning("end_page_id is out of range, use pdf_docs length")
|
62
|
+
end_page_id = len(document) - 1
|
63
|
+
output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
|
64
|
+
output_bytes = output_document.tobytes()
|
65
|
+
return output_bytes
|
66
|
+
|
67
|
+
|
29
68
|
def do_parse(
|
30
69
|
output_dir,
|
31
70
|
pdf_file_name,
|
@@ -55,6 +94,8 @@ def do_parse(
|
|
55
94
|
f_draw_model_bbox = True
|
56
95
|
f_draw_line_sort_bbox = True
|
57
96
|
|
97
|
+
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
|
98
|
+
|
58
99
|
orig_model_list = copy.deepcopy(model_list)
|
59
100
|
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
60
101
|
parse_method)
|
@@ -66,15 +107,18 @@ def do_parse(
|
|
66
107
|
if parse_method == 'auto':
|
67
108
|
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
68
109
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
69
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
110
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
111
|
+
lang=lang,
|
70
112
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
71
113
|
elif parse_method == 'txt':
|
72
114
|
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
73
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
115
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
116
|
+
lang=lang,
|
74
117
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
75
118
|
elif parse_method == 'ocr':
|
76
119
|
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
77
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
120
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
121
|
+
lang=lang,
|
78
122
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
79
123
|
else:
|
80
124
|
logger.error('unknown parse method')
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.3
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -26,6 +26,9 @@ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
|
|
26
26
|
Requires-Dist: einops; extra == "full"
|
27
27
|
Requires-Dist: accelerate; extra == "full"
|
28
28
|
Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
|
29
|
+
Requires-Dist: rapidocr-paddle; extra == "full"
|
30
|
+
Requires-Dist: rapid-table; extra == "full"
|
31
|
+
Requires-Dist: PyYAML; extra == "full"
|
29
32
|
Requires-Dist: detectron2; extra == "full"
|
30
33
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
|
31
34
|
Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
@@ -80,6 +83,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
80
83
|
</div>
|
81
84
|
|
82
85
|
# Changelog
|
86
|
+
- 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
|
83
87
|
- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
|
84
88
|
- 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
|
85
89
|
- Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
|
@@ -284,7 +288,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
284
288
|
"enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
|
285
289
|
},
|
286
290
|
"table-config": {
|
287
|
-
"model": "
|
291
|
+
"model": "rapid_table", // When using structEqTable, please change to "struct_eqtable".
|
288
292
|
"enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
|
289
293
|
"max_time": 400
|
290
294
|
}
|
@@ -299,7 +303,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
299
303
|
- [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
|
300
304
|
- Quick Deployment with Docker
|
301
305
|
> [!IMPORTANT]
|
302
|
-
> Docker requires a GPU with at least
|
306
|
+
> Docker requires a GPU with at least 8GB of VRAM, and all acceleration features are enabled by default.
|
303
307
|
>
|
304
308
|
> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
|
305
309
|
>
|
@@ -459,7 +463,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
|
|
459
463
|
# Acknowledgments
|
460
464
|
|
461
465
|
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
466
|
+
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
|
462
467
|
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
|
468
|
+
- [RapidTable](https://github.com/RapidAI/RapidTable)
|
463
469
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
464
470
|
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
465
471
|
- [layoutreader](https://github.com/ppaanngggg/layoutreader)
|
@@ -2,7 +2,7 @@ magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
magic_pdf/pdf_parse_by_ocr.py,sha256=E-AYHUXjzorFli0CEtmnAi09SI2STJ7FX58yjU0c9PI,810
|
3
3
|
magic_pdf/pdf_parse_by_txt.py,sha256=YeFYVAdfwF1CXOHq0LVE5131nqPHA14nt5t_sb-CMMk,709
|
4
4
|
magic_pdf/pdf_parse_union_core.py,sha256=AGIrP7ahc6Ycku0PxAlbjZhwqsdJ8iuRPIn-PFASKWY,11772
|
5
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=
|
5
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=GAgSP0PqbPg4U_nJXUztr-uBmakIK5rKwuxv0o9nMK0,25228
|
6
6
|
magic_pdf/user_api.py,sha256=gM-3RQYc6pMEsVvEPFXfWf5RBjAvHcUccugL6fXpP_U,3991
|
7
7
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
|
@@ -23,7 +23,7 @@ magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
|
23
23
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
24
24
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
26
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
26
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=lM5UBDueiZcm4_z-jtmcgbJH2jhaXhMVY5ubggaKqHU,12954
|
27
27
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
28
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
29
29
|
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
@@ -38,7 +38,7 @@ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl
|
|
38
38
|
magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
|
39
39
|
magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
|
40
40
|
magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
|
41
|
-
magic_pdf/libs/Constants.py,sha256=
|
41
|
+
magic_pdf/libs/Constants.py,sha256=ptiwMvWDUmzRZ0IbP1bM3PjGJ24BQVQQHO4sCeioPv8,1173
|
42
42
|
magic_pdf/libs/MakeContentConfig.py,sha256=Do5VKNQp3gfUKyhrZStfzfBj7l-vbsYpsJFF1SsmEc0,248
|
43
43
|
magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
|
44
44
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -46,11 +46,11 @@ magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,169
|
|
46
46
|
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
47
47
|
magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
|
48
48
|
magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
|
49
|
-
magic_pdf/libs/config_reader.py,sha256=
|
49
|
+
magic_pdf/libs/config_reader.py,sha256=7QIeUPLb8CNa7E3n8TT3MN61lZdYVTylxn5cyXPsPfA,4066
|
50
50
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
51
51
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
52
52
|
magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
|
53
|
-
magic_pdf/libs/draw_bbox.py,sha256=
|
53
|
+
magic_pdf/libs/draw_bbox.py,sha256=Ri_jbOv3Tgnx6s1IscRIWiIKNfUHPkGW8v4q4jPtgo8,17623
|
54
54
|
magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw,2148
|
55
55
|
magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
|
56
56
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
@@ -65,42 +65,63 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
65
65
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
66
66
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
67
67
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
68
|
-
magic_pdf/libs/version.py,sha256=
|
68
|
+
magic_pdf/libs/version.py,sha256=xKd3pzbczuMsdB08eLAOqZDUd_q1IRxwZ_ccAFL4c4A,22
|
69
69
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
70
70
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
71
71
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
|
72
72
|
magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
|
73
73
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
74
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
75
|
-
magic_pdf/model/ppTableModel.py,sha256=fqMuMahN2BW4sKGCgFLsi1X1OFaIG8Dab_eHUhKPcH4,2692
|
74
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=6y8tQSwse8cAgqjDoJvJ-uSPdT8FYzyUeCW5g7j1Tyw,10126
|
76
75
|
magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
|
77
|
-
magic_pdf/model/
|
78
|
-
magic_pdf/model/
|
79
|
-
magic_pdf/model/
|
80
|
-
magic_pdf/model/
|
81
|
-
magic_pdf/model/
|
82
|
-
magic_pdf/model/
|
83
|
-
magic_pdf/model/
|
84
|
-
magic_pdf/model/
|
85
|
-
magic_pdf/model/
|
86
|
-
magic_pdf/model/
|
87
|
-
magic_pdf/model/
|
88
|
-
magic_pdf/model/
|
89
|
-
magic_pdf/model/
|
90
|
-
magic_pdf/model/
|
91
|
-
magic_pdf/model/
|
92
|
-
magic_pdf/model/
|
93
|
-
magic_pdf/model/
|
94
|
-
magic_pdf/model/
|
95
|
-
magic_pdf/model/
|
96
|
-
magic_pdf/model/
|
97
|
-
magic_pdf/model/
|
98
|
-
magic_pdf/model/
|
99
|
-
magic_pdf/model/
|
100
|
-
magic_pdf/model/
|
101
|
-
magic_pdf/model/
|
102
|
-
magic_pdf/model/
|
103
|
-
magic_pdf/model/
|
76
|
+
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
|
+
magic_pdf/model/sub_modules/model_init.py,sha256=iFugp79H_QLi-P7t_6Ug0qIs2oOc4zSnf-8hhZhezHA,5021
|
78
|
+
magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
|
79
|
+
magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
80
|
+
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
|
81
|
+
magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
82
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
83
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
|
84
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
|
85
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
|
86
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
|
87
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
|
88
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
|
89
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
|
90
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
|
91
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
|
92
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
|
93
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
|
94
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
|
95
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
|
96
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
|
97
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
|
98
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
|
99
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
|
100
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
101
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
102
|
+
magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
103
|
+
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=A0eABWvJLyRH6kENWU31g66D2QQos12S0hEmbOuoB0g,347
|
104
|
+
magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
105
|
+
magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
106
|
+
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1PgpFrE0RcmCRl19oXbudxwgXc,3528
|
107
|
+
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
|
+
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=2QAxxs0awZ_osLMiL-oP8Ik6VQ3f2C4dgJ0EV93bxlQ,9202
|
111
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=BZ7wtkYvvcKtv8jUOI1n6wsSramt-Ob5faP7UeqrfCU,6710
|
112
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
113
|
+
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
115
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
|
116
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2uLSaZSo,7349
|
117
|
+
magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
118
|
+
magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
|
119
|
+
magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
|
+
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=UT__wzKQ4tVxlxgFacDqJfTyBU911CTJXD_6CTw6iS8,516
|
121
|
+
magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
122
|
+
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
|
123
|
+
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
124
|
+
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=keSvrxuTVqc8PbNenwb43VDhJqqzp0ayxK691kxClac,2702
|
104
125
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
105
126
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
106
127
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -112,7 +133,7 @@ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG
|
|
112
133
|
magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
|
113
134
|
magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
|
114
135
|
magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
|
115
|
-
magic_pdf/para/para_split_v3.py,sha256=
|
136
|
+
magic_pdf/para/para_split_v3.py,sha256=vSJ5_QqGKP1rbTbGQg5ONNpybidpTdbgXZgTGd2bGsw,14539
|
116
137
|
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
117
138
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
118
139
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
@@ -155,7 +176,7 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
|
|
155
176
|
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
156
177
|
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
157
178
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
158
|
-
magic_pdf/resources/model_config/model_configs.yaml,sha256=
|
179
|
+
magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
|
159
180
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
|
160
181
|
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
161
182
|
magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
|
@@ -167,12 +188,12 @@ magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,
|
|
167
188
|
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
168
189
|
magic_pdf/tools/cli.py,sha256=yl2E-DYxBN3XF7bWOBseYxptbmeE7tXWpwV-sp2aGIE,3140
|
169
190
|
magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,4122
|
170
|
-
magic_pdf/tools/common.py,sha256=
|
191
|
+
magic_pdf/tools/common.py,sha256=oo6DsbriyQv0azRNZSt4B-13eXvsMsPgE_kwgO0-aM8,7364
|
171
192
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
172
193
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
173
|
-
magic_pdf-0.9.
|
174
|
-
magic_pdf-0.9.
|
175
|
-
magic_pdf-0.9.
|
176
|
-
magic_pdf-0.9.
|
177
|
-
magic_pdf-0.9.
|
178
|
-
magic_pdf-0.9.
|
194
|
+
magic_pdf-0.9.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
195
|
+
magic_pdf-0.9.3.dist-info/METADATA,sha256=IpWvg-cnoZ9euLIh_3PYmPGh-DCQ8n8Lp2Ar4oyUfuc,40128
|
196
|
+
magic_pdf-0.9.3.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
|
197
|
+
magic_pdf-0.9.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
198
|
+
magic_pdf-0.9.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
199
|
+
magic_pdf-0.9.3.dist-info/RECORD,,
|
@@ -1,36 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
|
3
|
-
def layout_rm_equation(layout_res):
|
4
|
-
rm_idxs = []
|
5
|
-
for idx, ele in enumerate(layout_res['layout_dets']):
|
6
|
-
if ele['category_id'] == 10:
|
7
|
-
rm_idxs.append(idx)
|
8
|
-
|
9
|
-
for idx in rm_idxs[::-1]:
|
10
|
-
del layout_res['layout_dets'][idx]
|
11
|
-
return layout_res
|
12
|
-
|
13
|
-
|
14
|
-
def get_croped_image(image_pil, bbox):
|
15
|
-
x_min, y_min, x_max, y_max = bbox
|
16
|
-
croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
|
17
|
-
return croped_img
|
18
|
-
|
19
|
-
|
20
|
-
def latex_rm_whitespace(s: str):
|
21
|
-
"""Remove unnecessary whitespace from LaTeX code.
|
22
|
-
"""
|
23
|
-
text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
|
24
|
-
letter = '[a-zA-Z]'
|
25
|
-
noletter = '[\W_^\d]'
|
26
|
-
names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
|
27
|
-
s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
|
28
|
-
news = s
|
29
|
-
while True:
|
30
|
-
s = news
|
31
|
-
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
|
32
|
-
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
|
33
|
-
news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
|
34
|
-
if news == s:
|
35
|
-
break
|
36
|
-
return s
|