magic-pdf 0.9.1__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +1 -1
  2. magic_pdf/libs/Constants.py +3 -1
  3. magic_pdf/libs/config_reader.py +1 -1
  4. magic_pdf/libs/draw_bbox.py +10 -4
  5. magic_pdf/libs/version.py +1 -1
  6. magic_pdf/model/pdf_extract_kit.py +42 -310
  7. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  8. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  9. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  10. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  11. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  12. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  13. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  14. magic_pdf/model/sub_modules/model_init.py +144 -0
  15. magic_pdf/model/sub_modules/model_utils.py +51 -0
  16. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  17. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  18. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +259 -0
  19. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +168 -0
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  21. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  24. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  26. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +14 -0
  27. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  28. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  29. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  30. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  31. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +1 -1
  32. magic_pdf/para/para_split_v3.py +13 -15
  33. magic_pdf/pdf_parse_union_core_v2.py +56 -19
  34. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  35. magic_pdf/tools/common.py +47 -3
  36. {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/METADATA +35 -25
  37. {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/RECORD +65 -44
  38. {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/WHEEL +1 -1
  39. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  40. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  41. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  42. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  43. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  44. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  45. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  46. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  47. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  48. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  49. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  50. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  51. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  52. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  53. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  54. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  55. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  56. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  57. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  58. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  59. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  60. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  61. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  62. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  63. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  64. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  65. {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/LICENSE.md +0 -0
  66. {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/top_level.txt +0 -0
@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import (
30
30
  from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
31
31
  ocr_prepare_bboxes_for_layout_split_v2
32
32
  from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
33
- fix_block_spans,
34
- fix_discarded_block, fix_block_spans_v2)
33
+ fix_discarded_block,
34
+ fix_block_spans_v2)
35
35
  from magic_pdf.pre_proc.ocr_span_list_modify import (
36
36
  get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
37
37
  remove_overlaps_min_spans)
@@ -164,8 +164,8 @@ class ModelSingleton:
164
164
 
165
165
 
166
166
  def do_predict(boxes: List[List[int]], model) -> List[int]:
167
- from magic_pdf.model.v3.helpers import (boxes2inputs, parse_logits,
168
- prepare_inputs)
167
+ from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (boxes2inputs, parse_logits,
168
+ prepare_inputs)
169
169
 
170
170
  inputs = boxes2inputs(boxes)
171
171
  inputs = prepare_inputs(inputs, model)
@@ -174,23 +174,57 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
174
174
 
175
175
 
176
176
  def cal_block_index(fix_blocks, sorted_bboxes):
177
- for block in fix_blocks:
178
177
 
179
- line_index_list = []
180
- if len(block['lines']) == 0:
181
- block['index'] = sorted_bboxes.index(block['bbox'])
182
- else:
178
+ if sorted_bboxes is not None:
179
+ # 使用layoutreader排序
180
+ for block in fix_blocks:
181
+ line_index_list = []
182
+ if len(block['lines']) == 0:
183
+ block['index'] = sorted_bboxes.index(block['bbox'])
184
+ else:
185
+ for line in block['lines']:
186
+ line['index'] = sorted_bboxes.index(line['bbox'])
187
+ line_index_list.append(line['index'])
188
+ median_value = statistics.median(line_index_list)
189
+ block['index'] = median_value
190
+
191
+ # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
192
+ if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
193
+ block['virtual_lines'] = copy.deepcopy(block['lines'])
194
+ block['lines'] = copy.deepcopy(block['real_lines'])
195
+ del block['real_lines']
196
+ else:
197
+ # 使用xycut排序
198
+ block_bboxes = []
199
+ for block in fix_blocks:
200
+ block_bboxes.append(block['bbox'])
201
+
202
+ # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
203
+ if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
204
+ block['virtual_lines'] = copy.deepcopy(block['lines'])
205
+ block['lines'] = copy.deepcopy(block['real_lines'])
206
+ del block['real_lines']
207
+
208
+ import numpy as np
209
+ from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import recursive_xy_cut
210
+
211
+ random_boxes = np.array(block_bboxes)
212
+ np.random.shuffle(random_boxes)
213
+ res = []
214
+ recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
215
+ assert len(res) == len(block_bboxes)
216
+ sorted_boxes = random_boxes[np.array(res)].tolist()
217
+
218
+ for i, block in enumerate(fix_blocks):
219
+ block['index'] = sorted_boxes.index(block['bbox'])
220
+
221
+ # 生成line index
222
+ sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
223
+ line_inedx = 1
224
+ for block in sorted_blocks:
183
225
  for line in block['lines']:
184
- line['index'] = sorted_bboxes.index(line['bbox'])
185
- line_index_list.append(line['index'])
186
- median_value = statistics.median(line_index_list)
187
- block['index'] = median_value
188
-
189
- # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
190
- if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
191
- block['virtual_lines'] = copy.deepcopy(block['lines'])
192
- block['lines'] = copy.deepcopy(block['real_lines'])
193
- del block['real_lines']
226
+ line['index'] = line_inedx
227
+ line_inedx += 1
194
228
 
195
229
  return fix_blocks
196
230
 
@@ -264,6 +298,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
264
298
  block['lines'].append({'bbox': line, 'spans': []})
265
299
  page_line_list.extend(lines)
266
300
 
301
+ if len(page_line_list) > 200: # layoutreader最高支持512line
302
+ return None
303
+
267
304
  # 使用layoutreader排序
268
305
  x_scale = 1000.0 / page_w
269
306
  y_scale = 1000.0 / page_h
@@ -4,4 +4,5 @@ weights:
4
4
  yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
5
5
  unimernet_small: MFR/unimernet_small
6
6
  struct_eqtable: TabRec/StructEqTable
7
- tablemaster: TabRec/TableMaster
7
+ tablemaster: TabRec/TableMaster
8
+ rapid_table: TabRec/RapidTable
magic_pdf/tools/common.py CHANGED
@@ -14,6 +14,9 @@ from magic_pdf.pipe.TXTPipe import TXTPipe
14
14
  from magic_pdf.pipe.UNIPipe import UNIPipe
15
15
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
16
  from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
17
+ import fitz
18
+ # from io import BytesIO
19
+ # from pypdf import PdfReader, PdfWriter
17
20
 
18
21
 
19
22
  def prepare_env(output_dir, pdf_file_name, method):
@@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method):
26
29
  return local_image_dir, local_md_dir
27
30
 
28
31
 
32
+ # def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
33
+ # # 将字节数据包装在 BytesIO 对象中
34
+ # pdf_file = BytesIO(pdf_bytes)
35
+ # # 读取 PDF 的字节数据
36
+ # reader = PdfReader(pdf_file)
37
+ # # 创建一个新的 PDF 写入器
38
+ # writer = PdfWriter()
39
+ # # 将所有页面添加到新的 PDF 写入器中
40
+ # end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
41
+ # if end_page_id > len(reader.pages) - 1:
42
+ # logger.warning("end_page_id is out of range, use pdf_docs length")
43
+ # end_page_id = len(reader.pages) - 1
44
+ # for i, page in enumerate(reader.pages):
45
+ # if start_page_id <= i <= end_page_id:
46
+ # writer.add_page(page)
47
+ # # 创建一个字节缓冲区来存储输出的 PDF 数据
48
+ # output_buffer = BytesIO()
49
+ # # 将 PDF 写入字节缓冲区
50
+ # writer.write(output_buffer)
51
+ # # 获取字节缓冲区的内容
52
+ # converted_pdf_bytes = output_buffer.getvalue()
53
+ # return converted_pdf_bytes
54
+
55
+
56
+ def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
57
+ document = fitz.open("pdf", pdf_bytes)
58
+ output_document = fitz.open()
59
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
60
+ if end_page_id > len(document) - 1:
61
+ logger.warning("end_page_id is out of range, use pdf_docs length")
62
+ end_page_id = len(document) - 1
63
+ output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
64
+ output_bytes = output_document.tobytes()
65
+ return output_bytes
66
+
67
+
29
68
  def do_parse(
30
69
  output_dir,
31
70
  pdf_file_name,
@@ -55,6 +94,8 @@ def do_parse(
55
94
  f_draw_model_bbox = True
56
95
  f_draw_line_sort_bbox = True
57
96
 
97
+ pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
98
+
58
99
  orig_model_list = copy.deepcopy(model_list)
59
100
  local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
60
101
  parse_method)
@@ -66,15 +107,18 @@ def do_parse(
66
107
  if parse_method == 'auto':
67
108
  jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
68
109
  pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
69
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
110
+ # start_page_id=start_page_id, end_page_id=end_page_id,
111
+ lang=lang,
70
112
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
71
113
  elif parse_method == 'txt':
72
114
  pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
73
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
115
+ # start_page_id=start_page_id, end_page_id=end_page_id,
116
+ lang=lang,
74
117
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
75
118
  elif parse_method == 'ocr':
76
119
  pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
77
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
120
+ # start_page_id=start_page_id, end_page_id=end_page_id,
121
+ lang=lang,
78
122
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
79
123
  else:
80
124
  logger.error('unknown parse method')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.9.1
3
+ Version: 0.9.3
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -26,6 +26,9 @@ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
26
26
  Requires-Dist: einops; extra == "full"
27
27
  Requires-Dist: accelerate; extra == "full"
28
28
  Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
29
+ Requires-Dist: rapidocr-paddle; extra == "full"
30
+ Requires-Dist: rapid-table; extra == "full"
31
+ Requires-Dist: PyYAML; extra == "full"
29
32
  Requires-Dist: detectron2; extra == "full"
30
33
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
31
34
  Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
@@ -80,7 +83,8 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
80
83
  </div>
81
84
 
82
85
  # Changelog
83
- - 2024/11/06 0.9.1 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
86
+ - 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
87
+ - 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
84
88
  - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
85
89
  - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
86
90
  - Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.
@@ -176,13 +180,14 @@ There are three different ways to experience MinerU:
176
180
  - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
177
181
  - [Linux/Windows + CUDA](#Using-GPU)
178
182
 
179
- **⚠️ Pre-installation Notice—Hardware and Software Environment Support**
180
-
181
- To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
182
-
183
- By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
184
-
185
- In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
183
+ > [!WARNING]
184
+ > **Pre-installation Notice—Hardware and Software Environment Support**
185
+ >
186
+ > To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
187
+ >
188
+ > By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
189
+ >
190
+ > In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
186
191
 
187
192
  <table>
188
193
  <tr>
@@ -262,11 +267,13 @@ Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for de
262
267
  After completing the [2. Download model weight files](#2-download-model-weight-files) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
263
268
  You can find the `magic-pdf.json` file in your 【user directory】.
264
269
 
270
+ > [!TIP]
265
271
  > The user directory for Windows is "C:\\Users\\username", for Linux it is "/home/username", and for macOS it is "/Users/username".
266
272
 
267
273
  You can modify certain configurations in this file to enable or disable features, such as table recognition:
268
274
 
269
275
 
276
+ > [!NOTE]
270
277
  > If the following items are not present in the JSON, please manually add the required items and remove the comment content (standard JSON does not support comments).
271
278
 
272
279
  ```json
@@ -281,7 +288,7 @@ You can modify certain configurations in this file to enable or disable features
281
288
  "enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
282
289
  },
283
290
  "table-config": {
284
- "model": "tablemaster", // When using structEqTable, please change to "struct_eqtable".
291
+ "model": "rapid_table", // When using structEqTable, please change to "struct_eqtable".
285
292
  "enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
286
293
  "max_time": 400
287
294
  }
@@ -295,13 +302,14 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
295
302
  - [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
296
303
  - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
297
304
  - Quick Deployment with Docker
298
- > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
299
- >
300
- > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
301
- >
302
- > ```bash
303
- > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
304
- > ```
305
+ > [!IMPORTANT]
306
+ > Docker requires a GPU with at least 8GB of VRAM, and all acceleration features are enabled by default.
307
+ >
308
+ > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
309
+ >
310
+ > ```bash
311
+ > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
312
+ > ```
305
313
  ```bash
306
314
  wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
307
315
  docker build -t mineru:latest .
@@ -363,8 +371,8 @@ The results will be saved in the `{some_output_dir}` directory. The output file
363
371
  ├── some_pdf_spans.pdf # smallest granularity bbox position information diagram
364
372
  └── some_pdf_content_list.json # Rich text JSON arranged in reading order
365
373
  ```
366
-
367
- For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
374
+ > [!TIP]
375
+ > For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
368
376
 
369
377
  ### API
370
378
 
@@ -415,12 +423,12 @@ TODO
415
423
 
416
424
  # TODO
417
425
 
418
- - 🗹 Reading order based on the model
419
- - 🗹 Recognition of `index` and `list` in the main text
420
- - 🗹 Table recognition
421
- - Code block recognition in the main text
422
- - [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
423
- - Geometric shape recognition
426
+ - [x] Reading order based on the model
427
+ - [x] Recognition of `index` and `list` in the main text
428
+ - [x] Table recognition
429
+ - [ ] Code block recognition in the main text
430
+ - [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
431
+ - [ ] Geometric shape recognition
424
432
 
425
433
  # Known Issues
426
434
 
@@ -455,7 +463,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
455
463
  # Acknowledgments
456
464
 
457
465
  - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
466
+ - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
458
467
  - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
468
+ - [RapidTable](https://github.com/RapidAI/RapidTable)
459
469
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
460
470
  - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
461
471
  - [layoutreader](https://github.com/ppaanngggg/layoutreader)
@@ -2,7 +2,7 @@ magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  magic_pdf/pdf_parse_by_ocr.py,sha256=E-AYHUXjzorFli0CEtmnAi09SI2STJ7FX58yjU0c9PI,810
3
3
  magic_pdf/pdf_parse_by_txt.py,sha256=YeFYVAdfwF1CXOHq0LVE5131nqPHA14nt5t_sb-CMMk,709
4
4
  magic_pdf/pdf_parse_union_core.py,sha256=AGIrP7ahc6Ycku0PxAlbjZhwqsdJ8iuRPIn-PFASKWY,11772
5
- magic_pdf/pdf_parse_union_core_v2.py,sha256=18V8aYLz0gLQSMxGJDgUCWSltr15L8s8ClNqB17DNvw,23647
5
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=GAgSP0PqbPg4U_nJXUztr-uBmakIK5rKwuxv0o9nMK0,25228
6
6
  magic_pdf/user_api.py,sha256=gM-3RQYc6pMEsVvEPFXfWf5RBjAvHcUccugL6fXpP_U,3991
7
7
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
@@ -23,7 +23,7 @@ magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
23
23
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
24
24
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
26
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=ClxKUwrK7wlXKCcDfuTryztKl5e8pzcnh5x_fODFm2U,12928
26
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=lM5UBDueiZcm4_z-jtmcgbJH2jhaXhMVY5ubggaKqHU,12954
27
27
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
29
29
  magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -38,7 +38,7 @@ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl
38
38
  magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
39
39
  magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
40
40
  magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
41
- magic_pdf/libs/Constants.py,sha256=u9i-ivxxcGp7Hu_zvrLA1jdPcEnYhPb7wXOSUaOSwGQ,1140
41
+ magic_pdf/libs/Constants.py,sha256=ptiwMvWDUmzRZ0IbP1bM3PjGJ24BQVQQHO4sCeioPv8,1173
42
42
  magic_pdf/libs/MakeContentConfig.py,sha256=Do5VKNQp3gfUKyhrZStfzfBj7l-vbsYpsJFF1SsmEc0,248
43
43
  magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
44
44
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -46,11 +46,11 @@ magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,169
46
46
  magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
47
47
  magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
48
48
  magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
49
- magic_pdf/libs/config_reader.py,sha256=0uMOTSZE2crvqdGj2j21tlwV0nNRZha44Y9X98gQ3AQ,4067
49
+ magic_pdf/libs/config_reader.py,sha256=7QIeUPLb8CNa7E3n8TT3MN61lZdYVTylxn5cyXPsPfA,4066
50
50
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
51
51
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
52
52
  magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
53
- magic_pdf/libs/draw_bbox.py,sha256=J_RaG6sRkqNyUd75NiWbDk-3LIE8es-zNIbcwR9myvo,17196
53
+ magic_pdf/libs/draw_bbox.py,sha256=Ri_jbOv3Tgnx6s1IscRIWiIKNfUHPkGW8v4q4jPtgo8,17623
54
54
  magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw,2148
55
55
  magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
56
56
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
@@ -65,42 +65,63 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
65
65
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
66
66
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
67
67
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
68
- magic_pdf/libs/version.py,sha256=UwJXM8JY2T3tE2id0K2k_lEaVThbRTrGO1mNibyzIz8,22
68
+ magic_pdf/libs/version.py,sha256=xKd3pzbczuMsdB08eLAOqZDUd_q1IRxwZ_ccAFL4c4A,22
69
69
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
70
70
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
71
71
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
72
72
  magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
73
73
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
74
- magic_pdf/model/pdf_extract_kit.py,sha256=7BVcVkrIAI2aTAUHD_Xrq0yAuy4BEAAJEicOM8Hr0Xw,21593
75
- magic_pdf/model/ppTableModel.py,sha256=fqMuMahN2BW4sKGCgFLsi1X1OFaIG8Dab_eHUhKPcH4,2692
74
+ magic_pdf/model/pdf_extract_kit.py,sha256=6y8tQSwse8cAgqjDoJvJ-uSPdT8FYzyUeCW5g7j1Tyw,10126
76
75
  magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
77
- magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
- magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
79
- magic_pdf/model/pek_sub_modules/self_modify.py,sha256=NGUr8t4bXSeh38hwrfs6qxhf0IW-f3J96bnrg1xw8BA,14281
80
- magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
- magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
82
- magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
83
- magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
84
- magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
85
- magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
86
- magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
87
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
88
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
89
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
90
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
91
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
92
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
93
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
94
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
95
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
96
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
97
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
98
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
99
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
100
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qQthlYDvDPah1mzzrnKXU4fYqlJdXOPBnJ8tYf-o_0k,1384
101
- magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
102
- magic_pdf/model/v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
- magic_pdf/model/v3/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
76
+ magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ magic_pdf/model/sub_modules/model_init.py,sha256=iFugp79H_QLi-P7t_6Ug0qIs2oOc4zSnf-8hhZhezHA,5021
78
+ magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
79
+ magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
81
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
+ magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
+ magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
84
+ magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
85
+ magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
86
+ magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
87
+ magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
88
+ magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
89
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
90
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
91
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
92
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
93
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
94
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
95
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
96
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
97
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
98
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
99
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
100
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
101
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
102
+ magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
+ magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=A0eABWvJLyRH6kENWU31g66D2QQos12S0hEmbOuoB0g,347
104
+ magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
+ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
+ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1PgpFrE0RcmCRl19oXbudxwgXc,3528
107
+ magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
+ magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
+ magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
+ magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=2QAxxs0awZ_osLMiL-oP8Ik6VQ3f2C4dgJ0EV93bxlQ,9202
111
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=BZ7wtkYvvcKtv8jUOI1n6wsSramt-Ob5faP7UeqrfCU,6710
112
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
113
+ magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
115
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
116
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2uLSaZSo,7349
117
+ magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
+ magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
119
+ magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
+ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=UT__wzKQ4tVxlxgFacDqJfTyBU911CTJXD_6CTw6iS8,516
121
+ magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
+ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
123
+ magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
+ magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=keSvrxuTVqc8PbNenwb43VDhJqqzp0ayxK691kxClac,2702
104
125
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
126
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
106
127
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -112,7 +133,7 @@ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG
112
133
  magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
113
134
  magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
114
135
  magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
115
- magic_pdf/para/para_split_v3.py,sha256=k02I9Rdc8jfYr3bMT_Gm38b5ginkl-ZIU5C_XcfAcs8,14704
136
+ magic_pdf/para/para_split_v3.py,sha256=vSJ5_QqGKP1rbTbGQg5ONNpybidpTdbgXZgTGd2bGsw,14539
116
137
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
117
138
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
118
139
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -155,7 +176,7 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
155
176
  magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
156
177
  magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
157
178
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
158
- magic_pdf/resources/model_config/model_configs.yaml,sha256=muwPuO6C8Z5oEStE-wWEt7vmkenmJiqTbkRfUfle_vU,257
179
+ magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
159
180
  magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
160
181
  magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
161
182
  magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
@@ -167,12 +188,12 @@ magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,
167
188
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
189
  magic_pdf/tools/cli.py,sha256=yl2E-DYxBN3XF7bWOBseYxptbmeE7tXWpwV-sp2aGIE,3140
169
190
  magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,4122
170
- magic_pdf/tools/common.py,sha256=2S8N60pcA6bFqAmdchoEmn22l9ntQxEfyaKpxfCKJ-Y,5465
191
+ magic_pdf/tools/common.py,sha256=oo6DsbriyQv0azRNZSt4B-13eXvsMsPgE_kwgO0-aM8,7364
171
192
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
193
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
173
- magic_pdf-0.9.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
174
- magic_pdf-0.9.1.dist-info/METADATA,sha256=2NLbuQt-GzeMws3412i4A8XaDr8xuMZBymu7n3XY7S0,39624
175
- magic_pdf-0.9.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
176
- magic_pdf-0.9.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
177
- magic_pdf-0.9.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
178
- magic_pdf-0.9.1.dist-info/RECORD,,
194
+ magic_pdf-0.9.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
195
+ magic_pdf-0.9.3.dist-info/METADATA,sha256=IpWvg-cnoZ9euLIh_3PYmPGh-DCQ8n8Lp2Ar4oyUfuc,40128
196
+ magic_pdf-0.9.3.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
197
+ magic_pdf-0.9.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
198
+ magic_pdf-0.9.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
199
+ magic_pdf-0.9.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,36 +0,0 @@
1
- import re
2
-
3
- def layout_rm_equation(layout_res):
4
- rm_idxs = []
5
- for idx, ele in enumerate(layout_res['layout_dets']):
6
- if ele['category_id'] == 10:
7
- rm_idxs.append(idx)
8
-
9
- for idx in rm_idxs[::-1]:
10
- del layout_res['layout_dets'][idx]
11
- return layout_res
12
-
13
-
14
- def get_croped_image(image_pil, bbox):
15
- x_min, y_min, x_max, y_max = bbox
16
- croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
17
- return croped_img
18
-
19
-
20
- def latex_rm_whitespace(s: str):
21
- """Remove unnecessary whitespace from LaTeX code.
22
- """
23
- text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
24
- letter = '[a-zA-Z]'
25
- noletter = '[\W_^\d]'
26
- names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
27
- s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
28
- news = s
29
- while True:
30
- s = news
31
- news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
32
- news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
33
- news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
34
- if news == s:
35
- break
36
- return s