magic-pdf 0.9.2__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +1 -1
  2. magic_pdf/libs/Constants.py +3 -1
  3. magic_pdf/libs/config_reader.py +1 -1
  4. magic_pdf/libs/draw_bbox.py +10 -4
  5. magic_pdf/libs/version.py +1 -1
  6. magic_pdf/model/pdf_extract_kit.py +42 -297
  7. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  8. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  9. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  10. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  11. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  12. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  13. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  14. magic_pdf/model/sub_modules/model_init.py +144 -0
  15. magic_pdf/model/sub_modules/model_utils.py +51 -0
  16. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  17. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  18. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +259 -0
  19. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +168 -0
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  21. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  24. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  26. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +14 -0
  27. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  28. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  29. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  30. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  31. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +1 -1
  32. magic_pdf/para/para_split_v3.py +13 -15
  33. magic_pdf/pdf_parse_union_core_v2.py +56 -19
  34. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  35. magic_pdf/tools/common.py +47 -3
  36. {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/METADATA +9 -3
  37. {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/RECORD +65 -44
  38. {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/WHEEL +1 -1
  39. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  40. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  41. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  42. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  43. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  44. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  45. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  46. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  47. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  48. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  49. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  50. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  51. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  52. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  53. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  54. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  55. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  56. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  57. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  58. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  59. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  60. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  61. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  62. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  63. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  64. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  65. {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/LICENSE.md +0 -0
  66. {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.9.2.dist-info → magic_pdf-0.9.3.dist-info}/top_level.txt +0 -0
@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import (
30
30
  from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
31
31
  ocr_prepare_bboxes_for_layout_split_v2
32
32
  from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
33
- fix_block_spans,
34
- fix_discarded_block, fix_block_spans_v2)
33
+ fix_discarded_block,
34
+ fix_block_spans_v2)
35
35
  from magic_pdf.pre_proc.ocr_span_list_modify import (
36
36
  get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
37
37
  remove_overlaps_min_spans)
@@ -164,8 +164,8 @@ class ModelSingleton:
164
164
 
165
165
 
166
166
  def do_predict(boxes: List[List[int]], model) -> List[int]:
167
- from magic_pdf.model.v3.helpers import (boxes2inputs, parse_logits,
168
- prepare_inputs)
167
+ from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (boxes2inputs, parse_logits,
168
+ prepare_inputs)
169
169
 
170
170
  inputs = boxes2inputs(boxes)
171
171
  inputs = prepare_inputs(inputs, model)
@@ -174,23 +174,57 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
174
174
 
175
175
 
176
176
  def cal_block_index(fix_blocks, sorted_bboxes):
177
- for block in fix_blocks:
178
177
 
179
- line_index_list = []
180
- if len(block['lines']) == 0:
181
- block['index'] = sorted_bboxes.index(block['bbox'])
182
- else:
178
+ if sorted_bboxes is not None:
179
+ # 使用layoutreader排序
180
+ for block in fix_blocks:
181
+ line_index_list = []
182
+ if len(block['lines']) == 0:
183
+ block['index'] = sorted_bboxes.index(block['bbox'])
184
+ else:
185
+ for line in block['lines']:
186
+ line['index'] = sorted_bboxes.index(line['bbox'])
187
+ line_index_list.append(line['index'])
188
+ median_value = statistics.median(line_index_list)
189
+ block['index'] = median_value
190
+
191
+ # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
192
+ if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
193
+ block['virtual_lines'] = copy.deepcopy(block['lines'])
194
+ block['lines'] = copy.deepcopy(block['real_lines'])
195
+ del block['real_lines']
196
+ else:
197
+ # 使用xycut排序
198
+ block_bboxes = []
199
+ for block in fix_blocks:
200
+ block_bboxes.append(block['bbox'])
201
+
202
+ # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
203
+ if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
204
+ block['virtual_lines'] = copy.deepcopy(block['lines'])
205
+ block['lines'] = copy.deepcopy(block['real_lines'])
206
+ del block['real_lines']
207
+
208
+ import numpy as np
209
+ from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import recursive_xy_cut
210
+
211
+ random_boxes = np.array(block_bboxes)
212
+ np.random.shuffle(random_boxes)
213
+ res = []
214
+ recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
215
+ assert len(res) == len(block_bboxes)
216
+ sorted_boxes = random_boxes[np.array(res)].tolist()
217
+
218
+ for i, block in enumerate(fix_blocks):
219
+ block['index'] = sorted_boxes.index(block['bbox'])
220
+
221
+ # 生成line index
222
+ sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
223
+ line_inedx = 1
224
+ for block in sorted_blocks:
183
225
  for line in block['lines']:
184
- line['index'] = sorted_bboxes.index(line['bbox'])
185
- line_index_list.append(line['index'])
186
- median_value = statistics.median(line_index_list)
187
- block['index'] = median_value
188
-
189
- # 删除图表body block中的虚拟line信息, 并用real_lines信息回填
190
- if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
191
- block['virtual_lines'] = copy.deepcopy(block['lines'])
192
- block['lines'] = copy.deepcopy(block['real_lines'])
193
- del block['real_lines']
226
+ line['index'] = line_inedx
227
+ line_inedx += 1
194
228
 
195
229
  return fix_blocks
196
230
 
@@ -264,6 +298,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
264
298
  block['lines'].append({'bbox': line, 'spans': []})
265
299
  page_line_list.extend(lines)
266
300
 
301
+ if len(page_line_list) > 200: # layoutreader最高支持512line
302
+ return None
303
+
267
304
  # 使用layoutreader排序
268
305
  x_scale = 1000.0 / page_w
269
306
  y_scale = 1000.0 / page_h
@@ -4,4 +4,5 @@ weights:
4
4
  yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
5
5
  unimernet_small: MFR/unimernet_small
6
6
  struct_eqtable: TabRec/StructEqTable
7
- tablemaster: TabRec/TableMaster
7
+ tablemaster: TabRec/TableMaster
8
+ rapid_table: TabRec/RapidTable
magic_pdf/tools/common.py CHANGED
@@ -14,6 +14,9 @@ from magic_pdf.pipe.TXTPipe import TXTPipe
14
14
  from magic_pdf.pipe.UNIPipe import UNIPipe
15
15
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
16
  from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
17
+ import fitz
18
+ # from io import BytesIO
19
+ # from pypdf import PdfReader, PdfWriter
17
20
 
18
21
 
19
22
  def prepare_env(output_dir, pdf_file_name, method):
@@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method):
26
29
  return local_image_dir, local_md_dir
27
30
 
28
31
 
32
+ # def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
33
+ # # 将字节数据包装在 BytesIO 对象中
34
+ # pdf_file = BytesIO(pdf_bytes)
35
+ # # 读取 PDF 的字节数据
36
+ # reader = PdfReader(pdf_file)
37
+ # # 创建一个新的 PDF 写入器
38
+ # writer = PdfWriter()
39
+ # # 将所有页面添加到新的 PDF 写入器中
40
+ # end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
41
+ # if end_page_id > len(reader.pages) - 1:
42
+ # logger.warning("end_page_id is out of range, use pdf_docs length")
43
+ # end_page_id = len(reader.pages) - 1
44
+ # for i, page in enumerate(reader.pages):
45
+ # if start_page_id <= i <= end_page_id:
46
+ # writer.add_page(page)
47
+ # # 创建一个字节缓冲区来存储输出的 PDF 数据
48
+ # output_buffer = BytesIO()
49
+ # # 将 PDF 写入字节缓冲区
50
+ # writer.write(output_buffer)
51
+ # # 获取字节缓冲区的内容
52
+ # converted_pdf_bytes = output_buffer.getvalue()
53
+ # return converted_pdf_bytes
54
+
55
+
56
+ def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
57
+ document = fitz.open("pdf", pdf_bytes)
58
+ output_document = fitz.open()
59
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
60
+ if end_page_id > len(document) - 1:
61
+ logger.warning("end_page_id is out of range, use pdf_docs length")
62
+ end_page_id = len(document) - 1
63
+ output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
64
+ output_bytes = output_document.tobytes()
65
+ return output_bytes
66
+
67
+
29
68
  def do_parse(
30
69
  output_dir,
31
70
  pdf_file_name,
@@ -55,6 +94,8 @@ def do_parse(
55
94
  f_draw_model_bbox = True
56
95
  f_draw_line_sort_bbox = True
57
96
 
97
+ pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
98
+
58
99
  orig_model_list = copy.deepcopy(model_list)
59
100
  local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
60
101
  parse_method)
@@ -66,15 +107,18 @@ def do_parse(
66
107
  if parse_method == 'auto':
67
108
  jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
68
109
  pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
69
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
110
+ # start_page_id=start_page_id, end_page_id=end_page_id,
111
+ lang=lang,
70
112
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
71
113
  elif parse_method == 'txt':
72
114
  pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
73
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
115
+ # start_page_id=start_page_id, end_page_id=end_page_id,
116
+ lang=lang,
74
117
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
75
118
  elif parse_method == 'ocr':
76
119
  pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
77
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
120
+ # start_page_id=start_page_id, end_page_id=end_page_id,
121
+ lang=lang,
78
122
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
79
123
  else:
80
124
  logger.error('unknown parse method')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.9.2
3
+ Version: 0.9.3
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -26,6 +26,9 @@ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
26
26
  Requires-Dist: einops; extra == "full"
27
27
  Requires-Dist: accelerate; extra == "full"
28
28
  Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
29
+ Requires-Dist: rapidocr-paddle; extra == "full"
30
+ Requires-Dist: rapid-table; extra == "full"
31
+ Requires-Dist: PyYAML; extra == "full"
29
32
  Requires-Dist: detectron2; extra == "full"
30
33
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
31
34
  Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
@@ -80,6 +83,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
80
83
  </div>
81
84
 
82
85
  # Changelog
86
+ - 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
83
87
  - 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
84
88
  - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
85
89
  - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
@@ -284,7 +288,7 @@ You can modify certain configurations in this file to enable or disable features
284
288
  "enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
285
289
  },
286
290
  "table-config": {
287
- "model": "tablemaster", // When using structEqTable, please change to "struct_eqtable".
291
+ "model": "rapid_table", // When using structEqTable, please change to "struct_eqtable".
288
292
  "enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
289
293
  "max_time": 400
290
294
  }
@@ -299,7 +303,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
299
303
  - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
300
304
  - Quick Deployment with Docker
301
305
  > [!IMPORTANT]
302
- > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
306
+ > Docker requires a GPU with at least 8GB of VRAM, and all acceleration features are enabled by default.
303
307
  >
304
308
  > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
305
309
  >
@@ -459,7 +463,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
459
463
  # Acknowledgments
460
464
 
461
465
  - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
466
+ - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
462
467
  - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
468
+ - [RapidTable](https://github.com/RapidAI/RapidTable)
463
469
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
464
470
  - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
465
471
  - [layoutreader](https://github.com/ppaanngggg/layoutreader)
@@ -2,7 +2,7 @@ magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  magic_pdf/pdf_parse_by_ocr.py,sha256=E-AYHUXjzorFli0CEtmnAi09SI2STJ7FX58yjU0c9PI,810
3
3
  magic_pdf/pdf_parse_by_txt.py,sha256=YeFYVAdfwF1CXOHq0LVE5131nqPHA14nt5t_sb-CMMk,709
4
4
  magic_pdf/pdf_parse_union_core.py,sha256=AGIrP7ahc6Ycku0PxAlbjZhwqsdJ8iuRPIn-PFASKWY,11772
5
- magic_pdf/pdf_parse_union_core_v2.py,sha256=18V8aYLz0gLQSMxGJDgUCWSltr15L8s8ClNqB17DNvw,23647
5
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=GAgSP0PqbPg4U_nJXUztr-uBmakIK5rKwuxv0o9nMK0,25228
6
6
  magic_pdf/user_api.py,sha256=gM-3RQYc6pMEsVvEPFXfWf5RBjAvHcUccugL6fXpP_U,3991
7
7
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
@@ -23,7 +23,7 @@ magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
23
23
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
24
24
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
26
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=ClxKUwrK7wlXKCcDfuTryztKl5e8pzcnh5x_fODFm2U,12928
26
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=lM5UBDueiZcm4_z-jtmcgbJH2jhaXhMVY5ubggaKqHU,12954
27
27
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
28
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
29
29
  magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -38,7 +38,7 @@ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl
38
38
  magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
39
39
  magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
40
40
  magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
41
- magic_pdf/libs/Constants.py,sha256=u9i-ivxxcGp7Hu_zvrLA1jdPcEnYhPb7wXOSUaOSwGQ,1140
41
+ magic_pdf/libs/Constants.py,sha256=ptiwMvWDUmzRZ0IbP1bM3PjGJ24BQVQQHO4sCeioPv8,1173
42
42
  magic_pdf/libs/MakeContentConfig.py,sha256=Do5VKNQp3gfUKyhrZStfzfBj7l-vbsYpsJFF1SsmEc0,248
43
43
  magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
44
44
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -46,11 +46,11 @@ magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,169
46
46
  magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
47
47
  magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
48
48
  magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
49
- magic_pdf/libs/config_reader.py,sha256=0uMOTSZE2crvqdGj2j21tlwV0nNRZha44Y9X98gQ3AQ,4067
49
+ magic_pdf/libs/config_reader.py,sha256=7QIeUPLb8CNa7E3n8TT3MN61lZdYVTylxn5cyXPsPfA,4066
50
50
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
51
51
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
52
52
  magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
53
- magic_pdf/libs/draw_bbox.py,sha256=J_RaG6sRkqNyUd75NiWbDk-3LIE8es-zNIbcwR9myvo,17196
53
+ magic_pdf/libs/draw_bbox.py,sha256=Ri_jbOv3Tgnx6s1IscRIWiIKNfUHPkGW8v4q4jPtgo8,17623
54
54
  magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw,2148
55
55
  magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
56
56
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
@@ -65,42 +65,63 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
65
65
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
66
66
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
67
67
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
68
- magic_pdf/libs/version.py,sha256=gqT-BGoeEItda9fICQDvLbxEjWRIBhFJxPxxKvmHLUo,22
68
+ magic_pdf/libs/version.py,sha256=xKd3pzbczuMsdB08eLAOqZDUd_q1IRxwZ_ccAFL4c4A,22
69
69
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
70
70
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
71
71
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
72
72
  magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
73
73
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
74
- magic_pdf/model/pdf_extract_kit.py,sha256=9pdtcQgwn-XMvyQ7yMfzqKgjPfxEuNXR7juCPx-OM-M,20929
75
- magic_pdf/model/ppTableModel.py,sha256=fqMuMahN2BW4sKGCgFLsi1X1OFaIG8Dab_eHUhKPcH4,2692
74
+ magic_pdf/model/pdf_extract_kit.py,sha256=6y8tQSwse8cAgqjDoJvJ-uSPdT8FYzyUeCW5g7j1Tyw,10126
76
75
  magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
77
- magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
- magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
79
- magic_pdf/model/pek_sub_modules/self_modify.py,sha256=NGUr8t4bXSeh38hwrfs6qxhf0IW-f3J96bnrg1xw8BA,14281
80
- magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
- magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
82
- magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
83
- magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
84
- magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
85
- magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
86
- magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
87
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
88
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
89
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
90
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
91
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
92
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
93
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
94
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
95
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
96
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
97
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
98
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
99
- magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
100
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qQthlYDvDPah1mzzrnKXU4fYqlJdXOPBnJ8tYf-o_0k,1384
101
- magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
102
- magic_pdf/model/v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
- magic_pdf/model/v3/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
76
+ magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ magic_pdf/model/sub_modules/model_init.py,sha256=iFugp79H_QLi-P7t_6Ug0qIs2oOc4zSnf-8hhZhezHA,5021
78
+ magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
79
+ magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
81
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
+ magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
+ magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
84
+ magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
85
+ magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
86
+ magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
87
+ magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
88
+ magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
89
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
90
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
91
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
92
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
93
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
94
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
95
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
96
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
97
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
98
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
99
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
100
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
101
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
102
+ magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
+ magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=A0eABWvJLyRH6kENWU31g66D2QQos12S0hEmbOuoB0g,347
104
+ magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
+ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
+ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1PgpFrE0RcmCRl19oXbudxwgXc,3528
107
+ magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
+ magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
+ magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
+ magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=2QAxxs0awZ_osLMiL-oP8Ik6VQ3f2C4dgJ0EV93bxlQ,9202
111
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=BZ7wtkYvvcKtv8jUOI1n6wsSramt-Ob5faP7UeqrfCU,6710
112
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
113
+ magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
115
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
116
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2uLSaZSo,7349
117
+ magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
+ magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
119
+ magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
+ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=UT__wzKQ4tVxlxgFacDqJfTyBU911CTJXD_6CTw6iS8,516
121
+ magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
+ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
123
+ magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
+ magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=keSvrxuTVqc8PbNenwb43VDhJqqzp0ayxK691kxClac,2702
104
125
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
126
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
106
127
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -112,7 +133,7 @@ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG
112
133
  magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
113
134
  magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
114
135
  magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
115
- magic_pdf/para/para_split_v3.py,sha256=k02I9Rdc8jfYr3bMT_Gm38b5ginkl-ZIU5C_XcfAcs8,14704
136
+ magic_pdf/para/para_split_v3.py,sha256=vSJ5_QqGKP1rbTbGQg5ONNpybidpTdbgXZgTGd2bGsw,14539
116
137
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
117
138
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
118
139
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -155,7 +176,7 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
155
176
  magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
156
177
  magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
157
178
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
158
- magic_pdf/resources/model_config/model_configs.yaml,sha256=muwPuO6C8Z5oEStE-wWEt7vmkenmJiqTbkRfUfle_vU,257
179
+ magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
159
180
  magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
160
181
  magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
161
182
  magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
@@ -167,12 +188,12 @@ magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,
167
188
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
189
  magic_pdf/tools/cli.py,sha256=yl2E-DYxBN3XF7bWOBseYxptbmeE7tXWpwV-sp2aGIE,3140
169
190
  magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,4122
170
- magic_pdf/tools/common.py,sha256=2S8N60pcA6bFqAmdchoEmn22l9ntQxEfyaKpxfCKJ-Y,5465
191
+ magic_pdf/tools/common.py,sha256=oo6DsbriyQv0azRNZSt4B-13eXvsMsPgE_kwgO0-aM8,7364
171
192
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
193
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
173
- magic_pdf-0.9.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
174
- magic_pdf-0.9.2.dist-info/METADATA,sha256=CxyxzxwoOTK3GfaQCGAR8lcjQR3fK4teYf0pXLVDiNQ,39654
175
- magic_pdf-0.9.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
176
- magic_pdf-0.9.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
177
- magic_pdf-0.9.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
178
- magic_pdf-0.9.2.dist-info/RECORD,,
194
+ magic_pdf-0.9.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
195
+ magic_pdf-0.9.3.dist-info/METADATA,sha256=IpWvg-cnoZ9euLIh_3PYmPGh-DCQ8n8Lp2Ar4oyUfuc,40128
196
+ magic_pdf-0.9.3.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
197
+ magic_pdf-0.9.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
198
+ magic_pdf-0.9.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
199
+ magic_pdf-0.9.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,36 +0,0 @@
1
- import re
2
-
3
- def layout_rm_equation(layout_res):
4
- rm_idxs = []
5
- for idx, ele in enumerate(layout_res['layout_dets']):
6
- if ele['category_id'] == 10:
7
- rm_idxs.append(idx)
8
-
9
- for idx in rm_idxs[::-1]:
10
- del layout_res['layout_dets'][idx]
11
- return layout_res
12
-
13
-
14
- def get_croped_image(image_pil, bbox):
15
- x_min, y_min, x_max, y_max = bbox
16
- croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
17
- return croped_img
18
-
19
-
20
- def latex_rm_whitespace(s: str):
21
- """Remove unnecessary whitespace from LaTeX code.
22
- """
23
- text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
24
- letter = '[a-zA-Z]'
25
- noletter = '[\W_^\d]'
26
- names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
27
- s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
28
- news = s
29
- while True:
30
- s = news
31
- news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
32
- news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
33
- news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
34
- if news == s:
35
- break
36
- return s