magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
magic_pdf/tools/common.py CHANGED
@@ -3,17 +3,20 @@ import json as json_parse
3
3
  import os
4
4
 
5
5
  import click
6
+ import fitz
6
7
  from loguru import logger
7
8
 
8
9
  import magic_pdf.model as model_config
10
+ from magic_pdf.config.make_content_config import DropMode, MakeMode
11
+ from magic_pdf.data.data_reader_writer import FileBasedDataWriter
9
12
  from magic_pdf.libs.draw_bbox import (draw_layout_bbox, draw_line_sort_bbox,
10
13
  draw_model_bbox, draw_span_bbox)
11
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
12
14
  from magic_pdf.pipe.OCRPipe import OCRPipe
13
15
  from magic_pdf.pipe.TXTPipe import TXTPipe
14
16
  from magic_pdf.pipe.UNIPipe import UNIPipe
15
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
17
+
18
+ # from io import BytesIO
19
+ # from pypdf import PdfReader, PdfWriter
17
20
 
18
21
 
19
22
  def prepare_env(output_dir, pdf_file_name, method):
@@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method):
26
29
  return local_image_dir, local_md_dir
27
30
 
28
31
 
32
+ # def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
33
+ # # 将字节数据包装在 BytesIO 对象中
34
+ # pdf_file = BytesIO(pdf_bytes)
35
+ # # 读取 PDF 的字节数据
36
+ # reader = PdfReader(pdf_file)
37
+ # # 创建一个新的 PDF 写入器
38
+ # writer = PdfWriter()
39
+ # # 将所有页面添加到新的 PDF 写入器中
40
+ # end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
41
+ # if end_page_id > len(reader.pages) - 1:
42
+ # logger.warning("end_page_id is out of range, use pdf_docs length")
43
+ # end_page_id = len(reader.pages) - 1
44
+ # for i, page in enumerate(reader.pages):
45
+ # if start_page_id <= i <= end_page_id:
46
+ # writer.add_page(page)
47
+ # # 创建一个字节缓冲区来存储输出的 PDF 数据
48
+ # output_buffer = BytesIO()
49
+ # # 将 PDF 写入字节缓冲区
50
+ # writer.write(output_buffer)
51
+ # # 获取字节缓冲区的内容
52
+ # converted_pdf_bytes = output_buffer.getvalue()
53
+ # return converted_pdf_bytes
54
+
55
+
56
+ def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
57
+ document = fitz.open('pdf', pdf_bytes)
58
+ output_document = fitz.open()
59
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
60
+ if end_page_id > len(document) - 1:
61
+ logger.warning('end_page_id is out of range, use pdf_docs length')
62
+ end_page_id = len(document) - 1
63
+ output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
64
+ output_bytes = output_document.tobytes()
65
+ return output_bytes
66
+
67
+
29
68
  def do_parse(
30
69
  output_dir,
31
70
  pdf_file_name,
@@ -55,26 +94,34 @@ def do_parse(
55
94
  f_draw_model_bbox = True
56
95
  f_draw_line_sort_bbox = True
57
96
 
97
+ if lang == "":
98
+ lang = None
99
+
100
+ pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
101
+
58
102
  orig_model_list = copy.deepcopy(model_list)
59
103
  local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
60
104
  parse_method)
61
105
 
62
- image_writer, md_writer = DiskReaderWriter(
63
- local_image_dir), DiskReaderWriter(local_md_dir)
106
+ image_writer, md_writer = FileBasedDataWriter(
107
+ local_image_dir), FileBasedDataWriter(local_md_dir)
64
108
  image_dir = str(os.path.basename(local_image_dir))
65
109
 
66
110
  if parse_method == 'auto':
67
111
  jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
68
112
  pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
69
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
113
+ # start_page_id=start_page_id, end_page_id=end_page_id,
114
+ lang=lang,
70
115
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
71
116
  elif parse_method == 'txt':
72
117
  pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
73
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
118
+ # start_page_id=start_page_id, end_page_id=end_page_id,
119
+ lang=lang,
74
120
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
75
121
  elif parse_method == 'ocr':
76
122
  pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
77
- start_page_id=start_page_id, end_page_id=end_page_id, lang=lang,
123
+ # start_page_id=start_page_id, end_page_id=end_page_id,
124
+ lang=lang,
78
125
  layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
79
126
  else:
80
127
  logger.error('unknown parse method')
@@ -101,49 +148,36 @@ def do_parse(
101
148
  if f_draw_line_sort_bbox:
102
149
  draw_line_sort_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name)
103
150
 
104
- md_content = pipe.pipe_mk_markdown(image_dir,
105
- drop_mode=DropMode.NONE,
106
- md_make_mode=f_make_md_mode)
151
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
107
152
  if f_dump_md:
108
- md_writer.write(
109
- content=md_content,
110
- path=f'{pdf_file_name}.md',
111
- mode=AbsReaderWriter.MODE_TXT,
153
+ md_writer.write_string(
154
+ f'{pdf_file_name}.md',
155
+ md_content
112
156
  )
113
157
 
114
158
  if f_dump_middle_json:
115
- md_writer.write(
116
- content=json_parse.dumps(pipe.pdf_mid_data,
117
- ensure_ascii=False,
118
- indent=4),
119
- path=f'{pdf_file_name}_middle.json',
120
- mode=AbsReaderWriter.MODE_TXT,
159
+ md_writer.write_string(
160
+ f'{pdf_file_name}_middle.json',
161
+ json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
121
162
  )
122
163
 
123
164
  if f_dump_model_json:
124
- md_writer.write(
125
- content=json_parse.dumps(orig_model_list,
126
- ensure_ascii=False,
127
- indent=4),
128
- path=f'{pdf_file_name}_model.json',
129
- mode=AbsReaderWriter.MODE_TXT,
165
+ md_writer.write_string(
166
+ f'{pdf_file_name}_model.json',
167
+ json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4)
130
168
  )
131
169
 
132
170
  if f_dump_orig_pdf:
133
171
  md_writer.write(
134
- content=pdf_bytes,
135
- path=f'{pdf_file_name}_origin.pdf',
136
- mode=AbsReaderWriter.MODE_BIN,
172
+ f'{pdf_file_name}_origin.pdf',
173
+ pdf_bytes,
137
174
  )
138
175
 
139
176
  content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
140
177
  if f_dump_content_list:
141
- md_writer.write(
142
- content=json_parse.dumps(content_list,
143
- ensure_ascii=False,
144
- indent=4),
145
- path=f'{pdf_file_name}_content_list.json',
146
- mode=AbsReaderWriter.MODE_TXT,
178
+ md_writer.write_string(
179
+ f'{pdf_file_name}_content_list.json',
180
+ json_parse.dumps(content_list, ensure_ascii=False, indent=4)
147
181
  )
148
182
 
149
183
  logger.info(f'local output dir is {local_md_dir}')
magic_pdf/user_api.py CHANGED
@@ -1,36 +1,28 @@
1
- """
2
- 用户输入:
3
- model数组,每个元素代表一个页面
4
- pdf在s3的路径
5
- 截图保存的s3位置
1
+ """用户输入: model数组,每个元素代表一个页面 pdf在s3的路径 截图保存的s3位置.
6
2
 
7
3
  然后:
8
4
  1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
9
5
  2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
10
6
 
11
7
  其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
12
-
13
8
  """
14
- import re
15
9
 
16
10
  from loguru import logger
17
11
 
12
+ from magic_pdf.data.data_reader_writer import DataWriter
18
13
  from magic_pdf.libs.version import __version__
19
14
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
20
- from magic_pdf.rw import AbsReaderWriter
21
15
  from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
22
16
  from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
23
17
 
24
- PARSE_TYPE_TXT = "txt"
25
- PARSE_TYPE_OCR = "ocr"
18
+ PARSE_TYPE_TXT = 'txt'
19
+ PARSE_TYPE_OCR = 'ocr'
26
20
 
27
21
 
28
- def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
22
+ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
29
23
  start_page_id=0, end_page_id=None, lang=None,
30
24
  *args, **kwargs):
31
- """
32
- 解析文本类pdf
33
- """
25
+ """解析文本类pdf."""
34
26
  pdf_info_dict = parse_pdf_by_txt(
35
27
  pdf_bytes,
36
28
  pdf_models,
@@ -38,24 +30,23 @@ def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
38
30
  start_page_id=start_page_id,
39
31
  end_page_id=end_page_id,
40
32
  debug_mode=is_debug,
33
+ lang=lang,
41
34
  )
42
35
 
43
- pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
36
+ pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
44
37
 
45
- pdf_info_dict["_version_name"] = __version__
38
+ pdf_info_dict['_version_name'] = __version__
46
39
 
47
40
  if lang is not None:
48
- pdf_info_dict["_lang"] = lang
41
+ pdf_info_dict['_lang'] = lang
49
42
 
50
43
  return pdf_info_dict
51
44
 
52
45
 
53
- def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
46
+ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
54
47
  start_page_id=0, end_page_id=None, lang=None,
55
48
  *args, **kwargs):
56
- """
57
- 解析ocr类pdf
58
- """
49
+ """解析ocr类pdf."""
59
50
  pdf_info_dict = parse_pdf_by_ocr(
60
51
  pdf_bytes,
61
52
  pdf_models,
@@ -63,25 +54,24 @@ def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWrit
63
54
  start_page_id=start_page_id,
64
55
  end_page_id=end_page_id,
65
56
  debug_mode=is_debug,
57
+ lang=lang,
66
58
  )
67
59
 
68
- pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
60
+ pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
69
61
 
70
- pdf_info_dict["_version_name"] = __version__
62
+ pdf_info_dict['_version_name'] = __version__
71
63
 
72
64
  if lang is not None:
73
- pdf_info_dict["_lang"] = lang
65
+ pdf_info_dict['_lang'] = lang
74
66
 
75
67
  return pdf_info_dict
76
68
 
77
69
 
78
- def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False,
70
+ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: DataWriter, is_debug=False,
79
71
  input_model_is_empty: bool = False,
80
72
  start_page_id=0, end_page_id=None, lang=None,
81
73
  *args, **kwargs):
82
- """
83
- ocr和文本混合的pdf,全部解析出来
84
- """
74
+ """ocr和文本混合的pdf,全部解析出来."""
85
75
 
86
76
  def parse_pdf(method):
87
77
  try:
@@ -92,18 +82,19 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
92
82
  start_page_id=start_page_id,
93
83
  end_page_id=end_page_id,
94
84
  debug_mode=is_debug,
85
+ lang=lang,
95
86
  )
96
87
  except Exception as e:
97
88
  logger.exception(e)
98
89
  return None
99
90
 
100
91
  pdf_info_dict = parse_pdf(parse_pdf_by_txt)
101
- if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
102
- logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
92
+ if pdf_info_dict is None or pdf_info_dict.get('_need_drop', False):
93
+ logger.warning('parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr')
103
94
  if input_model_is_empty:
104
- layout_model = kwargs.get("layout_model", None)
105
- formula_enable = kwargs.get("formula_enable", None)
106
- table_enable = kwargs.get("table_enable", None)
95
+ layout_model = kwargs.get('layout_model', None)
96
+ formula_enable = kwargs.get('formula_enable', None)
97
+ table_enable = kwargs.get('table_enable', None)
107
98
  pdf_models = doc_analyze(
108
99
  pdf_bytes,
109
100
  ocr=True,
@@ -116,15 +107,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
116
107
  )
117
108
  pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
118
109
  if pdf_info_dict is None:
119
- raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
110
+ raise Exception('Both parse_pdf_by_txt and parse_pdf_by_ocr failed.')
120
111
  else:
121
- pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
112
+ pdf_info_dict['_parse_type'] = PARSE_TYPE_OCR
122
113
  else:
123
- pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
114
+ pdf_info_dict['_parse_type'] = PARSE_TYPE_TXT
124
115
 
125
- pdf_info_dict["_version_name"] = __version__
116
+ pdf_info_dict['_version_name'] = __version__
126
117
 
127
118
  if lang is not None:
128
- pdf_info_dict["_lang"] = lang
119
+ pdf_info_dict['_lang'] = lang
129
120
 
130
121
  return pdf_info_dict
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.9.2
3
+ Version: 0.10.0
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -26,6 +26,9 @@ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
26
26
  Requires-Dist: einops; extra == "full"
27
27
  Requires-Dist: accelerate; extra == "full"
28
28
  Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
29
+ Requires-Dist: rapidocr-paddle; extra == "full"
30
+ Requires-Dist: rapid-table; extra == "full"
31
+ Requires-Dist: PyYAML; extra == "full"
29
32
  Requires-Dist: detectron2; extra == "full"
30
33
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
31
34
  Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
@@ -35,6 +38,8 @@ Provides-Extra: lite
35
38
  Requires-Dist: paddleocr==2.7.3; extra == "lite"
36
39
  Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
37
40
  Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
41
+ Provides-Extra: old_linux
42
+ Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
38
43
 
39
44
  <div align="center" xmlns="http://www.w3.org/1999/html">
40
45
  <!-- logo -->
@@ -80,6 +85,10 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
80
85
  </div>
81
86
 
82
87
  # Changelog
88
+ - 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
89
+ - Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
90
+ - Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
91
+ - 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
83
92
  - 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
84
93
  - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
85
94
  - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
@@ -158,7 +167,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
158
167
  - Preserve the structure of the original document, including headings, paragraphs, lists, etc.
159
168
  - Extract images, image descriptions, tables, table titles, and footnotes.
160
169
  - Automatically recognize and convert formulas in the document to LaTeX format.
161
- - Automatically recognize and convert tables in the document to LaTeX or HTML format.
170
+ - Automatically recognize and convert tables in the document to HTML format.
162
171
  - Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
163
172
  - OCR supports detection and recognition of 84 languages.
164
173
  - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
@@ -222,17 +231,11 @@ There are three different ways to experience MinerU:
222
231
  </tr>
223
232
  <tr>
224
233
  <td rowspan="2">GPU Hardware Support List</td>
225
- <td colspan="2">Minimum Requirement 8G+ VRAM</td>
226
- <td colspan="2">3060ti/3070/4060<br>
227
- 8G VRAM enables layout, formula recognition acceleration and OCR acceleration</td>
234
+ <td colspan="2">GPU VRAM 8GB or more</td>
235
+ <td colspan="2">2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
236
+ 8G VRAM can enable all acceleration features</td>
228
237
  <td rowspan="2">None</td>
229
238
  </tr>
230
- <tr>
231
- <td colspan="2">Recommended Configuration 10G+ VRAM</td>
232
- <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
233
- 10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
234
- </td>
235
- </tr>
236
239
  </table>
237
240
 
238
241
  ### Online Demo
@@ -284,7 +287,7 @@ You can modify certain configurations in this file to enable or disable features
284
287
  "enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
285
288
  },
286
289
  "table-config": {
287
- "model": "tablemaster", // When using structEqTable, please change to "struct_eqtable".
290
+ "model": "rapid_table", // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
288
291
  "enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
289
292
  "max_time": 400
290
293
  }
@@ -299,7 +302,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
299
302
  - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
300
303
  - Quick Deployment with Docker
301
304
  > [!IMPORTANT]
302
- > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
305
+ > Docker requires a GPU with at least 8GB of VRAM, and all acceleration features are enabled by default.
303
306
  >
304
307
  > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
305
308
  >
@@ -459,7 +462,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
459
462
  # Acknowledgments
460
463
 
461
464
  - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
465
+ - [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
462
466
  - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
467
+ - [RapidTable](https://github.com/RapidAI/RapidTable)
463
468
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
464
469
  - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
465
470
  - [layoutreader](https://github.com/ppaanngggg/layoutreader)
@@ -0,0 +1,198 @@
1
+ magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
3
+ magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
4
+ magic_pdf/pdf_parse_union_core.py,sha256=w90lFIMOYUMAq4iv8bpsbBtLXFphPV4HyYeqbOTYQUI,12420
5
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=EqEi9AahBBh2JbXoY8uOCmClvi9W_H_26U4jK8RwPwU,31308
6
+ magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
7
+ magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
9
+ magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
10
+ magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
11
+ magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
12
+ magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zkE,622
13
+ magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
14
+ magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
15
+ magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
16
+ magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
18
+ magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
19
+ magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
20
+ magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
21
+ magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
22
+ magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
23
+ magic_pdf/data/data_reader_writer/filebase.py,sha256=21RYy4m9MqJGqwd2HWICQJHM-PZXp7UYETCQQK390Kk,1988
24
+ magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
25
+ magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
26
+ magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
27
+ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
28
+ magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
29
+ magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
30
+ magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ magic_pdf/dict2md/mkcontent.py,sha256=bMQK7uiay76YaWA92VIK57YajINV20SnOs65wOEXyKE,18667
32
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=ohjhEFS9YFrzTCC9c9yrvi4QuZe9iZm1qlkQWB6xxIw,13038
33
+ magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
35
+ magic_pdf/filter/pdf_meta_scan.py,sha256=h4D4O0OeAlEy2A8mJ6E0aQ8wIizIfsIxEagbjaomnAo,17823
36
+ magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
39
+ magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCalhJdmk,3193
40
+ magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
41
+ magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
+ magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
43
+ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
44
+ magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
45
+ magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
46
+ magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
47
+ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
49
+ magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
50
+ magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
51
+ magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
52
+ magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
53
+ magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
54
+ magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
55
+ magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
56
+ magic_pdf/libs/draw_bbox.py,sha256=NhAfqib5HYuGjjrAG_SvJR-yOHZTy6tzDxLXdxKlULQ,17676
57
+ magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
58
+ magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
59
+ magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
60
+ magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
61
+ magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
62
+ magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
63
+ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
64
+ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
65
+ magic_pdf/libs/pdf_image_tools.py,sha256=sh8hgBQu_83R71qBLodOFdByBUuQujsOMfgpSD9mrhE,1981
66
+ magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
67
+ magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
68
+ magic_pdf/libs/version.py,sha256=v4zmKjsKOPZbp6BrWoz7iK4ST0sdZdUh9bQSJmluZ5o,23
69
+ magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
70
+ magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
71
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=HOT6chGx2VPyH6O9WB0c6xGPeDs9m_6oZn3iOa745yw,7125
72
+ magic_pdf/model/magic_model.py,sha256=8nJLzNCa0Ag4JhMAQbjj5qrkj617qKPCXVJAiT9DnaA,43472
73
+ magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
74
+ magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
75
+ magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
76
+ magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
78
+ magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
79
+ magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
81
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
+ magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
83
+ magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
84
+ magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
85
+ magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
86
+ magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
87
+ magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
88
+ magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
89
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
90
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
91
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
92
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
93
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
94
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
95
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
96
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
97
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
98
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
99
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
100
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
101
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
102
+ magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
+ magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=A0eABWvJLyRH6kENWU31g66D2QQos12S0hEmbOuoB0g,347
104
+ magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
+ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
+ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1PgpFrE0RcmCRl19oXbudxwgXc,3528
107
+ magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
+ magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
+ magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
+ magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=UP7fADPGoxAMj2SUKmeW-fe_AcAQxlT9Mfy4WF6vHmU,9796
111
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=a6xkQHqLMUL4NCaORp8oo4Tfa8GB8PN9MVvG7Pj6jIE,7316
112
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
113
+ magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
115
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
116
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2uLSaZSo,7349
117
+ magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
+ magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
119
+ magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
+ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx6DCnhqYzP-4b1zSWptrefimxFTmy8Q,583
121
+ magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
+ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
123
+ magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
+ magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=AdH3UGu4BEoII0uFjPKUf61W7HmG4fDlWgR1xxMeFlE,2775
125
+ magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
126
+ magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
127
+ magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
128
+ magic_pdf/para/commons.py,sha256=VdJ8SY9qJTtcRyx8HH-PFeZSJwL4Tsf50197RD_-dwc,5414
129
+ magic_pdf/para/denoise.py,sha256=J7dM2KNnbdzAd2A3agB04U6L1GL9RrhAs-MLrq-_Ftg,10443
130
+ magic_pdf/para/draw.py,sha256=KyWc03do_WuBKQ028HYzepYwbIkel9ID0uqRhuPVOHc,5643
131
+ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,4978
132
+ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
133
+ magic_pdf/para/para_split.py,sha256=z7nYeg86BjZOAdJNMwYKSu51W9evurtl3cy1ZUcQLlw,33222
134
+ magic_pdf/para/para_split_v2.py,sha256=vJJqqMMKbv8D702nODThL-5hjkgZ7Vl2BTmEIdwmmDw,39051
135
+ magic_pdf/para/para_split_v3.py,sha256=atfELVRx-90paAS3nZptgP0qG8UpTTaj3LG_2x3NAlQ,15977
136
+ magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
137
+ magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
138
+ magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
139
+ magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
140
+ magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
141
+ magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
142
+ magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
143
+ magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
+ magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
+ magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
146
+ magic_pdf/post_proc/pdf_post_filter.py,sha256=3EJDovQPckPKJaBY1wvAty-LGKyRG63WICY_bA_Kfbs,2501
147
+ magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
148
+ magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
+ magic_pdf/pre_proc/citationmarker_remove.py,sha256=IitOERaK9fGaktsYMyiaaL_71uMIrlG5ZdmpZaR6dsA,6640
150
+ magic_pdf/pre_proc/construct_page_dict.py,sha256=lp3zBmInlWYYIcGC1-NSqT9s44AjDvlnWxDPeZoBVSY,3043
151
+ magic_pdf/pre_proc/cut_image.py,sha256=TghshkDTgdUbyLSbKZoFI9-n-xaFub02IYPyu0IAnRY,2761
152
+ magic_pdf/pre_proc/detect_equation.py,sha256=9omDHKTI8QO9Qd46eVFHWhZeMmTNx7XDuWRgjXI-KFA,6627
153
+ magic_pdf/pre_proc/detect_footer_by_model.py,sha256=_EghAM_zWBcqVY8XBkbSoprKqKUa0mlN1U8YNWxNNLI,2848
154
+ magic_pdf/pre_proc/detect_footer_header_by_statistics.py,sha256=924soXZ51QVpitPgVgnwbC7BqOZI30j5hGW5zP86y-w,11250
155
+ magic_pdf/pre_proc/detect_footnote.py,sha256=UxFuTCRwXdAv3wKCgRQJJVt12hM9O9oPTwzPAChQXoM,8309
156
+ magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1itbY7g,2848
157
+ magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
158
+ magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
159
+ magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
160
+ magic_pdf/pre_proc/equations_replace.py,sha256=7mexRPwD9C_UJ-SbvO_-XnpcnN7YtGUUznmPjHbjhnw,20526
161
+ magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
162
+ magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
163
+ magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
164
+ magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=DMc2H2xGqVePBReZu5AQbPdvDw3sxOssmujCLlNW3Vs,14143
165
+ magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
166
+ magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Au8y1NBhbWpq_VuPLg3b9dAMUhyPS71xtTghtd21K5M,14273
167
+ magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=9DxEyy1pH87g4T_JEgI3cTVCL2TVrEBl38wsmqhQM4k,12758
168
+ magic_pdf/pre_proc/pdf_pre_filter.py,sha256=qvNlNyj4Mc3qa73mgfkp0PMR-ucABbx3mMcyVipaEpQ,2776
169
+ magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
170
+ magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
171
+ magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=WVKhgeWifRdO-u2ETYffkcMOFVYIbiaZu5pMr1RpEdA,4090
172
+ magic_pdf/pre_proc/remove_footer_header.py,sha256=Igdr4jH7BUGuTcapWPiKEGKxhWH12c3VVmX5xwUVn7w,5680
173
+ magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=di7geS7AFhSaAvkWZHT6J3dlXEq8uu9Z4oBYtolQjl0,8803
174
+ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=ABl0vo8kkcCPSTI8dpXQTOH1b9R-lbzsJDDFONU6ELk,7313
175
+ magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
176
+ magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
177
+ magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
178
+ magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
179
+ magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
180
+ magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
181
+ magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
182
+ magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
183
+ magic_pdf/rw/S3ReaderWriter.py,sha256=_DmL45Ubio-_VsKD84KrqOQ-VNDUTzcXSrXfNMb5vww,5310
184
+ magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
185
+ magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
186
+ magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
187
+ magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
188
+ magic_pdf/tools/cli.py,sha256=83a8p4_DvVdDOTuviE6WqexSXsDE_MUY-af3QDxXeoU,3067
189
+ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
190
+ magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
191
+ magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
192
+ magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
193
+ magic_pdf-0.10.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
194
+ magic_pdf-0.10.0.dist-info/METADATA,sha256=U_TtQjdODFjAADoZro_ipfGiasBCVq2_zZlF2DFyNpM,40300
195
+ magic_pdf-0.10.0.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
196
+ magic_pdf-0.10.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
197
+ magic_pdf-0.10.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
198
+ magic_pdf-0.10.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5