magic-pdf 0.7.0b1__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -132,6 +132,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
132
132
  # if processed by table model
133
133
  if span.get('latex', ''):
134
134
  para_text += f"\n\n$\n {span['latex']}\n$\n\n"
135
+ elif span.get('html', ''):
136
+ para_text += f"\n\n{span['html']}\n\n"
135
137
  else:
136
138
  para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])}) \n"
137
139
  for block in para_block['blocks']: # 3rd.拼table_footnote
@@ -256,6 +258,8 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
256
258
  if block['type'] == BlockType.TableBody:
257
259
  if block["lines"][0]["spans"][0].get('latex', ''):
258
260
  para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
261
+ elif block["lines"][0]["spans"][0].get('html', ''):
262
+ para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
259
263
  para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
260
264
  if block['type'] == BlockType.TableCaption:
261
265
  para_content['table_caption'] = merge_para_with_text(block)
@@ -10,5 +10,31 @@ block维度自定义字段
10
10
  # block中lines是否被删除
11
11
  LINES_DELETED = "lines_deleted"
12
12
 
13
+ # struct eqtable
14
+ STRUCT_EQTABLE = "struct_eqtable"
15
+
13
16
  # table recognition max time default value
14
- TABLE_MAX_TIME_VALUE = 400
17
+ TABLE_MAX_TIME_VALUE = 400
18
+
19
+ # pp_table_result_max_length
20
+ TABLE_MAX_LEN = 480
21
+
22
+ # pp table structure algorithm
23
+ TABLE_MASTER = "TableMaster"
24
+
25
+ # table master structure dict
26
+ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
27
+
28
+ # table master dir
29
+ TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
30
+
31
+ # pp detect model dir
32
+ DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
33
+
34
+ # pp rec model dir
35
+ REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
36
+
37
+ # pp rec char dict path
38
+ REC_CHAR_DICT = "ppocr_keys_v1.txt"
39
+
40
+
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.7.0b1"
1
+ __version__ = "0.7.1"
@@ -562,8 +562,11 @@ class MagicModel:
562
562
  elif category_id == 5:
563
563
  # 获取table模型结果
564
564
  latex = layout_det.get("latex", None)
565
+ html = layout_det.get("html", None)
565
566
  if latex:
566
567
  span["latex"] = latex
568
+ elif html:
569
+ span["html"] = html
567
570
  span["type"] = ContentType.Table
568
571
  elif category_id == 13:
569
572
  span["content"] = layout_det["latex"]
@@ -2,7 +2,7 @@ from loguru import logger
2
2
  import os
3
3
  import time
4
4
 
5
- from magic_pdf.libs.Constants import TABLE_MAX_TIME_VALUE
5
+ from magic_pdf.libs.Constants import *
6
6
 
7
7
  os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新
8
8
  try:
@@ -34,10 +34,18 @@ from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Pre
34
34
  from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
35
35
  from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
36
36
  from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
37
-
38
-
39
- def table_model_init(model_path, max_time, _device_='cpu'):
40
- table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
37
+ from magic_pdf.model.ppTableModel import ppTableModel
38
+
39
+
40
+ def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
41
+ if table_model_type == STRUCT_EQTABLE:
42
+ table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
43
+ else:
44
+ config = {
45
+ "model_dir": model_path,
46
+ "device": _device_
47
+ }
48
+ table_model = ppTableModel(config)
41
49
  return table_model
42
50
 
43
51
 
@@ -104,9 +112,11 @@ class CustomPEKModel:
104
112
  # 初始化解析配置
105
113
  self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
106
114
  self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
115
+ # table config
107
116
  self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
108
117
  self.apply_table = self.table_config.get("is_table_recog_enable", False)
109
118
  self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
119
+ self.table_model_type = self.table_config.get("model", TABLE_MASTER)
110
120
  self.apply_ocr = ocr
111
121
  logger.info(
112
122
  "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
@@ -141,10 +151,11 @@ class CustomPEKModel:
141
151
  if self.apply_ocr:
142
152
  self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
143
153
 
144
- # init structeqtable
154
+ # init table model
145
155
  if self.apply_table:
146
- self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
147
- max_time = self.table_max_time, _device_=self.device)
156
+ table_model_dir = self.configs["weights"][self.table_model_type]
157
+ self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
158
+ max_time=self.table_max_time, _device_=self.device)
148
159
  logger.info('DocAnalysis init done!')
149
160
 
150
161
  def __call__(self, image):
@@ -278,16 +289,28 @@ class CustomPEKModel:
278
289
  new_image, _ = crop_img(res, pil_img)
279
290
  single_table_start_time = time.time()
280
291
  logger.info("------------------table recognition processing begins-----------------")
292
+ latex_code = None
293
+ html_code = None
281
294
  with torch.no_grad():
282
- latex_code = self.table_model.image2latex(new_image)[0]
295
+ if self.table_model_type == STRUCT_EQTABLE:
296
+ latex_code = self.table_model.image2latex(new_image)[0]
297
+ else:
298
+ html_code = self.table_model.img2html(new_image)
283
299
  run_time = time.time() - single_table_start_time
284
300
  logger.info(f"------------table recognition processing ends within {run_time}s-----")
285
301
  if run_time > self.table_max_time:
286
302
  logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
287
303
  # 判断是否返回正常
288
- expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
289
- if latex_code and expected_ending:
290
- res["latex"] = latex_code
304
+
305
+ if latex_code:
306
+ expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
307
+ 'end{table}')
308
+ if expected_ending:
309
+ res["latex"] = latex_code
310
+ else:
311
+ logger.warning(f"------------table recognition processing fails----------")
312
+ elif html_code:
313
+ res["html"] = html_code
291
314
  else:
292
315
  logger.warning(f"------------table recognition processing fails----------")
293
316
  table_cost = round(time.time() - table_start, 2)
@@ -12,7 +12,6 @@ class StructTableModel:
12
12
  self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
13
13
 
14
14
  def image2latex(self, image) -> str:
15
- #
16
15
  table_latex = self.model.forward(image)
17
16
  return table_latex
18
17
 
@@ -0,0 +1,67 @@
1
+ from paddleocr.ppstructure.table.predict_table import TableSystem
2
+ from paddleocr.ppstructure.utility import init_args
3
+ from magic_pdf.libs.Constants import *
4
+ import os
5
+ from PIL import Image
6
+ import numpy as np
7
+
8
+
9
+ class ppTableModel(object):
10
+ """
11
+ This class is responsible for converting image of table into HTML format using a pre-trained model.
12
+
13
+ Attributes:
14
+ - table_sys: An instance of TableSystem initialized with parsed arguments.
15
+
16
+ Methods:
17
+ - __init__(config): Initializes the model with configuration parameters.
18
+ - img2html(image): Converts a PIL Image or NumPy array to HTML string.
19
+ - parse_args(**kwargs): Parses configuration arguments.
20
+ """
21
+
22
+ def __init__(self, config):
23
+ """
24
+ Parameters:
25
+ - config (dict): Configuration dictionary containing model_dir and device.
26
+ """
27
+ args = self.parse_args(**config)
28
+ self.table_sys = TableSystem(args)
29
+
30
+ def img2html(self, image):
31
+ """
32
+ Parameters:
33
+ - image (PIL.Image or np.ndarray): The image of the table to be converted.
34
+
35
+ Return:
36
+ - HTML (str): A string representing the HTML structure with content of the table.
37
+ """
38
+ if isinstance(image, Image.Image):
39
+ image = np.array(image)
40
+ pred_res, _ = self.table_sys(image)
41
+ pred_html = pred_res["html"]
42
+ res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
43
+ "") + "</table></td>\n"
44
+ return res
45
+
46
+ def parse_args(self, **kwargs):
47
+ parser = init_args()
48
+ model_dir = kwargs.get("model_dir")
49
+ table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
50
+ table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
51
+ det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
52
+ rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
53
+ rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
54
+ device = kwargs.get("device", "cpu")
55
+ use_gpu = True if device == "cuda" else False
56
+ config = {
57
+ "use_gpu": use_gpu,
58
+ "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
59
+ "table_algorithm": TABLE_MASTER,
60
+ "table_model_dir": table_model_dir,
61
+ "table_char_dict_path": table_char_dict_path,
62
+ "det_model_dir": det_model_dir,
63
+ "rec_model_dir": rec_model_dir,
64
+ "rec_char_dict_path": rec_char_dict_path,
65
+ }
66
+ parser.set_defaults(**config)
67
+ return parser.parse_args([])
@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
100
100
 
101
101
  if lang != 'en':
102
102
  return lines, None
103
- else:
104
- total_lines = len(lines)
105
- line_fea_encode = []
106
- """
107
- 对每一行进行特征编码,编码规则如下:
108
- 1. 如果行顶格,且大写字母开头或者数字开头,编码为1
109
- 2. 如果顶格,其他非大写开头编码为4
110
- 3. 如果非顶格,首字符大写,编码为2
111
- 4. 如果非顶格,首字符非大写编码为3
112
- """
113
- if len(lines) > 0:
114
- x_map_tag_dict, min_x_tag = cluster_line_x(lines)
115
- for l in lines:
116
- span_text = __get_span_text(l['spans'][0])
117
- first_char = span_text[0]
118
- layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
119
- if not layout:
120
- line_fea_encode.append(0)
103
+
104
+ total_lines = len(lines)
105
+ line_fea_encode = []
106
+ """
107
+ 对每一行进行特征编码,编码规则如下:
108
+ 1. 如果行顶格,且大写字母开头或者数字开头,编码为1
109
+ 2. 如果顶格,其他非大写开头编码为4
110
+ 3. 如果非顶格,首字符大写,编码为2
111
+ 4. 如果非顶格,首字符非大写编码为3
112
+ """
113
+ if len(lines) > 0:
114
+ x_map_tag_dict, min_x_tag = cluster_line_x(lines)
115
+ for l in lines:
116
+ span_text = __get_span_text(l['spans'][0])
117
+ if not span_text:
118
+ line_fea_encode.append(0)
119
+ continue
120
+ first_char = span_text[0]
121
+ layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
122
+ if not layout:
123
+ line_fea_encode.append(0)
124
+ else:
125
+ #
126
+ if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
127
+ # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
128
+ if not first_char.isalnum() or if_match_reference_list(span_text):
129
+ line_fea_encode.append(1)
130
+ else:
131
+ line_fea_encode.append(4)
121
132
  else:
122
- #
123
- if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
124
- # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
125
- if not first_char.isalnum() or if_match_reference_list(span_text):
126
- line_fea_encode.append(1)
127
- else:
128
- line_fea_encode.append(4)
133
+ if first_char.isupper():
134
+ line_fea_encode.append(2)
129
135
  else:
130
- if first_char.isupper():
131
- line_fea_encode.append(2)
132
- else:
133
- line_fea_encode.append(3)
136
+ line_fea_encode.append(3)
134
137
 
135
- # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
138
+ # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。
136
139
 
137
- list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
138
- if len(list_indice) > 0:
140
+ list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
141
+ if len(list_indice) > 0:
142
+ if debug_able:
143
+ logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}")
144
+
145
+ # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
146
+ segments = []
147
+ for start, end in list_indice:
148
+ for i in range(start, end + 1):
149
+ if i > 0:
150
+ if line_fea_encode[i] == 4:
151
+ if debug_able:
152
+ logger.info(f"列表行的第{i}行不是顶格的")
153
+ break
154
+ else:
139
155
  if debug_able:
140
- logger.info(f"发现了列表,列表行数:{list_indice}{list_start_idx}")
141
-
142
- # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
143
- segments = []
144
- for start, end in list_indice:
145
- for i in range(start, end + 1):
146
- if i > 0:
147
- if line_fea_encode[i] == 4:
148
- if debug_able:
149
- logger.info(f"列表行的第{i}行不是顶格的")
150
- break
151
- else:
152
- if debug_able:
153
- logger.info(f"列表行的第{start}到第{end}行是列表")
156
+ logger.info(f"列表行的第{start}到第{end}行是列表")
154
157
 
155
- return split_indices(total_lines, list_indice), list_start_idx
158
+ return split_indices(total_lines, list_indice), list_start_idx
156
159
 
157
160
 
158
161
  def cluster_line_x(lines: list) -> dict:
@@ -3,6 +3,7 @@ config:
3
3
  layout: True
4
4
  formula: True
5
5
  table_config:
6
+ model: TableMaster
6
7
  is_table_recog_enable: False
7
8
  max_time: 400
8
9
 
@@ -10,4 +11,5 @@ weights:
10
11
  layout: Layout/model_final.pth
11
12
  mfd: MFD/weights.pt
12
13
  mfr: MFR/UniMERNet
13
- table: TabRec/StructEqTable
14
+ struct_eqtable: TabRec/StructEqTable
15
+ TableMaster: TabRec/TableMaster
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.7.0b1
3
+ Version: 0.7.1
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -64,6 +64,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
64
64
  </div>
65
65
 
66
66
  # Changelog
67
+ - 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
67
68
  - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
68
69
  - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
69
70
  - 2024/07/05: Initial open-source release
@@ -205,7 +206,7 @@ In non-mainline environments, due to the diversity of hardware and software conf
205
206
  ```bash
206
207
  conda create -n MinerU python=3.10
207
208
  conda activate MinerU
208
- pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com
209
+ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
209
210
  ```
210
211
  #### 2. Download model weight files
211
212
 
@@ -234,6 +235,7 @@ Find the `magic-pdf.json` file in your user directory and configure the "models-
234
235
  // other config
235
236
  "models-dir": "D:/models",
236
237
  "table-config": {
238
+ "model": "TableMaster", // Another option of this value is 'struct_eqtable'
237
239
  "is_table_recog_enable": false, // Table recognition is disabled by default, modify this value to enable it
238
240
  "max_time": 400
239
241
  }
@@ -345,13 +347,7 @@ TODO
345
347
  - Comic books, art books, elementary school textbooks, and exercise books are not well-parsed yet
346
348
  - Enabling OCR may produce better results in PDFs with a high density of formulas
347
349
  - If you are processing PDFs with a large number of formulas, it is strongly recommended to enable the OCR function. When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions.
348
- - **Table Recognition** is currently in the testing phase; recognition speed is slow, and accuracy needs improvement. Below are some performance test results in an Ubuntu 22.04 LTS + Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz + NVIDIA GeForce RTX 4090 environment for reference.
349
350
 
350
- | Table Size | Parsing Time |
351
- |---------------|----------------------------|
352
- | 6\*5 55kb | 37s |
353
- | 16\*12 284kb | 3m18s |
354
- | 44\*7 559kb | 4m12s |
355
351
 
356
352
  # FAQ
357
353
  [FAQ in Chinese](docs/FAQ_zh_cn.md)
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_union_core.py,sha256=jNly6l9pGcCf7wr6s6PgQhITJZ1m9PaI32Q26zx
5
5
  magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
6
6
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
8
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=mq6tACGkL383bdUla7xOkRXRTBBydRtdbgIHwkk_daM,16169
8
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=jg_v2Bj62xBObg0LDayvqUVX_O9DrIBli5Z9_i7Qduw,16479
9
9
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
11
11
  magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -15,7 +15,7 @@ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl
15
15
  magic_pdf/layout/layout_sort.py,sha256=ovqRX1xcRA7E7s8VvsI7ZNbaNSElJe07bApCh5hxwIE,33533
16
16
  magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
17
17
  magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
18
- magic_pdf/libs/Constants.py,sha256=aKdTHeK75qkVvxvE_2EA5LYis6Z6HLmiuk9o8ESOnNg,260
18
+ magic_pdf/libs/Constants.py,sha256=rdJVadmgN0UlIB-xcMQ9j7Qk9q1Qahxt3KEY-vL7hSU,774
19
19
  magic_pdf/libs/MakeContentConfig.py,sha256=UDZPpsv8q4DqTy8h0vRtrT2kHqWiVI205VnVhlUEQc0,206
20
20
  magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
21
21
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -41,13 +41,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
41
41
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
42
42
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
43
43
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
44
- magic_pdf/libs/version.py,sha256=95eHzU5LYX2l3ASu7OvUb95xo-2kfuwh1uUYnY54K90,24
44
+ magic_pdf/libs/version.py,sha256=2KJZDSMOG7KS82AxYOrZ4ZihYxX0wjfUjDsIZh3L024,22
45
45
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
46
46
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
47
47
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=SoT21VHy6ICXoRfC9V3XS6BMiX8EZI6zaqSNgoE17oo,4347
48
- magic_pdf/model/magic_model.py,sha256=xwKV9BrdjOJecJSzbErT54N6qeJu0tvFuJg2S1z_2kU,25413
48
+ magic_pdf/model/magic_model.py,sha256=3eAfmglKFkmIVPoz3TG8xAzkNK2g_VLI5rRMQAb_cK4,25544
49
49
  magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
50
- magic_pdf/model/pdf_extract_kit.py,sha256=21vBy8p6pI5a0b6V45ul52yE8zD1R0xrjv4Tx8r9gaw,13620
50
+ magic_pdf/model/pdf_extract_kit.py,sha256=WO54IoxX8XYXLGrjPts--84qRO1FQZm9f_yVyfpPi0s,14539
51
+ magic_pdf/model/ppTableModel.py,sha256=wWiui9VOjkKYlNX-viPqsWpzgkNJ-9_S2Se-j4oyLqU,2687
51
52
  magic_pdf/model/pp_structure_v2.py,sha256=1sn8IJK0d5ZmqJ2XFt9FdaSdI0RQf-iwNAWBrVrIeuc,2872
52
53
  magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
54
  magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
@@ -72,7 +73,7 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
72
73
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
73
74
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
74
75
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
75
- magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=BdrBZ_2B2jgF0vzn_ted8bE9Te-DC1Ea2UijqULNKjg,928
76
+ magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=C9WluPhwaqsFg154WsNxN2HlhFXVkAAw0prR7t8r5J4,918
76
77
  magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
78
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
79
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
@@ -84,7 +85,7 @@ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,
84
85
  magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
85
86
  magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
86
87
  magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
87
- magic_pdf/para/para_split_v2.py,sha256=jGOhsubdh_CEgSv9WMNmp1loq1YNlpcAj3yh3g0gPhw,37027
88
+ magic_pdf/para/para_split_v2.py,sha256=jJnn8numhxVgojGwKGCqBNIIYn2AYsucO-q-eQgsPb4,36911
88
89
  magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
89
90
  magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
90
91
  magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -127,7 +128,7 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
127
128
  magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
128
129
  magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
129
130
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
130
- magic_pdf/resources/model_config/model_configs.yaml,sha256=2MxCOJ5yNUupQqvrAvEuJKlygjxxV_o1qE64K_4NWKA,235
131
+ magic_pdf/resources/model_config/model_configs.yaml,sha256=_gOSxK9jxe1bFwtH_uwovsyZnRi1sEVNYb1OAexDmF4,301
131
132
  magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
132
133
  magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
133
134
  magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
@@ -140,9 +141,9 @@ magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
141
  magic_pdf/tools/cli.py,sha256=aVmurGAEyWT-MOv0MOaCRrfef1-jkRTpeVVWUsEVyeY,2157
141
142
  magic_pdf/tools/cli_dev.py,sha256=w-J4OixDzHjknnUuRW44PXsUlUqyiD4nPbBSSk9WkXM,4160
142
143
  magic_pdf/tools/common.py,sha256=XoSs19DD-4ubbjrDFQer83T9O6O_MmgEO61NbjlP_2M,3939
143
- magic_pdf-0.7.0b1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
144
- magic_pdf-0.7.0b1.dist-info/METADATA,sha256=47QGAd2iGc0i1osA_jbBS1QT_Jrfmofoyetsrh9KRy8,18571
145
- magic_pdf-0.7.0b1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
146
- magic_pdf-0.7.0b1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
147
- magic_pdf-0.7.0b1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
148
- magic_pdf-0.7.0b1.dist-info/RECORD,,
144
+ magic_pdf-0.7.1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
145
+ magic_pdf-0.7.1.dist-info/METADATA,sha256=SD5oVg3vUEuFg7IyAbwncQ_mtgXljhKiJCOwRCTSOVo,18232
146
+ magic_pdf-0.7.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
147
+ magic_pdf-0.7.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
148
+ magic_pdf-0.7.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
149
+ magic_pdf-0.7.1.dist-info/RECORD,,