magic-pdf 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -138,12 +138,9 @@ def full_to_half(text: str) -> str:
138
138
  result = []
139
139
  for char in text:
140
140
  code = ord(char)
141
- # Full-width ASCII variants (FF01-FF5E)
142
- if 0xFF01 <= code <= 0xFF5E:
141
+ # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
142
+ if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
143
143
  result.append(chr(code - 0xFEE0)) # Shift to ASCII range
144
- # Full-width space
145
- elif code == 0x3000:
146
- result.append(' ')
147
144
  else:
148
145
  result.append(char)
149
146
  return ''.join(result)
@@ -0,0 +1,54 @@
1
+ import time
2
+ import functools
3
+ from collections import defaultdict
4
+ from typing import Dict, List
5
+
6
+
7
+ class PerformanceStats:
8
+ """性能统计类,用于收集和展示方法执行时间"""
9
+
10
+ _stats: Dict[str, List[float]] = defaultdict(list)
11
+
12
+ @classmethod
13
+ def add_execution_time(cls, func_name: str, execution_time: float):
14
+ """添加执行时间记录"""
15
+ cls._stats[func_name].append(execution_time)
16
+
17
+ @classmethod
18
+ def get_stats(cls) -> Dict[str, dict]:
19
+ """获取统计结果"""
20
+ results = {}
21
+ for func_name, times in cls._stats.items():
22
+ results[func_name] = {
23
+ 'count': len(times),
24
+ 'total_time': sum(times),
25
+ 'avg_time': sum(times) / len(times),
26
+ 'min_time': min(times),
27
+ 'max_time': max(times)
28
+ }
29
+ return results
30
+
31
+ @classmethod
32
+ def print_stats(cls):
33
+ """打印统计结果"""
34
+ stats = cls.get_stats()
35
+ print("\n性能统计结果:")
36
+ print("-" * 80)
37
+ print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}")
38
+ print("-" * 80)
39
+ for func_name, data in stats.items():
40
+ print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}")
41
+
42
+
43
+ def measure_time(func):
44
+ """测量方法执行时间的装饰器"""
45
+
46
+ @functools.wraps(func)
47
+ def wrapper(*args, **kwargs):
48
+ start_time = time.time()
49
+ result = func(*args, **kwargs)
50
+ execution_time = time.time() - start_time
51
+ PerformanceStats.add_execution_time(func.__name__, execution_time)
52
+ return result
53
+
54
+ return wrapper
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.0"
1
+ __version__ = "1.2.2"
@@ -170,11 +170,7 @@ def doc_analyze(
170
170
  gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
171
171
  if gpu_memory is not None and gpu_memory >= 8:
172
172
 
173
- if gpu_memory >= 40:
174
- batch_ratio = 32
175
- elif gpu_memory >=20:
176
- batch_ratio = 16
177
- elif gpu_memory >= 16:
173
+ if gpu_memory >= 16:
178
174
  batch_ratio = 8
179
175
  elif gpu_memory >= 10:
180
176
  batch_ratio = 4
@@ -528,14 +528,13 @@ class MagicModel:
528
528
  pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox'])
529
529
  nearest_dis = float('inf')
530
530
  for i in range(N):
531
- if i in seen_idx:continue
531
+ if i in seen_idx or i == sub_idx:continue
532
532
  nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox']))
533
533
 
534
534
  if pair_dis >= 3*nearest_dis:
535
535
  seen_idx.add(sub_idx)
536
536
  continue
537
537
 
538
-
539
538
  seen_idx.add(sub_idx)
540
539
  seen_idx.add(obj_idx + OBJ_IDX_OFFSET)
541
540
  seen_sub_idx.add(sub_idx)
@@ -100,20 +100,61 @@ class UnimernetModel(object):
100
100
  res["latex"] = latex_rm_whitespace(latex)
101
101
  return formula_list
102
102
 
103
- def batch_predict(
104
- self, images_mfd_res: list, images: list, batch_size: int = 64
105
- ) -> list:
103
+ # def batch_predict(
104
+ # self, images_mfd_res: list, images: list, batch_size: int = 64
105
+ # ) -> list:
106
+ # images_formula_list = []
107
+ # mf_image_list = []
108
+ # backfill_list = []
109
+ # for image_index in range(len(images_mfd_res)):
110
+ # mfd_res = images_mfd_res[image_index]
111
+ # pil_img = Image.fromarray(images[image_index])
112
+ # formula_list = []
113
+ #
114
+ # for xyxy, conf, cla in zip(
115
+ # mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
116
+ # ):
117
+ # xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
118
+ # new_item = {
119
+ # "category_id": 13 + int(cla.item()),
120
+ # "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
121
+ # "score": round(float(conf.item()), 2),
122
+ # "latex": "",
123
+ # }
124
+ # formula_list.append(new_item)
125
+ # bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
126
+ # mf_image_list.append(bbox_img)
127
+ #
128
+ # images_formula_list.append(formula_list)
129
+ # backfill_list += formula_list
130
+ #
131
+ # dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
132
+ # dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
133
+ # mfr_res = []
134
+ # for mf_img in dataloader:
135
+ # mf_img = mf_img.to(self.device)
136
+ # with torch.no_grad():
137
+ # output = self.model.generate({"image": mf_img})
138
+ # mfr_res.extend(output["pred_str"])
139
+ # for res, latex in zip(backfill_list, mfr_res):
140
+ # res["latex"] = latex_rm_whitespace(latex)
141
+ # return images_formula_list
142
+
143
+ def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
106
144
  images_formula_list = []
107
145
  mf_image_list = []
108
146
  backfill_list = []
147
+ image_info = [] # Store (area, original_index, image) tuples
148
+
149
+ # Collect images with their original indices
109
150
  for image_index in range(len(images_mfd_res)):
110
151
  mfd_res = images_mfd_res[image_index]
111
152
  pil_img = Image.fromarray(images[image_index])
112
153
  formula_list = []
113
154
 
114
- for xyxy, conf, cla in zip(
115
- mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
116
- ):
155
+ for idx, (xyxy, conf, cla) in enumerate(zip(
156
+ mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
157
+ )):
117
158
  xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
118
159
  new_item = {
119
160
  "category_id": 13 + int(cla.item()),
@@ -123,19 +164,43 @@ class UnimernetModel(object):
123
164
  }
124
165
  formula_list.append(new_item)
125
166
  bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
167
+ area = (xmax - xmin) * (ymax - ymin)
168
+
169
+ curr_idx = len(mf_image_list)
170
+ image_info.append((area, curr_idx, bbox_img))
126
171
  mf_image_list.append(bbox_img)
127
172
 
128
173
  images_formula_list.append(formula_list)
129
174
  backfill_list += formula_list
130
175
 
131
- dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
176
+ # Stable sort by area
177
+ image_info.sort(key=lambda x: x[0]) # sort by area
178
+ sorted_indices = [x[1] for x in image_info]
179
+ sorted_images = [x[2] for x in image_info]
180
+
181
+ # Create mapping for results
182
+ index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)}
183
+
184
+ # Create dataset with sorted images
185
+ dataset = MathDataset(sorted_images, transform=self.mfr_transform)
132
186
  dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
187
+
188
+ # Process batches and store results
133
189
  mfr_res = []
134
190
  for mf_img in dataloader:
135
191
  mf_img = mf_img.to(self.device)
136
192
  with torch.no_grad():
137
193
  output = self.model.generate({"image": mf_img})
138
194
  mfr_res.extend(output["pred_str"])
139
- for res, latex in zip(backfill_list, mfr_res):
140
- res["latex"] = latex_rm_whitespace(latex)
195
+
196
+ # Restore original order
197
+ unsorted_results = [""] * len(mfr_res)
198
+ for new_idx, latex in enumerate(mfr_res):
199
+ original_idx = index_mapping[new_idx]
200
+ unsorted_results[original_idx] = latex_rm_whitespace(latex)
201
+
202
+ # Fill results back
203
+ for res, latex in zip(backfill_list, unsorted_results):
204
+ res["latex"] = latex
205
+
141
206
  return images_formula_list
@@ -21,9 +21,12 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
21
21
  from magic_pdf.libs.convert_utils import dict_to_list
22
22
  from magic_pdf.libs.hash_utils import compute_md5
23
23
  from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
24
+ from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
24
25
  from magic_pdf.model.magic_model import MagicModel
25
26
  from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
26
27
 
28
+ from concurrent.futures import ThreadPoolExecutor
29
+
27
30
  try:
28
31
  import torchtext
29
32
 
@@ -215,7 +218,7 @@ def calculate_contrast(img, img_mode) -> float:
215
218
  # logger.info(f"contrast: {contrast}")
216
219
  return round(contrast, 2)
217
220
 
218
-
221
+ # @measure_time
219
222
  def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
220
223
  # cid用0xfffd表示,连字符拆开
221
224
  # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
@@ -489,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
489
492
  else:
490
493
  return [[x0, y0, x1, y1]]
491
494
 
492
-
495
+ # @measure_time
493
496
  def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
494
497
  page_line_list = []
495
498
 
@@ -923,7 +926,6 @@ def pdf_parse_union(
923
926
  magic_model = MagicModel(model_list, dataset)
924
927
 
925
928
  """根据输入的起始范围解析pdf"""
926
- # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
927
929
  end_page_id = (
928
930
  end_page_id
929
931
  if end_page_id is not None and end_page_id >= 0
@@ -960,6 +962,8 @@ def pdf_parse_union(
960
962
  )
961
963
  pdf_info_dict[f'page_{page_id}'] = page_info
962
964
 
965
+ # PerformanceStats.print_stats()
966
+
963
967
  """分段"""
964
968
  para_split(pdf_info_dict)
965
969
 
@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
108
108
  ):
109
109
  multiple_para_flag = True
110
110
 
111
- for line in block['lines']:
112
- line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
113
- block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
114
- if (
115
- line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
116
- and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
117
- ):
118
- external_sides_not_close_num += 1
119
- if abs(line_mid_x - block_mid_x) < line_height / 2:
120
- center_close_num += 1
111
+ block_text = ''
121
112
 
113
+ for line in block['lines']:
122
114
  line_text = ''
123
115
 
124
116
  for span in line['spans']:
125
117
  span_type = span['type']
126
118
  if span_type == ContentType.Text:
127
119
  line_text += span['content'].strip()
128
-
129
120
  # 添加所有文本,包括空行,保持与block['lines']长度一致
130
121
  lines_text_list.append(line_text)
131
122
  block_text = ''.join(lines_text_list)
132
- block_lang = detect_lang(block_text)
133
- # logger.info(f"block_lang: {block_lang}")
123
+
124
+ block_lang = detect_lang(block_text)
125
+ # logger.info(f"block_lang: {block_lang}")
126
+
127
+ for line in block['lines']:
128
+ line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
129
+ block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
130
+ if (
131
+ line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
132
+ and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
133
+ ):
134
+ external_sides_not_close_num += 1
135
+ if abs(line_mid_x - block_mid_x) < line_height / 2:
136
+ center_close_num += 1
134
137
 
135
138
  # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
136
139
  if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
@@ -64,7 +64,7 @@ def span_block_type_compatible(span_type, block_type):
64
64
  if span_type in [ContentType.Text, ContentType.InlineEquation]:
65
65
  return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
66
66
  elif span_type == ContentType.InterlineEquation:
67
- return block_type in [BlockType.InterlineEquation]
67
+ return block_type in [BlockType.InterlineEquation, BlockType.Text]
68
68
  elif span_type == ContentType.Image:
69
69
  return block_type in [BlockType.ImageBody]
70
70
  elif span_type == ContentType.Table:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 1.2.0
3
+ Version: 1.2.2
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -9,7 +9,7 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3>=1.28.43
10
10
  Requires-Dist: Brotli>=1.1.0
11
11
  Requires-Dist: click>=8.1.7
12
- Requires-Dist: fast-langdetect>=0.2.3
12
+ Requires-Dist: fast-langdetect<0.3.0,>=0.2.3
13
13
  Requires-Dist: loguru>=0.6.0
14
14
  Requires-Dist: numpy<2.0.0,>=1.21.6
15
15
  Requires-Dist: pydantic>=2.7.2
@@ -94,6 +94,10 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
94
94
  </div>
95
95
 
96
96
  # Changelog
97
+ - 2025/03/03 1.2.1 released, fixed several bugs:
98
+ - Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers
99
+ - Fixed caption matching inaccuracies in certain scenarios
100
+ - Fixed formula span loss issues in certain scenarios
97
101
  - 2025/02/24 1.2.0 released. This version includes several fixes and improvements to enhance parsing efficiency and accuracy:
98
102
  - Performance Optimization
99
103
  - Increased classification speed for PDF documents in auto mode.
@@ -1,5 +1,5 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- magic_pdf/pdf_parse_union_core_v2.py,sha256=jIrXgU_gKL4toJ6GsCoDxByszaN8mAr5vrEy_c63ewk,38310
2
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=Pt3UtPQgOrF2YudQqrwVVC767_271E-LRg2aUsiggXg,38435
3
3
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
5
5
  magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
@@ -24,7 +24,7 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
24
24
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
25
25
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
26
26
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=ZZTaiIn18OWuWKGbDdpoOZ3VMhe_3_JKwrKCfzDiSk0,13715
27
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=12WeBVxnBzzruk8CfYqqsV2dpH-mDWmE4Osl1RlRoc8,13741
28
28
  magic_pdf/filter/__init__.py,sha256=_7lSez_myu4b6cdzPpQ-NfREuqeBSq_QdyBPKVLyq2U,1505
29
29
  magic_pdf/filter/pdf_classify_by_type.py,sha256=YNYXamxYgEiSujwilCNHOtrwpgJGDiQ597qJfardDVc,42354
30
30
  magic_pdf/filter/pdf_meta_scan.py,sha256=eOuM0-JgaXvHolSgepGoNDJDmv_uITWLQpH_0MfnVQw,17478
@@ -49,12 +49,13 @@ magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg
49
49
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
50
50
  magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3396
51
51
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
52
+ magic_pdf/libs/performance_stats.py,sha256=BFi4NIsUYlanznYoTVq4hBpj4NOuShAlWBHzebBGVYM,1702
52
53
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
53
- magic_pdf/libs/version.py,sha256=MpAT5hgNoHnTtG1XRD_GV_A7QrHVU6vJjGSw_8qMGA4,22
54
+ magic_pdf/libs/version.py,sha256=uuf4VNtTNA93fMhoAur9YafzaKJFnczY-H1SSCSuRVQ,22
54
55
  magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
55
56
  magic_pdf/model/batch_analyze.py,sha256=sbrgOJWycb1Ep6e62CPi6jEyG6VSeklIxc4PmrqaLhM,11933
56
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=wma0aq6RyxAepEqnaiTJ9_pWWKLVBj39c6xWA85dxzA,8068
57
- magic_pdf/model/magic_model.py,sha256=OcKhSJ_PyAAldgpKPiPxi2uuvnj3Sf4SvXi_5Rv0a6Q,30667
57
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=T0-h4QmSIDXRzgF5uWO4jQrwIot221l26PXU52xeKiA,7933
58
+ magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
58
59
  magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
59
60
  magic_pdf/model/pdf_extract_kit.py,sha256=Rd51VNZPKRA_tUbDss-b44d84K6WDG2S87a37Ax7HUA,12224
60
61
  magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
@@ -92,7 +93,7 @@ magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
92
93
  magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=QfHbMr1br0pOJUu1NJEMgA6yw11G0yFImJv_AfW48_c,1008
93
94
  magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
95
  magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=R05qw54QuLl2btNWdkxf4yCjDeEj8o0786e-gz_Xv8k,5290
96
+ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=HPNetRfQeHoHfRTzFEaIjLSHfjrxRvS-EaApMUebZuQ,8020
96
97
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
98
  magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
98
99
  magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -116,12 +117,12 @@ magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio
116
117
  magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
117
118
  magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
118
119
  magic_pdf/post_proc/llm_aided.py,sha256=yzhu2cCpUZjdwf3v0swYDgSs9VWIfMAoXepYIP1EMZs,6367
119
- magic_pdf/post_proc/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
120
+ magic_pdf/post_proc/para_split_v3.py,sha256=SPN_VVGvFX5KpFMGw9OzgoE-kTZq-FF036i0cIImGH8,16975
120
121
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
121
122
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
122
123
  magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
123
124
  magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
124
- magic_pdf/pre_proc/ocr_dict_merge.py,sha256=vrbLIzNIjxrm7PonfHaFdY6qaicc0uIly62SJwgZ5UM,5496
125
+ magic_pdf/pre_proc/ocr_dict_merge.py,sha256=4Z3aHZ9sxzijkVpOCENslvUcpp7DXgNID4Gl3pxwIg4,5512
125
126
  magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=xrgC9vR0poklZuY4Og41pZVdXzuaGFg3BnQ01X60dpo,3102
126
127
  magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
127
128
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
@@ -138,9 +139,9 @@ magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,838
138
139
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
140
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
140
141
  magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
141
- magic_pdf-1.2.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
142
- magic_pdf-1.2.0.dist-info/METADATA,sha256=7iel3MItxKhJc1Bbfh_NMbDp8a23k9G1vA8LYEw2k_U,40720
143
- magic_pdf-1.2.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
144
- magic_pdf-1.2.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
145
- magic_pdf-1.2.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
146
- magic_pdf-1.2.0.dist-info/RECORD,,
142
+ magic_pdf-1.2.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
+ magic_pdf-1.2.2.dist-info/METADATA,sha256=FYzj0yWzmFAG4mQ22DH9F4KZfqexNg7YuhgiXMHc9Ug,41001
144
+ magic_pdf-1.2.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
+ magic_pdf-1.2.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
+ magic_pdf-1.2.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
+ magic_pdf-1.2.2.dist-info/RECORD,,