magic-pdf 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -103,54 +103,65 @@ def batch_build_dataset(pdf_paths, k, lang=None):
103
103
  all_images : list
104
104
  List of all processed images
105
105
  """
106
- # Get page counts for each PDF
107
- pdf_info = []
108
- total_pages = 0
109
106
 
107
+ results = []
110
108
  for pdf_path in pdf_paths:
111
- try:
112
- doc = fitz.open(pdf_path)
113
- num_pages = len(doc)
114
- pdf_info.append((pdf_path, num_pages))
115
- total_pages += num_pages
116
- doc.close()
117
- except Exception as e:
118
- print(f'Error opening {pdf_path}: {e}')
119
-
120
- # Partition the jobs based on page countEach job has 1 page
121
- partitions = partition_array_greedy(pdf_info, k)
122
-
123
- # Process each partition in parallel
124
- all_images_h = {}
125
-
126
- with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
127
- # Submit one task per partition
128
- futures = []
129
- for sn, partition in enumerate(partitions):
130
- # Get the jobs for this partition
131
- partition_jobs = [pdf_info[idx] for idx in partition]
132
-
133
- # Submit the task
134
- future = executor.submit(
135
- process_pdf_batch,
136
- partition_jobs,
137
- sn
138
- )
139
- futures.append(future)
140
- # Process results as they complete
141
- for i, future in enumerate(concurrent.futures.as_completed(futures)):
142
- try:
143
- idx, images = future.result()
144
- all_images_h[idx] = images
145
- except Exception as e:
146
- print(f'Error processing partition: {e}')
147
- results = [None] * len(pdf_paths)
148
- for i in range(len(partitions)):
149
- partition = partitions[i]
150
- for j in range(len(partition)):
151
- with open(pdf_info[partition[j]][0], 'rb') as f:
152
- pdf_bytes = f.read()
153
- dataset = PymuDocDataset(pdf_bytes, lang=lang)
154
- dataset.set_images(all_images_h[i][j])
155
- results[partition[j]] = dataset
109
+ with open(pdf_path, 'rb') as f:
110
+ pdf_bytes = f.read()
111
+ dataset = PymuDocDataset(pdf_bytes, lang=lang)
112
+ results.append(dataset)
156
113
  return results
114
+
115
+
116
+ #
117
+ # # Get page counts for each PDF
118
+ # pdf_info = []
119
+ # total_pages = 0
120
+ #
121
+ # for pdf_path in pdf_paths:
122
+ # try:
123
+ # doc = fitz.open(pdf_path)
124
+ # num_pages = len(doc)
125
+ # pdf_info.append((pdf_path, num_pages))
126
+ # total_pages += num_pages
127
+ # doc.close()
128
+ # except Exception as e:
129
+ # print(f'Error opening {pdf_path}: {e}')
130
+ #
131
+ # # Partition the jobs based on page countEach job has 1 page
132
+ # partitions = partition_array_greedy(pdf_info, k)
133
+ #
134
+ # # Process each partition in parallel
135
+ # all_images_h = {}
136
+ #
137
+ # with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
138
+ # # Submit one task per partition
139
+ # futures = []
140
+ # for sn, partition in enumerate(partitions):
141
+ # # Get the jobs for this partition
142
+ # partition_jobs = [pdf_info[idx] for idx in partition]
143
+ #
144
+ # # Submit the task
145
+ # future = executor.submit(
146
+ # process_pdf_batch,
147
+ # partition_jobs,
148
+ # sn
149
+ # )
150
+ # futures.append(future)
151
+ # # Process results as they complete
152
+ # for i, future in enumerate(concurrent.futures.as_completed(futures)):
153
+ # try:
154
+ # idx, images = future.result()
155
+ # all_images_h[idx] = images
156
+ # except Exception as e:
157
+ # print(f'Error processing partition: {e}')
158
+ # results = [None] * len(pdf_paths)
159
+ # for i in range(len(partitions)):
160
+ # partition = partitions[i]
161
+ # for j in range(len(partition)):
162
+ # with open(pdf_info[partition[j]][0], 'rb') as f:
163
+ # pdf_bytes = f.read()
164
+ # dataset = PymuDocDataset(pdf_bytes, lang=lang)
165
+ # dataset.set_images(all_images_h[i][j])
166
+ # results[partition[j]] = dataset
167
+ # return results
magic_pdf/data/dataset.py CHANGED
@@ -150,7 +150,7 @@ class PymuDocDataset(Dataset):
150
150
  elif lang == 'auto':
151
151
  from magic_pdf.model.sub_modules.language_detection.utils import \
152
152
  auto_detect_lang
153
- self._lang = auto_detect_lang(bits)
153
+ self._lang = auto_detect_lang(self._data_bits)
154
154
  logger.info(f'lang: {lang}, detect_lang: {self._lang}')
155
155
  else:
156
156
  self._lang = lang
@@ -249,7 +249,7 @@ class ImageDataset(Dataset):
249
249
  elif lang == 'auto':
250
250
  from magic_pdf.model.sub_modules.language_detection.utils import \
251
251
  auto_detect_lang
252
- self._lang = auto_detect_lang(bits)
252
+ self._lang = auto_detect_lang(self._data_bits)
253
253
  logger.info(f'lang: {lang}, detect_lang: {self._lang}')
254
254
  else:
255
255
  self._lang = lang
@@ -405,4 +405,4 @@ class Doc(PageableData):
405
405
  fontsize (int): font size of the text
406
406
  color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
407
407
  """
408
- self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
408
+ self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.3.1"
1
+ __version__ = "1.3.3"
@@ -30,8 +30,14 @@ class BatchAnalyze:
30
30
 
31
31
  images_layout_res = []
32
32
  layout_start_time = time.time()
33
- _, fst_ocr, fst_lang = images_with_extra_info[0]
34
- self.model = self.model_manager.get_model(fst_ocr, self.show_log, fst_lang, self.layout_model, self.formula_enable, self.table_enable)
33
+ self.model = self.model_manager.get_model(
34
+ ocr=True,
35
+ show_log=self.show_log,
36
+ lang = None,
37
+ layout_model = self.layout_model,
38
+ formula_enable = self.formula_enable,
39
+ table_enable = self.table_enable,
40
+ )
35
41
 
36
42
  images = [image for image, _, _ in images_with_extra_info]
37
43
 
@@ -143,14 +149,14 @@ class BatchAnalyze:
143
149
  if ocr_res:
144
150
  ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
145
151
  ocr_res_list_dict['layout_res'].extend(ocr_result_list)
146
- det_count += len(ocr_res_list_dict['ocr_res_list'])
152
+
153
+ # det_count += len(ocr_res_list_dict['ocr_res_list'])
147
154
  # logger.info(f'ocr-det time: {round(time.time()-det_start, 2)}, image num: {det_count}')
148
155
 
149
156
 
150
157
  # 表格识别 table recognition
151
158
  if self.model.apply_table:
152
159
  table_start = time.time()
153
- table_count = 0
154
160
  # for table_res_list_dict in table_res_list_all_page:
155
161
  for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"):
156
162
  _lang = table_res_dict['lang']
@@ -146,10 +146,8 @@ def doc_analyze(
146
146
  img_dict = page_data.get_image()
147
147
  images.append(img_dict['img'])
148
148
  page_wh_list.append((img_dict['width'], img_dict['height']))
149
- if lang is None or lang == 'auto':
150
- images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(dataset))]
151
- else:
152
- images_with_extra_info = [(images[index], ocr, lang) for index in range(len(dataset))]
149
+
150
+ images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))]
153
151
 
154
152
  if len(images) >= MIN_BATCH_INFERENCE_SIZE:
155
153
  batch_size = MIN_BATCH_INFERENCE_SIZE
@@ -158,8 +156,8 @@ def doc_analyze(
158
156
  batch_images = [images_with_extra_info]
159
157
 
160
158
  results = []
161
- for sn, batch_image in enumerate(batch_images):
162
- _, result = may_batch_image_analyze(batch_image, sn, ocr, show_log,layout_model, formula_enable, table_enable)
159
+ for batch_image in batch_images:
160
+ result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
163
161
  results.extend(result)
164
162
 
165
163
  model_json = []
@@ -181,7 +179,7 @@ def doc_analyze(
181
179
 
182
180
  def batch_doc_analyze(
183
181
  datasets: list[Dataset],
184
- parse_method: str,
182
+ parse_method: str = 'auto',
185
183
  show_log: bool = False,
186
184
  lang=None,
187
185
  layout_model=None,
@@ -190,30 +188,37 @@ def batch_doc_analyze(
190
188
  ):
191
189
  MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
192
190
  batch_size = MIN_BATCH_INFERENCE_SIZE
193
- images = []
194
191
  page_wh_list = []
195
192
 
196
193
  images_with_extra_info = []
197
194
  for dataset in datasets:
198
- for index in range(len(dataset)):
199
- if lang is None or lang == 'auto':
200
- _lang = dataset._lang
201
- else:
202
- _lang = lang
203
195
 
196
+ ocr = False
197
+ if parse_method == 'auto':
198
+ if dataset.classify() == SupportedPdfParseMethod.TXT:
199
+ ocr = False
200
+ elif dataset.classify() == SupportedPdfParseMethod.OCR:
201
+ ocr = True
202
+ elif parse_method == 'ocr':
203
+ ocr = True
204
+ elif parse_method == 'txt':
205
+ ocr = False
206
+
207
+ _lang = dataset._lang
208
+
209
+ for index in range(len(dataset)):
204
210
  page_data = dataset.get_page(index)
205
211
  img_dict = page_data.get_image()
206
- images.append(img_dict['img'])
207
212
  page_wh_list.append((img_dict['width'], img_dict['height']))
208
- if parse_method == 'auto':
209
- images_with_extra_info.append((images[-1], dataset.classify() == SupportedPdfParseMethod.OCR, _lang))
210
- else:
211
- images_with_extra_info.append((images[-1], parse_method == 'ocr', _lang))
213
+ images_with_extra_info.append((img_dict['img'], ocr, _lang))
212
214
 
213
215
  batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
214
216
  results = []
215
- for sn, batch_image in enumerate(batch_images):
216
- _, result = may_batch_image_analyze(batch_image, sn, True, show_log, layout_model, formula_enable, table_enable)
217
+ processed_images_count = 0
218
+ for index, batch_image in enumerate(batch_images):
219
+ processed_images_count += len(batch_image)
220
+ logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
221
+ result = may_batch_image_analyze(batch_image, True, show_log, layout_model, formula_enable, table_enable)
217
222
  results.extend(result)
218
223
 
219
224
  infer_results = []
@@ -233,7 +238,6 @@ def batch_doc_analyze(
233
238
 
234
239
  def may_batch_image_analyze(
235
240
  images_with_extra_info: list[(np.ndarray, bool, str)],
236
- idx: int,
237
241
  ocr: bool,
238
242
  show_log: bool = False,
239
243
  layout_model=None,
@@ -291,4 +295,4 @@ def may_batch_image_analyze(
291
295
  # f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
292
296
  # f' speed: {doc_analyze_speed} pages/second'
293
297
  # )
294
- return idx, results
298
+ return results
@@ -29,22 +29,204 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
29
29
  return return_image, return_list
30
30
 
31
31
 
32
- # Select regions for OCR / formula regions / table regions
33
- def get_res_list_from_layout_res(layout_res):
32
+ def get_coords_and_area(table):
33
+ """Extract coordinates and area from a table."""
34
+ xmin, ymin = int(table['poly'][0]), int(table['poly'][1])
35
+ xmax, ymax = int(table['poly'][4]), int(table['poly'][5])
36
+ area = (xmax - xmin) * (ymax - ymin)
37
+ return xmin, ymin, xmax, ymax, area
38
+
39
+
40
+ def calculate_intersection(box1, box2):
41
+ """Calculate intersection coordinates between two boxes."""
42
+ intersection_xmin = max(box1[0], box2[0])
43
+ intersection_ymin = max(box1[1], box2[1])
44
+ intersection_xmax = min(box1[2], box2[2])
45
+ intersection_ymax = min(box1[3], box2[3])
46
+
47
+ # Check if intersection is valid
48
+ if intersection_xmax <= intersection_xmin or intersection_ymax <= intersection_ymin:
49
+ return None
50
+
51
+ return intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax
52
+
53
+
54
+ def calculate_iou(box1, box2):
55
+ """Calculate IoU between two boxes."""
56
+ intersection = calculate_intersection(box1[:4], box2[:4])
57
+
58
+ if not intersection:
59
+ return 0
60
+
61
+ intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection
62
+ intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin)
63
+
64
+ area1, area2 = box1[4], box2[4]
65
+ union_area = area1 + area2 - intersection_area
66
+
67
+ return intersection_area / union_area if union_area > 0 else 0
68
+
69
+
70
+ def is_inside(small_box, big_box, overlap_threshold=0.8):
71
+ """Check if small_box is inside big_box by at least overlap_threshold."""
72
+ intersection = calculate_intersection(small_box[:4], big_box[:4])
73
+
74
+ if not intersection:
75
+ return False
76
+
77
+ intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection
78
+ intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin)
79
+
80
+ # Check if overlap exceeds threshold
81
+ return intersection_area >= overlap_threshold * small_box[4]
82
+
83
+
84
+ def do_overlap(box1, box2):
85
+ """Check if two boxes overlap."""
86
+ return calculate_intersection(box1[:4], box2[:4]) is not None
87
+
88
+
89
+ def merge_high_iou_tables(table_res_list, layout_res, table_indices, iou_threshold=0.7):
90
+ """Merge tables with IoU > threshold."""
91
+ if len(table_res_list) < 2:
92
+ return table_res_list, table_indices
93
+
94
+ table_info = [get_coords_and_area(table) for table in table_res_list]
95
+ merged = True
96
+
97
+ while merged:
98
+ merged = False
99
+ i = 0
100
+ while i < len(table_res_list) - 1:
101
+ j = i + 1
102
+ while j < len(table_res_list):
103
+ iou = calculate_iou(table_info[i], table_info[j])
104
+
105
+ if iou > iou_threshold:
106
+ # Merge tables by taking their union
107
+ x1_min, y1_min, x1_max, y1_max, _ = table_info[i]
108
+ x2_min, y2_min, x2_max, y2_max, _ = table_info[j]
109
+
110
+ union_xmin = min(x1_min, x2_min)
111
+ union_ymin = min(y1_min, y2_min)
112
+ union_xmax = max(x1_max, x2_max)
113
+ union_ymax = max(y1_max, y2_max)
114
+
115
+ # Create merged table
116
+ merged_table = table_res_list[i].copy()
117
+ merged_table['poly'][0] = union_xmin
118
+ merged_table['poly'][1] = union_ymin
119
+ merged_table['poly'][2] = union_xmax
120
+ merged_table['poly'][3] = union_ymin
121
+ merged_table['poly'][4] = union_xmax
122
+ merged_table['poly'][5] = union_ymax
123
+ merged_table['poly'][6] = union_xmin
124
+ merged_table['poly'][7] = union_ymax
125
+
126
+ # Update layout_res
127
+ to_remove = [table_indices[j], table_indices[i]]
128
+ for idx in sorted(to_remove, reverse=True):
129
+ del layout_res[idx]
130
+ layout_res.append(merged_table)
131
+
132
+ # Update tracking lists
133
+ table_indices = [k if k < min(to_remove) else
134
+ k - 1 if k < max(to_remove) else
135
+ k - 2 if k > max(to_remove) else
136
+ len(layout_res) - 1
137
+ for k in table_indices
138
+ if k not in to_remove]
139
+ table_indices.append(len(layout_res) - 1)
140
+
141
+ # Update table lists
142
+ table_res_list.pop(j)
143
+ table_res_list.pop(i)
144
+ table_res_list.append(merged_table)
145
+
146
+ # Update table_info
147
+ table_info = [get_coords_and_area(table) for table in table_res_list]
148
+
149
+ merged = True
150
+ break
151
+ j += 1
152
+
153
+ if merged:
154
+ break
155
+ i += 1
156
+
157
+ return table_res_list, table_indices
158
+
159
+
160
+ def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0.8):
161
+ """Remove big tables containing multiple smaller tables within them."""
162
+ if len(table_res_list) < 3:
163
+ return table_res_list
164
+
165
+ table_info = [get_coords_and_area(table) for table in table_res_list]
166
+ big_tables_idx = []
167
+
168
+ for i in range(len(table_res_list)):
169
+ # Find tables inside this one
170
+ tables_inside = [j for j in range(len(table_res_list))
171
+ if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
172
+
173
+ # Continue if there are at least 2 tables inside
174
+ if len(tables_inside) >= 2:
175
+ # Check if inside tables overlap with each other
176
+ tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
177
+ for idx1 in range(len(tables_inside))
178
+ for idx2 in range(idx1 + 1, len(tables_inside)))
179
+
180
+ # If no overlaps, check area condition
181
+ if not tables_overlap:
182
+ total_inside_area = sum(table_info[j][4] for j in tables_inside)
183
+ big_table_area = table_info[i][4]
184
+
185
+ if total_inside_area > area_threshold * big_table_area:
186
+ big_tables_idx.append(i)
187
+
188
+ return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx]
189
+
190
+
191
+ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8):
192
+ """Extract OCR, table and other regions from layout results."""
34
193
  ocr_res_list = []
35
194
  table_res_list = []
195
+ table_indices = []
36
196
  single_page_mfdetrec_res = []
37
- for res in layout_res:
38
- if int(res['category_id']) in [13, 14]:
197
+
198
+ # Categorize regions
199
+ for i, res in enumerate(layout_res):
200
+ category_id = int(res['category_id'])
201
+
202
+ if category_id in [13, 14]: # Formula regions
39
203
  single_page_mfdetrec_res.append({
40
204
  "bbox": [int(res['poly'][0]), int(res['poly'][1]),
41
205
  int(res['poly'][4]), int(res['poly'][5])],
42
206
  })
43
- elif int(res['category_id']) in [0, 1, 2, 4, 6, 7]:
207
+ elif category_id in [0, 1, 2, 4, 6, 7]: # OCR regions
44
208
  ocr_res_list.append(res)
45
- elif int(res['category_id']) in [5]:
209
+ elif category_id == 5: # Table regions
46
210
  table_res_list.append(res)
47
- return ocr_res_list, table_res_list, single_page_mfdetrec_res
211
+ table_indices.append(i)
212
+
213
+ # Process tables: merge high IoU tables first, then filter nested tables
214
+ table_res_list, table_indices = merge_high_iou_tables(
215
+ table_res_list, layout_res, table_indices, iou_threshold)
216
+
217
+ filtered_table_res_list = filter_nested_tables(
218
+ table_res_list, overlap_threshold, area_threshold)
219
+
220
+ # Remove filtered out tables from layout_res
221
+ if len(filtered_table_res_list) < len(table_res_list):
222
+ kept_tables = set(id(table) for table in filtered_table_res_list)
223
+ to_remove = [table_indices[i] for i, table in enumerate(table_res_list)
224
+ if id(table) not in kept_tables]
225
+
226
+ for idx in sorted(to_remove, reverse=True):
227
+ del layout_res[idx]
228
+
229
+ return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res
48
230
 
49
231
 
50
232
  def clean_vram(device, vram_threshold=8):
@@ -1,8 +1,12 @@
1
1
  lang:
2
- ch:
2
+ ch_lite:
3
3
  det: ch_PP-OCRv3_det_infer.pth
4
4
  rec: ch_PP-OCRv4_rec_infer.pth
5
5
  dict: ppocr_keys_v1.txt
6
+ ch:
7
+ det: ch_PP-OCRv3_det_infer.pth
8
+ rec: ch_PP-OCRv4_rec_server_infer.pth
9
+ dict: ppocr_keys_v1.txt
6
10
  en:
7
11
  det: en_PP-OCRv3_det_infer.pth
8
12
  rec: en_PP-OCRv4_rec_infer.pth
@@ -437,4 +437,10 @@ class TextRecognizer(BaseOCRV20):
437
437
  index += 1
438
438
  pbar.update(current_batch_size)
439
439
 
440
+ # Fix NaN values in recognition results
441
+ for i in range(len(rec_res)):
442
+ text, score = rec_res[i]
443
+ if isinstance(score, float) and math.isnan(score):
444
+ rec_res[i] = (text, 0.0)
445
+
440
446
  return rec_res, elapse
@@ -35,26 +35,63 @@ class RapidTableModel(object):
35
35
  # from rapidocr_onnxruntime import RapidOCR
36
36
  # self.ocr_engine = RapidOCR()
37
37
 
38
- self.ocr_model_name = "PaddleOCR"
38
+ # self.ocr_model_name = "PaddleOCR"
39
39
  self.ocr_engine = ocr_engine
40
40
 
41
41
 
42
42
  def predict(self, image):
43
+ bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
43
44
 
44
- if self.ocr_model_name == "RapidOCR":
45
- ocr_result, _ = self.ocr_engine(np.asarray(image))
46
- elif self.ocr_model_name == "PaddleOCR":
47
- bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
48
- ocr_result = self.ocr_engine.ocr(bgr_image)[0]
49
- if ocr_result:
50
- ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
51
- len(item) == 2 and isinstance(item[1], tuple)]
52
- else:
53
- ocr_result = None
45
+ # First check the overall image aspect ratio (height/width)
46
+ img_height, img_width = bgr_image.shape[:2]
47
+ img_aspect_ratio = img_height / img_width if img_width > 0 else 1.0
48
+ img_is_portrait = img_aspect_ratio > 1.2
49
+
50
+ if img_is_portrait:
51
+
52
+ det_res = self.ocr_engine.ocr(bgr_image, rec=False)[0]
53
+ # Check if table is rotated by analyzing text box aspect ratios
54
+ is_rotated = False
55
+ if det_res:
56
+ vertical_count = 0
57
+
58
+ for box_ocr_res in det_res:
59
+ p1, p2, p3, p4 = box_ocr_res
60
+
61
+ # Calculate width and height
62
+ width = p3[0] - p1[0]
63
+ height = p3[1] - p1[1]
64
+
65
+ aspect_ratio = width / height if height > 0 else 1.0
66
+
67
+ # Count vertical vs horizontal text boxes
68
+ if aspect_ratio < 0.8: # Taller than wide - vertical text
69
+ vertical_count += 1
70
+ # elif aspect_ratio > 1.2: # Wider than tall - horizontal text
71
+ # horizontal_count += 1
72
+
73
+ # If we have more vertical text boxes than horizontal ones,
74
+ # and vertical ones are significant, table might be rotated
75
+ if vertical_count >= len(det_res) * 0.3:
76
+ is_rotated = True
77
+
78
+ # logger.debug(f"Text orientation analysis: vertical={vertical_count}, det_res={len(det_res)}, rotated={is_rotated}")
79
+
80
+ # Rotate image if necessary
81
+ if is_rotated:
82
+ # logger.debug("Table appears to be in portrait orientation, rotating 90 degrees clockwise")
83
+ image = cv2.rotate(np.asarray(image), cv2.ROTATE_90_CLOCKWISE)
84
+ bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
85
+
86
+ # Continue with OCR on potentially rotated image
87
+ ocr_result = self.ocr_engine.ocr(bgr_image)[0]
88
+ if ocr_result:
89
+ ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
90
+ len(item) == 2 and isinstance(item[1], tuple)]
54
91
  else:
55
- logger.error("OCR model not supported")
56
92
  ocr_result = None
57
93
 
94
+
58
95
  if ocr_result:
59
96
  table_results = self.table_model(np.asarray(image), ocr_result)
60
97
  html_code = table_results.pred_html
@@ -99,11 +99,11 @@ def ocr_prepare_bboxes_for_layout_split_v2(
99
99
  all_discarded_blocks = []
100
100
  add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
101
101
 
102
- """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的"""
102
+ """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
103
103
  footnote_blocks = []
104
104
  for discarded in discarded_blocks:
105
105
  x0, y0, x1, y1 = discarded['bbox']
106
- if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
106
+ if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
107
107
  footnote_blocks.append([x0, y0, x1, y1])
108
108
 
109
109
  """移除在footnote下面的任何框"""
magic_pdf/tools/common.py CHANGED
@@ -109,9 +109,7 @@ def _do_parse(
109
109
  pdf_bytes = ds._raw_data
110
110
  local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
111
111
 
112
- image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
113
- local_md_dir
114
- )
112
+ image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
115
113
  image_dir = str(os.path.basename(local_image_dir))
116
114
 
117
115
  if len(model_list) == 0:
@@ -317,7 +315,26 @@ def batch_do_parse(
317
315
 
318
316
  infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
319
317
  for idx, infer_result in enumerate(infer_results):
320
- _do_parse(output_dir, pdf_file_names[idx], dss[idx], infer_result.get_infer_res(), parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
318
+ _do_parse(
319
+ output_dir = output_dir,
320
+ pdf_file_name = pdf_file_names[idx],
321
+ pdf_bytes_or_dataset = dss[idx],
322
+ model_list = infer_result.get_infer_res(),
323
+ parse_method = parse_method,
324
+ debug_able = debug_able,
325
+ f_draw_span_bbox = f_draw_span_bbox,
326
+ f_draw_layout_bbox = f_draw_layout_bbox,
327
+ f_dump_md=f_dump_md,
328
+ f_dump_middle_json=f_dump_middle_json,
329
+ f_dump_model_json=f_dump_model_json,
330
+ f_dump_orig_pdf=f_dump_orig_pdf,
331
+ f_dump_content_list=f_dump_content_list,
332
+ f_make_md_mode=MakeMode.MM_MD,
333
+ f_draw_model_bbox=f_draw_model_bbox,
334
+ f_draw_line_sort_bbox=f_draw_line_sort_bbox,
335
+ f_draw_char_bbox=f_draw_char_bbox,
336
+ lang=lang,
337
+ )
321
338
 
322
339
 
323
340
  parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 1.3.1
3
+ Version: 1.3.3
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: Home, https://mineru.net/
@@ -23,27 +23,26 @@ Requires-Dist: numpy >=1.21.6
23
23
  Requires-Dist: pdfminer.six ==20231228
24
24
  Requires-Dist: pydantic <2.11,>=2.7.2
25
25
  Requires-Dist: scikit-learn >=1.0.2
26
- Requires-Dist: torch !=2.5.0,!=2.5.1,<=2.6.0,>=2.2.2
26
+ Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
27
27
  Requires-Dist: torchvision
28
28
  Requires-Dist: tqdm >=4.67.1
29
29
  Requires-Dist: transformers !=4.51.0,<5.0.0,>=4.49.0
30
30
  Provides-Extra: full
31
31
  Requires-Dist: PyYAML <7,>=6.0.2 ; extra == 'full'
32
- Requires-Dist: dill <1,>=0.3.9 ; extra == 'full'
32
+ Requires-Dist: dill <1,>=0.3.8 ; extra == 'full'
33
33
  Requires-Dist: doclayout-yolo ==0.0.2b1 ; extra == 'full'
34
34
  Requires-Dist: ftfy <7,>=6.3.1 ; extra == 'full'
35
+ Requires-Dist: matplotlib <4,>=3.10 ; extra == 'full'
35
36
  Requires-Dist: omegaconf <3,>=2.3.0 ; extra == 'full'
36
37
  Requires-Dist: openai <2,>=1.70.0 ; extra == 'full'
37
38
  Requires-Dist: pyclipper <2,>=1.3.0 ; extra == 'full'
38
39
  Requires-Dist: rapid-table <2.0.0,>=1.0.5 ; extra == 'full'
39
40
  Requires-Dist: shapely <3,>=2.0.7 ; extra == 'full'
40
- Requires-Dist: ultralytics >=8.3.48 ; extra == 'full'
41
- Requires-Dist: matplotlib >=3.10 ; (platform_system == "Linux" or platform_system == "Darwin") and extra == 'full'
42
- Requires-Dist: matplotlib <=3.9.0 ; (platform_system == "Windows") and extra == 'full'
41
+ Requires-Dist: ultralytics <9,>=8.3.48 ; extra == 'full'
43
42
  Provides-Extra: full_old_linux
44
43
  Requires-Dist: PyYAML ==6.0.2 ; extra == 'full_old_linux'
45
44
  Requires-Dist: albumentations ==1.4.20 ; extra == 'full_old_linux'
46
- Requires-Dist: dill ==0.3.9 ; extra == 'full_old_linux'
45
+ Requires-Dist: dill ==0.3.8 ; extra == 'full_old_linux'
47
46
  Requires-Dist: doclayout-yolo ==0.0.2b1 ; extra == 'full_old_linux'
48
47
  Requires-Dist: ftfy ==6.3.1 ; extra == 'full_old_linux'
49
48
  Requires-Dist: matplotlib <=3.10.1,>=3.10 ; extra == 'full_old_linux'
@@ -108,9 +107,14 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
108
107
  </div>
109
108
 
110
109
  # Changelog
110
+ - 2025/04/12 1.3.2 released
111
+ - Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
112
+ - Optimized memory usage during batch inference.
113
+ - Improved the parsing effect of tables rotated by 90 degrees.
114
+ - Enhanced the parsing accuracy for large tables in financial report samples.
115
+ - Fixed the occasional word concatenation issue in English text areas when OCR language is not specified.(The model needs to be updated)
111
116
  - 2025/04/08 1.3.1 released, fixed some compatibility issues
112
117
  - Supported Python 3.13
113
- - Resolved errors caused by `transformers 4.51.0`
114
118
  - Made the final adaptation for some outdated Linux systems (e.g., CentOS 7), and no further support will be guaranteed for subsequent versions. [Installation Instructions](https://github.com/opendatalab/MinerU/issues/1004)
115
119
  - 2025/04/03 1.3.0 released, in this version we made many optimizations and improvements:
116
120
  - Installation and compatibility optimization
@@ -129,59 +133,154 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
129
133
  - Usability Optimization
130
134
  - By using `paddleocr2torch`, completely replaced the use of the `paddle` framework and `paddleocr` in the project, resolving conflicts between `paddle` and `torch`, as well as thread safety issues caused by the `paddle` framework.
131
135
  - Added a real-time progress bar during the parsing process to accurately track progress, making the wait less painful.
132
- - 2025/03/03 1.2.1 released, fixed several bugs:
133
- - Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers
134
- - Fixed caption matching inaccuracies in certain scenarios
135
- - Fixed formula span loss issues in certain scenarios
136
- - 2025/02/24 1.2.0 released. This version includes several fixes and improvements to enhance parsing efficiency and accuracy:
137
- - Performance Optimization
138
- - Increased classification speed for PDF documents in auto mode.
139
- - Parsing Optimization
140
- - Improved parsing logic for documents containing watermarks, significantly enhancing the parsing results for such documents.
141
- - Enhanced the matching logic for multiple images/tables and captions within a single page, improving the accuracy of image-text matching in complex layouts.
142
- - Bug Fixes
143
- - Fixed an issue where image/table spans were incorrectly filled into text blocks under certain conditions.
144
- - Resolved an issue where title blocks were empty in some cases.
145
- - 2025/01/22 1.1.0 released. In this version we have focused on improving parsing accuracy and efficiency:
146
- - Model capability upgrade (requires re-executing the [model download process](docs/how_to_download_models_en.md) to obtain incremental updates of model files)
147
- - The layout recognition model has been upgraded to the latest `doclayout_yolo(2501)` model, improving layout recognition accuracy.
148
- - The formula parsing model has been upgraded to the latest `unimernet(2501)` model, improving formula recognition accuracy.
149
- - Performance optimization
150
- - On devices that meet certain configuration requirements (16GB+ VRAM), by optimizing resource usage and restructuring the processing pipeline, overall parsing speed has been increased by more than 50%.
151
- - Parsing effect optimization
152
- - Added a new heading classification feature (testing version, enabled by default) to the online demo([mineru.net](https://mineru.net/OpenSourceTools/Extractor)/[huggingface](https://huggingface.co/spaces/opendatalab/MinerU)/[modelscope](https://www.modelscope.cn/studios/OpenDataLab/MinerU)), which supports hierarchical classification of headings, thereby enhancing document structuring.
153
- - 2025/01/10 1.0.1 released. This is our first official release, where we have introduced a completely new API interface and enhanced compatibility through extensive refactoring, as well as a brand new automatic language identification feature:
154
- - New API Interface
155
- - For the data-side API, we have introduced the Dataset class, designed to provide a robust and flexible data processing framework. This framework currently supports a variety of document formats, including images (.jpg and .png), PDFs, Word documents (.doc and .docx), and PowerPoint presentations (.ppt and .pptx). It ensures effective support for data processing tasks ranging from simple to complex.
156
- - For the user-side API, we have meticulously designed the MinerU processing workflow as a series of composable Stages. Each Stage represents a specific processing step, allowing users to define new Stages according to their needs and creatively combine these stages to customize their data processing workflows.
157
- - Enhanced Compatibility
158
- - By optimizing the dependency environment and configuration items, we ensure stable and efficient operation on ARM architecture Linux systems.
159
- - We have deeply integrated with Huawei Ascend NPU acceleration, providing autonomous and controllable high-performance computing capabilities. This supports the localization and development of AI application platforms in China. [Ascend NPU Acceleration](docs/README_Ascend_NPU_Acceleration_zh_CN.md)
160
- - Automatic Language Identification
161
- - By introducing a new language recognition model, setting the `lang` configuration to `auto` during document parsing will automatically select the appropriate OCR language model, improving the accuracy of scanned document parsing.
162
- - 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
163
- - Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
164
- - Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
165
- - 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
166
- - 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
167
- - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
168
- - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
169
- - Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.
170
- - Refactored the list and table of contents recognition functions, significantly improving the accuracy of list blocks and table of contents blocks, as well as the parsing of corresponding text paragraphs.
171
- - Refactored the matching logic for figures, tables, and descriptive text, greatly enhancing the accuracy of matching captions and footnotes to figures and tables, and reducing the loss rate of descriptive text to near zero.
172
- - Added multi-language support for OCR, supporting detection and recognition of 84 languages.For the list of supported languages, see [OCR Language Support List](https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations).
173
- - Added memory recycling logic and other memory optimization measures, significantly reducing memory usage. The memory requirement for enabling all acceleration features except table acceleration (layout/formula/OCR) has been reduced from 16GB to 8GB, and the memory requirement for enabling all acceleration features has been reduced from 24GB to 10GB.
174
- - Optimized configuration file feature switches, adding an independent formula detection switch to significantly improve speed and parsing results when formula detection is not needed.
175
- - Integrated [PDF-Extract-Kit 1.0](https://github.com/opendatalab/PDF-Extract-Kit):
176
- - Added the self-developed `doclayout_yolo` model, which speeds up processing by more than 10 times compared to the original solution while maintaining similar parsing effects, and can be freely switched with `layoutlmv3` via the configuration file.
177
- - Upgraded formula parsing to `unimernet 0.2.1`, improving formula parsing accuracy while significantly reducing memory usage.
178
- - Due to the repository change for `PDF-Extract-Kit 1.0`, you need to re-download the model. Please refer to [How to Download Models](docs/how_to_download_models_en.md) for detailed steps.
179
- - 2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a [localized deployment version](projects/web_demo/README.md) of the [online demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) and the [front-end interface](projects/web/README.md).
180
- - 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
181
- - 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
182
- - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
183
- - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
184
- - 2024/07/05: Initial open-source release
136
+ <details>
137
+ <summary>2025/03/03 1.2.1 released</summary>
138
+ <ul>
139
+ <li>Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers</li>
140
+ <li>Fixed caption matching inaccuracies in certain scenarios</li>
141
+ <li>Fixed formula span loss issues in certain scenarios</li>
142
+ </ul>
143
+ </details>
144
+
145
+ <details>
146
+ <summary>2025/02/24 1.2.0 released</summary>
147
+ <p>This version includes several fixes and improvements to enhance parsing efficiency and accuracy:</p>
148
+ <ul>
149
+ <li><strong>Performance Optimization</strong>
150
+ <ul>
151
+ <li>Increased classification speed for PDF documents in auto mode.</li>
152
+ </ul>
153
+ </li>
154
+ <li><strong>Parsing Optimization</strong>
155
+ <ul>
156
+ <li>Improved parsing logic for documents containing watermarks, significantly enhancing the parsing results for such documents.</li>
157
+ <li>Enhanced the matching logic for multiple images/tables and captions within a single page, improving the accuracy of image-text matching in complex layouts.</li>
158
+ </ul>
159
+ </li>
160
+ <li><strong>Bug Fixes</strong>
161
+ <ul>
162
+ <li>Fixed an issue where image/table spans were incorrectly filled into text blocks under certain conditions.</li>
163
+ <li>Resolved an issue where title blocks were empty in some cases.</li>
164
+ </ul>
165
+ </li>
166
+ </ul>
167
+ </details>
168
+
169
+ <details>
170
+ <summary>2025/01/22 1.1.0 released</summary>
171
+ <p>In this version we have focused on improving parsing accuracy and efficiency:</p>
172
+ <ul>
173
+ <li><strong>Model capability upgrade</strong> (requires re-executing the <a href="https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_en.md">model download process</a> to obtain incremental updates of model files)
174
+ <ul>
175
+ <li>The layout recognition model has been upgraded to the latest <code>doclayout_yolo(2501)</code> model, improving layout recognition accuracy.</li>
176
+ <li>The formula parsing model has been upgraded to the latest <code>unimernet(2501)</code> model, improving formula recognition accuracy.</li>
177
+ </ul>
178
+ </li>
179
+ <li><strong>Performance optimization</strong>
180
+ <ul>
181
+ <li>On devices that meet certain configuration requirements (16GB+ VRAM), by optimizing resource usage and restructuring the processing pipeline, overall parsing speed has been increased by more than 50%.</li>
182
+ </ul>
183
+ </li>
184
+ <li><strong>Parsing effect optimization</strong>
185
+ <ul>
186
+ <li>Added a new heading classification feature (testing version, enabled by default) to the online demo (<a href="https://mineru.net/OpenSourceTools/Extractor">mineru.net</a>/<a href="https://huggingface.co/spaces/opendatalab/MinerU">huggingface</a>/<a href="https://www.modelscope.cn/studios/OpenDataLab/MinerU">modelscope</a>), which supports hierarchical classification of headings, thereby enhancing document structuring.</li>
187
+ </ul>
188
+ </li>
189
+ </ul>
190
+ </details>
191
+
192
+ <details>
193
+ <summary>2025/01/10 1.0.1 released</summary>
194
+ <p>This is our first official release, where we have introduced a completely new API interface and enhanced compatibility through extensive refactoring, as well as a brand new automatic language identification feature:</p>
195
+ <ul>
196
+ <li><strong>New API Interface</strong>
197
+ <ul>
198
+ <li>For the data-side API, we have introduced the Dataset class, designed to provide a robust and flexible data processing framework. This framework currently supports a variety of document formats, including images (.jpg and .png), PDFs, Word documents (.doc and .docx), and PowerPoint presentations (.ppt and .pptx). It ensures effective support for data processing tasks ranging from simple to complex.</li>
199
+ <li>For the user-side API, we have meticulously designed the MinerU processing workflow as a series of composable Stages. Each Stage represents a specific processing step, allowing users to define new Stages according to their needs and creatively combine these stages to customize their data processing workflows.</li>
200
+ </ul>
201
+ </li>
202
+ <li><strong>Enhanced Compatibility</strong>
203
+ <ul>
204
+ <li>By optimizing the dependency environment and configuration items, we ensure stable and efficient operation on ARM architecture Linux systems.</li>
205
+ <li>We have deeply integrated with Huawei Ascend NPU acceleration, providing autonomous and controllable high-performance computing capabilities. This supports the localization and development of AI application platforms in China. <a href="https://github.com/opendatalab/MinerU/blob/master/docs/README_Ascend_NPU_Acceleration_zh_CN.md">Ascend NPU Acceleration</a></li>
206
+ </ul>
207
+ </li>
208
+ <li><strong>Automatic Language Identification</strong>
209
+ <ul>
210
+ <li>By introducing a new language recognition model, setting the <code>lang</code> configuration to <code>auto</code> during document parsing will automatically select the appropriate OCR language model, improving the accuracy of scanned document parsing.</li>
211
+ </ul>
212
+ </li>
213
+ </ul>
214
+ </details>
215
+
216
+ <details>
217
+ <summary>2024/11/22 0.10.0 released</summary>
218
+ <p>Introducing hybrid OCR text extraction capabilities:</p>
219
+ <ul>
220
+ <li>Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.</li>
221
+ <li>Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.</li>
222
+ </ul>
223
+ </details>
224
+
225
+ <details>
226
+ <summary>2024/11/15 0.9.3 released</summary>
227
+ <p>Integrated <a href="https://github.com/RapidAI/RapidTable">RapidTable</a> for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.</p>
228
+ </details>
229
+
230
+ <details>
231
+ <summary>2024/11/06 0.9.2 released</summary>
232
+ <p>Integrated the <a href="https://huggingface.co/U4R/StructTable-InternVL2-1B">StructTable-InternVL2-1B</a> model for table recognition functionality.</p>
233
+ </details>
234
+
235
+ <details>
236
+ <summary>2024/10/31 0.9.0 released</summary>
237
+ <p>This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:</p>
238
+ <ul>
239
+ <li>Refactored the sorting module code to use <a href="https://github.com/ppaanngggg/layoutreader">layoutreader</a> for reading order sorting, ensuring high accuracy in various layouts.</li>
240
+ <li>Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.</li>
241
+ <li>Refactored the list and table of contents recognition functions, significantly improving the accuracy of list blocks and table of contents blocks, as well as the parsing of corresponding text paragraphs.</li>
242
+ <li>Refactored the matching logic for figures, tables, and descriptive text, greatly enhancing the accuracy of matching captions and footnotes to figures and tables, and reducing the loss rate of descriptive text to near zero.</li>
243
+ <li>Added multi-language support for OCR, supporting detection and recognition of 84 languages. For the list of supported languages, see <a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations">OCR Language Support List</a>.</li>
244
+ <li>Added memory recycling logic and other memory optimization measures, significantly reducing memory usage. The memory requirement for enabling all acceleration features except table acceleration (layout/formula/OCR) has been reduced from 16GB to 8GB, and the memory requirement for enabling all acceleration features has been reduced from 24GB to 10GB.</li>
245
+ <li>Optimized configuration file feature switches, adding an independent formula detection switch to significantly improve speed and parsing results when formula detection is not needed.</li>
246
+ <li>Integrated <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit 1.0</a>:
247
+ <ul>
248
+ <li>Added the self-developed <code>doclayout_yolo</code> model, which speeds up processing by more than 10 times compared to the original solution while maintaining similar parsing effects, and can be freely switched with <code>layoutlmv3</code> via the configuration file.</li>
249
+ <li>Upgraded formula parsing to <code>unimernet 0.2.1</code>, improving formula parsing accuracy while significantly reducing memory usage.</li>
250
+ <li>Due to the repository change for <code>PDF-Extract-Kit 1.0</code>, you need to re-download the model. Please refer to <a href="https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_en.md">How to Download Models</a> for detailed steps.</li>
251
+ </ul>
252
+ </li>
253
+ </ul>
254
+ </details>
255
+
256
+ <details>
257
+ <summary>2024/09/27 Version 0.8.1 released</summary>
258
+ <p>Fixed some bugs, and providing a <a href="https://github.com/opendatalab/MinerU/blob/master/projects/web_demo/README.md">localized deployment version</a> of the <a href="https://opendatalab.com/OpenSourceTools/Extractor/PDF/">online demo</a> and the <a href="https://github.com/opendatalab/MinerU/blob/master/projects/web/README.md">front-end interface</a>.</p>
259
+ </details>
260
+
261
+ <details>
262
+ <summary>2024/09/09 Version 0.8.0 released</summary>
263
+ <p>Supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.</p>
264
+ </details>
265
+
266
+ <details>
267
+ <summary>2024/08/30 Version 0.7.1 released</summary>
268
+ <p>Add paddle tablemaster table recognition option</p>
269
+ </details>
270
+
271
+ <details>
272
+ <summary>2024/08/09 Version 0.7.0b1 released</summary>
273
+ <p>Simplified installation process, added table recognition functionality</p>
274
+ </details>
275
+
276
+ <details>
277
+ <summary>2024/08/01 Version 0.6.2b1 released</summary>
278
+ <p>Optimized dependency conflict issues and installation documentation</p>
279
+ </details>
280
+
281
+ <details>
282
+ <summary>2024/07/05 Initial open-source release</summary>
283
+ </details>
185
284
 
186
285
  <!-- TABLE OF CONTENT -->
187
286
 
@@ -10,8 +10,8 @@ magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLx
10
10
  magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
11
11
  magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
12
12
  magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- magic_pdf/data/batch_build_dataset.py,sha256=rS4f50hBc7IvSqa_Gd84E_tSYpQ66BMaeZkCPd5Ajxw,4601
14
- magic_pdf/data/dataset.py,sha256=nsS507s1lPyfjnzEhfsQiBy_CdScPy79h3Fvjk_VKp0,12237
13
+ magic_pdf/data/batch_build_dataset.py,sha256=KQoWFJDqCwRQug8-fTuciSwff58AYRjCNP6GdiDhxLI,4953
14
+ magic_pdf/data/dataset.py,sha256=2v-a7kA6dRUDQpjlAVE5We1tMATR-MYKzQCcBhNci5g,12258
15
15
  magic_pdf/data/read_api.py,sha256=_faBnYE3iU_EiQLNFjVM6a8IQtOGAcSQNYBZsTSN1d8,5225
16
16
  magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
17
17
  magic_pdf/data/utils.py,sha256=dNWIJECPXaakKocI4z5Tq6vhDDSnR-bVWQV7DO2w_A8,5335
@@ -52,17 +52,17 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
52
52
  magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
53
53
  magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
54
54
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
55
- magic_pdf/libs/version.py,sha256=-ypEJktJToAL9by62JJKWEzDo_KPCQtmE5kwFgX24z4,22
55
+ magic_pdf/libs/version.py,sha256=Vi6om3KImlKsS_Wg5CjUgYffoi2zx7T-SRPnnGL0G7M,22
56
56
  magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
57
- magic_pdf/model/batch_analyze.py,sha256=6vRqGnZjDqznsifeDZhjD_v8RmDSdDNxOAci8GCFozo,11211
58
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=z1JWvM24poMd2SsziRJRzeqJ9rKXbqSwJprCheuXSGg,10282
57
+ magic_pdf/model/batch_analyze.py,sha256=yKhKQuZTh9GG83p61bw2BRqKMbnsjsmX73gfuTRk8xE,11272
58
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
59
59
  magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
60
60
  magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
61
61
  magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd05_fGU,10845
62
62
  magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
63
63
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  magic_pdf/model/sub_modules/model_init.py,sha256=e2t95kxiuU47luOHByDokbQ2uob6oQhEA4b6UGVfMjY,8303
65
- magic_pdf/model/sub_modules/model_utils.py,sha256=GGkVqdGPTmPUaYTuPHxjzzxIizg1kmYo8voIdE7ETdg,2653
65
+ magic_pdf/model/sub_modules/model_utils.py,sha256=iNC-zuDLWkwUAwMZ0YcGxAwHn5SAAFRdZBQgTy9nmgY,9880
66
66
  magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
67
67
  magic_pdf/model/sub_modules/language_detection/utils.py,sha256=Q__v6DdNJztt8GhVSuSB0txahVq-aj8RLhWn2VScx4w,3047
68
68
  magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=7T8eFl8zlZe6F0j0jSB3jrwOapDft320JQ1fuWxpvAY,5230
@@ -141,7 +141,7 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_post
141
141
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py,sha256=HiHNr4bhW5U1j4pYoyi8fPOaFsn8TUc4nSB6q8chfV4,26899
142
142
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
143
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml,sha256=eEzg5D5L3MHFL4H02gZnxdDiqtSCUzZDnt5pqDAmgCI,6980
144
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=GOtAGMAretviqDXak409PPav7qHYMDBwSs9wxlSANRA,1388
144
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=M0vyAENxKIaPaSdRBDhH8ik5V71vcY1STkZoq-3iqD8,1504
145
145
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt,sha256=xbaXD14RWk0Vpc7fAHpephuszp1j-Qi3IWC4VrFKu70,407
146
146
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt,sha256=gyVR_uHy-8l1CHctgevcjboSwA3pejXHHJ3fQ92sGoM,33443
147
147
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt,sha256=NpqCxsjEeXhKXXJkSLg7Hq-1_vCkEppeqjkpYl3c0TI,410
@@ -158,7 +158,7 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py,sha256=xEqR6
158
158
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
159
159
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py,sha256=8RmKl1vejnZl65caHZNV2ta6hMsg5B_LE-FuqCO8T8A,4225
160
160
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py,sha256=cRBKE0blzryj3Ar6yM0FKKgxmZdgMc44NDNl1S2wiRs,9136
161
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py,sha256=_fLTWjEmDZwXC-zzPT37PHO-nNlEvafemo2CyPJS7_w,19216
161
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py,sha256=GZ1PhVZ6GCPedgzU02e4pC52jHPf7uNI1GTID2CkMHA,19444
162
162
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-fkTS8swRYSbZeoqmSI8,3822
163
163
  magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
164
164
  magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -168,7 +168,7 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
168
168
  magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
169
169
  magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
170
170
  magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=w9nTdoTV5EJsG8ZlshNig0cdaMwlQ3XlQF1MKVuMwD8,2785
171
+ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=6W6qBNFZ_ETyk7B7Figk2ekPT3YgM_CUGWlAbdJC6dQ,4399
172
172
  magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1REI,2630
173
173
  magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
174
174
  magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
@@ -178,7 +178,7 @@ magic_pdf/post_proc/para_split_v3.py,sha256=SPN_VVGvFX5KpFMGw9OzgoE-kTZq-FF036i0
178
178
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
179
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
180
180
  magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
181
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
181
+ magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=3_bEbZC_BDwbuaBLPdCIbkxz93-g9oCtvjuXD8qbklo,9330
182
182
  magic_pdf/pre_proc/ocr_dict_merge.py,sha256=PscKGF0uJIjMxZRM69FLUs1SZO_wOswDQQV1f0M2xAo,5627
183
183
  magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=bs5RLvk4kIyx9_Hqq0FU3AGPPxE8Sxs97Uwlf1sBryM,4725
184
184
  magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
@@ -191,13 +191,13 @@ magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,
191
191
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
192
192
  magic_pdf/tools/cli.py,sha256=_oa-M5Hcopa5RZudVzrEip2W8pa9422Lmat7tMBJO5M,5171
193
193
  magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
194
- magic_pdf/tools/common.py,sha256=LoUz6Y36_U2odZqzBNKXngFNa6plf01U7_5jlDAFXaQ,12313
194
+ magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,12623
195
195
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
196
196
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
197
197
  magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
198
- magic_pdf-1.3.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
199
- magic_pdf-1.3.1.dist-info/METADATA,sha256=PGXFggL8ni7iXJ5qUXfZLGZqXrbEi9TUhLYzCVxduWw,43499
200
- magic_pdf-1.3.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
201
- magic_pdf-1.3.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
202
- magic_pdf-1.3.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
203
- magic_pdf-1.3.1.dist-info/RECORD,,
198
+ magic_pdf-1.3.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
199
+ magic_pdf-1.3.3.dist-info/METADATA,sha256=1Y-a4UouLQRhsldrhz6UZLlx4KUFOdjSk5R1gK_oYjs,45615
200
+ magic_pdf-1.3.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
201
+ magic_pdf-1.3.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
202
+ magic_pdf-1.3.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
203
+ magic_pdf-1.3.3.dist-info/RECORD,,