PyPI - magic-pdf - Versions diffs - 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl - Mend

magic-pdf 1.3.1py3-none-any.whl → 1.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

magic_pdf/data/batch_build_dataset.py CHANGED Viewed

@@ -103,54 +103,65 @@ def batch_build_dataset(pdf_paths, k, lang=None):
     all_images : list
         List of all processed images
     """
-    # Get page counts for each PDF
-    pdf_info = []
-    total_pages = 0
+    results = []
     for pdf_path in pdf_paths:
-        try:
-            doc = fitz.open(pdf_path)
-            num_pages = len(doc)
-            pdf_info.append((pdf_path, num_pages))
-            total_pages += num_pages
-            doc.close()
-        except Exception as e:
-            print(f'Error opening {pdf_path}: {e}')
-    # Partition the jobs based on page countEach job has 1 page
-    partitions = partition_array_greedy(pdf_info, k)
-    # Process each partition in parallel
-    all_images_h = {}
-    with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
-        # Submit one task per partition
-        futures = []
-        for sn, partition in enumerate(partitions):
-            # Get the jobs for this partition
-            partition_jobs = [pdf_info[idx] for idx in partition]
-            # Submit the task
-            future = executor.submit(
-                process_pdf_batch,
-                partition_jobs,
-                sn
-            )
-            futures.append(future)
-        # Process results as they complete
-        for i, future in enumerate(concurrent.futures.as_completed(futures)):
-            try:
-                idx, images = future.result()
-                all_images_h[idx] = images
-            except Exception as e:
-                print(f'Error processing partition: {e}')
-    results = [None] * len(pdf_paths)
-    for i in range(len(partitions)):
-        partition = partitions[i]
-        for j in range(len(partition)):
-            with open(pdf_info[partition[j]][0], 'rb') as f:
-                pdf_bytes = f.read()
-            dataset = PymuDocDataset(pdf_bytes, lang=lang)
-            dataset.set_images(all_images_h[i][j])
-            results[partition[j]] = dataset
+        with open(pdf_path, 'rb') as f:
+            pdf_bytes = f.read()
+        dataset = PymuDocDataset(pdf_bytes, lang=lang)
+        results.append(dataset)
     return results
+    #
+    # # Get page counts for each PDF
+    # pdf_info = []
+    # total_pages = 0
+    #
+    # for pdf_path in pdf_paths:
+    #     try:
+    #         doc = fitz.open(pdf_path)
+    #         num_pages = len(doc)
+    #         pdf_info.append((pdf_path, num_pages))
+    #         total_pages += num_pages
+    #         doc.close()
+    #     except Exception as e:
+    #         print(f'Error opening {pdf_path}: {e}')
+    #
+    # # Partition the jobs based on page countEach job has 1 page
+    # partitions = partition_array_greedy(pdf_info, k)
+    #
+    # # Process each partition in parallel
+    # all_images_h = {}
+    #
+    # with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
+    #     # Submit one task per partition
+    #     futures = []
+    #     for sn, partition in enumerate(partitions):
+    #         # Get the jobs for this partition
+    #         partition_jobs = [pdf_info[idx] for idx in partition]
+    #
+    #         # Submit the task
+    #         future = executor.submit(
+    #             process_pdf_batch,
+    #             partition_jobs,
+    #             sn
+    #         )
+    #         futures.append(future)
+    #     # Process results as they complete
+    #     for i, future in enumerate(concurrent.futures.as_completed(futures)):
+    #         try:
+    #             idx, images = future.result()
+    #             all_images_h[idx] = images
+    #         except Exception as e:
+    #             print(f'Error processing partition: {e}')
+    # results = [None] * len(pdf_paths)
+    # for i in range(len(partitions)):
+    #     partition = partitions[i]
+    #     for j in range(len(partition)):
+    #         with open(pdf_info[partition[j]][0], 'rb') as f:
+    #             pdf_bytes = f.read()
+    #         dataset = PymuDocDataset(pdf_bytes, lang=lang)
+    #         dataset.set_images(all_images_h[i][j])
+    #         results[partition[j]] = dataset
+    # return results

magic_pdf/data/dataset.py CHANGED Viewed

@@ -150,7 +150,7 @@ class PymuDocDataset(Dataset):
         elif lang == 'auto':
             from magic_pdf.model.sub_modules.language_detection.utils import \
                 auto_detect_lang
-            self._lang = auto_detect_lang(bits)
+            self._lang = auto_detect_lang(self._data_bits)
             logger.info(f'lang: {lang}, detect_lang: {self._lang}')
         else:
             self._lang = lang
@@ -249,7 +249,7 @@ class ImageDataset(Dataset):
         elif lang == 'auto':
             from magic_pdf.model.sub_modules.language_detection.utils import \
                 auto_detect_lang
-            self._lang = auto_detect_lang(bits)
+            self._lang = auto_detect_lang(self._data_bits)
             logger.info(f'lang: {lang}, detect_lang: {self._lang}')
         else:
             self._lang = lang
@@ -405,4 +405,4 @@ class Doc(PageableData):
             fontsize (int): font size of the text
             color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
         """
-        self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
+        self._doc.insert_text(coord, content, fontsize=fontsize, color=color)

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.3.1"
1	+ __version__ = "1.3.3"

magic_pdf/model/batch_analyze.py CHANGED Viewed

@@ -30,8 +30,14 @@ class BatchAnalyze:
         images_layout_res = []
         layout_start_time = time.time()
-        _, fst_ocr, fst_lang = images_with_extra_info[0]
-        self.model = self.model_manager.get_model(fst_ocr, self.show_log, fst_lang, self.layout_model, self.formula_enable, self.table_enable)
+        self.model = self.model_manager.get_model(
+            ocr=True,
+            show_log=self.show_log,
+            lang = None,
+            layout_model = self.layout_model,
+            formula_enable = self.formula_enable,
+            table_enable = self.table_enable,
+        )
         images = [image for image, _, _ in images_with_extra_info]
@@ -143,14 +149,14 @@ class BatchAnalyze:
                 if ocr_res:
                     ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
                     ocr_res_list_dict['layout_res'].extend(ocr_result_list)
-            det_count += len(ocr_res_list_dict['ocr_res_list'])
+            # det_count += len(ocr_res_list_dict['ocr_res_list'])
         # logger.info(f'ocr-det time: {round(time.time()-det_start, 2)}, image num: {det_count}')
         # 表格识别 table recognition
         if self.model.apply_table:
             table_start = time.time()
-            table_count = 0
             # for table_res_list_dict in table_res_list_all_page:
             for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"):
                 _lang = table_res_dict['lang']

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -146,10 +146,8 @@ def doc_analyze(
             img_dict = page_data.get_image()
             images.append(img_dict['img'])
             page_wh_list.append((img_dict['width'], img_dict['height']))
-    if lang is None or lang == 'auto':
-        images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(dataset))]
-    else:
-        images_with_extra_info = [(images[index], ocr, lang) for index in range(len(dataset))]
+    images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))]
     if len(images) >= MIN_BATCH_INFERENCE_SIZE:
         batch_size = MIN_BATCH_INFERENCE_SIZE
@@ -158,8 +156,8 @@ def doc_analyze(
         batch_images = [images_with_extra_info]
     results = []
-    for sn, batch_image in enumerate(batch_images):
-        _, result = may_batch_image_analyze(batch_image, sn, ocr, show_log,layout_model, formula_enable, table_enable)
+    for batch_image in batch_images:
+        result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
         results.extend(result)
     model_json = []
@@ -181,7 +179,7 @@ def doc_analyze(
 def batch_doc_analyze(
     datasets: list[Dataset],
-    parse_method: str,
+    parse_method: str = 'auto',
     show_log: bool = False,
     lang=None,
     layout_model=None,
@@ -190,30 +188,37 @@ def batch_doc_analyze(
 ):
     MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
     batch_size = MIN_BATCH_INFERENCE_SIZE
-    images = []
     page_wh_list = []
     images_with_extra_info = []
     for dataset in datasets:
-        for index in range(len(dataset)):
-            if lang is None or lang == 'auto':
-                _lang = dataset._lang
-            else:
-                _lang = lang
+        ocr = False
+        if parse_method == 'auto':
+            if dataset.classify() == SupportedPdfParseMethod.TXT:
+                ocr = False
+            elif dataset.classify() == SupportedPdfParseMethod.OCR:
+                ocr = True
+        elif parse_method == 'ocr':
+            ocr = True
+        elif parse_method == 'txt':
+            ocr = False
+        _lang = dataset._lang
+        for index in range(len(dataset)):
             page_data = dataset.get_page(index)
             img_dict = page_data.get_image()
-            images.append(img_dict['img'])
             page_wh_list.append((img_dict['width'], img_dict['height']))
-            if parse_method == 'auto':
-                images_with_extra_info.append((images[-1], dataset.classify() == SupportedPdfParseMethod.OCR, _lang))
-            else:
-                images_with_extra_info.append((images[-1], parse_method == 'ocr', _lang))
+            images_with_extra_info.append((img_dict['img'], ocr, _lang))
     batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
     results = []
-    for sn, batch_image in enumerate(batch_images):
-        _, result = may_batch_image_analyze(batch_image, sn, True, show_log, layout_model, formula_enable, table_enable)
+    processed_images_count = 0
+    for index, batch_image in enumerate(batch_images):
+        processed_images_count += len(batch_image)
+        logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
+        result = may_batch_image_analyze(batch_image, True, show_log, layout_model, formula_enable, table_enable)
         results.extend(result)
     infer_results = []
@@ -233,7 +238,6 @@ def batch_doc_analyze(
 def may_batch_image_analyze(
         images_with_extra_info: list[(np.ndarray, bool, str)],
-        idx: int,
         ocr: bool,
         show_log: bool = False,
         layout_model=None,
@@ -291,4 +295,4 @@ def may_batch_image_analyze(
     #     f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
     #     f' speed: {doc_analyze_speed} pages/second'
     # )
-    return idx, results
+    return results

magic_pdf/model/sub_modules/model_utils.py CHANGED Viewed

@@ -29,22 +29,204 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
     return return_image, return_list
-# Select regions for OCR / formula regions / table regions
-def get_res_list_from_layout_res(layout_res):
+def get_coords_and_area(table):
+    """Extract coordinates and area from a table."""
+    xmin, ymin = int(table['poly'][0]), int(table['poly'][1])
+    xmax, ymax = int(table['poly'][4]), int(table['poly'][5])
+    area = (xmax - xmin) * (ymax - ymin)
+    return xmin, ymin, xmax, ymax, area
+def calculate_intersection(box1, box2):
+    """Calculate intersection coordinates between two boxes."""
+    intersection_xmin = max(box1[0], box2[0])
+    intersection_ymin = max(box1[1], box2[1])
+    intersection_xmax = min(box1[2], box2[2])
+    intersection_ymax = min(box1[3], box2[3])
+    # Check if intersection is valid
+    if intersection_xmax <= intersection_xmin or intersection_ymax <= intersection_ymin:
+        return None
+    return intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax
+def calculate_iou(box1, box2):
+    """Calculate IoU between two boxes."""
+    intersection = calculate_intersection(box1[:4], box2[:4])
+    if not intersection:
+        return 0
+    intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection
+    intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin)
+    area1, area2 = box1[4], box2[4]
+    union_area = area1 + area2 - intersection_area
+    return intersection_area / union_area if union_area > 0 else 0
+def is_inside(small_box, big_box, overlap_threshold=0.8):
+    """Check if small_box is inside big_box by at least overlap_threshold."""
+    intersection = calculate_intersection(small_box[:4], big_box[:4])
+    if not intersection:
+        return False
+    intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection
+    intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin)
+    # Check if overlap exceeds threshold
+    return intersection_area >= overlap_threshold * small_box[4]
+def do_overlap(box1, box2):
+    """Check if two boxes overlap."""
+    return calculate_intersection(box1[:4], box2[:4]) is not None
+def merge_high_iou_tables(table_res_list, layout_res, table_indices, iou_threshold=0.7):
+    """Merge tables with IoU > threshold."""
+    if len(table_res_list) < 2:
+        return table_res_list, table_indices
+    table_info = [get_coords_and_area(table) for table in table_res_list]
+    merged = True
+    while merged:
+        merged = False
+        i = 0
+        while i < len(table_res_list) - 1:
+            j = i + 1
+            while j < len(table_res_list):
+                iou = calculate_iou(table_info[i], table_info[j])
+                if iou > iou_threshold:
+                    # Merge tables by taking their union
+                    x1_min, y1_min, x1_max, y1_max, _ = table_info[i]
+                    x2_min, y2_min, x2_max, y2_max, _ = table_info[j]
+                    union_xmin = min(x1_min, x2_min)
+                    union_ymin = min(y1_min, y2_min)
+                    union_xmax = max(x1_max, x2_max)
+                    union_ymax = max(y1_max, y2_max)
+                    # Create merged table
+                    merged_table = table_res_list[i].copy()
+                    merged_table['poly'][0] = union_xmin
+                    merged_table['poly'][1] = union_ymin
+                    merged_table['poly'][2] = union_xmax
+                    merged_table['poly'][3] = union_ymin
+                    merged_table['poly'][4] = union_xmax
+                    merged_table['poly'][5] = union_ymax
+                    merged_table['poly'][6] = union_xmin
+                    merged_table['poly'][7] = union_ymax
+                    # Update layout_res
+                    to_remove = [table_indices[j], table_indices[i]]
+                    for idx in sorted(to_remove, reverse=True):
+                        del layout_res[idx]
+                    layout_res.append(merged_table)
+                    # Update tracking lists
+                    table_indices = [k if k < min(to_remove) else
+                                     k - 1 if k < max(to_remove) else
+                                     k - 2 if k > max(to_remove) else
+                                     len(layout_res) - 1
+                                     for k in table_indices
+                                     if k not in to_remove]
+                    table_indices.append(len(layout_res) - 1)
+                    # Update table lists
+                    table_res_list.pop(j)
+                    table_res_list.pop(i)
+                    table_res_list.append(merged_table)
+                    # Update table_info
+                    table_info = [get_coords_and_area(table) for table in table_res_list]
+                    merged = True
+                    break
+                j += 1
+            if merged:
+                break
+            i += 1
+    return table_res_list, table_indices
+def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0.8):
+    """Remove big tables containing multiple smaller tables within them."""
+    if len(table_res_list) < 3:
+        return table_res_list
+    table_info = [get_coords_and_area(table) for table in table_res_list]
+    big_tables_idx = []
+    for i in range(len(table_res_list)):
+        # Find tables inside this one
+        tables_inside = [j for j in range(len(table_res_list))
+                         if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
+        # Continue if there are at least 2 tables inside
+        if len(tables_inside) >= 2:
+            # Check if inside tables overlap with each other
+            tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
+                                 for idx1 in range(len(tables_inside))
+                                 for idx2 in range(idx1 + 1, len(tables_inside)))
+            # If no overlaps, check area condition
+            if not tables_overlap:
+                total_inside_area = sum(table_info[j][4] for j in tables_inside)
+                big_table_area = table_info[i][4]
+                if total_inside_area > area_threshold * big_table_area:
+                    big_tables_idx.append(i)
+    return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx]
+def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8):
+    """Extract OCR, table and other regions from layout results."""
     ocr_res_list = []
     table_res_list = []
+    table_indices = []
     single_page_mfdetrec_res = []
-    for res in layout_res:
-        if int(res['category_id']) in [13, 14]:
+    # Categorize regions
+    for i, res in enumerate(layout_res):
+        category_id = int(res['category_id'])
+        if category_id in [13, 14]:  # Formula regions
             single_page_mfdetrec_res.append({
                 "bbox": [int(res['poly'][0]), int(res['poly'][1]),
                          int(res['poly'][4]), int(res['poly'][5])],
             })
-        elif int(res['category_id']) in [0, 1, 2, 4, 6, 7]:
+        elif category_id in [0, 1, 2, 4, 6, 7]:  # OCR regions
             ocr_res_list.append(res)
-        elif int(res['category_id']) in [5]:
+        elif category_id == 5:  # Table regions
             table_res_list.append(res)
-    return ocr_res_list, table_res_list, single_page_mfdetrec_res
+            table_indices.append(i)
+    # Process tables: merge high IoU tables first, then filter nested tables
+    table_res_list, table_indices = merge_high_iou_tables(
+        table_res_list, layout_res, table_indices, iou_threshold)
+    filtered_table_res_list = filter_nested_tables(
+        table_res_list, overlap_threshold, area_threshold)
+    # Remove filtered out tables from layout_res
+    if len(filtered_table_res_list) < len(table_res_list):
+        kept_tables = set(id(table) for table in filtered_table_res_list)
+        to_remove = [table_indices[i] for i, table in enumerate(table_res_list)
+                     if id(table) not in kept_tables]
+        for idx in sorted(to_remove, reverse=True):
+            del layout_res[idx]
+    return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res
 def clean_vram(device, vram_threshold=8):

magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml CHANGED Viewed

@@ -1,8 +1,12 @@
 lang:
-  ch:
+  ch_lite:
     det: ch_PP-OCRv3_det_infer.pth
     rec: ch_PP-OCRv4_rec_infer.pth
     dict: ppocr_keys_v1.txt
+  ch:
+    det: ch_PP-OCRv3_det_infer.pth
+    rec: ch_PP-OCRv4_rec_server_infer.pth
+    dict: ppocr_keys_v1.txt
   en:
     det: en_PP-OCRv3_det_infer.pth
     rec: en_PP-OCRv4_rec_infer.pth

magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py CHANGED Viewed

@@ -437,4 +437,10 @@ class TextRecognizer(BaseOCRV20):
                 index += 1
                 pbar.update(current_batch_size)
+        # Fix NaN values in recognition results
+        for i in range(len(rec_res)):
+            text, score = rec_res[i]
+            if isinstance(score, float) and math.isnan(score):
+                rec_res[i] = (text, 0.0)
         return rec_res, elapse

magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py CHANGED Viewed

@@ -35,26 +35,63 @@ class RapidTableModel(object):
         #     from rapidocr_onnxruntime import RapidOCR
         #     self.ocr_engine = RapidOCR()
-        self.ocr_model_name = "PaddleOCR"
+        # self.ocr_model_name = "PaddleOCR"
         self.ocr_engine = ocr_engine
     def predict(self, image):
+        bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
-        if self.ocr_model_name == "RapidOCR":
-            ocr_result, _ = self.ocr_engine(np.asarray(image))
-        elif self.ocr_model_name == "PaddleOCR":
-            bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
-            ocr_result = self.ocr_engine.ocr(bgr_image)[0]
-            if ocr_result:
-                ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
-                          len(item) == 2 and isinstance(item[1], tuple)]
-            else:
-                ocr_result = None
+        # First check the overall image aspect ratio (height/width)
+        img_height, img_width = bgr_image.shape[:2]
+        img_aspect_ratio = img_height / img_width if img_width > 0 else 1.0
+        img_is_portrait = img_aspect_ratio > 1.2
+        if img_is_portrait:
+            det_res = self.ocr_engine.ocr(bgr_image, rec=False)[0]
+            # Check if table is rotated by analyzing text box aspect ratios
+            is_rotated = False
+            if det_res:
+                vertical_count = 0
+                for box_ocr_res in det_res:
+                    p1, p2, p3, p4 = box_ocr_res
+                    # Calculate width and height
+                    width = p3[0] - p1[0]
+                    height = p3[1] - p1[1]
+                    aspect_ratio = width / height if height > 0 else 1.0
+                    # Count vertical vs horizontal text boxes
+                    if aspect_ratio < 0.8:  # Taller than wide - vertical text
+                        vertical_count += 1
+                    # elif aspect_ratio > 1.2:  # Wider than tall - horizontal text
+                    #     horizontal_count += 1
+                # If we have more vertical text boxes than horizontal ones,
+                # and vertical ones are significant, table might be rotated
+                if vertical_count >= len(det_res) * 0.3:
+                    is_rotated = True
+                # logger.debug(f"Text orientation analysis: vertical={vertical_count}, det_res={len(det_res)}, rotated={is_rotated}")
+            # Rotate image if necessary
+            if is_rotated:
+                # logger.debug("Table appears to be in portrait orientation, rotating 90 degrees clockwise")
+                image = cv2.rotate(np.asarray(image), cv2.ROTATE_90_CLOCKWISE)
+                bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+        # Continue with OCR on potentially rotated image
+        ocr_result = self.ocr_engine.ocr(bgr_image)[0]
+        if ocr_result:
+            ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
+                      len(item) == 2 and isinstance(item[1], tuple)]
         else:
-            logger.error("OCR model not supported")
             ocr_result = None
         if ocr_result:
             table_results = self.table_model(np.asarray(image), ocr_result)
             html_code = table_results.pred_html

magic_pdf/pre_proc/ocr_detect_all_bboxes.py CHANGED Viewed

@@ -99,11 +99,11 @@ def ocr_prepare_bboxes_for_layout_split_v2(
     all_discarded_blocks = []
     add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
-    """footnote识别：宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的"""
+    """footnote识别：宽度超过1/3页面宽度的，高度超过10的，处于页面下半30%区域的"""
     footnote_blocks = []
     for discarded in discarded_blocks:
         x0, y0, x1, y1 = discarded['bbox']
-        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
+        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
             footnote_blocks.append([x0, y0, x1, y1])
     """移除在footnote下面的任何框"""

magic_pdf/tools/common.py CHANGED Viewed

@@ -109,9 +109,7 @@ def _do_parse(
     pdf_bytes = ds._raw_data
     local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
-    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
-        local_md_dir
-    )
+    image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
     image_dir = str(os.path.basename(local_image_dir))
     if len(model_list) == 0:
@@ -317,7 +315,26 @@ def batch_do_parse(
     infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
     for idx, infer_result in enumerate(infer_results):
-        _do_parse(output_dir, pdf_file_names[idx], dss[idx], infer_result.get_infer_res(), parse_method, debug_able, f_draw_span_bbox=f_draw_span_bbox, f_draw_layout_bbox=f_draw_layout_bbox, f_dump_md=f_dump_md, f_dump_middle_json=f_dump_middle_json, f_dump_model_json=f_dump_model_json, f_dump_orig_pdf=f_dump_orig_pdf, f_dump_content_list=f_dump_content_list, f_make_md_mode=f_make_md_mode, f_draw_model_bbox=f_draw_model_bbox, f_draw_line_sort_bbox=f_draw_line_sort_bbox, f_draw_char_bbox=f_draw_char_bbox, lang=lang)
+        _do_parse(
+            output_dir = output_dir,
+            pdf_file_name = pdf_file_names[idx],
+            pdf_bytes_or_dataset = dss[idx],
+            model_list = infer_result.get_infer_res(),
+            parse_method = parse_method,
+            debug_able = debug_able,
+            f_draw_span_bbox = f_draw_span_bbox,
+            f_draw_layout_bbox = f_draw_layout_bbox,
+            f_dump_md=f_dump_md,
+            f_dump_middle_json=f_dump_middle_json,
+            f_dump_model_json=f_dump_model_json,
+            f_dump_orig_pdf=f_dump_orig_pdf,
+            f_dump_content_list=f_dump_content_list,
+            f_make_md_mode=MakeMode.MM_MD,
+            f_draw_model_bbox=f_draw_model_bbox,
+            f_draw_line_sort_bbox=f_draw_line_sort_bbox,
+            f_draw_char_bbox=f_draw_char_bbox,
+            lang=lang,
+        )
 parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])

{magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: magic-pdf
-Version: 1.3.1
+Version: 1.3.3
 Summary: A practical tool for converting PDF to Markdown
 License: AGPL-3.0
 Project-URL: Home, https://mineru.net/
@@ -23,27 +23,26 @@ Requires-Dist: numpy >=1.21.6
 Requires-Dist: pdfminer.six ==20231228
 Requires-Dist: pydantic <2.11,>=2.7.2
 Requires-Dist: scikit-learn >=1.0.2
-Requires-Dist: torch !=2.5.0,!=2.5.1,<=2.6.0,>=2.2.2
+Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
 Requires-Dist: torchvision
 Requires-Dist: tqdm >=4.67.1
 Requires-Dist: transformers !=4.51.0,<5.0.0,>=4.49.0
 Provides-Extra: full
 Requires-Dist: PyYAML <7,>=6.0.2 ; extra == 'full'
-Requires-Dist: dill <1,>=0.3.9 ; extra == 'full'
+Requires-Dist: dill <1,>=0.3.8 ; extra == 'full'
 Requires-Dist: doclayout-yolo ==0.0.2b1 ; extra == 'full'
 Requires-Dist: ftfy <7,>=6.3.1 ; extra == 'full'
+Requires-Dist: matplotlib <4,>=3.10 ; extra == 'full'
 Requires-Dist: omegaconf <3,>=2.3.0 ; extra == 'full'
 Requires-Dist: openai <2,>=1.70.0 ; extra == 'full'
 Requires-Dist: pyclipper <2,>=1.3.0 ; extra == 'full'
 Requires-Dist: rapid-table <2.0.0,>=1.0.5 ; extra == 'full'
 Requires-Dist: shapely <3,>=2.0.7 ; extra == 'full'
-Requires-Dist: ultralytics >=8.3.48 ; extra == 'full'
-Requires-Dist: matplotlib >=3.10 ; (platform_system == "Linux" or platform_system == "Darwin") and extra == 'full'
-Requires-Dist: matplotlib <=3.9.0 ; (platform_system == "Windows") and extra == 'full'
+Requires-Dist: ultralytics <9,>=8.3.48 ; extra == 'full'
 Provides-Extra: full_old_linux
 Requires-Dist: PyYAML ==6.0.2 ; extra == 'full_old_linux'
 Requires-Dist: albumentations ==1.4.20 ; extra == 'full_old_linux'
-Requires-Dist: dill ==0.3.9 ; extra == 'full_old_linux'
+Requires-Dist: dill ==0.3.8 ; extra == 'full_old_linux'
 Requires-Dist: doclayout-yolo ==0.0.2b1 ; extra == 'full_old_linux'
 Requires-Dist: ftfy ==6.3.1 ; extra == 'full_old_linux'
 Requires-Dist: matplotlib <=3.10.1,>=3.10 ; extra == 'full_old_linux'
@@ -108,9 +107,14 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
 </div>
 # Changelog
+- 2025/04/12 1.3.2 released
+  - Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
+  - Optimized memory usage during batch inference.
+  - Improved the parsing effect of tables rotated by 90 degrees.
+  - Enhanced the parsing accuracy for large tables in financial report samples.
+  - Fixed the occasional word concatenation issue in English text areas when OCR language is not specified.(The model needs to be updated)
 - 2025/04/08 1.3.1 released, fixed some compatibility issues
   - Supported Python 3.13
-  - Resolved errors caused by `transformers 4.51.0`
   - Made the final adaptation for some outdated Linux systems (e.g., CentOS 7), and no further support will be guaranteed for subsequent versions. [Installation Instructions](https://github.com/opendatalab/MinerU/issues/1004)
 - 2025/04/03 1.3.0 released, in this version we made many optimizations and improvements:
   - Installation and compatibility optimization
@@ -129,59 +133,154 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
   - Usability Optimization
     - By using `paddleocr2torch`, completely replaced the use of the `paddle` framework and `paddleocr` in the project, resolving conflicts between `paddle` and `torch`, as well as thread safety issues caused by the `paddle` framework.
     - Added a real-time progress bar during the parsing process to accurately track progress, making the wait less painful.
-- 2025/03/03 1.2.1 released, fixed several bugs:
-  - Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers
-  - Fixed caption matching inaccuracies in certain scenarios
-  - Fixed formula span loss issues in certain scenarios
-- 2025/02/24 1.2.0 released. This version includes several fixes and improvements to enhance parsing efficiency and accuracy:
-  - Performance Optimization
-    - Increased classification speed for PDF documents in auto mode.
-  - Parsing Optimization
-    - Improved parsing logic for documents containing watermarks, significantly enhancing the parsing results for such documents.
-    - Enhanced the matching logic for multiple images/tables and captions within a single page, improving the accuracy of image-text matching in complex layouts.
-  - Bug Fixes
-    - Fixed an issue where image/table spans were incorrectly filled into text blocks under certain conditions.
-    - Resolved an issue where title blocks were empty in some cases.
-- 2025/01/22 1.1.0 released. In this version we have focused on improving parsing accuracy and efficiency:
-  - Model capability upgrade (requires re-executing the [model download process](docs/how_to_download_models_en.md) to obtain incremental updates of model files)
-    - The layout recognition model has been upgraded to the latest `doclayout_yolo(2501)` model, improving layout recognition accuracy.
-    - The formula parsing model has been upgraded to the latest `unimernet(2501)` model, improving formula recognition accuracy.
-  - Performance optimization
-    - On devices that meet certain configuration requirements (16GB+ VRAM), by optimizing resource usage and restructuring the processing pipeline, overall parsing speed has been increased by more than 50%.
-  - Parsing effect optimization
-    - Added a new heading classification feature (testing version, enabled by default) to the online demo([mineru.net](https://mineru.net/OpenSourceTools/Extractor)/[huggingface](https://huggingface.co/spaces/opendatalab/MinerU)/[modelscope](https://www.modelscope.cn/studios/OpenDataLab/MinerU)), which supports hierarchical classification of headings, thereby enhancing document structuring.
-- 2025/01/10 1.0.1 released. This is our first official release, where we have introduced a completely new API interface and enhanced compatibility through extensive refactoring, as well as a brand new automatic language identification feature:
-  - New API Interface
-    - For the data-side API, we have introduced the Dataset class, designed to provide a robust and flexible data processing framework. This framework currently supports a variety of document formats, including images (.jpg and .png), PDFs, Word documents (.doc and .docx), and PowerPoint presentations (.ppt and .pptx). It ensures effective support for data processing tasks ranging from simple to complex.
-    - For the user-side API, we have meticulously designed the MinerU processing workflow as a series of composable Stages. Each Stage represents a specific processing step, allowing users to define new Stages according to their needs and creatively combine these stages to customize their data processing workflows.
-  - Enhanced Compatibility
-    - By optimizing the dependency environment and configuration items, we ensure stable and efficient operation on ARM architecture Linux systems.
-    - We have deeply integrated with Huawei Ascend NPU acceleration, providing autonomous and controllable high-performance computing capabilities. This supports the localization and development of AI application platforms in China. [Ascend NPU Acceleration](docs/README_Ascend_NPU_Acceleration_zh_CN.md)
-  - Automatic Language Identification
-    - By introducing a new language recognition model, setting the `lang` configuration to `auto` during document parsing will automatically select the appropriate OCR language model, improving the accuracy of scanned document parsing.
-- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
-  - Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
-  - Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
-- 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
-- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
-- 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
-  - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
-  - Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.
-  - Refactored the list and table of contents recognition functions, significantly improving the accuracy of list blocks and table of contents blocks, as well as the parsing of corresponding text paragraphs.
-  - Refactored the matching logic for figures, tables, and descriptive text, greatly enhancing the accuracy of matching captions and footnotes to figures and tables, and reducing the loss rate of descriptive text to near zero.
-  - Added multi-language support for OCR, supporting detection and recognition of 84 languages.For the list of supported languages, see [OCR Language Support List](https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations).
-  - Added memory recycling logic and other memory optimization measures, significantly reducing memory usage. The memory requirement for enabling all acceleration features except table acceleration (layout/formula/OCR) has been reduced from 16GB to 8GB, and the memory requirement for enabling all acceleration features has been reduced from 24GB to 10GB.
-  - Optimized configuration file feature switches, adding an independent formula detection switch to significantly improve speed and parsing results when formula detection is not needed.
-  - Integrated [PDF-Extract-Kit 1.0](https://github.com/opendatalab/PDF-Extract-Kit):
-    - Added the self-developed `doclayout_yolo` model, which speeds up processing by more than 10 times compared to the original solution while maintaining similar parsing effects, and can be freely switched with `layoutlmv3` via the configuration file.
-    - Upgraded formula parsing to `unimernet 0.2.1`, improving formula parsing accuracy while significantly reducing memory usage.
-    - Due to the repository change for `PDF-Extract-Kit 1.0`, you need to re-download the model. Please refer to [How to Download Models](docs/how_to_download_models_en.md) for detailed steps.
-- 2024/09/27 Version 0.8.1 released, Fixed some bugs, and providing a [localized deployment version](projects/web_demo/README.md) of the [online demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF/) and the [front-end interface](projects/web/README.md).
-- 2024/09/09: Version 0.8.0 released, supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.
-- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
-- 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
-- 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
-- 2024/07/05: Initial open-source release
+<details>
+<summary>2025/03/03 1.2.1 released</summary>
+<ul>
+  <li>Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers</li>
+  <li>Fixed caption matching inaccuracies in certain scenarios</li>
+  <li>Fixed formula span loss issues in certain scenarios</li>
+</ul>
+</details>
+<details>
+<summary>2025/02/24 1.2.0 released</summary>
+<p>This version includes several fixes and improvements to enhance parsing efficiency and accuracy:</p>
+<ul>
+  <li><strong>Performance Optimization</strong>
+    <ul>
+      <li>Increased classification speed for PDF documents in auto mode.</li>
+    </ul>
+  </li>
+  <li><strong>Parsing Optimization</strong>
+    <ul>
+      <li>Improved parsing logic for documents containing watermarks, significantly enhancing the parsing results for such documents.</li>
+      <li>Enhanced the matching logic for multiple images/tables and captions within a single page, improving the accuracy of image-text matching in complex layouts.</li>
+    </ul>
+  </li>
+  <li><strong>Bug Fixes</strong>
+    <ul>
+      <li>Fixed an issue where image/table spans were incorrectly filled into text blocks under certain conditions.</li>
+      <li>Resolved an issue where title blocks were empty in some cases.</li>
+    </ul>
+  </li>
+</ul>
+</details>
+<details>
+<summary>2025/01/22 1.1.0 released</summary>
+<p>In this version we have focused on improving parsing accuracy and efficiency:</p>
+<ul>
+  <li><strong>Model capability upgrade</strong> (requires re-executing the <a href="https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_en.md">model download process</a> to obtain incremental updates of model files)
+    <ul>
+      <li>The layout recognition model has been upgraded to the latest <code>doclayout_yolo(2501)</code> model, improving layout recognition accuracy.</li>
+      <li>The formula parsing model has been upgraded to the latest <code>unimernet(2501)</code> model, improving formula recognition accuracy.</li>
+    </ul>
+  </li>
+  <li><strong>Performance optimization</strong>
+    <ul>
+      <li>On devices that meet certain configuration requirements (16GB+ VRAM), by optimizing resource usage and restructuring the processing pipeline, overall parsing speed has been increased by more than 50%.</li>
+    </ul>
+  </li>
+  <li><strong>Parsing effect optimization</strong>
+    <ul>
+      <li>Added a new heading classification feature (testing version, enabled by default) to the online demo (<a href="https://mineru.net/OpenSourceTools/Extractor">mineru.net</a>/<a href="https://huggingface.co/spaces/opendatalab/MinerU">huggingface</a>/<a href="https://www.modelscope.cn/studios/OpenDataLab/MinerU">modelscope</a>), which supports hierarchical classification of headings, thereby enhancing document structuring.</li>
+    </ul>
+  </li>
+</ul>
+</details>
+<details>
+<summary>2025/01/10 1.0.1 released</summary>
+<p>This is our first official release, where we have introduced a completely new API interface and enhanced compatibility through extensive refactoring, as well as a brand new automatic language identification feature:</p>
+<ul>
+  <li><strong>New API Interface</strong>
+    <ul>
+      <li>For the data-side API, we have introduced the Dataset class, designed to provide a robust and flexible data processing framework. This framework currently supports a variety of document formats, including images (.jpg and .png), PDFs, Word documents (.doc and .docx), and PowerPoint presentations (.ppt and .pptx). It ensures effective support for data processing tasks ranging from simple to complex.</li>
+      <li>For the user-side API, we have meticulously designed the MinerU processing workflow as a series of composable Stages. Each Stage represents a specific processing step, allowing users to define new Stages according to their needs and creatively combine these stages to customize their data processing workflows.</li>
+    </ul>
+  </li>
+  <li><strong>Enhanced Compatibility</strong>
+    <ul>
+      <li>By optimizing the dependency environment and configuration items, we ensure stable and efficient operation on ARM architecture Linux systems.</li>
+      <li>We have deeply integrated with Huawei Ascend NPU acceleration, providing autonomous and controllable high-performance computing capabilities. This supports the localization and development of AI application platforms in China. <a href="https://github.com/opendatalab/MinerU/blob/master/docs/README_Ascend_NPU_Acceleration_zh_CN.md">Ascend NPU Acceleration</a></li>
+    </ul>
+  </li>
+  <li><strong>Automatic Language Identification</strong>
+    <ul>
+      <li>By introducing a new language recognition model, setting the <code>lang</code> configuration to <code>auto</code> during document parsing will automatically select the appropriate OCR language model, improving the accuracy of scanned document parsing.</li>
+    </ul>
+  </li>
+</ul>
+</details>
+<details>
+<summary>2024/11/22 0.10.0 released</summary>
+<p>Introducing hybrid OCR text extraction capabilities:</p>
+<ul>
+  <li>Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.</li>
+  <li>Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.</li>
+</ul>
+</details>
+<details>
+<summary>2024/11/15 0.9.3 released</summary>
+<p>Integrated <a href="https://github.com/RapidAI/RapidTable">RapidTable</a> for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.</p>
+</details>
+<details>
+<summary>2024/11/06 0.9.2 released</summary>
+<p>Integrated the <a href="https://huggingface.co/U4R/StructTable-InternVL2-1B">StructTable-InternVL2-1B</a> model for table recognition functionality.</p>
+</details>
+<details>
+<summary>2024/10/31 0.9.0 released</summary>
+<p>This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:</p>
+<ul>
+  <li>Refactored the sorting module code to use <a href="https://github.com/ppaanngggg/layoutreader">layoutreader</a> for reading order sorting, ensuring high accuracy in various layouts.</li>
+  <li>Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.</li>
+  <li>Refactored the list and table of contents recognition functions, significantly improving the accuracy of list blocks and table of contents blocks, as well as the parsing of corresponding text paragraphs.</li>
+  <li>Refactored the matching logic for figures, tables, and descriptive text, greatly enhancing the accuracy of matching captions and footnotes to figures and tables, and reducing the loss rate of descriptive text to near zero.</li>
+  <li>Added multi-language support for OCR, supporting detection and recognition of 84 languages. For the list of supported languages, see <a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations">OCR Language Support List</a>.</li>
+  <li>Added memory recycling logic and other memory optimization measures, significantly reducing memory usage. The memory requirement for enabling all acceleration features except table acceleration (layout/formula/OCR) has been reduced from 16GB to 8GB, and the memory requirement for enabling all acceleration features has been reduced from 24GB to 10GB.</li>
+  <li>Optimized configuration file feature switches, adding an independent formula detection switch to significantly improve speed and parsing results when formula detection is not needed.</li>
+  <li>Integrated <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit 1.0</a>:
+    <ul>
+      <li>Added the self-developed <code>doclayout_yolo</code> model, which speeds up processing by more than 10 times compared to the original solution while maintaining similar parsing effects, and can be freely switched with <code>layoutlmv3</code> via the configuration file.</li>
+      <li>Upgraded formula parsing to <code>unimernet 0.2.1</code>, improving formula parsing accuracy while significantly reducing memory usage.</li>
+      <li>Due to the repository change for <code>PDF-Extract-Kit 1.0</code>, you need to re-download the model. Please refer to <a href="https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_en.md">How to Download Models</a> for detailed steps.</li>
+    </ul>
+  </li>
+</ul>
+</details>
+<details>
+<summary>2024/09/27 Version 0.8.1 released</summary>
+<p>Fixed some bugs, and providing a <a href="https://github.com/opendatalab/MinerU/blob/master/projects/web_demo/README.md">localized deployment version</a> of the <a href="https://opendatalab.com/OpenSourceTools/Extractor/PDF/">online demo</a> and the <a href="https://github.com/opendatalab/MinerU/blob/master/projects/web/README.md">front-end interface</a>.</p>
+</details>
+<details>
+<summary>2024/09/09 Version 0.8.0 released</summary>
+<p>Supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.</p>
+</details>
+<details>
+<summary>2024/08/30 Version 0.7.1 released</summary>
+<p>Add paddle tablemaster table recognition option</p>
+</details>
+<details>
+<summary>2024/08/09 Version 0.7.0b1 released</summary>
+<p>Simplified installation process, added table recognition functionality</p>
+</details>
+<details>
+<summary>2024/08/01 Version 0.6.2b1 released</summary>
+<p>Optimized dependency conflict issues and installation documentation</p>
+</details>
+<details>
+<summary>2024/07/05 Initial open-source release</summary>
+</details>
 <!-- TABLE OF CONTENT -->

{magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/RECORD RENAMED Viewed

@@ -10,8 +10,8 @@ magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLx
 magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
 magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
 magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-magic_pdf/data/batch_build_dataset.py,sha256=rS4f50hBc7IvSqa_Gd84E_tSYpQ66BMaeZkCPd5Ajxw,4601
-magic_pdf/data/dataset.py,sha256=nsS507s1lPyfjnzEhfsQiBy_CdScPy79h3Fvjk_VKp0,12237
+magic_pdf/data/batch_build_dataset.py,sha256=KQoWFJDqCwRQug8-fTuciSwff58AYRjCNP6GdiDhxLI,4953
+magic_pdf/data/dataset.py,sha256=2v-a7kA6dRUDQpjlAVE5We1tMATR-MYKzQCcBhNci5g,12258
 magic_pdf/data/read_api.py,sha256=_faBnYE3iU_EiQLNFjVM6a8IQtOGAcSQNYBZsTSN1d8,5225
 magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
 magic_pdf/data/utils.py,sha256=dNWIJECPXaakKocI4z5Tq6vhDDSnR-bVWQV7DO2w_A8,5335
@@ -52,17 +52,17 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
 magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
 magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
 magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
-magic_pdf/libs/version.py,sha256=-ypEJktJToAL9by62JJKWEzDo_KPCQtmE5kwFgX24z4,22
+magic_pdf/libs/version.py,sha256=Vi6om3KImlKsS_Wg5CjUgYffoi2zx7T-SRPnnGL0G7M,22
 magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
-magic_pdf/model/batch_analyze.py,sha256=6vRqGnZjDqznsifeDZhjD_v8RmDSdDNxOAci8GCFozo,11211
-magic_pdf/model/doc_analyze_by_custom_model.py,sha256=z1JWvM24poMd2SsziRJRzeqJ9rKXbqSwJprCheuXSGg,10282
+magic_pdf/model/batch_analyze.py,sha256=yKhKQuZTh9GG83p61bw2BRqKMbnsjsmX73gfuTRk8xE,11272
+magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
 magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
 magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
 magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd05_fGU,10845
 magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
 magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/model/sub_modules/model_init.py,sha256=e2t95kxiuU47luOHByDokbQ2uob6oQhEA4b6UGVfMjY,8303
-magic_pdf/model/sub_modules/model_utils.py,sha256=GGkVqdGPTmPUaYTuPHxjzzxIizg1kmYo8voIdE7ETdg,2653
+magic_pdf/model/sub_modules/model_utils.py,sha256=iNC-zuDLWkwUAwMZ0YcGxAwHn5SAAFRdZBQgTy9nmgY,9880
 magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
 magic_pdf/model/sub_modules/language_detection/utils.py,sha256=Q__v6DdNJztt8GhVSuSB0txahVq-aj8RLhWn2VScx4w,3047
 magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=7T8eFl8zlZe6F0j0jSB3jrwOapDft320JQ1fuWxpvAY,5230
@@ -141,7 +141,7 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_post
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py,sha256=HiHNr4bhW5U1j4pYoyi8fPOaFsn8TUc4nSB6q8chfV4,26899
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml,sha256=eEzg5D5L3MHFL4H02gZnxdDiqtSCUzZDnt5pqDAmgCI,6980
-magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=GOtAGMAretviqDXak409PPav7qHYMDBwSs9wxlSANRA,1388
+magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=M0vyAENxKIaPaSdRBDhH8ik5V71vcY1STkZoq-3iqD8,1504
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt,sha256=xbaXD14RWk0Vpc7fAHpephuszp1j-Qi3IWC4VrFKu70,407
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt,sha256=gyVR_uHy-8l1CHctgevcjboSwA3pejXHHJ3fQ92sGoM,33443
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt,sha256=NpqCxsjEeXhKXXJkSLg7Hq-1_vCkEppeqjkpYl3c0TI,410
@@ -158,7 +158,7 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py,sha256=xEqR6
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py,sha256=8RmKl1vejnZl65caHZNV2ta6hMsg5B_LE-FuqCO8T8A,4225
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py,sha256=cRBKE0blzryj3Ar6yM0FKKgxmZdgMc44NDNl1S2wiRs,9136
-magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py,sha256=_fLTWjEmDZwXC-zzPT37PHO-nNlEvafemo2CyPJS7_w,19216
+magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py,sha256=GZ1PhVZ6GCPedgzU02e4pC52jHPf7uNI1GTID2CkMHA,19444
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-fkTS8swRYSbZeoqmSI8,3822
 magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
 magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -168,7 +168,7 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
 magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
 magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=w9nTdoTV5EJsG8ZlshNig0cdaMwlQ3XlQF1MKVuMwD8,2785
+magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=6W6qBNFZ_ETyk7B7Figk2ekPT3YgM_CUGWlAbdJC6dQ,4399
 magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1REI,2630
 magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
 magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
@@ -178,7 +178,7 @@ magic_pdf/post_proc/para_split_v3.py,sha256=SPN_VVGvFX5KpFMGw9OzgoE-kTZq-FF036i0
 magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
 magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
-magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
+magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=3_bEbZC_BDwbuaBLPdCIbkxz93-g9oCtvjuXD8qbklo,9330
 magic_pdf/pre_proc/ocr_dict_merge.py,sha256=PscKGF0uJIjMxZRM69FLUs1SZO_wOswDQQV1f0M2xAo,5627
 magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=bs5RLvk4kIyx9_Hqq0FU3AGPPxE8Sxs97Uwlf1sBryM,4725
 magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
@@ -191,13 +191,13 @@ magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,
 magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/tools/cli.py,sha256=_oa-M5Hcopa5RZudVzrEip2W8pa9422Lmat7tMBJO5M,5171
 magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
-magic_pdf/tools/common.py,sha256=LoUz6Y36_U2odZqzBNKXngFNa6plf01U7_5jlDAFXaQ,12313
+magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,12623
 magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
 magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
-magic_pdf-1.3.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
-magic_pdf-1.3.1.dist-info/METADATA,sha256=PGXFggL8ni7iXJ5qUXfZLGZqXrbEi9TUhLYzCVxduWw,43499
-magic_pdf-1.3.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-magic_pdf-1.3.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
-magic_pdf-1.3.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
-magic_pdf-1.3.1.dist-info/RECORD,,
+magic_pdf-1.3.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
+magic_pdf-1.3.3.dist-info/METADATA,sha256=1Y-a4UouLQRhsldrhz6UZLlx4KUFOdjSk5R1gK_oYjs,45615
+magic_pdf-1.3.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+magic_pdf-1.3.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
+magic_pdf-1.3.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
+magic_pdf-1.3.3.dist-info/RECORD,,

{magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

magic-pdf 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl

magic-pdf 1.3.1py3-none-any.whl → 1.3.3py3-none-any.whl