magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  7. magic_pdf/data/read_api.py +1 -1
  8. magic_pdf/dict2md/mkcontent.py +226 -185
  9. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  10. magic_pdf/filter/pdf_meta_scan.py +101 -79
  11. magic_pdf/integrations/rag/utils.py +4 -5
  12. magic_pdf/libs/config_reader.py +5 -5
  13. magic_pdf/libs/draw_bbox.py +3 -2
  14. magic_pdf/libs/pdf_image_tools.py +36 -12
  15. magic_pdf/libs/version.py +1 -1
  16. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  17. magic_pdf/model/magic_model.py +13 -13
  18. magic_pdf/model/pdf_extract_kit.py +122 -76
  19. magic_pdf/model/sub_modules/model_init.py +40 -35
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  21. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  22. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  23. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  24. magic_pdf/para/para_split.py +411 -248
  25. magic_pdf/para/para_split_v2.py +352 -182
  26. magic_pdf/para/para_split_v3.py +110 -53
  27. magic_pdf/pdf_parse_by_ocr.py +2 -0
  28. magic_pdf/pdf_parse_by_txt.py +2 -0
  29. magic_pdf/pdf_parse_union_core.py +174 -100
  30. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  31. magic_pdf/pipe/AbsPipe.py +28 -44
  32. magic_pdf/pipe/OCRPipe.py +5 -5
  33. magic_pdf/pipe/TXTPipe.py +5 -6
  34. magic_pdf/pipe/UNIPipe.py +24 -25
  35. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  36. magic_pdf/pre_proc/cut_image.py +9 -11
  37. magic_pdf/pre_proc/equations_replace.py +203 -212
  38. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  39. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  40. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  41. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  42. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  43. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  44. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  45. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  46. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  47. magic_pdf/spark/spark_api.py +15 -17
  48. magic_pdf/tools/cli.py +3 -4
  49. magic_pdf/tools/cli_dev.py +6 -9
  50. magic_pdf/tools/common.py +26 -36
  51. magic_pdf/user_api.py +29 -38
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
  53. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
  54. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
  55. magic_pdf/libs/Constants.py +0 -55
  56. magic_pdf/libs/MakeContentConfig.py +0 -11
  57. magic_pdf/libs/drop_reason.py +0 -27
  58. magic_pdf/libs/drop_tag.py +0 -19
  59. magic_pdf/para/para_pipeline.py +0 -297
  60. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
  63. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,17 @@
1
1
  from loguru import logger
2
2
 
3
+ from magic_pdf.config.drop_reason import DropReason
3
4
  from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
4
- from magic_pdf.libs.drop_reason import DropReason
5
5
 
6
6
 
7
7
  def __is_pseudo_single_column(page_info) -> bool:
8
- """
9
- 判断一个页面是否伪单列。
8
+ """判断一个页面是否伪单列。
10
9
 
11
10
  Args:
12
11
  page_info (dict): 页面信息字典,包括'_layout_tree'和'preproc_blocks'。
13
12
 
14
13
  Returns:
15
14
  Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info),否则返回(False, None)。
16
-
17
15
  """
18
16
  layout_tree = page_info['_layout_tree']
19
17
  layout_column_width = get_columns_cnt_of_layout(layout_tree)
@@ -41,27 +39,22 @@ def __is_pseudo_single_column(page_info) -> bool:
41
39
  if num_lines > 20:
42
40
  radio = num_satisfying_lines / num_lines
43
41
  if radio >= 0.5:
44
- extra_info = f"{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}"
42
+ extra_info = f'{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}'
45
43
  block_text = []
46
44
  for line in lines:
47
45
  if line['spans']:
48
46
  for span in line['spans']:
49
47
  block_text.append(span['text'])
50
- logger.warning(f"pseudo_single_column block_text: {block_text}")
48
+ logger.warning(f'pseudo_single_column block_text: {block_text}')
51
49
  return True, extra_info
52
50
 
53
51
  return False, None
54
52
 
55
53
 
56
54
  def pdf_post_filter(page_info) -> tuple:
57
- """
58
- return:(True|False, err_msg)
59
- True, 如果pdf符合要求
60
- False, 如果pdf不符合要求
61
-
62
- """
55
+ """return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
63
56
  bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
64
57
  if bool_is_pseudo_single_column:
65
- return False, {"_need_drop": True, "_drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}
58
+ return False, {'_need_drop': True, '_drop_reason': DropReason.PSEUDO_SINGLE_COLUMN, 'extra_info': extra_info}
66
59
 
67
- return True, None
60
+ return True, None
@@ -1,7 +1,7 @@
1
1
  from loguru import logger
2
2
 
3
+ from magic_pdf.config.ocr_content_type import ContentType
3
4
  from magic_pdf.libs.commons import join_path
4
- from magic_pdf.libs.ocr_content_type import ContentType
5
5
  from magic_pdf.libs.pdf_image_tools import cut_image
6
6
 
7
7
 
@@ -29,9 +29,7 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
29
29
  image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
30
30
  equation_inline_bboxes: list,
31
31
  equation_interline_bboxes: list, imageWriter) -> dict:
32
- """
33
- 返回一个dict, key为bbox, 值是图片地址
34
- """
32
+ """返回一个dict, key为bbox, 值是图片地址."""
35
33
  image_info = []
36
34
  image_backup_info = []
37
35
  table_info = []
@@ -46,26 +44,26 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
46
44
  for bbox in image_bboxes:
47
45
  if not check_img_bbox(bbox):
48
46
  continue
49
- image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
50
- image_info.append({"bbox": bbox, "image_path": image_path})
47
+ image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
48
+ image_info.append({'bbox': bbox, 'image_path': image_path})
51
49
 
52
50
  for bbox in images_overlap_backup:
53
51
  if not check_img_bbox(bbox):
54
52
  continue
55
- image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
56
- image_backup_info.append({"bbox": bbox, "image_path": image_path})
53
+ image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
54
+ image_backup_info.append({'bbox': bbox, 'image_path': image_path})
57
55
 
58
56
  for bbox in table_bboxes:
59
57
  if not check_img_bbox(bbox):
60
58
  continue
61
- image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
62
- table_info.append({"bbox": bbox, "image_path": image_path})
59
+ image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
60
+ table_info.append({'bbox': bbox, 'image_path': image_path})
63
61
 
64
62
  return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
65
63
 
66
64
 
67
65
  def check_img_bbox(bbox) -> bool:
68
66
  if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
69
- logger.warning(f"image_bboxes: 错误的box, {bbox}")
67
+ logger.warning(f'image_bboxes: 错误的box, {bbox}')
70
68
  return False
71
69
  return True