magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +5 -5
  12. magic_pdf/libs/draw_bbox.py +3 -2
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +122 -76
  18. magic_pdf/model/sub_modules/model_init.py +40 -35
  19. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  21. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  22. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  23. magic_pdf/para/para_split.py +411 -248
  24. magic_pdf/para/para_split_v2.py +352 -182
  25. magic_pdf/para/para_split_v3.py +110 -53
  26. magic_pdf/pdf_parse_by_ocr.py +2 -0
  27. magic_pdf/pdf_parse_by_txt.py +2 -0
  28. magic_pdf/pdf_parse_union_core.py +174 -100
  29. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  30. magic_pdf/pipe/AbsPipe.py +28 -44
  31. magic_pdf/pipe/OCRPipe.py +5 -5
  32. magic_pdf/pipe/TXTPipe.py +5 -6
  33. magic_pdf/pipe/UNIPipe.py +24 -25
  34. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  35. magic_pdf/pre_proc/cut_image.py +9 -11
  36. magic_pdf/pre_proc/equations_replace.py +203 -212
  37. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  38. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  39. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  40. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  41. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  42. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  43. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  44. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  45. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  46. magic_pdf/spark/spark_api.py +15 -17
  47. magic_pdf/tools/cli.py +3 -4
  48. magic_pdf/tools/cli_dev.py +6 -9
  49. magic_pdf/tools/common.py +26 -36
  50. magic_pdf/user_api.py +29 -38
  51. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
  53. magic_pdf/libs/Constants.py +0 -55
  54. magic_pdf/libs/MakeContentConfig.py +0 -11
  55. magic_pdf/libs/drop_reason.py +0 -27
  56. magic_pdf/libs/drop_tag.py +0 -19
  57. magic_pdf/para/para_pipeline.py +0 -297
  58. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  59. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  60. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,53 @@
1
+ """span维度自定义字段."""
2
+ # span是否是跨页合并的
3
+ CROSS_PAGE = 'cross_page'
4
+
5
+ """
6
+ block维度自定义字段
7
+ """
8
+ # block中lines是否被删除
9
+ LINES_DELETED = 'lines_deleted'
10
+
11
+ # table recognition max time default value
12
+ TABLE_MAX_TIME_VALUE = 400
13
+
14
+ # pp_table_result_max_length
15
+ TABLE_MAX_LEN = 480
16
+
17
+ # table master structure dict
18
+ TABLE_MASTER_DICT = 'table_master_structure_dict.txt'
19
+
20
+ # table master dir
21
+ TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'
22
+
23
+ # pp detect model dir
24
+ DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'
25
+
26
+ # pp rec model dir
27
+ REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'
28
+
29
+ # pp rec char dict path
30
+ REC_CHAR_DICT = 'ppocr_keys_v1.txt'
31
+
32
+ # pp rec copy rec directory
33
+ PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
34
+
35
+ # pp rec copy det directory
36
+ PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
37
+
38
+
39
+ class MODEL_NAME:
40
+ # pp table structure algorithm
41
+ TABLE_MASTER = 'tablemaster'
42
+ # struct eqtable
43
+ STRUCT_EQTABLE = 'struct_eqtable'
44
+
45
+ DocLayout_YOLO = 'doclayout_yolo'
46
+
47
+ LAYOUTLMv3 = 'layoutlmv3'
48
+
49
+ YOLO_V8_MFD = 'yolo_v8_mfd'
50
+
51
+ UniMerNet_v2_Small = 'unimernet_small'
52
+
53
+ RAPID_TABLE = 'rapid_table'
@@ -0,0 +1,35 @@
1
+ class DropReason:
2
+ TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序
3
+ USEFUL_BLOCK_HOR_OVERLAP = (
4
+ 'useful_block_horizontal_overlap' # 需保留的block水平覆盖
5
+ )
6
+ COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持
7
+ TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的
8
+ COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
9
+ HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
10
+ 'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃
11
+ )
12
+ HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
13
+ 'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃
14
+ )
15
+ HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大
16
+ MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败
17
+ Exception = '_exception' # 解析中发生异常
18
+ ENCRYPTED = 'encrypted' # PDF是加密的
19
+ EMPTY_PDF = 'total_page=0' # PDF页面总数为0
20
+ NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析
21
+ DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段
22
+ TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败
23
+ TITLE_LEVEL_FAILED = (
24
+ 'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题)
25
+ )
26
+ PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败
27
+ PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败
28
+ NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种
29
+ SPECIAL_PDF = 'special_pdf'
30
+ PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏
31
+ CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面
32
+ NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负
33
+ OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
34
+ 'overlap_blocks_can_t_separation' # 无法分离重叠的block
35
+ )
@@ -0,0 +1,19 @@
1
+
2
+ COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
3
+ PAGE_NO = 'page-no' # 页码
4
+ CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
5
+ VERTICAL_TEXT = 'vertical-text' # 垂直文本
6
+ ROTATE_TEXT = 'rotate-text' # 旋转文本
7
+ EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
8
+ ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
9
+ ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
10
+
11
+
12
+ class DropTag:
13
+ PAGE_NUMBER = 'page_no'
14
+ HEADER = 'header'
15
+ FOOTER = 'footer'
16
+ FOOTNOTE = 'footnote'
17
+ NOT_IN_LAYOUT = 'not_in_layout'
18
+ SPAN_OVERLAP = 'span_overlap'
19
+ BLOCK_OVERLAP = 'block_overlap'
@@ -0,0 +1,11 @@
1
+ class MakeMode:
2
+ MM_MD = 'mm_markdown'
3
+ NLP_MD = 'nlp_markdown'
4
+ STANDARD_FORMAT = 'standard_format'
5
+
6
+
7
+ class DropMode:
8
+ WHOLE_PDF = 'whole_pdf'
9
+ SINGLE_PAGE = 'single_page'
10
+ NONE = 'none'
11
+ NONE_WITH_REASON = 'none_with_reason'
@@ -1,9 +1,10 @@
1
1
  from enum import Enum
2
2
 
3
+
3
4
  class ModelBlockTypeEnum(Enum):
4
5
  TITLE = 0
5
6
  PLAIN_TEXT = 1
6
7
  ABANDON = 2
7
8
  ISOLATE_FORMULA = 8
8
9
  EMBEDDING = 13
9
- ISOLATED = 14
10
+ ISOLATED = 14
@@ -35,7 +35,7 @@ def read_jsonl(
35
35
  jsonl_d = [
36
36
  json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
37
37
  ]
38
- for d in jsonl_d[:5]:
38
+ for d in jsonl_d:
39
39
  pdf_path = d.get('file_location', '') or d.get('path', '')
40
40
  if len(pdf_path) == 0:
41
41
  raise EmptyData('pdf file location is empty')