magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,125 @@
1
+ import random
2
+
3
+ import fitz
4
+ import cv2
5
+ from paddleocr import PPStructure
6
+ from PIL import Image
7
+ from loguru import logger
8
+ import numpy as np
9
+
10
+ def region_to_bbox(region):
11
+ x0 = region[0][0]
12
+ y0 = region[0][1]
13
+ x1 = region[2][0]
14
+ y1 = region[2][1]
15
+ return [x0, y0, x1, y1]
16
+
17
+
18
+ def dict_compare(d1, d2):
19
+ return d1.items() == d2.items()
20
+
21
+
22
+ def remove_duplicates_dicts(lst):
23
+ unique_dicts = []
24
+ for dict_item in lst:
25
+ if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
26
+ unique_dicts.append(dict_item)
27
+ return unique_dicts
28
+ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
29
+ ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
30
+
31
+ imgs = []
32
+ with fitz.open("pdf", pdf_bytes) as doc:
33
+ for index in range(0, doc.page_count):
34
+ page = doc[index]
35
+ dpi = 200
36
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
37
+ pm = page.get_pixmap(matrix=mat, alpha=False)
38
+
39
+ # if width or height > 2000 pixels, don't enlarge the image
40
+ # if pm.width > 2000 or pm.height > 2000:
41
+ # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
42
+
43
+ img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
44
+ img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
45
+ img_dict = {
46
+ "img": img,
47
+ "width": pm.width,
48
+ "height": pm.height
49
+ }
50
+ imgs.append(img_dict)
51
+
52
+ model_json = []
53
+ for index, img_dict in enumerate(imgs):
54
+ img = img_dict['img']
55
+ page_width = img_dict['width']
56
+ page_height = img_dict['height']
57
+ result = ocr_engine(img)
58
+ spans = []
59
+ for line in result:
60
+ line.pop('img')
61
+ '''
62
+ 为paddle输出适配type no.
63
+ title: 0 # 标题
64
+ text: 1 # 文本
65
+ header: 2 # abandon
66
+ footer: 2 # abandon
67
+ reference: 1 # 文本 or abandon
68
+ equation: 8 # 行间公式 block
69
+ equation: 14 # 行间公式 text
70
+ figure: 3 # 图片
71
+ figure_caption: 4 # 图片描述
72
+ table: 5 # 表格
73
+ table_caption: 6 # 表格描述
74
+ '''
75
+ if line['type'] == 'title':
76
+ line['category_id'] = 0
77
+ elif line['type'] in ['text', 'reference']:
78
+ line['category_id'] = 1
79
+ elif line['type'] == 'figure':
80
+ line['category_id'] = 3
81
+ elif line['type'] == 'figure_caption':
82
+ line['category_id'] = 4
83
+ elif line['type'] == 'table':
84
+ line['category_id'] = 5
85
+ elif line['type'] == 'table_caption':
86
+ line['category_id'] = 6
87
+ elif line['type'] == 'equation':
88
+ line['category_id'] = 8
89
+ elif line['type'] in ['header', 'footer']:
90
+ line['category_id'] = 2
91
+ else:
92
+ logger.warning(f"unknown type: {line['type']}")
93
+
94
+ # 兼容不输出score的paddleocr版本
95
+ if line.get("score") is None:
96
+ line['score'] = 0.5 + random.random() * 0.5
97
+
98
+ res = line.pop('res', None)
99
+ if res is not None and len(res) > 0:
100
+ for span in res:
101
+ new_span = {'category_id': 15,
102
+ 'bbox': region_to_bbox(span['text_region']),
103
+ 'score': span['confidence'],
104
+ 'text': span['text']
105
+ }
106
+ spans.append(new_span)
107
+
108
+ if len(spans) > 0:
109
+ result.extend(spans)
110
+
111
+ result = remove_duplicates_dicts(result)
112
+
113
+ page_info = {
114
+ "page_no": index,
115
+ "height": page_height,
116
+ "width": page_width
117
+ }
118
+ page_dict = {
119
+ "layout_dets": result,
120
+ "page_info": page_info
121
+ }
122
+
123
+ model_json.append(page_dict)
124
+
125
+ return model_json