magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,56 @@
1
+ from magic_pdf.pdf_parse_union_core import pdf_parse_union
2
+
3
+
4
+ def parse_pdf_by_txt(
5
+ pdf_bytes,
6
+ model_list,
7
+ imageWriter,
8
+ start_page_id=0,
9
+ end_page_id=None,
10
+ debug_mode=False,
11
+ ):
12
+ return pdf_parse_union(pdf_bytes,
13
+ model_list,
14
+ imageWriter,
15
+ "txt",
16
+ start_page_id=start_page_id,
17
+ end_page_id=end_page_id,
18
+ debug_mode=debug_mode,
19
+ )
20
+
21
+
22
+ if __name__ == "__main__":
23
+ pass
24
+ # if 1:
25
+ # import fitz
26
+ # import json
27
+ #
28
+ # with open("/opt/data/pdf/20240418/25536-00.pdf", "rb") as f:
29
+ # pdf_bytes = f.read()
30
+ # pdf_docs = fitz.open("pdf", pdf_bytes)
31
+ #
32
+ # with open("/opt/data/pdf/20240418/25536-00.json") as f:
33
+ # model_list = json.loads(f.readline())
34
+ #
35
+ # magic_model = MagicModel(model_list, pdf_docs)
36
+ # for i in range(7):
37
+ # print(magic_model.get_imgs(i))
38
+ #
39
+ # for page_no, page in enumerate(pdf_docs):
40
+ # inline_equations, interline_equations, interline_equation_blocks = (
41
+ # magic_model.get_equations(page_no)
42
+ # )
43
+ #
44
+ # text_raw_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
45
+ # char_level_text_blocks = page.get_text(
46
+ # "rawdict", flags=fitz.TEXTFLAGS_TEXT
47
+ # )["blocks"]
48
+ # text_blocks = combine_chars_to_pymudict(
49
+ # text_raw_blocks, char_level_text_blocks
50
+ # )
51
+ # text_blocks = replace_equations_in_textblock(
52
+ # text_blocks, inline_equations, interline_equations
53
+ # )
54
+ # text_blocks = remove_citation_marker(text_blocks)
55
+ #
56
+ # text_blocks = remove_chars_in_text_blocks(text_blocks)