magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,562 +0,0 @@
1
- import os
2
- import unicodedata
3
-
4
- from magic_pdf.para.commons import *
5
-
6
-
7
- if sys.version_info[0] >= 3:
8
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
9
-
10
-
11
- class BlockContinuationProcessor:
12
- """
13
- This class is used to process the blocks to detect block continuations.
14
- """
15
-
16
- def __init__(self) -> None:
17
- pass
18
-
19
- def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
20
- """
21
- This function checks if the two font types are similar.
22
- Definition of similar font types: the two font types have a common prefix,
23
- and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
24
-
25
- Parameters
26
- ----------
27
- font_type1 : str
28
- font type 1
29
- font_type2 : str
30
- font type 2
31
- prefix_length_ratio : float
32
- minimum ratio of the common prefix length to the length of the shorter font type
33
-
34
- Returns
35
- -------
36
- bool
37
- True if the two font types are similar, False otherwise.
38
- """
39
-
40
- if isinstance(font_type1, list):
41
- font_type1 = font_type1[0] if font_type1 else ""
42
- if isinstance(font_type2, list):
43
- font_type2 = font_type2[0] if font_type2 else ""
44
-
45
- if font_type1 == font_type2:
46
- return True
47
-
48
- # Find the length of the common prefix
49
- common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
50
-
51
- # Calculate the minimum prefix length based on the ratio
52
- min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
53
-
54
- return common_prefix_length >= min_prefix_length
55
-
56
- def __is_same_block_font(self, block1, block2):
57
- """
58
- This function compares the font of block1 and block2
59
-
60
- Parameters
61
- ----------
62
- block1 : dict
63
- block1
64
- block2 : dict
65
- block2
66
-
67
- Returns
68
- -------
69
- is_same : bool
70
- True if block1 and block2 have the same font, else False
71
- """
72
- block_1_font_type = safe_get(block1, "block_font_type", "")
73
- block_1_font_size = safe_get(block1, "block_font_size", 0)
74
- block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
75
-
76
- block_2_font_type = safe_get(block2, "block_font_type", "")
77
- block_2_font_size = safe_get(block2, "block_font_size", 0)
78
- block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
79
-
80
- if isinstance(block_1_font_size, list):
81
- block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
82
- if isinstance(block_2_font_size, list):
83
- block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
84
-
85
- block_1_text = safe_get(block1, "text", "")
86
- block_2_text = safe_get(block2, "text", "")
87
-
88
- if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
89
- return False
90
-
91
- if not block_1_text or not block_2_text:
92
- return False
93
- else:
94
- text_len_ratio = len(block_2_text) / len(block_1_text)
95
- if text_len_ratio < 0.2:
96
- avg_char_width_condition = (
97
- abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
98
- < 0.5
99
- )
100
- else:
101
- avg_char_width_condition = (
102
- abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
103
- < 0.2
104
- )
105
-
106
- block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
107
-
108
- return (
109
- self.__is_similar_font_type(block_1_font_type, block_2_font_type)
110
- and avg_char_width_condition
111
- and block_font_size_condtion
112
- )
113
-
114
- def _is_alphabet_char(self, char):
115
- if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
116
- return True
117
- else:
118
- return False
119
-
120
- def _is_chinese_char(self, char):
121
- if char >= "\u4e00" and char <= "\u9fa5":
122
- return True
123
- else:
124
- return False
125
-
126
- def _is_other_letter_char(self, char):
127
- try:
128
- cat = unicodedata.category(char)
129
- if cat == "Lu" or cat == "Ll":
130
- return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
131
- except TypeError:
132
- print("The input to the function must be a single character.")
133
- return False
134
-
135
- def _is_year(self, s: str):
136
- try:
137
- number = int(s)
138
- return 1900 <= number <= 2099
139
- except ValueError:
140
- return False
141
-
142
- def __is_para_font_consistent(self, para_1, para_2):
143
- """
144
- This function compares the font of para1 and para2
145
-
146
- Parameters
147
- ----------
148
- para1 : dict
149
- para1
150
- para2 : dict
151
- para2
152
-
153
- Returns
154
- -------
155
- is_same : bool
156
- True if para1 and para2 have the same font, else False
157
- """
158
- if para_1 is None or para_2 is None:
159
- return False
160
-
161
- para_1_font_type = safe_get(para_1, "para_font_type", "")
162
- para_1_font_size = safe_get(para_1, "para_font_size", 0)
163
- para_1_font_color = safe_get(para_1, "para_font_color", "")
164
-
165
- para_2_font_type = safe_get(para_2, "para_font_type", "")
166
- para_2_font_size = safe_get(para_2, "para_font_size", 0)
167
- para_2_font_color = safe_get(para_2, "para_font_color", "")
168
-
169
- if isinstance(para_1_font_type, list): # get the most common font type
170
- para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
171
- if isinstance(para_2_font_type, list):
172
- para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
173
- if isinstance(para_1_font_size, list): # compute average font type
174
- para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
175
- if isinstance(para_2_font_size, list): # compute average font type
176
- para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
177
-
178
- return (
179
- self.__is_similar_font_type(para_1_font_type, para_2_font_type)
180
- and abs(para_1_font_size - para_2_font_size) < 1.5
181
- # and para_font_color1 == para_font_color2
182
- )
183
-
184
- def _is_para_puncs_consistent(self, para_1, para_2):
185
- """
186
- This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
187
-
188
- Parameters
189
- ----------
190
- para1 : dict
191
- para1
192
- para2 : dict
193
- para2
194
-
195
- Returns
196
- -------
197
- is_same : bool
198
- True if para1 and para2 are from the same paragraph by using the puncs, else False
199
- """
200
- para_1_text = safe_get(para_1, "para_text", "").strip()
201
- para_2_text = safe_get(para_2, "para_text", "").strip()
202
-
203
- para_1_bboxes = safe_get(para_1, "para_bbox", [])
204
- para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
205
-
206
- para_2_bboxes = safe_get(para_2, "para_bbox", [])
207
- para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
208
-
209
- # print_yellow(" Features of determine puncs_consistent:")
210
- # print(f" para_1_text: {para_1_text}")
211
- # print(f" para_2_text: {para_2_text}")
212
- # print(f" para_1_bboxes: {para_1_bboxes}")
213
- # print(f" para_2_bboxes: {para_2_bboxes}")
214
- # print(f" para_1_font_sizes: {para_1_font_sizes}")
215
- # print(f" para_2_font_sizes: {para_2_font_sizes}")
216
-
217
- if is_nested_list(para_1_bboxes):
218
- x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
219
- else:
220
- x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
221
-
222
- if is_nested_list(para_2_bboxes):
223
- x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
224
- para_2_font_sizes = para_2_font_sizes[0] # type: ignore
225
- else:
226
- x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
227
-
228
- right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
229
- are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
230
-
231
- left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
232
- is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
233
- is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
234
-
235
- # Check if either para_text1 or para_text2 is empty
236
- if not para_1_text or not para_2_text:
237
- return False
238
-
239
- # Define the end puncs for a sentence to end and hyphen
240
- end_puncs = [".", "?", "!", "。", "?", "!", "…"]
241
- hyphen = ["-", "—"]
242
-
243
- # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
244
- para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
245
- para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
246
- para_1_end_with_space = para_1_text and para_1_text[-1] == " "
247
- para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
248
-
249
- # print_yellow(f" para_1_end_with_hyphen: {para_1_end_with_hyphen}")
250
- # print_yellow(f" para_1_end_with_end_punc: {para_1_end_with_end_punc}")
251
- # print_yellow(f" para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
252
- # print_yellow(f" para_1_end_with_space: {para_1_end_with_space}")
253
-
254
- if para_1_end_with_hyphen: # If para_text1 ends with hyphen
255
- # print_red(f"para_1 is end with hyphen.")
256
- para_2_is_consistent = para_2_text and (
257
- para_2_text[0] in hyphen
258
- or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
259
- or (self._is_chinese_char(para_2_text[0]))
260
- or (self._is_other_letter_char(para_2_text[0]))
261
- )
262
- if para_2_is_consistent:
263
- # print(f"para_2 is consistent.\n")
264
- return True
265
- else:
266
- # print(f"para_2 is not consistent.\n")
267
- pass
268
-
269
- elif para_1_end_with_end_punc: # If para_text1 ends with ending punctuations
270
- # print_red(f"para_1 is end with end_punc.")
271
- para_2_is_consistent = (
272
- para_2_text
273
- and (
274
- para_2_text[0] == " "
275
- or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
276
- or (self._is_chinese_char(para_2_text[0]))
277
- or (self._is_other_letter_char(para_2_text[0]))
278
- )
279
- and not is_para2_left_indent_than_papa1
280
- )
281
- if para_2_is_consistent:
282
- # print(f"para_2 is consistent.\n")
283
- return True
284
- else:
285
- # print(f"para_2 is not consistent.\n")
286
- pass
287
-
288
- elif para_1_not_end_with_end_punc: # If para_text1 is not end with ending punctuations
289
- # print_red(f"para_1 is NOT end with end_punc.")
290
- para_2_is_consistent = para_2_text and (
291
- para_2_text[0] == " "
292
- or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
293
- or (self._is_alphabet_char(para_2_text[0]))
294
- or (self._is_year(para_2_text[0:4]))
295
- or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
296
- or (self._is_chinese_char(para_2_text[0]))
297
- or (self._is_other_letter_char(para_2_text[0]))
298
- )
299
- if para_2_is_consistent:
300
- # print(f"para_2 is consistent.\n")
301
- return True
302
- else:
303
- # print(f"para_2 is not consistent.\n")
304
- pass
305
-
306
- elif para_1_end_with_space: # If para_text1 ends with space
307
- # print_red(f"para_1 is end with space.")
308
- para_2_is_consistent = para_2_text and (
309
- para_2_text[0] == " "
310
- or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
311
- or (self._is_chinese_char(para_2_text[0]))
312
- or (self._is_other_letter_char(para_2_text[0]))
313
- )
314
- if para_2_is_consistent:
315
- # print(f"para_2 is consistent.\n")
316
- return True
317
- else:
318
- pass
319
- # print(f"para_2 is not consistent.\n")
320
-
321
- return False
322
-
323
- def _is_block_consistent(self, block1, block2):
324
- """
325
- This function determines whether block1 and block2 are originally from the same block
326
-
327
- Parameters
328
- ----------
329
- block1 : dict
330
- block1s
331
- block2 : dict
332
- block2
333
-
334
- Returns
335
- -------
336
- is_same : bool
337
- True if block1 and block2 are from the same block, else False
338
- """
339
- return self.__is_same_block_font(block1, block2)
340
-
341
- def _is_para_continued(self, para1, para2):
342
- """
343
- This function determines whether para1 and para2 are originally from the same paragraph
344
-
345
- Parameters
346
- ----------
347
- para1 : dict
348
- para1
349
- para2 : dict
350
- para2
351
-
352
- Returns
353
- -------
354
- is_same : bool
355
- True if para1 and para2 are from the same paragraph, else False
356
- """
357
- is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
358
- is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
359
-
360
- return is_para_font_consistent and is_para_puncs_consistent
361
-
362
- def _are_boundaries_of_block_consistent(self, block1, block2):
363
- """
364
- This function checks if the boundaries of block1 and block2 are consistent
365
-
366
- Parameters
367
- ----------
368
- block1 : dict
369
- block1
370
-
371
- block2 : dict
372
- block2
373
-
374
- Returns
375
- -------
376
- is_consistent : bool
377
- True if the boundaries of block1 and block2 are consistent, else False
378
- """
379
-
380
- last_line_of_block1 = block1["lines"][-1]
381
- first_line_of_block2 = block2["lines"][0]
382
-
383
- spans_of_last_line_of_block1 = last_line_of_block1["spans"]
384
- spans_of_first_line_of_block2 = first_line_of_block2["spans"]
385
-
386
- font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
387
- font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
388
- font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
389
- font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
390
-
391
- font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
392
- font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
393
- font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
394
- font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
395
-
396
- return (
397
- self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
398
- and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
399
- # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
400
- and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
401
- )
402
-
403
- def _get_last_paragraph(self, block):
404
- """
405
- Retrieves the last paragraph from a block.
406
-
407
- Parameters
408
- ----------
409
- block : dict
410
- The block from which to retrieve the paragraph.
411
-
412
- Returns
413
- -------
414
- dict
415
- The last paragraph of the block.
416
- """
417
- if block["paras"]:
418
- last_para_key = list(block["paras"].keys())[-1]
419
- return block["paras"][last_para_key]
420
- else:
421
- return None
422
-
423
- def _get_first_paragraph(self, block):
424
- """
425
- Retrieves the first paragraph from a block.
426
-
427
- Parameters
428
- ----------
429
- block : dict
430
- The block from which to retrieve the paragraph.
431
-
432
- Returns
433
- -------
434
- dict
435
- The first paragraph of the block.
436
- """
437
- if block["paras"]:
438
- first_para_key = list(block["paras"].keys())[0]
439
- return block["paras"][first_para_key]
440
- else:
441
- return None
442
-
443
- def should_merge_next_para(self, curr_para, next_para):
444
- if self._is_para_continued(curr_para, next_para):
445
- return True
446
- else:
447
- return False
448
-
449
- def batch_tag_paras(self, pdf_dict):
450
- the_last_page_id = len(pdf_dict) - 1
451
-
452
- for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
453
- if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
454
- para_blocks_of_curr_page = curr_page_content["para_blocks"]
455
- next_page_idx = curr_page_idx + 1
456
- next_page_id = f"page_{next_page_idx}"
457
- next_page_content = pdf_dict.get(next_page_id, {})
458
-
459
- for i, current_block in enumerate(para_blocks_of_curr_page):
460
- for para_id, curr_para in current_block["paras"].items():
461
- curr_para["curr_para_location"] = [
462
- curr_page_idx,
463
- current_block["block_id"],
464
- int(para_id.split("_")[-1]),
465
- ]
466
- curr_para["next_para_location"] = None # 默认设置为None
467
- curr_para["merge_next_para"] = False # 默认设置为False
468
-
469
- next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
470
-
471
- if next_block:
472
- curr_block_last_para_key = list(current_block["paras"].keys())[-1]
473
- curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
474
-
475
- next_block_first_para_key = list(next_block["paras"].keys())[0]
476
- next_blk_first_para = next_block["paras"][next_block_first_para_key]
477
-
478
- if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
479
- curr_blk_last_para["next_para_location"] = [
480
- curr_page_idx,
481
- next_block["block_id"],
482
- int(next_block_first_para_key.split("_")[-1]),
483
- ]
484
- curr_blk_last_para["merge_next_para"] = True
485
- else:
486
- # Handle the case where the next block is in a different page
487
- curr_block_last_para_key = list(current_block["paras"].keys())[-1]
488
- curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
489
-
490
- while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
491
- next_page_idx += 1
492
- next_page_id = f"page_{next_page_idx}"
493
- next_page_content = pdf_dict.get(next_page_id, {})
494
-
495
- if next_page_content.get("para_blocks", []):
496
- next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
497
- next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
498
-
499
- if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
500
- curr_blk_last_para["next_para_location"] = [
501
- next_page_idx,
502
- next_page_content["para_blocks"][0]["block_id"],
503
- int(next_blk_first_para_key.split("_")[-1]),
504
- ]
505
- curr_blk_last_para["merge_next_para"] = True
506
-
507
- return pdf_dict
508
-
509
- def find_block_by_id(self, para_blocks, block_id):
510
- for block in para_blocks:
511
- if block.get("block_id") == block_id:
512
- return block
513
- return None
514
-
515
- def batch_merge_paras(self, pdf_dict):
516
- for page_id, page_content in pdf_dict.items():
517
- if page_id.startswith("page_") and page_content.get("para_blocks", []):
518
- para_blocks_of_page = page_content["para_blocks"]
519
-
520
- for i in range(len(para_blocks_of_page)):
521
- current_block = para_blocks_of_page[i]
522
- paras = current_block["paras"]
523
-
524
- for para_id, curr_para in list(paras.items()):
525
- # 跳过标题段落
526
- if curr_para.get("is_para_title"):
527
- continue
528
-
529
- while curr_para.get("merge_next_para"):
530
- next_para_location = curr_para.get("next_para_location")
531
- if not next_para_location:
532
- break
533
-
534
- next_page_idx, next_block_id, next_para_id = next_para_location
535
- next_page_id = f"page_{next_page_idx}"
536
- next_page_content = pdf_dict.get(next_page_id)
537
- if not next_page_content:
538
- break
539
-
540
- next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
541
- if not next_block:
542
- break
543
-
544
- next_para = next_block["paras"].get(f"para_{next_para_id}")
545
- if not next_para or next_para.get("is_para_title"):
546
- break
547
-
548
- # 合并段落文本
549
- curr_para_text = curr_para.get("para_text", "")
550
- next_para_text = next_para.get("para_text", "")
551
- curr_para["para_text"] = curr_para_text + " " + next_para_text
552
-
553
- # 更新 next_para_location
554
- curr_para["next_para_location"] = next_para.get("next_para_location")
555
-
556
- # 将下一个段落文本置为空,表示已被合并
557
- next_para["para_text"] = ""
558
-
559
- # 更新 merge_next_para 标记
560
- curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
561
-
562
- return pdf_dict