magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,480 +0,0 @@
1
- from magic_pdf.para.commons import *
2
-
3
-
4
- if sys.version_info[0] >= 3:
5
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
6
-
7
-
8
-
9
- class BlockTerminationProcessor:
10
- def __init__(self) -> None:
11
- pass
12
-
13
- def _is_consistent_lines(
14
- self,
15
- curr_line,
16
- prev_line,
17
- next_line,
18
- consistent_direction, # 0 for prev, 1 for next, 2 for both
19
- ):
20
- """
21
- This function checks if the line is consistent with its neighbors
22
-
23
- Parameters
24
- ----------
25
- curr_line : dict
26
- current line
27
- prev_line : dict
28
- previous line
29
- next_line : dict
30
- next line
31
- consistent_direction : int
32
- 0 for prev, 1 for next, 2 for both
33
-
34
- Returns
35
- -------
36
- bool
37
- True if the line is consistent with its neighbors, False otherwise.
38
- """
39
-
40
- curr_line_font_size = curr_line["spans"][0]["size"]
41
- curr_line_font_type = curr_line["spans"][0]["font"].lower()
42
-
43
- if consistent_direction == 0:
44
- if prev_line:
45
- prev_line_font_size = prev_line["spans"][0]["size"]
46
- prev_line_font_type = prev_line["spans"][0]["font"].lower()
47
- return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
48
- else:
49
- return False
50
-
51
- elif consistent_direction == 1:
52
- if next_line:
53
- next_line_font_size = next_line["spans"][0]["size"]
54
- next_line_font_type = next_line["spans"][0]["font"].lower()
55
- return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
56
- else:
57
- return False
58
-
59
- elif consistent_direction == 2:
60
- if prev_line and next_line:
61
- prev_line_font_size = prev_line["spans"][0]["size"]
62
- prev_line_font_type = prev_line["spans"][0]["font"].lower()
63
- next_line_font_size = next_line["spans"][0]["size"]
64
- next_line_font_type = next_line["spans"][0]["font"].lower()
65
- return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
66
- curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
67
- )
68
- else:
69
- return False
70
-
71
- else:
72
- return False
73
-
74
- def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
75
- """
76
- This function checks if the line is a regular line
77
-
78
- Parameters
79
- ----------
80
- curr_line_bbox : list
81
- bbox of the current line
82
- prev_line_bbox : list
83
- bbox of the previous line
84
- next_line_bbox : list
85
- bbox of the next line
86
- avg_char_width : float
87
- average of char widths
88
- X0 : float
89
- median of x0 values, which represents the left average boundary of the page
90
- X1 : float
91
- median of x1 values, which represents the right average boundary of the page
92
- avg_line_height : float
93
- average of line heights
94
-
95
- Returns
96
- -------
97
- bool
98
- True if the line is a regular line, False otherwise.
99
- """
100
- horizontal_ratio = 0.5
101
- vertical_ratio = 0.5
102
- horizontal_thres = horizontal_ratio * avg_char_width
103
- vertical_thres = vertical_ratio * avg_line_height
104
-
105
- x0, y0, x1, y1 = curr_line_bbox
106
-
107
- x0_near_X0 = abs(x0 - X0) < horizontal_thres
108
- x1_near_X1 = abs(x1 - X1) < horizontal_thres
109
-
110
- prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
111
-
112
- sufficient_spacing_above = False
113
- if prev_line_bbox:
114
- vertical_spacing_above = y1 - prev_line_bbox[3]
115
- sufficient_spacing_above = vertical_spacing_above > vertical_thres
116
-
117
- sufficient_spacing_below = False
118
- if next_line_bbox:
119
- vertical_spacing_below = next_line_bbox[1] - y0
120
- sufficient_spacing_below = vertical_spacing_below > vertical_thres
121
-
122
- return (
123
- (sufficient_spacing_above or sufficient_spacing_below)
124
- or (not x0_near_X0 and not x1_near_X1)
125
- or prev_line_is_end_of_para
126
- )
127
-
128
- def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
129
- """
130
- This function checks if the line is a possible start of a paragraph
131
-
132
- Parameters
133
- ----------
134
- curr_line : dict
135
- current line
136
- prev_line : dict
137
- previous line
138
- next_line : dict
139
- next line
140
- X0 : float
141
- median of x0 values, which represents the left average boundary of the page
142
- X1 : float
143
- median of x1 values, which represents the right average boundary of the page
144
- avg_char_width : float
145
- average of char widths
146
- avg_line_height : float
147
- average of line heights
148
-
149
- Returns
150
- -------
151
- bool
152
- True if the line is a possible start of a paragraph, False otherwise.
153
- """
154
- start_confidence = 0.5 # Initial confidence of the line being a start of a paragraph
155
- decision_path = [] # Record the decision path
156
-
157
- curr_line_bbox = curr_line["bbox"]
158
- prev_line_bbox = prev_line["bbox"] if prev_line else None
159
- next_line_bbox = next_line["bbox"] if next_line else None
160
-
161
- indent_ratio = 1
162
-
163
- vertical_ratio = 1.5
164
- vertical_thres = vertical_ratio * avg_font_size
165
-
166
- left_horizontal_ratio = 0.5
167
- left_horizontal_thres = left_horizontal_ratio * avg_char_width
168
-
169
- right_horizontal_ratio = 2.5
170
- right_horizontal_thres = right_horizontal_ratio * avg_char_width
171
-
172
- x0, y0, x1, y1 = curr_line_bbox
173
-
174
- indent_condition = x0 > X0 + indent_ratio * avg_char_width
175
- if indent_condition:
176
- start_confidence += 0.2
177
- decision_path.append("indent_condition_met")
178
-
179
- x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
180
- if x0_near_X0:
181
- start_confidence += 0.1
182
- decision_path.append("x0_near_X0")
183
-
184
- x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
185
- if x1_near_X1:
186
- start_confidence += 0.1
187
- decision_path.append("x1_near_X1")
188
-
189
- if prev_line is None:
190
- prev_line_is_end_of_para = True
191
- start_confidence += 0.2
192
- decision_path.append("no_prev_line")
193
- else:
194
- prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
195
- if prev_line_is_end_of_para:
196
- start_confidence += 0.1
197
- decision_path.append("prev_line_is_end_of_para")
198
-
199
- sufficient_spacing_above = False
200
- if prev_line_bbox:
201
- vertical_spacing_above = y1 - prev_line_bbox[3]
202
- sufficient_spacing_above = vertical_spacing_above > vertical_thres
203
- if sufficient_spacing_above:
204
- start_confidence += 0.2
205
- decision_path.append("sufficient_spacing_above")
206
-
207
- sufficient_spacing_below = False
208
- if next_line_bbox:
209
- vertical_spacing_below = next_line_bbox[1] - y0
210
- sufficient_spacing_below = vertical_spacing_below > vertical_thres
211
- if sufficient_spacing_below:
212
- start_confidence += 0.2
213
- decision_path.append("sufficient_spacing_below")
214
-
215
- is_regular_line = self._is_regular_line(
216
- curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
217
- )
218
- if is_regular_line:
219
- start_confidence += 0.1
220
- decision_path.append("is_regular_line")
221
-
222
- is_start_of_para = (
223
- (sufficient_spacing_above or sufficient_spacing_below)
224
- or (indent_condition)
225
- or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
226
- or prev_line_is_end_of_para
227
- )
228
- return (is_start_of_para, start_confidence, decision_path)
229
-
230
- def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
231
- """
232
- This function checks if the line is a possible end of a paragraph
233
-
234
- Parameters
235
- ----------
236
- curr_line : dict
237
- current line
238
- next_line : dict
239
- next line
240
- X0 : float
241
- median of x0 values, which represents the left average boundary of the page
242
- X1 : float
243
- median of x1 values, which represents the right average boundary of the page
244
- avg_char_width : float
245
- average of char widths
246
-
247
- Returns
248
- -------
249
- bool
250
- True if the line is a possible end of a paragraph, False otherwise.
251
- """
252
-
253
- end_confidence = 0.5 # Initial confidence of the line being a end of a paragraph
254
- decision_path = [] # Record the decision path
255
-
256
- curr_line_bbox = curr_line["bbox"]
257
- next_line_bbox = next_line["bbox"] if next_line else None
258
-
259
- left_horizontal_ratio = 0.5
260
- right_horizontal_ratio = 0.5
261
-
262
- x0, _, x1, y1 = curr_line_bbox
263
- next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
264
-
265
- x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
266
- if x0_near_X0:
267
- end_confidence += 0.1
268
- decision_path.append("x0_near_X0")
269
-
270
- x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
271
- if x1_smaller_than_X1:
272
- end_confidence += 0.1
273
- decision_path.append("x1_smaller_than_X1")
274
-
275
- next_line_is_start_of_para = (
276
- next_line_bbox
277
- and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
278
- and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
279
- )
280
- if next_line_is_start_of_para:
281
- end_confidence += 0.2
282
- decision_path.append("next_line_is_start_of_para")
283
-
284
- is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
285
- curr_line_bbox, None, next_line_bbox, avg_char_width
286
- )
287
- if is_line_left_aligned_from_neighbors_bool:
288
- end_confidence += 0.1
289
- decision_path.append("line_is_left_aligned_from_neighbors")
290
-
291
- is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
292
- curr_line_bbox, None, next_line_bbox, avg_char_width
293
- )
294
- if not is_line_right_aligned_from_neighbors_bool:
295
- end_confidence += 0.1
296
- decision_path.append("line_is_not_right_aligned_from_neighbors")
297
-
298
- is_end_of_para = end_with_punctuation(curr_line["text"]) and (
299
- (x0_near_X0 and x1_smaller_than_X1)
300
- or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
301
- )
302
-
303
- return (is_end_of_para, end_confidence, decision_path)
304
-
305
- def _cut_paras_per_block(
306
- self,
307
- block,
308
- ):
309
- """
310
- Processes a raw block from PyMuPDF and returns the processed block.
311
-
312
- Parameters
313
- ----------
314
- raw_block : dict
315
- A raw block from pymupdf.
316
-
317
- Returns
318
- -------
319
- processed_block : dict
320
-
321
- """
322
-
323
- def _construct_para(lines, is_block_title, para_title_level):
324
- """
325
- Construct a paragraph from given lines.
326
- """
327
-
328
- font_sizes = [span["size"] for line in lines for span in line["spans"]]
329
- avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
330
-
331
- font_colors = [span["color"] for line in lines for span in line["spans"]]
332
- most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
333
-
334
- # font_types = [span["font"] for line in lines for span in line["spans"]]
335
- # most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None
336
-
337
- font_type_lengths = {}
338
- for line in lines:
339
- for span in line["spans"]:
340
- font_type = span["font"]
341
- bbox_width = span["bbox"][2] - span["bbox"][0]
342
- if font_type in font_type_lengths:
343
- font_type_lengths[font_type] += bbox_width
344
- else:
345
- font_type_lengths[font_type] = bbox_width
346
-
347
- # get the font type with the longest bbox width
348
- most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None # type: ignore
349
-
350
- para_bbox = calculate_para_bbox(lines)
351
- para_text = " ".join(line["text"] for line in lines)
352
-
353
- return {
354
- "para_bbox": para_bbox,
355
- "para_text": para_text,
356
- "para_font_type": most_common_font_type,
357
- "para_font_size": avg_font_size,
358
- "para_font_color": most_common_font_color,
359
- "is_para_title": is_block_title,
360
- "para_title_level": para_title_level,
361
- }
362
-
363
- block_bbox = block["bbox"]
364
- block_text = block["text"]
365
- block_lines = block["lines"]
366
-
367
- X0 = safe_get(block, "X0", 0)
368
- X1 = safe_get(block, "X1", 0)
369
- avg_char_width = safe_get(block, "avg_char_width", 0)
370
- avg_char_height = safe_get(block, "avg_char_height", 0)
371
- avg_font_size = safe_get(block, "avg_font_size", 0)
372
-
373
- is_block_title = safe_get(block, "is_block_title", False)
374
- para_title_level = safe_get(block, "block_title_level", 0)
375
-
376
- # Segment into paragraphs
377
- para_ranges = []
378
- in_paragraph = False
379
- start_idx_of_para = None
380
-
381
- # Create the processed paragraphs
382
- processed_paras = {}
383
- para_bboxes = []
384
- end_idx_of_para = 0
385
-
386
- for line_index, line in enumerate(block_lines):
387
- curr_line = line
388
- prev_line = block_lines[line_index - 1] if line_index > 0 else None
389
- next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
390
-
391
- """
392
- Start processing paragraphs.
393
- """
394
-
395
- # Check if the line is the start of a paragraph
396
- is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
397
- curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
398
- )
399
- if not in_paragraph and is_start_of_para:
400
- in_paragraph = True
401
- start_idx_of_para = line_index
402
-
403
- # print_green(">>> Start of a paragraph")
404
- # print(" curr_line_text: ", curr_line["text"])
405
- # print(" start_confidence: ", start_confidence)
406
- # print(" decision_path: ", decision_path)
407
-
408
- # Check if the line is the end of a paragraph
409
- is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
410
- curr_line, next_line, X0, X1, avg_char_width
411
- )
412
- if in_paragraph and (is_end_of_para or not next_line):
413
- para_ranges.append((start_idx_of_para, line_index))
414
- start_idx_of_para = None
415
- in_paragraph = False
416
-
417
- # print_red(">>> End of a paragraph")
418
- # print(" curr_line_text: ", curr_line["text"])
419
- # print(" end_confidence: ", end_confidence)
420
- # print(" decision_path: ", decision_path)
421
-
422
- # Add the last paragraph if it is not added
423
- if in_paragraph and start_idx_of_para is not None:
424
- para_ranges.append((start_idx_of_para, len(block_lines) - 1))
425
-
426
- # Process the matched paragraphs
427
- for para_index, (start_idx, end_idx) in enumerate(para_ranges):
428
- matched_lines = block_lines[start_idx : end_idx + 1]
429
- para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
430
- para_key = f"para_{len(processed_paras)}"
431
- processed_paras[para_key] = para_properties
432
- para_bboxes.append(para_properties["para_bbox"])
433
- end_idx_of_para = end_idx + 1
434
-
435
- # Deal with the remaining lines
436
- if end_idx_of_para < len(block_lines):
437
- unmatched_lines = block_lines[end_idx_of_para:]
438
- unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
439
- unmatched_key = f"para_{len(processed_paras)}"
440
- processed_paras[unmatched_key] = unmatched_properties
441
- para_bboxes.append(unmatched_properties["para_bbox"])
442
-
443
- block["paras"] = processed_paras
444
-
445
- return block
446
-
447
- def batch_process_blocks(self, pdf_dict):
448
- """
449
- Parses the blocks of all pages.
450
-
451
- Parameters
452
- ----------
453
- pdf_dict : dict
454
- PDF dictionary.
455
- filter_blocks : list
456
- List of bounding boxes to filter.
457
-
458
- Returns
459
- -------
460
- result_dict : dict
461
- Result dictionary.
462
-
463
- """
464
-
465
- num_paras = 0
466
-
467
- for page_id, page in pdf_dict.items():
468
- if page_id.startswith("page_"):
469
- para_blocks = []
470
- if "para_blocks" in page.keys():
471
- input_blocks = page["para_blocks"]
472
- for input_block in input_blocks:
473
- new_block = self._cut_paras_per_block(input_block)
474
- para_blocks.append(new_block)
475
- num_paras += len(new_block["paras"])
476
-
477
- page["para_blocks"] = para_blocks
478
-
479
- pdf_dict["statistics"]["num_paras"] = num_paras
480
- return pdf_dict
magic_pdf/para/commons.py DELETED
@@ -1,222 +0,0 @@
1
- import sys
2
-
3
- from magic_pdf.libs.commons import fitz
4
- from termcolor import cprint
5
-
6
-
7
- if sys.version_info[0] >= 3:
8
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
9
-
10
-
11
- def open_pdf(pdf_path):
12
- try:
13
- pdf_document = fitz.open(pdf_path) # type: ignore
14
- return pdf_document
15
- except Exception as e:
16
- print(f"无法打开PDF文件:{pdf_path}。原因是:{e}")
17
- raise e
18
-
19
-
20
- def print_green_on_red(text):
21
- cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
22
-
23
-
24
- def print_green(text):
25
- print()
26
- cprint(text, "green", attrs=["bold"], end="\n\n")
27
-
28
-
29
- def print_red(text):
30
- print()
31
- cprint(text, "red", attrs=["bold"], end="\n\n")
32
-
33
-
34
- def print_yellow(text):
35
- print()
36
- cprint(text, "yellow", attrs=["bold"], end="\n\n")
37
-
38
-
39
- def safe_get(dict_obj, key, default):
40
- val = dict_obj.get(key)
41
- if val is None:
42
- return default
43
- else:
44
- return val
45
-
46
-
47
- def is_bbox_overlap(bbox1, bbox2):
48
- """
49
- This function checks if bbox1 and bbox2 overlap or not
50
-
51
- Parameters
52
- ----------
53
- bbox1 : list
54
- bbox1
55
- bbox2 : list
56
- bbox2
57
-
58
- Returns
59
- -------
60
- bool
61
- True if bbox1 and bbox2 overlap, else False
62
- """
63
- x0_1, y0_1, x1_1, y1_1 = bbox1
64
- x0_2, y0_2, x1_2, y1_2 = bbox2
65
-
66
- if x0_1 > x1_2 or x0_2 > x1_1:
67
- return False
68
- if y0_1 > y1_2 or y0_2 > y1_1:
69
- return False
70
-
71
- return True
72
-
73
-
74
- def is_in_bbox(bbox1, bbox2):
75
- """
76
- This function checks if bbox1 is in bbox2
77
-
78
- Parameters
79
- ----------
80
- bbox1 : list
81
- bbox1
82
- bbox2 : list
83
- bbox2
84
-
85
- Returns
86
- -------
87
- bool
88
- True if bbox1 is in bbox2, else False
89
- """
90
- x0_1, y0_1, x1_1, y1_1 = bbox1
91
- x0_2, y0_2, x1_2, y1_2 = bbox2
92
-
93
- if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
94
- return True
95
- else:
96
- return False
97
-
98
-
99
- def calculate_para_bbox(lines):
100
- """
101
- This function calculates the minimum bbox of the paragraph
102
-
103
- Parameters
104
- ----------
105
- lines : list
106
- lines
107
-
108
- Returns
109
- -------
110
- para_bbox : list
111
- bbox of the paragraph
112
- """
113
- x0 = min(line["bbox"][0] for line in lines)
114
- y0 = min(line["bbox"][1] for line in lines)
115
- x1 = max(line["bbox"][2] for line in lines)
116
- y1 = max(line["bbox"][3] for line in lines)
117
- return [x0, y0, x1, y1]
118
-
119
-
120
- def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
121
- """
122
- This function checks if the line is right aligned from its neighbors
123
-
124
- Parameters
125
- ----------
126
- curr_line_bbox : list
127
- bbox of the current line
128
- prev_line_bbox : list
129
- bbox of the previous line
130
- next_line_bbox : list
131
- bbox of the next line
132
- avg_char_width : float
133
- average of char widths
134
- direction : int
135
- 0 for prev, 1 for next, 2 for both
136
-
137
- Returns
138
- -------
139
- bool
140
- True if the line is right aligned from its neighbors, False otherwise.
141
- """
142
- horizontal_ratio = 0.5
143
- horizontal_thres = horizontal_ratio * avg_char_width
144
-
145
- _, _, x1, _ = curr_line_bbox
146
- _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
147
- _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
148
-
149
- if direction == 0:
150
- return abs(x1 - prev_x1) < horizontal_thres
151
- elif direction == 1:
152
- return abs(x1 - next_x1) < horizontal_thres
153
- elif direction == 2:
154
- return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
155
- else:
156
- return False
157
-
158
-
159
- def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
160
- """
161
- This function checks if the line is left aligned from its neighbors
162
-
163
- Parameters
164
- ----------
165
- curr_line_bbox : list
166
- bbox of the current line
167
- prev_line_bbox : list
168
- bbox of the previous line
169
- next_line_bbox : list
170
- bbox of the next line
171
- avg_char_width : float
172
- average of char widths
173
- direction : int
174
- 0 for prev, 1 for next, 2 for both
175
-
176
- Returns
177
- -------
178
- bool
179
- True if the line is left aligned from its neighbors, False otherwise.
180
- """
181
- horizontal_ratio = 0.5
182
- horizontal_thres = horizontal_ratio * avg_char_width
183
-
184
- x0, _, _, _ = curr_line_bbox
185
- prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
186
- next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
187
-
188
- if direction == 0:
189
- return abs(x0 - prev_x0) < horizontal_thres
190
- elif direction == 1:
191
- return abs(x0 - next_x0) < horizontal_thres
192
- elif direction == 2:
193
- return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
194
- else:
195
- return False
196
-
197
-
198
- def end_with_punctuation(line_text):
199
- """
200
- This function checks if the line ends with punctuation marks
201
- """
202
-
203
- english_end_puncs = [".", "?", "!"]
204
- chinese_end_puncs = ["。", "?", "!"]
205
- end_puncs = english_end_puncs + chinese_end_puncs
206
-
207
- last_non_space_char = None
208
- for ch in line_text[::-1]:
209
- if not ch.isspace():
210
- last_non_space_char = ch
211
- break
212
-
213
- if last_non_space_char is None:
214
- return False
215
-
216
- return last_non_space_char in end_puncs
217
-
218
-
219
- def is_nested_list(lst):
220
- if isinstance(lst, list):
221
- return any(isinstance(sub, list) for sub in lst)
222
- return False