mineru 2.6.8__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. mineru/backend/hybrid/__init__.py +1 -0
  2. mineru/backend/hybrid/hybrid_analyze.py +526 -0
  3. mineru/backend/hybrid/hybrid_magic_model.py +617 -0
  4. mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
  5. mineru/backend/pipeline/batch_analyze.py +9 -1
  6. mineru/backend/pipeline/model_init.py +96 -1
  7. mineru/backend/pipeline/pipeline_analyze.py +6 -4
  8. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
  9. mineru/backend/vlm/utils.py +3 -1
  10. mineru/backend/vlm/vlm_analyze.py +12 -12
  11. mineru/backend/vlm/vlm_magic_model.py +24 -89
  12. mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
  13. mineru/cli/client.py +17 -17
  14. mineru/cli/common.py +170 -20
  15. mineru/cli/fast_api.py +39 -13
  16. mineru/cli/gradio_app.py +232 -206
  17. mineru/model/mfd/yolo_v8.py +12 -6
  18. mineru/model/mfr/unimernet/Unimernet.py +71 -3
  19. mineru/resources/header.html +5 -1
  20. mineru/utils/boxbase.py +23 -0
  21. mineru/utils/char_utils.py +55 -0
  22. mineru/utils/engine_utils.py +74 -0
  23. mineru/utils/enum_class.py +18 -1
  24. mineru/utils/magic_model_utils.py +85 -2
  25. mineru/utils/span_pre_proc.py +5 -3
  26. mineru/utils/table_merge.py +5 -21
  27. mineru/version.py +1 -1
  28. mineru-2.7.0.dist-info/METADATA +433 -0
  29. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/RECORD +33 -27
  30. mineru-2.6.8.dist-info/METADATA +0 -954
  31. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/WHEEL +0 -0
  32. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/entry_points.txt +0 -0
  33. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/licenses/LICENSE.md +0 -0
  34. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,617 @@
1
+ import re
2
+ from typing import Literal
3
+
4
+ from loguru import logger
5
+
6
+ from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
7
+ from mineru.utils.enum_class import ContentType, BlockType, NotExtractType
8
+ from mineru.utils.guess_suffix_or_lang import guess_language_by_text
9
+ from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
10
+ from mineru.utils.span_block_fix import fix_text_block
11
+ from mineru.utils.span_pre_proc import txt_spans_extract
12
+
13
+ not_extract_list = [item.value for item in NotExtractType]
14
+
15
+ class MagicModel:
16
+ def __init__(self,
17
+ page_blocks: list,
18
+ page_inline_formula,
19
+ page_ocr_res,
20
+ page,
21
+ scale,
22
+ page_pil_img,
23
+ width,
24
+ height,
25
+ _ocr_enable,
26
+ _vlm_ocr_enable,
27
+ ):
28
+ self.page_blocks = page_blocks
29
+ self.page_inline_formula = page_inline_formula
30
+ self.page_ocr_res = page_ocr_res
31
+
32
+ self.width = width
33
+ self.height = height
34
+
35
+ blocks = []
36
+ self.all_spans = []
37
+
38
+ page_text_inline_formula_spans = []
39
+ if not _vlm_ocr_enable:
40
+ for inline_formula in page_inline_formula:
41
+ inline_formula["bbox"] = self.cal_real_bbox(inline_formula["bbox"])
42
+ inline_formula_latex = inline_formula.pop("latex", "")
43
+ if inline_formula_latex:
44
+ page_text_inline_formula_spans.append({
45
+ "bbox": inline_formula["bbox"],
46
+ "type": ContentType.INLINE_EQUATION,
47
+ "content": inline_formula_latex,
48
+ "score": inline_formula["score"],
49
+ })
50
+ for ocr_res in page_ocr_res:
51
+ ocr_res["bbox"] = self.cal_real_bbox(ocr_res["bbox"])
52
+ if ocr_res['category_id'] == 15:
53
+ page_text_inline_formula_spans.append({
54
+ "bbox": ocr_res["bbox"],
55
+ "type": ContentType.TEXT,
56
+ "content": ocr_res["text"],
57
+ "score": ocr_res["score"],
58
+ })
59
+ if not _ocr_enable:
60
+ virtual_block = [0, 0, width, height, None, None, None, "text"]
61
+ page_text_inline_formula_spans = txt_spans_extract(page, page_text_inline_formula_spans, page_pil_img, scale, [virtual_block],[])
62
+
63
+ # 解析每个块
64
+ for index, block_info in enumerate(page_blocks):
65
+ try:
66
+ block_bbox = self.cal_real_bbox(block_info["bbox"])
67
+ block_type = block_info["type"]
68
+ block_content = block_info["content"]
69
+ block_angle = block_info["angle"]
70
+
71
+ # print(f"坐标: {block_bbox}")
72
+ # print(f"类型: {block_type}")
73
+ # print(f"内容: {block_content}")
74
+ # print("-" * 50)
75
+ except Exception as e:
76
+ # 如果解析失败,可能是因为格式不正确,跳过这个块
77
+ logger.warning(f"Invalid block format: {block_info}, error: {e}")
78
+ continue
79
+
80
+ span_type = "unknown"
81
+ code_block_sub_type = None
82
+ guess_lang = None
83
+
84
+ if block_type in [
85
+ "text",
86
+ "title",
87
+ "image_caption",
88
+ "image_footnote",
89
+ "table_caption",
90
+ "table_footnote",
91
+ "code_caption",
92
+ "ref_text",
93
+ "phonetic",
94
+ "header",
95
+ "footer",
96
+ "page_number",
97
+ "aside_text",
98
+ "page_footnote",
99
+ "list"
100
+ ]:
101
+ span_type = ContentType.TEXT
102
+ elif block_type in ["image"]:
103
+ block_type = BlockType.IMAGE_BODY
104
+ span_type = ContentType.IMAGE
105
+ elif block_type in ["table"]:
106
+ block_type = BlockType.TABLE_BODY
107
+ span_type = ContentType.TABLE
108
+ elif block_type in ["code", "algorithm"]:
109
+ block_content = code_content_clean(block_content)
110
+ code_block_sub_type = block_type
111
+ block_type = BlockType.CODE_BODY
112
+ span_type = ContentType.TEXT
113
+ guess_lang = guess_language_by_text(block_content)
114
+ elif block_type in ["equation"]:
115
+ block_type = BlockType.INTERLINE_EQUATION
116
+ span_type = ContentType.INTERLINE_EQUATION
117
+
118
+ # code 和 algorithm 类型的块,如果内容中包含行内公式,则需要将块类型切换为algorithm
119
+ switch_code_to_algorithm = False
120
+
121
+ span = None
122
+ if span_type in ["image", "table"]:
123
+ span = {
124
+ "bbox": block_bbox,
125
+ "type": span_type,
126
+ }
127
+ if span_type == ContentType.TABLE:
128
+ span["html"] = block_content
129
+ elif span_type in [ContentType.INTERLINE_EQUATION]:
130
+ span = {
131
+ "bbox": block_bbox,
132
+ "type": span_type,
133
+ "content": isolated_formula_clean(block_content),
134
+ }
135
+ elif _vlm_ocr_enable or block_type not in not_extract_list:
136
+ # vlm_ocr_enable模式下,所有文本块都直接使用block的内容
137
+ # 非vlm_ocr_enable模式下,非提取块需要使用span填充方式
138
+ if block_content:
139
+ block_content = clean_content(block_content)
140
+
141
+ if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
142
+
143
+ switch_code_to_algorithm = True
144
+
145
+ # 生成包含文本和公式的span列表
146
+ spans = []
147
+ last_end = 0
148
+
149
+ # 查找所有公式
150
+ for match in re.finditer(r'\\\((.+?)\\\)', block_content):
151
+ start, end = match.span()
152
+
153
+ # 添加公式前的文本
154
+ if start > last_end:
155
+ text_before = block_content[last_end:start]
156
+ if text_before.strip():
157
+ spans.append({
158
+ "bbox": block_bbox,
159
+ "type": ContentType.TEXT,
160
+ "content": text_before
161
+ })
162
+
163
+ # 添加公式(去除\(和\))
164
+ formula = match.group(1)
165
+ spans.append({
166
+ "bbox": block_bbox,
167
+ "type": ContentType.INLINE_EQUATION,
168
+ "content": formula.strip()
169
+ })
170
+
171
+ last_end = end
172
+
173
+ # 添加最后一个公式后的文本
174
+ if last_end < len(block_content):
175
+ text_after = block_content[last_end:]
176
+ if text_after.strip():
177
+ spans.append({
178
+ "bbox": block_bbox,
179
+ "type": ContentType.TEXT,
180
+ "content": text_after
181
+ })
182
+
183
+ span = spans
184
+ else:
185
+ span = {
186
+ "bbox": block_bbox,
187
+ "type": span_type,
188
+ "content": block_content,
189
+ }
190
+
191
+ if (
192
+ span_type in ["image", "table", ContentType.INTERLINE_EQUATION]
193
+ or (_vlm_ocr_enable or block_type not in not_extract_list)
194
+ ):
195
+ if span is None:
196
+ continue
197
+ # 处理span类型并添加到all_spans
198
+ if isinstance(span, dict) and "bbox" in span:
199
+ self.all_spans.append(span)
200
+ spans = [span]
201
+ elif isinstance(span, list):
202
+ self.all_spans.extend(span)
203
+ spans = span
204
+ else:
205
+ raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
206
+
207
+ # 构造line对象
208
+ if block_type in [BlockType.CODE_BODY]:
209
+ if switch_code_to_algorithm and code_block_sub_type == "code":
210
+ code_block_sub_type = "algorithm"
211
+ line = {"bbox": block_bbox, "spans": spans,
212
+ "extra": {"type": code_block_sub_type, "guess_lang": guess_lang}}
213
+ else:
214
+ line = {"bbox": block_bbox, "spans": spans}
215
+
216
+ block = {
217
+ "bbox": block_bbox,
218
+ "type": block_type,
219
+ "angle": block_angle,
220
+ "lines": [line],
221
+ "index": index,
222
+ }
223
+
224
+ else: # 使用span填充方式
225
+ block_spans = []
226
+ for span in page_text_inline_formula_spans:
227
+ if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
228
+ block_spans.append(span)
229
+ # 从spans删除已经放入block_spans中的span
230
+ if len(block_spans) > 0:
231
+ for span in block_spans:
232
+ page_text_inline_formula_spans.remove(span)
233
+
234
+ block = {
235
+ "bbox": block_bbox,
236
+ "type": block_type,
237
+ "angle": block_angle,
238
+ "spans": block_spans,
239
+ "index": index,
240
+ }
241
+ block = fix_text_block(block)
242
+
243
+ blocks.append(block)
244
+
245
+ self.image_blocks = []
246
+ self.table_blocks = []
247
+ self.interline_equation_blocks = []
248
+ self.text_blocks = []
249
+ self.title_blocks = []
250
+ self.code_blocks = []
251
+ self.discarded_blocks = []
252
+ self.ref_text_blocks = []
253
+ self.phonetic_blocks = []
254
+ self.list_blocks = []
255
+ for block in blocks:
256
+ if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
257
+ self.image_blocks.append(block)
258
+ elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
259
+ self.table_blocks.append(block)
260
+ elif block["type"] in [BlockType.CODE_BODY, BlockType.CODE_CAPTION]:
261
+ self.code_blocks.append(block)
262
+ elif block["type"] == BlockType.INTERLINE_EQUATION:
263
+ self.interline_equation_blocks.append(block)
264
+ elif block["type"] == BlockType.TEXT:
265
+ self.text_blocks.append(block)
266
+ elif block["type"] == BlockType.TITLE:
267
+ self.title_blocks.append(block)
268
+ elif block["type"] in [BlockType.REF_TEXT]:
269
+ self.ref_text_blocks.append(block)
270
+ elif block["type"] in [BlockType.PHONETIC]:
271
+ self.phonetic_blocks.append(block)
272
+ elif block["type"] in [BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
273
+ self.discarded_blocks.append(block)
274
+ elif block["type"] == BlockType.LIST:
275
+ self.list_blocks.append(block)
276
+ else:
277
+ continue
278
+
279
+ self.list_blocks, self.text_blocks, self.ref_text_blocks = fix_list_blocks(self.list_blocks, self.text_blocks, self.ref_text_blocks)
280
+ self.image_blocks, not_include_image_blocks = fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
281
+ self.table_blocks, not_include_table_blocks = fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
282
+ self.code_blocks, not_include_code_blocks = fix_two_layer_blocks(self.code_blocks, BlockType.CODE)
283
+ for code_block in self.code_blocks:
284
+ for block in code_block['blocks']:
285
+ if block['type'] == BlockType.CODE_BODY:
286
+ if len(block["lines"]) > 0:
287
+ line = block["lines"][0]
288
+ code_block["sub_type"] = line["extra"]["type"]
289
+ if code_block["sub_type"] in ["code"]:
290
+ code_block["guess_lang"] = line["extra"]["guess_lang"]
291
+ del line["extra"]
292
+ else:
293
+ code_block["sub_type"] = "code"
294
+ code_block["guess_lang"] = "txt"
295
+
296
+ for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
297
+ block["type"] = BlockType.TEXT
298
+ self.text_blocks.append(block)
299
+
300
+ def cal_real_bbox(self, bbox):
301
+ x1, y1, x2, y2 = bbox
302
+ x_1, y_1, x_2, y_2 = (
303
+ int(x1 * self.width),
304
+ int(y1 * self.height),
305
+ int(x2 * self.width),
306
+ int(y2 * self.height),
307
+ )
308
+ if x_2 < x_1:
309
+ x_1, x_2 = x_2, x_1
310
+ if y_2 < y_1:
311
+ y_1, y_2 = y_2, y_1
312
+ bbox = (x_1, y_1, x_2, y_2)
313
+ return bbox
314
+
315
+ def get_list_blocks(self):
316
+ return self.list_blocks
317
+
318
+ def get_image_blocks(self):
319
+ return self.image_blocks
320
+
321
+ def get_table_blocks(self):
322
+ return self.table_blocks
323
+
324
+ def get_code_blocks(self):
325
+ return self.code_blocks
326
+
327
+ def get_ref_text_blocks(self):
328
+ return self.ref_text_blocks
329
+
330
+ def get_phonetic_blocks(self):
331
+ return self.phonetic_blocks
332
+
333
+ def get_title_blocks(self):
334
+ return self.title_blocks
335
+
336
+ def get_text_blocks(self):
337
+ return self.text_blocks
338
+
339
+ def get_interline_equation_blocks(self):
340
+ return self.interline_equation_blocks
341
+
342
+ def get_discarded_blocks(self):
343
+ return self.discarded_blocks
344
+
345
+ def get_all_spans(self):
346
+ return self.all_spans
347
+
348
+
349
+ def isolated_formula_clean(txt):
350
+ latex = txt[:]
351
+ if latex.startswith("\\["): latex = latex[2:]
352
+ if latex.endswith("\\]"): latex = latex[:-2]
353
+ latex = latex.strip()
354
+ return latex
355
+
356
+
357
+ def code_content_clean(content):
358
+ """清理代码内容,移除Markdown代码块的开始和结束标记"""
359
+ if not content:
360
+ return ""
361
+
362
+ lines = content.splitlines()
363
+ start_idx = 0
364
+ end_idx = len(lines)
365
+
366
+ # 处理开头的三个反引号
367
+ if lines and lines[0].startswith("```"):
368
+ start_idx = 1
369
+
370
+ # 处理结尾的三个反引号
371
+ if lines and end_idx > start_idx and lines[end_idx - 1].strip() == "```":
372
+ end_idx -= 1
373
+
374
+ # 只有在有内容时才进行join操作
375
+ if start_idx < end_idx:
376
+ return "\n".join(lines[start_idx:end_idx]).strip()
377
+ return ""
378
+
379
+
380
+ def clean_content(content):
381
+ if content and content.count("\\[") == content.count("\\]") and content.count("\\[") > 0:
382
+ # Function to handle each match
383
+ def replace_pattern(match):
384
+ # Extract content between \[ and \]
385
+ inner_content = match.group(1)
386
+ return f"[{inner_content}]"
387
+
388
+ # Find all patterns of \[x\] and apply replacement
389
+ pattern = r'\\\[(.*?)\\\]'
390
+ content = re.sub(pattern, replace_pattern, content)
391
+
392
+ return content
393
+
394
+
395
+ def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
396
+ """基于index的主客体关联包装函数"""
397
+ # 定义获取主体和客体对象的函数
398
+ def get_subjects():
399
+ return reduct_overlap(
400
+ list(
401
+ map(
402
+ lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
403
+ filter(
404
+ lambda x: x["type"] == subject_block_type,
405
+ blocks,
406
+ ),
407
+ )
408
+ )
409
+ )
410
+
411
+ def get_objects():
412
+ return reduct_overlap(
413
+ list(
414
+ map(
415
+ lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
416
+ filter(
417
+ lambda x: x["type"] == object_block_type,
418
+ blocks,
419
+ ),
420
+ )
421
+ )
422
+ )
423
+
424
+ # 调用通用方法
425
+ return tie_up_category_by_index(
426
+ get_subjects,
427
+ get_objects
428
+ )
429
+
430
+
431
+ def get_type_blocks_by_index(blocks, block_type: Literal["image", "table", "code"]):
432
+ """使用基于index的匹配策略来组织blocks"""
433
+ with_captions = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_caption")
434
+ with_footnotes = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_footnote")
435
+ ret = []
436
+ for v in with_captions:
437
+ record = {
438
+ f"{block_type}_body": v["sub_bbox"],
439
+ f"{block_type}_caption_list": v["obj_bboxes"],
440
+ }
441
+ filter_idx = v["sub_idx"]
442
+ d = next(filter(lambda x: x["sub_idx"] == filter_idx, with_footnotes))
443
+ record[f"{block_type}_footnote_list"] = d["obj_bboxes"]
444
+ ret.append(record)
445
+ return ret
446
+
447
+
448
+ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
449
+ need_fix_blocks = get_type_blocks_by_index(blocks, fix_type)
450
+ fixed_blocks = []
451
+ not_include_blocks = []
452
+ processed_indices = set()
453
+
454
+ # 特殊处理表格类型,确保标题在表格前,注脚在表格后
455
+ if fix_type in ["table", "image"]:
456
+ # 收集所有不合适的caption和footnote
457
+ misplaced_footnotes = [] # 存储(footnote, 原始block索引)
458
+
459
+ # 第一步:移除不符合位置要求的footnote
460
+ for block_idx, block in enumerate(need_fix_blocks):
461
+ body = block[f"{fix_type}_body"]
462
+ body_index = body["index"]
463
+
464
+ # 检查footnote应在body后或同位置
465
+ valid_footnotes = []
466
+ for footnote in block[f"{fix_type}_footnote_list"]:
467
+ if footnote["index"] >= body_index:
468
+ valid_footnotes.append(footnote)
469
+ else:
470
+ misplaced_footnotes.append((footnote, block_idx))
471
+ block[f"{fix_type}_footnote_list"] = valid_footnotes
472
+
473
+ # 第三步:重新分配不合规的footnote到合适的body
474
+ for footnote, original_block_idx in misplaced_footnotes:
475
+ footnote_index = footnote["index"]
476
+ best_block_idx = None
477
+ min_distance = float('inf')
478
+
479
+ # 寻找索引小于等于footnote_index的最近body
480
+ for idx, block in enumerate(need_fix_blocks):
481
+ body_index = block[f"{fix_type}_body"]["index"]
482
+ if body_index <= footnote_index and idx != original_block_idx:
483
+ distance = footnote_index - body_index
484
+ if distance < min_distance:
485
+ min_distance = distance
486
+ best_block_idx = idx
487
+
488
+ if best_block_idx is not None:
489
+ # 找到合适的body,添加到对应block的footnote_list
490
+ need_fix_blocks[best_block_idx][f"{fix_type}_footnote_list"].append(footnote)
491
+ else:
492
+ # 没找到合适的body,作为普通block处理
493
+ not_include_blocks.append(footnote)
494
+
495
+ # 第四步:将每个block的caption_list和footnote_list中不连续index的元素提出来作为普通block处理
496
+ for block in need_fix_blocks:
497
+ caption_list = block[f"{fix_type}_caption_list"]
498
+ footnote_list = block[f"{fix_type}_footnote_list"]
499
+ body_index = block[f"{fix_type}_body"]["index"]
500
+
501
+ # 处理caption_list (从body往前看,caption在body之前)
502
+ if caption_list:
503
+ # 按index降序排列,从最接近body的开始检查
504
+ caption_list.sort(key=lambda x: x["index"], reverse=True)
505
+ filtered_captions = [caption_list[0]]
506
+ for i in range(1, len(caption_list)):
507
+ prev_index = caption_list[i - 1]["index"]
508
+ curr_index = caption_list[i]["index"]
509
+
510
+ # 检查是否连续
511
+ if curr_index == prev_index - 1:
512
+ filtered_captions.append(caption_list[i])
513
+ else:
514
+ # 检查gap中是否只有body_index
515
+ gap_indices = set(range(curr_index + 1, prev_index))
516
+ if gap_indices == {body_index}:
517
+ # gap中只有body_index,不算真正的gap
518
+ filtered_captions.append(caption_list[i])
519
+ else:
520
+ # 出现真正的gap,后续所有caption都作为普通block
521
+ not_include_blocks.extend(caption_list[i:])
522
+ break
523
+ # 恢复升序
524
+ filtered_captions.reverse()
525
+ block[f"{fix_type}_caption_list"] = filtered_captions
526
+
527
+ # 处理footnote_list (从body往后看,footnote在body之后)
528
+ if footnote_list:
529
+ # 按index升序排列,从最接近body的开始检查
530
+ footnote_list.sort(key=lambda x: x["index"])
531
+ filtered_footnotes = [footnote_list[0]]
532
+ for i in range(1, len(footnote_list)):
533
+ # 检查是否与前一个footnote连续
534
+ if footnote_list[i]["index"] == footnote_list[i - 1]["index"] + 1:
535
+ filtered_footnotes.append(footnote_list[i])
536
+ else:
537
+ # 出现gap,后续所有footnote都作为普通block
538
+ not_include_blocks.extend(footnote_list[i:])
539
+ break
540
+ block[f"{fix_type}_footnote_list"] = filtered_footnotes
541
+
542
+ # 构建两层结构blocks
543
+ for block in need_fix_blocks:
544
+ body = block[f"{fix_type}_body"]
545
+ caption_list = block[f"{fix_type}_caption_list"]
546
+ footnote_list = block[f"{fix_type}_footnote_list"]
547
+
548
+ body["type"] = f"{fix_type}_body"
549
+ for caption in caption_list:
550
+ caption["type"] = f"{fix_type}_caption"
551
+ processed_indices.add(caption["index"])
552
+ for footnote in footnote_list:
553
+ footnote["type"] = f"{fix_type}_footnote"
554
+ processed_indices.add(footnote["index"])
555
+
556
+ processed_indices.add(body["index"])
557
+
558
+ two_layer_block = {
559
+ "type": fix_type,
560
+ "bbox": body["bbox"],
561
+ "blocks": [body],
562
+ "index": body["index"],
563
+ }
564
+ two_layer_block["blocks"].extend([*caption_list, *footnote_list])
565
+ # 对blocks按index排序
566
+ two_layer_block["blocks"].sort(key=lambda x: x["index"])
567
+
568
+ fixed_blocks.append(two_layer_block)
569
+
570
+ # 添加未处理的blocks
571
+ for block in blocks:
572
+ block.pop("type", None)
573
+ if block["index"] not in processed_indices and block not in not_include_blocks:
574
+ not_include_blocks.append(block)
575
+
576
+ return fixed_blocks, not_include_blocks
577
+
578
+
579
+ def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
580
+ for list_block in list_blocks:
581
+ list_block["blocks"] = []
582
+ if "lines" in list_block:
583
+ del list_block["lines"]
584
+
585
+ temp_text_blocks = text_blocks + ref_text_blocks
586
+ need_remove_blocks = []
587
+ for block in temp_text_blocks:
588
+ for list_block in list_blocks:
589
+ if calculate_overlap_area_in_bbox1_area_ratio(block["bbox"], list_block["bbox"]) >= 0.8:
590
+ list_block["blocks"].append(block)
591
+ need_remove_blocks.append(block)
592
+ break
593
+
594
+ for block in need_remove_blocks:
595
+ if block in text_blocks:
596
+ text_blocks.remove(block)
597
+ elif block in ref_text_blocks:
598
+ ref_text_blocks.remove(block)
599
+
600
+ # 移除blocks为空的list_block
601
+ list_blocks = [lb for lb in list_blocks if lb["blocks"]]
602
+
603
+ for list_block in list_blocks:
604
+ # 统计list_block["blocks"]中所有block的type,用众数作为list_block的sub_type
605
+ type_count = {}
606
+ for sub_block in list_block["blocks"]:
607
+ sub_block_type = sub_block["type"]
608
+ if sub_block_type not in type_count:
609
+ type_count[sub_block_type] = 0
610
+ type_count[sub_block_type] += 1
611
+
612
+ if type_count:
613
+ list_block["sub_type"] = max(type_count, key=type_count.get)
614
+ else:
615
+ list_block["sub_type"] = "unknown"
616
+
617
+ return list_blocks, text_blocks, ref_text_blocks