docling-ibm-models 3.4.0__py3-none-any.whl → 3.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_ibm_models/reading_order/reading_order_rb.py +2 -2
- docling_ibm_models/tableformer/data_management/matching_post_processor.py +61 -96
- {docling_ibm_models-3.4.0.dist-info → docling_ibm_models-3.4.2.dist-info}/METADATA +1 -1
- {docling_ibm_models-3.4.0.dist-info → docling_ibm_models-3.4.2.dist-info}/RECORD +6 -6
- {docling_ibm_models-3.4.0.dist-info → docling_ibm_models-3.4.2.dist-info}/LICENSE +0 -0
- {docling_ibm_models-3.4.0.dist-info → docling_ibm_models-3.4.2.dist-info}/WHEEL +0 -0
@@ -201,8 +201,8 @@ class ReadingOrderPredictor:
|
|
201
201
|
)
|
202
202
|
):
|
203
203
|
|
204
|
-
m1 = re.fullmatch(".+([a-z
|
205
|
-
m2 = re.fullmatch("(\s*[a-z])(.+)", sorted_elements[ind_p1].text)
|
204
|
+
m1 = re.fullmatch(r".+([a-z,\-])(\s*)", elem.text)
|
205
|
+
m2 = re.fullmatch(r"(\s*[a-z])(.+)", sorted_elements[ind_p1].text)
|
206
206
|
|
207
207
|
if m1 and m2:
|
208
208
|
merges[elem.cid] = [sorted_elements[ind_p1].cid]
|
@@ -468,112 +468,77 @@ class MatchingPostProcessor:
|
|
468
468
|
return table_cells
|
469
469
|
|
470
470
|
def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
|
471
|
-
r"""
|
472
|
-
USED in 8.a step
|
473
|
-
NOT USED in 6. step
|
474
|
-
|
475
|
-
Align table cell bboxes with good matches
|
476
|
-
to encapsulate matching pdf cells
|
477
|
-
|
478
|
-
Parameters
|
479
|
-
----------
|
480
|
-
table_cells : list of dict
|
481
|
-
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
482
|
-
pdf_cells : list of dict
|
483
|
-
List of PDF cells as defined by Docling
|
484
|
-
matches : dictionary of lists of table_cells
|
485
|
-
A dictionary which is indexed by the pdf_cell_id as key and the value is a list
|
486
|
-
of the table_cells that fall inside that pdf cell
|
487
|
-
|
488
|
-
Returns
|
489
|
-
-------
|
490
|
-
clean_table_cells : list of dict
|
491
|
-
Aligned and cleaned table cells
|
492
471
|
"""
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
for pdf_cell_id in matches:
|
498
|
-
match_list = matches[pdf_cell_id]
|
499
|
-
one_table_cells = []
|
500
|
-
for i in range(len(match_list)):
|
501
|
-
otc = int(match_list[i]["table_cell_id"])
|
502
|
-
if otc not in one_table_cells:
|
503
|
-
one_table_cells.append(otc)
|
504
|
-
|
505
|
-
# Get bbox of pdf_cell:
|
506
|
-
pdf_cell_bbox = []
|
507
|
-
for pdf_cell in pdf_cells:
|
508
|
-
if pdf_cell["id"] == int(pdf_cell_id):
|
509
|
-
pdf_cell_bbox = pdf_cell["bbox"]
|
510
|
-
|
511
|
-
# Get bbox of pdf_cell:
|
512
|
-
for table_cell in table_cells:
|
513
|
-
if table_cell["cell_id"] in one_table_cells:
|
514
|
-
# Align bbox vertically to cover PDF cell
|
515
|
-
new_bbox = [
|
516
|
-
pdf_cell_bbox[0],
|
517
|
-
pdf_cell_bbox[1],
|
518
|
-
pdf_cell_bbox[2],
|
519
|
-
pdf_cell_bbox[3],
|
520
|
-
]
|
521
|
-
# We are sure cell is not empty,
|
522
|
-
# because we assign PDF cell to it
|
523
|
-
new_table_cell_class = "2"
|
524
|
-
|
525
|
-
if "cell_class" in table_cell:
|
526
|
-
new_table_cell_class = table_cell["cell_class"]
|
527
|
-
|
528
|
-
new_table_cell = {
|
529
|
-
"bbox": new_bbox,
|
530
|
-
"cell_id": table_cell["cell_id"],
|
531
|
-
"column_id": table_cell["column_id"],
|
532
|
-
"label": table_cell["label"],
|
533
|
-
"row_id": table_cell["row_id"],
|
534
|
-
"cell_class": new_table_cell_class,
|
535
|
-
}
|
472
|
+
Align table cell bboxes with good matches to encapsulate matching pdf cells
|
473
|
+
"""
|
474
|
+
pdf_cell_dict = {pdf_cell["id"]: pdf_cell["bbox"] for pdf_cell in pdf_cells}
|
475
|
+
table_cell_dict = {cell["cell_id"]: cell for cell in table_cells}
|
536
476
|
|
537
|
-
|
538
|
-
|
539
|
-
if "rowspan_val" in table_cell:
|
540
|
-
new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
|
541
|
-
new_table_cells.append(new_table_cell)
|
477
|
+
# Track unique cells we're going to add
|
478
|
+
processed_cells = set()
|
542
479
|
|
543
|
-
#
|
544
|
-
|
480
|
+
# First pass - create initial new_table_cells with aligned bboxes
|
481
|
+
new_table_cells = []
|
545
482
|
|
546
|
-
for
|
547
|
-
|
483
|
+
for pdf_cell_id, match_list in matches.items():
|
484
|
+
# Extract unique table cell ids from match_list
|
485
|
+
table_cell_ids = set(int(match["table_cell_id"]) for match in match_list)
|
548
486
|
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
487
|
+
# Get bbox of pdf_cell
|
488
|
+
pdf_cell_bbox = pdf_cell_dict.get(int(pdf_cell_id))
|
489
|
+
if not pdf_cell_bbox:
|
490
|
+
continue
|
553
491
|
|
554
|
-
|
492
|
+
# Process each unique table cell
|
493
|
+
for cell_id in table_cell_ids:
|
494
|
+
if cell_id in processed_cells:
|
495
|
+
continue
|
496
|
+
|
497
|
+
table_cell = table_cell_dict.get(cell_id)
|
498
|
+
if not table_cell:
|
499
|
+
continue
|
500
|
+
|
501
|
+
# Create new table cell with aligned bbox
|
502
|
+
new_table_cell = table_cell.copy()
|
503
|
+
new_table_cell["bbox"] = list(pdf_cell_bbox)
|
504
|
+
|
505
|
+
# Set cell class
|
506
|
+
if "cell_class" not in new_table_cell:
|
507
|
+
new_table_cell["cell_class"] = "2"
|
508
|
+
|
509
|
+
new_table_cells.append(new_table_cell)
|
510
|
+
processed_cells.add(cell_id)
|
511
|
+
|
512
|
+
# Second pass - aggregate bboxes for duplicate cells
|
513
|
+
cell_to_bboxes = {}
|
514
|
+
for cell in new_table_cells:
|
515
|
+
cell_id = cell["cell_id"]
|
516
|
+
if cell_id not in cell_to_bboxes:
|
517
|
+
cell_to_bboxes[cell_id] = []
|
518
|
+
cell_to_bboxes[cell_id].append(cell["bbox"])
|
519
|
+
|
520
|
+
# Create final clean table cells
|
521
|
+
clean_table_cells = []
|
522
|
+
processed_ids = set()
|
523
|
+
|
524
|
+
for cell in new_table_cells:
|
525
|
+
cell_id = cell["cell_id"]
|
526
|
+
if cell_id in processed_ids:
|
527
|
+
continue
|
555
528
|
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
529
|
+
bboxes = cell_to_bboxes[cell_id]
|
530
|
+
if len(bboxes) > 1:
|
531
|
+
# Merge bboxes
|
532
|
+
x1s = [bbox[0] for bbox in bboxes]
|
533
|
+
y1s = [bbox[1] for bbox in bboxes]
|
534
|
+
x2s = [bbox[2] for bbox in bboxes]
|
535
|
+
y2s = [bbox[3] for bbox in bboxes]
|
563
536
|
|
564
|
-
|
565
|
-
min_y1 = min(y1s)
|
566
|
-
max_x2 = max(x2s)
|
567
|
-
max_y2 = max(y2s)
|
537
|
+
cell["bbox"] = [min(x1s), min(y1s), max(x2s), max(y2s)]
|
568
538
|
|
569
|
-
|
570
|
-
|
539
|
+
clean_table_cells.append(cell)
|
540
|
+
processed_ids.add(cell_id)
|
571
541
|
|
572
|
-
clean_table_cells = [
|
573
|
-
i
|
574
|
-
for n, i in enumerate(new_table_cells)
|
575
|
-
if i not in new_table_cells[n + 1 :]
|
576
|
-
]
|
577
542
|
return clean_table_cells
|
578
543
|
|
579
544
|
def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling-ibm-models
|
3
|
-
Version: 3.4.
|
3
|
+
Version: 3.4.2
|
4
4
|
Summary: This package contains the AI models used by the Docling PDF conversion package
|
5
5
|
License: MIT
|
6
6
|
Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
|
@@ -11,12 +11,12 @@ docling_ibm_models/layoutmodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
|
|
11
11
|
docling_ibm_models/layoutmodel/layout_predictor.py,sha256=ArVgs7FBOiu23TC-JoybcaTp7F7a4BgYC8uRVxTgx4E,5681
|
12
12
|
docling_ibm_models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
docling_ibm_models/reading_order/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
docling_ibm_models/reading_order/reading_order_rb.py,sha256=
|
14
|
+
docling_ibm_models/reading_order/reading_order_rb.py,sha256=Vk3ufc47w2FnVaLI5UGpxoBTZFcpWuIrSAaNGa9c5Rg,20416
|
15
15
|
docling_ibm_models/tableformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
docling_ibm_models/tableformer/common.py,sha256=2zgGZBFf4fXytEaXrZR2NU6FWdX2kxO0DHlGZmuvpNQ,3230
|
17
17
|
docling_ibm_models/tableformer/data_management/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
docling_ibm_models/tableformer/data_management/functional.py,sha256=kJntHEXFz2SP7obEcHyjAqZNZC9qh-U75MwUJALLADI,3143
|
19
|
-
docling_ibm_models/tableformer/data_management/matching_post_processor.py,sha256=
|
19
|
+
docling_ibm_models/tableformer/data_management/matching_post_processor.py,sha256=_MVbsm0l5aKP3ChvKhXFeZ2Gz_DHGLlyMbqbKTan_MU,56721
|
20
20
|
docling_ibm_models/tableformer/data_management/tf_cell_matcher.py,sha256=IdZTaWIRhPpyEwzZgCmviZnYacR6kbcUqBvx7ilmkKY,21250
|
21
21
|
docling_ibm_models/tableformer/data_management/tf_predictor.py,sha256=BHd6KdAX0-b9TbX01m0872MO10zWDMValyf4UTIRkAU,39008
|
22
22
|
docling_ibm_models/tableformer/data_management/transforms.py,sha256=NNaz_7GI7FCVmu_rJuenqH5VfzRSljJHUHpNQQ8Mq3Q,2983
|
@@ -34,7 +34,7 @@ docling_ibm_models/tableformer/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeu
|
|
34
34
|
docling_ibm_models/tableformer/utils/app_profiler.py,sha256=Pb7o1zcikKXh7ninaNt4_nVa1xuUrogZxbTr6U6jkEE,8392
|
35
35
|
docling_ibm_models/tableformer/utils/mem_monitor.py,sha256=NFZUnrfLThXNZQrm3ESRmPSJmPF2J1z3E2v_72O4dRw,6408
|
36
36
|
docling_ibm_models/tableformer/utils/utils.py,sha256=8Bxf1rEn977lFbY9NX0r5xh9PvxIRipQZX_EZW92XfA,10980
|
37
|
-
docling_ibm_models-3.4.
|
38
|
-
docling_ibm_models-3.4.
|
39
|
-
docling_ibm_models-3.4.
|
40
|
-
docling_ibm_models-3.4.
|
37
|
+
docling_ibm_models-3.4.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
38
|
+
docling_ibm_models-3.4.2.dist-info/METADATA,sha256=AC30CNriUSKcviE24Sn1eIEkrwvXzoM5jiP7ImYR4VU,7434
|
39
|
+
docling_ibm_models-3.4.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
40
|
+
docling_ibm_models-3.4.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|