docling-ibm-models 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. docling_ibm_models/layoutmodel/layout_predictor.py +171 -0
  2. docling_ibm_models/tableformer/__init__.py +0 -0
  3. docling_ibm_models/tableformer/common.py +200 -0
  4. docling_ibm_models/tableformer/data_management/__init__.py +0 -0
  5. docling_ibm_models/tableformer/data_management/data_transformer.py +504 -0
  6. docling_ibm_models/tableformer/data_management/functional.py +574 -0
  7. docling_ibm_models/tableformer/data_management/matching_post_processor.py +1325 -0
  8. docling_ibm_models/tableformer/data_management/tf_cell_matcher.py +596 -0
  9. docling_ibm_models/tableformer/data_management/tf_dataset.py +1233 -0
  10. docling_ibm_models/tableformer/data_management/tf_predictor.py +1020 -0
  11. docling_ibm_models/tableformer/data_management/transforms.py +396 -0
  12. docling_ibm_models/tableformer/models/__init__.py +0 -0
  13. docling_ibm_models/tableformer/models/common/__init__.py +0 -0
  14. docling_ibm_models/tableformer/models/common/base_model.py +279 -0
  15. docling_ibm_models/tableformer/models/table04_rs/__init__.py +0 -0
  16. docling_ibm_models/tableformer/models/table04_rs/bbox_decoder_rs.py +163 -0
  17. docling_ibm_models/tableformer/models/table04_rs/encoder04_rs.py +72 -0
  18. docling_ibm_models/tableformer/models/table04_rs/tablemodel04_rs.py +324 -0
  19. docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py +203 -0
  20. docling_ibm_models/tableformer/otsl.py +541 -0
  21. docling_ibm_models/tableformer/settings.py +90 -0
  22. docling_ibm_models/tableformer/test_dataset_cache.py +37 -0
  23. docling_ibm_models/tableformer/test_prepare_image.py +99 -0
  24. docling_ibm_models/tableformer/utils/__init__.py +0 -0
  25. docling_ibm_models/tableformer/utils/app_profiler.py +243 -0
  26. docling_ibm_models/tableformer/utils/torch_utils.py +216 -0
  27. docling_ibm_models/tableformer/utils/utils.py +376 -0
  28. docling_ibm_models/tableformer/utils/variance.py +175 -0
  29. docling_ibm_models-0.1.0.dist-info/LICENSE +21 -0
  30. docling_ibm_models-0.1.0.dist-info/METADATA +172 -0
  31. docling_ibm_models-0.1.0.dist-info/RECORD +32 -0
  32. docling_ibm_models-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1325 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+ import json
6
+ import logging
7
+ import statistics
8
+
9
+ import docling_ibm_models.tableformer.settings as s
10
+ from docling_ibm_models.tableformer.data_management.tf_cell_matcher import CellMatcher
11
+
12
+ LOG_LEVEL = logging.INFO
13
+ # LOG_LEVEL = logging.DEBUG
14
+
15
+
16
+ class MatchingPostProcessor:
17
+ r"""
18
+ The MatchingPostProcessor aims to improve the matchings between the predicted table cells and
19
+ the pdf cells
20
+ """
21
+
22
+ def __init__(self, config):
23
+ self._config = config
24
+ self._cell_matcher = CellMatcher(config)
25
+
26
+ def _log(self):
27
+ # Setup a custom logger
28
+ return s.get_custom_logger(self.__class__.__name__, LOG_LEVEL)
29
+
30
+ def _get_table_dimension(self, table_cells):
31
+ r"""
32
+ Get dimensions (columns, rows) of a table from table_cells
33
+
34
+ Parameters
35
+ ----------
36
+ table_cells : list of dict
37
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
38
+
39
+ Returns
40
+ -------
41
+ columns : integer,
42
+ rows : integer,
43
+ max_cell_id : integer,
44
+ highest cell_id in table_cells
45
+ """
46
+ columns = 1
47
+ rows = 1
48
+ max_cell_id = 0
49
+
50
+ for cell in table_cells:
51
+ if cell["column_id"] > columns:
52
+ columns = cell["column_id"]
53
+ if cell["row_id"] > rows:
54
+ rows = cell["row_id"]
55
+ if cell["cell_id"] > max_cell_id:
56
+ max_cell_id = cell["cell_id"]
57
+
58
+ return columns + 1, rows + 1, max_cell_id
59
+
60
+ def _get_good_bad_cells_in_column(self, table_cells, column, matches):
61
+ r"""
62
+ 1. step
63
+ Get good/bad IOU predicted cells for each structural column (of minimal grid)
64
+
65
+ Parameters
66
+ ----------
67
+ table_cells : list of dict
68
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
69
+ column : integer
70
+ Index of a column
71
+ matches : dictionary of lists of table_cells
72
+ A dictionary which is indexed by the pdf_cell_id as key and the value is a list
73
+ of the table_cells that fall inside that pdf cell
74
+
75
+ Returns
76
+ -------
77
+ good_table_cells : list of dict
78
+ cells in a column that have match
79
+ bad_table_cells : list of dict
80
+ cells in a column that don't have match
81
+ """
82
+ good_table_cells = []
83
+ bad_table_cells = []
84
+
85
+ for cell in table_cells:
86
+ if cell["column_id"] == column:
87
+ table_cell_id = cell["cell_id"]
88
+
89
+ bad_match = True
90
+ allow_class = True
91
+
92
+ for pdf_cell_id in matches:
93
+ # CHECK IF CELL CLASS TO BE VERIFIED HERE
94
+ if "cell_class" in cell:
95
+ if cell["cell_class"] <= 1:
96
+ allow_class = False
97
+ else:
98
+ print("***")
99
+ print("no cell_class in...")
100
+ print(cell)
101
+ print("***")
102
+ if allow_class:
103
+ match_list = matches[pdf_cell_id]
104
+ for match in match_list:
105
+ if match["table_cell_id"] == table_cell_id:
106
+ good_table_cells.append(cell)
107
+ bad_match = False
108
+ if bad_match:
109
+ bad_table_cells.append(cell)
110
+
111
+ return good_table_cells, bad_table_cells
112
+
113
+ def _delete_column_from_table(self, table_cells, column):
114
+ r"""
115
+ 1.a. step
116
+ If all IOU in a column are bad - eliminate column (from bboxes and structure)
117
+
118
+ Parameters
119
+ ----------
120
+ table_cells : list of dict
121
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
122
+ column : integer
123
+ Index of a column
124
+
125
+ Returns
126
+ -------
127
+ new_table_cells : list of dict
128
+ """
129
+ new_table_cells = []
130
+
131
+ for cell in table_cells:
132
+ if cell["column_id"] < column:
133
+ new_table_cells.append(cell)
134
+ if cell["column_id"] > column:
135
+ new_cell = {
136
+ "bbox": cell["bbox"],
137
+ "cell_id": cell["cell_id"],
138
+ "column_id": cell["column_id"] - 1,
139
+ "label": cell["label"],
140
+ "row_id": cell["row_id"],
141
+ "cell_class": cell["cell_class"],
142
+ }
143
+ new_table_cells.append(new_cell)
144
+
145
+ return new_table_cells
146
+
147
+ def _find_alignment_in_column(self, cells):
148
+ r"""
149
+ 2. step
150
+ Find alignment of good IOU cells per column
151
+
152
+ Parameters
153
+ ----------
154
+ cells : list of dict
155
+ Cells in a column
156
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
157
+
158
+ Returns
159
+ -------
160
+ alignment : string
161
+ column general alignment can be: "left", "right", "center"
162
+ """
163
+ possible_alignments = ["left", "middle", "right"]
164
+ alignment = "left" # left / right / center
165
+
166
+ lefts = []
167
+ rights = []
168
+ middles = []
169
+
170
+ for cell in cells:
171
+ x_left = cell["bbox"][0]
172
+ x_right = cell["bbox"][2]
173
+ x_middle = (x_left + x_right) / 2
174
+ lefts.append(x_left)
175
+ rights.append(x_right)
176
+ middles.append(x_middle)
177
+
178
+ if len(lefts) > 0:
179
+ delta_left = max(lefts) - min(lefts)
180
+ delta_middle = max(middles) - min(middles)
181
+ delta_right = max(rights) - min(rights)
182
+
183
+ deltas = [delta_left, delta_middle, delta_right]
184
+ align_index = deltas.index(min(deltas))
185
+ alignment = possible_alignments[align_index]
186
+
187
+ return alignment
188
+
189
+ def _get_median_pos_size(self, cells, alignment):
190
+ r"""
191
+ 3. step
192
+ Get median* (according to alignment) "bbox left/middle/right X" coord
193
+ for good IOU cells, get median* cell size in a column.
194
+
195
+ Parameters
196
+ ----------
197
+ cells : list of dict
198
+ Cells in a column
199
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
200
+ alignment : string
201
+ column general alignment can be: "left", "right", "center"
202
+
203
+ Returns
204
+ -------
205
+ median_x : number
206
+ Median X position of a cell (according to alignment)
207
+ median_y : number
208
+ Median Y position of a cell (according to alignment)
209
+ median_width : number
210
+ Median width of a cell
211
+ median_height : number
212
+ Median height of a cell
213
+ """
214
+ median_x = 0
215
+ median_y = 0
216
+ median_width = 1
217
+ median_height = 1
218
+
219
+ coords_x = []
220
+ coords_y = []
221
+ widths = []
222
+ heights = []
223
+
224
+ for cell in cells:
225
+ if "rowspan_val" not in cell:
226
+ if "colspan_val" not in cell:
227
+ if cell["cell_class"] > 1:
228
+ # Use left alignment
229
+ x_coord = cell["bbox"][0]
230
+ if alignment == "middle":
231
+ # Use middle alignment
232
+ x_coord = (cell["bbox"][2] + cell["bbox"][0]) / 2
233
+ if alignment == "right":
234
+ # Use right alignment
235
+ x_coord = cell["bbox"][2]
236
+
237
+ coords_x.append(x_coord)
238
+ y_coord = cell["bbox"][1]
239
+ coords_y.append(y_coord)
240
+
241
+ width = cell["bbox"][2] - cell["bbox"][0]
242
+ widths.append(width)
243
+ height = cell["bbox"][3] - cell["bbox"][1]
244
+ heights.append(height)
245
+ else:
246
+ self._log().debug("Empty cells not considered in medians")
247
+ self._log().debug(cell)
248
+ else:
249
+ self._log().debug("Colspans not considered in medians")
250
+ self._log().debug(cell)
251
+ else:
252
+ self._log().debug("Rowspans not considered in medians")
253
+ self._log().debug(cell)
254
+
255
+ if len(coords_x) > 0:
256
+ median_x = statistics.median(coords_x)
257
+ if len(coords_y) > 0:
258
+ median_y = statistics.median(coords_y)
259
+ if len(widths) > 0:
260
+ median_width = statistics.median(widths)
261
+ if len(heights) > 0:
262
+ median_height = statistics.median(heights)
263
+ return median_x, median_y, median_width, median_height
264
+
265
+ def _move_cells_to_left_pos(
266
+ self, cells, median_x, rescale, median_width, median_height, alignment
267
+ ):
268
+ r"""
269
+ 4. step
270
+ Move bad cells to the median* (left/middle/right) good in a column
271
+ (Additionally), re-size cell to median* size of cells in a column
272
+
273
+ Parameters
274
+ ----------
275
+ cells : list of dict
276
+ Cells in a column
277
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
278
+ median_x : number
279
+ Median X position of a cell (according to alignment)
280
+ rescale : boolean
281
+ should cells be re-sized to median or not
282
+ median_width : number
283
+ Median width of a cell
284
+ median_height : number
285
+ Median height of a cell
286
+ alignment : string
287
+ column general alignment can be: "left", "right", "center"
288
+
289
+ Returns
290
+ -------
291
+
292
+ new_table_cells : list of dict
293
+ Cells in a column
294
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
295
+ """
296
+ new_table_cells = []
297
+
298
+ for cell in cells:
299
+ new_cell = {
300
+ "bbox": [],
301
+ "cell_id": 0,
302
+ "column_id": 0,
303
+ "label": "",
304
+ "row_id": 0,
305
+ "cell_class": 0,
306
+ }
307
+ x1 = cell["bbox"][0]
308
+ y1 = cell["bbox"][1]
309
+ x2 = cell["bbox"][2]
310
+ y2 = cell["bbox"][3]
311
+ original_width = x2 - x1
312
+ # original_height = y2 - y1
313
+
314
+ # Move to left by default
315
+ new_x1 = median_x
316
+ new_y1 = y1
317
+ new_x2 = median_x + original_width
318
+ new_y2 = y2
319
+
320
+ if rescale:
321
+ new_x2 = median_x + median_width
322
+ # Next line does vertical resizing of BBOX:
323
+ new_y2 = y1 + median_height
324
+
325
+ # Move to middle
326
+ if alignment == "middle":
327
+ # TODO
328
+ new_x1 = median_x - (original_width / 2)
329
+ new_x2 = new_x1 + original_width
330
+ if rescale:
331
+ new_x1 = median_x - (median_width / 2)
332
+ new_x2 = median_x + (median_width / 2)
333
+
334
+ # Move to right
335
+ if alignment == "right":
336
+ new_x1 = median_x - original_width
337
+ new_x2 = median_x
338
+ if rescale:
339
+ new_x1 = median_x - median_width
340
+
341
+ new_cell["bbox"] = [new_x1, new_y1, new_x2, new_y2]
342
+ new_cell["cell_id"] = cell["cell_id"]
343
+ new_cell["column_id"] = cell["column_id"]
344
+ new_cell["label"] = cell["label"]
345
+ new_cell["row_id"] = cell["row_id"]
346
+ new_cell["cell_class"] = cell["cell_class"]
347
+ # Add spans if present
348
+ if "rowspan_val" in cell:
349
+ new_cell["rowspan_val"] = cell["rowspan_val"]
350
+ if "colspan_val" in cell:
351
+ new_cell["colspan_val"] = cell["colspan_val"]
352
+ new_table_cells.append(new_cell)
353
+ return new_table_cells
354
+
355
+ def _run_intersection_match(self, cell_matcher, table_cells, pdf_cells):
356
+ r"""
357
+ 5. step
358
+ Generate new matches, run Intersection over cell(pdf) on a table cells
359
+
360
+ Parameters
361
+ ----------
362
+ cell_matcher : CellMatcher
363
+ src.data_management.cell_matcher
364
+ table_cells : list of dict
365
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
366
+ pdf_cells : list of dict
367
+ List of PDF cells as defined by Docling
368
+
369
+ Returns
370
+ -------
371
+ clean_matches : dictionary of lists of table_cells
372
+ A dictionary which is indexed by the pdf_cell_id as key and the value is a list
373
+ of the table_cells that fall inside that pdf cell
374
+ """
375
+ new_matches = {}
376
+ clean_matches = {}
377
+ new_matches, matches_counter = cell_matcher._intersection_over_pdf_match(
378
+ table_cells, pdf_cells
379
+ )
380
+ clean_matches = new_matches
381
+ # Convert to JSON and back to have string keys in the dictionary
382
+ clean_matches_string = json.dumps(clean_matches)
383
+ clean_matches = json.loads(clean_matches_string)
384
+ return clean_matches
385
+
386
+ def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
387
+ r"""
388
+ USED in 8.a step
389
+ NOT USED in 6. step
390
+
391
+ Align table cell bboxes with good matches
392
+ to encapsulate matching pdf cells
393
+
394
+ Parameters
395
+ ----------
396
+ table_cells : list of dict
397
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
398
+ pdf_cells : list of dict
399
+ List of PDF cells as defined by Docling
400
+ matches : dictionary of lists of table_cells
401
+ A dictionary which is indexed by the pdf_cell_id as key and the value is a list
402
+ of the table_cells that fall inside that pdf cell
403
+
404
+ Returns
405
+ -------
406
+ clean_table_cells : list of dict
407
+ Aligned and cleaned table cells
408
+ """
409
+ # 6
410
+ # align table cells with matching pdf cells
411
+ new_table_cells = []
412
+
413
+ for pdf_cell_id in matches:
414
+ match_list = matches[pdf_cell_id]
415
+ one_table_cells = []
416
+ for i in range(len(match_list)):
417
+ otc = int(match_list[i]["table_cell_id"])
418
+ if otc not in one_table_cells:
419
+ one_table_cells.append(otc)
420
+
421
+ # Get bbox of pdf_cell:
422
+ pdf_cell_bbox = []
423
+ for pdf_cell in pdf_cells:
424
+ if pdf_cell["id"] == int(pdf_cell_id):
425
+ pdf_cell_bbox = pdf_cell["bbox"]
426
+
427
+ # Get bbox of pdf_cell:
428
+ for table_cell in table_cells:
429
+ if table_cell["cell_id"] in one_table_cells:
430
+ # Align bbox vertically to cover PDF cell
431
+ new_bbox = [
432
+ pdf_cell_bbox[0],
433
+ pdf_cell_bbox[1],
434
+ pdf_cell_bbox[2],
435
+ pdf_cell_bbox[3],
436
+ ]
437
+ # We are sure cell is not empty,
438
+ # because we assign PDF cell to it
439
+ new_table_cell_class = "2"
440
+
441
+ if "cell_class" in table_cell:
442
+ new_table_cell_class = table_cell["cell_class"]
443
+
444
+ new_table_cell = {
445
+ "bbox": new_bbox,
446
+ "cell_id": table_cell["cell_id"],
447
+ "column_id": table_cell["column_id"],
448
+ "label": table_cell["label"],
449
+ "row_id": table_cell["row_id"],
450
+ "cell_class": new_table_cell_class,
451
+ }
452
+
453
+ if "colspan_val" in table_cell:
454
+ new_table_cell["colspan_val"] = table_cell["colspan_val"]
455
+ if "rowspan_val" in table_cell:
456
+ new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
457
+ new_table_cells.append(new_table_cell)
458
+
459
+ # Rebuild table_cells list deduplicating repeating cells,
460
+ # encapsulating all duplicate cells dimensions
461
+
462
+ for new_table_cell in new_table_cells:
463
+ cell_id_to_find = new_table_cell["cell_id"]
464
+
465
+ x1s = []
466
+ y1s = []
467
+ x2s = []
468
+ y2s = []
469
+
470
+ found = 0
471
+
472
+ for found_cell in new_table_cells:
473
+ if found_cell["cell_id"] == cell_id_to_find:
474
+ found += 1
475
+ x1s.append(found_cell["bbox"][0])
476
+ y1s.append(found_cell["bbox"][1])
477
+ x2s.append(found_cell["bbox"][2])
478
+ y2s.append(found_cell["bbox"][3])
479
+
480
+ min_x1 = min(x1s)
481
+ min_y1 = min(y1s)
482
+ max_x2 = max(x2s)
483
+ max_y2 = max(y2s)
484
+
485
+ if found > 1:
486
+ new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2]
487
+
488
+ clean_table_cells = [
489
+ i
490
+ for n, i in enumerate(new_table_cells)
491
+ if i not in new_table_cells[n + 1 :]
492
+ ]
493
+ return clean_table_cells
494
+
495
+ def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):
496
+ r"""
497
+ 7. step
498
+
499
+ De-duplicate columns in table_cells according to highest column score
500
+ in: matches + intersection_pdf_matches
501
+
502
+ Parameters
503
+ ----------
504
+ tab_columns : integer
505
+ Number of table columns
506
+ table_cells : list of dict
507
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
508
+ iou_matches : dictionary of lists of table_cells
509
+ Cell matches done using Intersection Over Union (IOU) method
510
+ ioc_matches : dictionary of lists of table_cells
511
+ Cell matches done using Intersection Over (PDF) Cell method
512
+
513
+ Returns
514
+ -------
515
+ new_table_cells : list of dict
516
+ New table cells with removed column duplicates
517
+ new_matches : dictionary of lists of table_cells
518
+ Matches that are in sync with new_table_cells
519
+ new_tab_columns : integer
520
+ New number of table columns
521
+ """
522
+ pdf_cells_in_columns = []
523
+ total_score_in_columns = []
524
+
525
+ for col in range(tab_columns):
526
+ column_table_cells = []
527
+ column_pdf_cells_iou = []
528
+ column_pdf_cells_ioc = []
529
+ column_pdf_cells = []
530
+ column_iou_score = 0
531
+ column_ioc_score = 0
532
+
533
+ for cell in table_cells:
534
+ if cell["column_id"] == col:
535
+ table_cell_id = cell["cell_id"]
536
+ column_table_cells.append(table_cell_id)
537
+
538
+ # SUM IOU + IOC Scores for column, collect all pdf_cell_id
539
+ for iou_key in iou_matches:
540
+ iou_match_list = iou_matches[iou_key]
541
+ for uk in range(len(iou_match_list)):
542
+ t_cell_id = iou_match_list[uk]["table_cell_id"]
543
+ if t_cell_id in column_table_cells:
544
+ if "iou" in iou_match_list[uk]:
545
+ # In case initial match was IOU
546
+ column_iou_score += iou_match_list[uk]["iou"]
547
+ elif "iopdf" in iou_match_list[uk]:
548
+ # Otherwise it's intersection over PDF match
549
+ column_iou_score += iou_match_list[uk]["iopdf"]
550
+ column_pdf_cells_iou.append(iou_key)
551
+
552
+ for ioc_key in ioc_matches:
553
+ ioc_match_list = ioc_matches[ioc_key]
554
+ for k in range(len(ioc_match_list)):
555
+ t_cell_id = ioc_match_list[k]["table_cell_id"]
556
+ if t_cell_id in column_table_cells:
557
+ column_ioc_score += ioc_match_list[k]["iopdf"]
558
+ column_pdf_cells_ioc.append(ioc_key)
559
+
560
+ column_pdf_cells = column_pdf_cells_iou
561
+ column_pdf_cells += list(
562
+ set(column_pdf_cells_ioc) - set(column_pdf_cells_iou)
563
+ )
564
+ column_total_score = column_iou_score + column_ioc_score
565
+
566
+ pdf_cells_in_columns.append(column_pdf_cells)
567
+ total_score_in_columns.append(column_total_score)
568
+ self._log().debug(
569
+ "Column: {}, Score:{}, PDF cells: {}".format(
570
+ col, column_total_score, column_pdf_cells
571
+ )
572
+ )
573
+
574
+ # Eliminate duplicates in the pdf_cells_in_columns and ensure int content
575
+ # pdf_cells_in_columns:
576
+ # - initially: list of lists of str with duplicates in the inner lists
577
+ # - afterwards: list of lists of int (unique)
578
+ pdf_cells_in_columns = [
579
+ list(set([x for x in map(lambda x: int(x), le)]))
580
+ for le in pdf_cells_in_columns
581
+ ]
582
+ cols_to_eliminate = []
583
+ # Pairwise comparison of all columns, finding intersection, and it's length
584
+ for cl in range(tab_columns - 1):
585
+ col_a = pdf_cells_in_columns[cl]
586
+ col_b = pdf_cells_in_columns[cl + 1]
587
+ score_a = total_score_in_columns[cl]
588
+ score_b = total_score_in_columns[cl + 1]
589
+ intsct = list(set(col_a).intersection(col_b))
590
+ int_prc = 0
591
+ if len(col_a) > 0:
592
+ int_prc = len(intsct) / len(col_a)
593
+ logstring = "Col A: {}, Col B: {}, Int: {}, %: {}, Score A: {}, Score B: {}"
594
+ self._log().debug(
595
+ logstring.format(cl, cl + 1, len(intsct), int_prc, score_a, score_b)
596
+ )
597
+
598
+ # Consider structural column elimination
599
+ # if 60% of two columns pointing to the same pdf cells
600
+ if int_prc > 0.6:
601
+ if score_a >= score_b:
602
+ # Elliminate B
603
+ cols_to_eliminate.append(cl + 1)
604
+ if score_b > score_a:
605
+ # Elliminate A
606
+ cols_to_eliminate.append(cl)
607
+
608
+ self._log().debug("Columns to eliminate: {}".format(cols_to_eliminate))
609
+ new_table_cells = []
610
+ new_matches = {}
611
+
612
+ removed_table_cell_ids = []
613
+ new_tab_columns = tab_columns - len(cols_to_eliminate)
614
+
615
+ # Clean table_cells structure
616
+ for tab_cell in table_cells:
617
+ add_cell = True
618
+ for col_del in cols_to_eliminate:
619
+ if tab_cell["column_id"] == col_del:
620
+ removed_table_cell_ids.append(tab_cell["cell_id"])
621
+ add_cell = False
622
+ if add_cell:
623
+ new_table_cells.append(tab_cell)
624
+ # Clean ioc_matches structure
625
+ for pdf_cell_id, pdf_cell_matches in ioc_matches.items():
626
+ new_cell_match = []
627
+ for pdf_match in pdf_cell_matches:
628
+ if pdf_match["table_cell_id"] not in removed_table_cell_ids:
629
+ new_cell_match.append(pdf_match)
630
+
631
+ if len(new_cell_match) > 0:
632
+ new_matches[pdf_cell_id] = new_cell_match
633
+
634
+ return new_table_cells, new_matches, new_tab_columns
635
+
636
+ def _do_final_asignment(self, table_cells, iou_matches, ioc_matches):
637
+ r"""
638
+ 8. step
639
+
640
+ Do final assignment of table bbox to pdf cell based on saved scores,
641
+ either preferring IOU over PDF Intersection, and higher Intersection over lower,
642
+ or just use PDF Intersection
643
+ Rule: 1 Table cell can contain many PDF cells,
644
+ but each PDF cell has to be asigned to one Table cell
645
+ Rule: Do not discard table bboxes at this point, asign all of them
646
+
647
+ Iterate over matches, if PDF cell has more than 1 table cell match:
648
+ Go over all other matches and delete tab_cell match of lower score
649
+ (prefer iou match over ioc match)
650
+
651
+ Parameters
652
+ ----------
653
+ table_cells : list of dict
654
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
655
+ iou_matches : dictionary of lists of table_cells
656
+ Cell matches done using Intersection Over Union (IOU) method
657
+ ioc_matches : dictionary of lists of table_cells
658
+ Cell matches done using Intersection Over (PDF) Cell method
659
+
660
+ Returns
661
+ -------
662
+ new_matches : dictionary of lists of table_cells
663
+ New matches with final table cell asignments
664
+ """
665
+ new_matches = {}
666
+
667
+ for pdf_cell_id, pdf_cell_matches in ioc_matches.items():
668
+ max_ioc_match = max(pdf_cell_matches, key=lambda x: x["iopdf"])
669
+ new_matches[pdf_cell_id] = [max_ioc_match]
670
+
671
+ return new_matches
672
+
673
+ def _merge_two_bboxes(self, bbox1, bbox2):
674
+ r"""
675
+ Merge two bboxes into one bboxes that encompasses the two
676
+
677
+ Parameters
678
+ ----------
679
+ bbox1 : list of numbers
680
+ bbox to be merged described as two corners [x1, y1, x2, y2]
681
+ bbox1 : list of numbers
682
+ bbox to be merged described as two corners [x1, y1, x2, y2]
683
+
684
+ Returns
685
+ -------
686
+ bbox_result : list of numbers
687
+ bbox that encompasses two input bboxes
688
+ """
689
+ bbox_result = [-1, -1, -1, -1]
690
+ bbox_result[0] = min([bbox1[0], bbox2[0]])
691
+ bbox_result[1] = min([bbox1[1], bbox2[1]])
692
+ bbox_result[2] = max([bbox1[2], bbox2[2]])
693
+ bbox_result[3] = max([bbox1[3], bbox2[3]])
694
+ return bbox_result
695
+
696
+ def _pick_orphan_cells(
697
+ self, tab_rows, tab_cols, max_cell_id, table_cells, pdf_cells, matches
698
+ ):
699
+ # 9.
700
+ # new_matches, new_table_cells, max_cell_id
701
+ r"""
702
+ 9. step
703
+
704
+ Pick up remaining orphan cells (pdf cells that don't have any matches or intersections)
705
+ 9.a. Determine row banding per row (min/max Y per row)
706
+ match with orphan cells (intersection with band)
707
+ 9.b. Determine column banding per column (min/max X per column)
708
+ match with orphan cells (intersection with band)
709
+ 9.c. Decide to which column/row each orphan PDF cell belongs
710
+ append match to existing structural cell, or create cell
711
+
712
+ Parameters
713
+ ----------
714
+ tab_rows : number
715
+ Total number of rows
716
+ tab_cols : number
717
+ Total number of columns
718
+ max_cell_id : number
719
+ Highest table cell id
720
+ table_cells : list of dict
721
+ Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
722
+ pdf_cells : list of dict
723
+ List of PDF cells as defined by Docling
724
+ matches : dictionary of lists of table_cells
725
+ A dictionary which is indexed by the pdf_cell_id as key and the value is a list
726
+ of the table_cells that fall inside that pdf cell
727
+
728
+ Returns
729
+ -------
730
+ new_matches : dictionary of lists of table_cells
731
+ updated matches
732
+ new_table_cells : list of dict
733
+ updated table cells
734
+ max_cell_id : number
735
+ New highest table cell id, accounting freshly added table cells (if any)
736
+ """
737
+
738
+ new_matches = matches
739
+ new_table_cells = table_cells
740
+
741
+ # Identify orphan rows (START)
742
+ orphan_rows = []
743
+ orphan_rows_depth = []
744
+ orphan_rows_bbox = []
745
+
746
+ # List with pdf_ids which are used in some (any) row
747
+ used_row_pdf_ids = []
748
+ used_row_rowid = []
749
+
750
+ for row in range(tab_rows):
751
+ bbox_y1s = [] # y2 > y1
752
+ bbox_y2s = []
753
+ row_y1 = -1
754
+ row_y2 = -1
755
+ row_table_cells = []
756
+ row_table_cell_ids = []
757
+ orphan_cells_in_row = []
758
+ orphan_cells_in_row_depth = []
759
+ orphan_cells_in_row_bbox = []
760
+
761
+ for cell in table_cells:
762
+ if cell["row_id"] == row:
763
+ # Do not consider spanned cells
764
+ if "rowspan_val" not in cell:
765
+ # Do not consider empty cells
766
+ if cell["cell_class"] > 1:
767
+ table_cell_id = cell["cell_id"]
768
+ row_table_cells.append(cell)
769
+ row_table_cell_ids.append(table_cell_id)
770
+ bbox_y1s.append(cell["bbox"][1])
771
+ bbox_y2s.append(cell["bbox"][3])
772
+
773
+ # Y coordinates that define band of rows
774
+ if len(bbox_y1s) > 0:
775
+ row_y1 = min(bbox_y1s)
776
+ if len(bbox_y2s) > 0:
777
+ row_y2 = max(bbox_y2s)
778
+
779
+ # Find "orphan" cells that intersect the band
780
+ for pdf_cell in pdf_cells:
781
+ pdf_str_id = str(pdf_cell["id"])
782
+ if pdf_str_id not in matches:
783
+ within_band = False
784
+ depth = -1
785
+
786
+ centroid_band = (row_y2 + row_y1) / 2
787
+ centroid_cell = (pdf_cell["bbox"][3] + pdf_cell["bbox"][1]) / 2
788
+
789
+ # pdf_cell - Orphan, and don't have any match
790
+ if pdf_cell["bbox"][1] >= row_y1 and pdf_cell["bbox"][1] <= row_y2:
791
+ depth = abs(centroid_band - centroid_cell)
792
+ within_band = True
793
+
794
+ if pdf_cell["bbox"][3] >= row_y1 and pdf_cell["bbox"][3] <= row_y2:
795
+ depth = abs(centroid_band - centroid_cell)
796
+ within_band = True
797
+
798
+ if pdf_cell["bbox"][1] <= row_y1 and pdf_cell["bbox"][3] >= row_y2:
799
+ depth = abs(centroid_band - centroid_cell)
800
+ within_band = True
801
+
802
+ if within_band:
803
+ if pdf_str_id not in used_row_pdf_ids:
804
+ used_row_pdf_ids.append(pdf_str_id)
805
+ used_row_rowid.append(row)
806
+ orphan_cells_in_row.append(pdf_str_id)
807
+ orphan_cells_in_row_depth.append(round(depth))
808
+ orphan_cells_in_row_bbox.append(pdf_cell["bbox"])
809
+ else:
810
+ self._log().debug("Found duplicate: {}".format(pdf_str_id))
811
+ # Get index of a row where pdf cell was already detected
812
+ used_ind = used_row_pdf_ids.index(pdf_str_id)
813
+ duplicate_id_found_in_row = used_row_rowid[used_ind]
814
+ valid_duplicate = False
815
+ if len(orphan_rows) > duplicate_id_found_in_row:
816
+ index_in_row_list = orphan_rows[
817
+ duplicate_id_found_in_row
818
+ ].index(pdf_str_id)
819
+ valid_duplicate = True
820
+
821
+ if valid_duplicate:
822
+ i1 = duplicate_id_found_in_row
823
+ i2 = index_in_row_list
824
+ score1 = orphan_rows_depth[i1][i2]
825
+ score2 = round(depth)
826
+ # If new cell better than the old one
827
+ if score2 < score1:
828
+ # Delete old record about the pdf cell...
829
+ orphan_rows[i1].pop(index_in_row_list)
830
+ orphan_rows_depth[i1].pop(index_in_row_list)
831
+ orphan_rows_bbox[i1].pop(index_in_row_list)
832
+
833
+ used_row_pdf_ids.pop(used_ind)
834
+ used_row_rowid.pop(used_ind)
835
+ # Then proceed adding new cell
836
+ used_row_pdf_ids.append(pdf_str_id)
837
+ used_row_rowid.append(row)
838
+ orphan_cells_in_row.append(pdf_str_id)
839
+ orphan_cells_in_row_depth.append(round(depth))
840
+ orphan_cells_in_row_bbox.append(pdf_cell["bbox"])
841
+ msg = "Resolved duplicate: {} in favor of new one"
842
+ self._log().debug(msg.format(pdf_str_id))
843
+ else:
844
+ msg = "Resolved duplicate: {} in favor of old one"
845
+ self._log().debug(msg.format(pdf_str_id))
846
+
847
+ log_msg = "Row: {}, Band: {}/{}, Orphan PDF cells: {}"
848
+ self._log().debug(log_msg.format(row, row_y1, row_y2, orphan_cells_in_row))
849
+ orphan_rows.append(orphan_cells_in_row)
850
+ orphan_rows_depth.append(orphan_cells_in_row_depth)
851
+ orphan_rows_bbox.append(orphan_cells_in_row_bbox)
852
+
853
+ # Identify orphan rows (END)
854
+ self._log().debug("...")
855
+ # Identify orphan columns
856
+ orphan_columns = []
857
+ orphan_columns_depth = []
858
+ orphan_columns_bbox = []
859
+ used_col_pdf_ids = []
860
+ used_col_columnid = []
861
+
862
+ for col in range(tab_cols):
863
+ bbox_x1s = [] # y2 > y1
864
+ bbox_x2s = []
865
+ col_x1 = -1
866
+ col_x2 = -1
867
+ col_table_cells = []
868
+ col_table_cell_ids = []
869
+ orphan_cells_in_col = []
870
+ orphan_cells_in_col_depth = []
871
+ orphan_cells_in_col_bbox = []
872
+
873
+ for cell in table_cells:
874
+ if cell["column_id"] == col:
875
+ # Do not consider spanned cells
876
+ if "colspan_val" not in cell:
877
+ # Do not consider empty cells
878
+ if cell["cell_class"] > 1:
879
+ table_cell_id = cell["cell_id"]
880
+ col_table_cells.append(cell)
881
+ col_table_cell_ids.append(table_cell_id)
882
+ bbox_x1s.append(cell["bbox"][0])
883
+ bbox_x2s.append(cell["bbox"][2])
884
+ else:
885
+ wrn_txt = "Orphan matching skipped cell in column {} because of colspan"
886
+ self._log().debug(wrn_txt.format(col))
887
+ # self._log().info(cell)
888
+
889
+ # X coordinates that define band of columns
890
+ if len(bbox_x1s) > 0:
891
+ col_x1 = min(bbox_x1s)
892
+ if len(bbox_x2s) > 0:
893
+ col_x2 = max(bbox_x2s)
894
+
895
+ # Find "orphan" cells that intersect the band
896
+ for pdf_cell in pdf_cells:
897
+ pdf_str_id = str(pdf_cell["id"])
898
+ if pdf_str_id not in matches:
899
+ within_band = False
900
+ depth = -1
901
+
902
+ centroid_band = (col_x2 + col_x1) / 2
903
+ centroid_cell = (pdf_cell["bbox"][2] + pdf_cell["bbox"][0]) / 2
904
+
905
+ if pdf_cell["bbox"][0] >= col_x1 and pdf_cell["bbox"][0] <= col_x2:
906
+ depth = abs(centroid_band - centroid_cell)
907
+ within_band = True
908
+
909
+ if pdf_cell["bbox"][2] >= col_x1 and pdf_cell["bbox"][2] <= col_x2:
910
+ depth = abs(centroid_band - centroid_cell)
911
+ within_band = True
912
+
913
+ if pdf_cell["bbox"][0] < col_x1 and pdf_cell["bbox"][2] > col_x2:
914
+ depth = abs(centroid_band - centroid_cell)
915
+ within_band = True
916
+
917
+ if within_band:
918
+ if pdf_str_id not in used_col_pdf_ids:
919
+ used_col_pdf_ids.append(pdf_str_id)
920
+ used_col_columnid.append(col)
921
+ orphan_cells_in_col.append(pdf_str_id)
922
+ orphan_cells_in_col_depth.append(round(depth))
923
+ orphan_cells_in_col_bbox.append(pdf_cell["bbox"])
924
+ else:
925
+ self._log().debug("Found duplicate: {}".format(pdf_str_id))
926
+ # Get index of a column where pdf cell was already detected
927
+ used_ind = used_col_pdf_ids.index(pdf_str_id)
928
+ duplicate_id_found_in_column = used_col_columnid[used_ind]
929
+
930
+ valid_col_duplicate = False
931
+ if len(orphan_columns) > duplicate_id_found_in_column:
932
+ index_in_col_list = orphan_columns[
933
+ duplicate_id_found_in_column
934
+ ].index(pdf_str_id)
935
+ valid_col_duplicate = True
936
+
937
+ if valid_col_duplicate:
938
+ i1 = duplicate_id_found_in_column
939
+ i2 = index_in_col_list
940
+ score1 = orphan_columns_depth[i1][i2]
941
+ score2 = round(depth)
942
+ # If new cell better than the old one
943
+ if score2 < score1:
944
+ # Delete old record about the pdf cell...
945
+ orphan_columns[i1].pop(index_in_col_list)
946
+ orphan_columns_depth[i1].pop(index_in_col_list)
947
+ orphan_columns_bbox[i1].pop(index_in_col_list)
948
+
949
+ used_col_pdf_ids.pop(used_ind)
950
+ used_col_columnid.pop(used_ind)
951
+ # Then proceed adding new cell
952
+ used_col_pdf_ids.append(pdf_str_id)
953
+ used_col_columnid.append(col)
954
+ orphan_cells_in_col.append(pdf_str_id)
955
+ orphan_cells_in_col_depth.append(round(depth))
956
+ orphan_cells_in_col_bbox.append(pdf_cell["bbox"])
957
+ msg = "Resolved duplicate: {} in favor of new one"
958
+ self._log().debug(msg.format(pdf_str_id))
959
+ else:
960
+ msg = "Resolved duplicate: {} in favor of old one"
961
+ self._log().debug(msg.format(pdf_str_id))
962
+
963
+ orphan_columns.append(orphan_cells_in_col)
964
+ orphan_columns_depth.append(orphan_cells_in_col_depth)
965
+ orphan_columns_bbox.append(orphan_cells_in_col_bbox)
966
+
967
+ # Assign to structural cells and/or create new cells when absent
968
+
969
+ for col_ind in range(len(orphan_columns)):
970
+ self._log().debug(
971
+ "Col: {}, Orphan PDF cells: {}".format(col_ind, orphan_columns[col_ind])
972
+ )
973
+ self._log().debug(
974
+ "Col: {}, Orphan Depth: {}".format(
975
+ col_ind, orphan_columns_depth[col_ind]
976
+ )
977
+ )
978
+ self._log().debug("...")
979
+
980
+ # Collect the pdf_ids from the orphan_rows and sort them in order to produce the same
981
+ # results with the c++ implementation
982
+ orphan_rows_pdf_ids = []
983
+ row_id_per_pdf_id = {} # pdf_cell_id -> row_id
984
+ for row_id, row_pdf_ids in enumerate(orphan_rows):
985
+ if len(row_pdf_ids) == 0:
986
+ continue
987
+ # Extend the orphan_rows_pdf_ids with the pdf_ids as int
988
+ orphan_rows_pdf_ids.extend([int(x) for x in row_pdf_ids])
989
+ # Set the row_id for the pdf_ids
990
+ for pdf_cell_id in row_pdf_ids:
991
+ row_id_per_pdf_id[int(pdf_cell_id)] = row_id
992
+
993
+ orphan_rows_pdf_ids.sort()
994
+
995
+ # Assign Table cell Row ID / Table cell Column ID to orphans,
996
+ # Check if Table cell doesn't exist in the table_cells, create one,
997
+ # add match to new_matches
998
+ for pdf_cell_id_int in orphan_rows_pdf_ids:
999
+ new_row_id = row_id_per_pdf_id[pdf_cell_id_int]
1000
+ new_column_id = 0
1001
+ pdf_cell_id = str(pdf_cell_id_int)
1002
+
1003
+ if pdf_cell_id in used_col_pdf_ids:
1004
+ new_column_id = used_col_columnid[used_col_pdf_ids.index(pdf_cell_id)]
1005
+
1006
+ self._log().debug(
1007
+ "new_column_id {}, pdf_cell_id {}".format(
1008
+ new_column_id, pdf_cell_id
1009
+ )
1010
+ )
1011
+ self._log().debug(orphan_columns[new_column_id])
1012
+ depth_index = orphan_columns[new_column_id].index(pdf_cell_id)
1013
+ confidence = orphan_columns_depth[new_column_id][depth_index]
1014
+ pdf_bbox = orphan_columns_bbox[new_column_id][depth_index]
1015
+
1016
+ # 1. Find table_cell_id by new_row_id / new_column_id
1017
+ new_table_cell_id = -1
1018
+ tcell = list(
1019
+ filter(
1020
+ lambda table_cell: table_cell["row_id"] == new_row_id
1021
+ and table_cell["column_id"] == new_column_id,
1022
+ table_cells,
1023
+ )
1024
+ )
1025
+
1026
+ if len(tcell) > 0:
1027
+ new_table_cell_id = tcell[0]["cell_id"]
1028
+ self._log().debug(
1029
+ "reusing table_cell_id: {}".format(new_table_cell_id)
1030
+ )
1031
+
1032
+ for i in range(len(new_table_cells)):
1033
+ if new_table_cells[i]["cell_id"] == new_table_cell_id:
1034
+ bbox_tmp = self._merge_two_bboxes(
1035
+ new_table_cells[i]["bbox"], pdf_bbox
1036
+ )
1037
+ new_table_cells[i]["bbox"] = bbox_tmp
1038
+
1039
+ if new_table_cell_id < 0:
1040
+ max_cell_id += 1
1041
+ new_table_cell_id = max_cell_id
1042
+
1043
+ new_table_cell = {
1044
+ "bbox": pdf_bbox,
1045
+ "cell_id": new_table_cell_id,
1046
+ "column_id": new_column_id,
1047
+ "label": "body",
1048
+ "row_id": new_row_id,
1049
+ "cell_class": 2,
1050
+ }
1051
+ self._log().debug(
1052
+ "making new table_cell_id: {}".format(new_table_cell_id)
1053
+ )
1054
+ new_table_cells.append(new_table_cell)
1055
+
1056
+ # And then add new match to the new_matches
1057
+ new_matches[str(pdf_cell_id)] = [
1058
+ {"post": confidence, "table_cell_id": new_table_cell_id}
1059
+ ]
1060
+ return new_matches, new_table_cells, max_cell_id
1061
+
1062
+ def _clear_pdf_cells(self, pdf_cells):
1063
+ r"""
1064
+ Clean PDF cells from cells that have an empty string as text
1065
+
1066
+ Parameters
1067
+ ----------
1068
+ pdf_cells : list of dict
1069
+ List of PDF cells as defined by Docling
1070
+
1071
+ Returns
1072
+ -------
1073
+ new_pdf_cells : list of dict
1074
+ updated, cleaned list of pdf_cells
1075
+ """
1076
+ new_pdf_cells = []
1077
+ for i in range(len(pdf_cells)):
1078
+ if pdf_cells[i]["text"] != "":
1079
+ new_pdf_cells.append(pdf_cells[i])
1080
+ return new_pdf_cells
1081
+
1082
+ def process(self, matching_details):
1083
+ r"""
1084
+ Do post processing, see details in the comments below
1085
+
1086
+ Parameters
1087
+ ----------
1088
+ matching_details : dictionary
1089
+ contains all the necessary information for Docling processing
1090
+ already has predictions and initial (IOU) matches
1091
+
1092
+ Returns
1093
+ -------
1094
+ matching_details : dictionary
1095
+ matching_details that contain post-processed matches
1096
+ """
1097
+
1098
+ # ====================================================================================
1099
+ # Start post-processing
1100
+ # ====================================================================================
1101
+
1102
+ # General description of post-processing algorithm to improve matching
1103
+
1104
+ # Uses: (IOU) matches, pdf_cells, table_cells
1105
+ # Generates: new_table_cells, new_matches
1106
+
1107
+ # +0. Get minimal grid table dimension (cols/rows)
1108
+ # +1. Get good/bad IOU predicted cells for each structural column (of minimal grid)
1109
+ # +1.a. If all IOU in a column are bad - eliminate column (from bboxes and structure)
1110
+ # +2. Find alignment of good IOU cells per column
1111
+ # +2.a. For this, measure min-max distance for left, min-max distance for centroid,
1112
+ # min-max distance for right side of cell rectangles
1113
+ # smallest distance would determine Left / Center / Right alignment
1114
+ # +3. Get median* (according to alignment) "bbox left/middle/right X" coord
1115
+ # for good IOU cells, get median* cell size in a column.
1116
+ # +4. Move bad cells to the median* (left/middle/right) good in a column
1117
+ # +4.a. (Additionally), re-size cell to median* size of cells in a column
1118
+ # +5. Generate new matches, run Intersection over cell(pdf) on a table cells
1119
+
1120
+ # NOT USED +6. Align table cell bboxes with good matches
1121
+ # to encapsulate matching pdf cells
1122
+
1123
+ # +7. De-duplicate columns in table_cells according to highest column score
1124
+ # in: matches + intersection_pdf_matches
1125
+ # +8. Do final assignment of table bbox to pdf cell based on saved scores,
1126
+ # either preferring IOU over PDF Intersection, and higher Intersection over lower,
1127
+ # or just use PDF Intersection
1128
+ # Rule: 1 Table cell can contain many PDF cells,
1129
+ # but each PDF cell has to be asigned to one Table cell
1130
+ # Rule: Do not discard table bboxes at this point, asign all of them
1131
+ # +8.a. Align table cell bboxes with matched pdf cells bboxes (instead of step 6)
1132
+
1133
+ # +9. Pick up remaining orphan cells
1134
+ # +9.a. Determine row banding per row (min/max Y per row)
1135
+ # match with orphan cells (intersection with band)
1136
+ # +9.b. Determine column banding per column (min/max X per column)
1137
+ # match with orphan cells (intersection with band)
1138
+ # +9.c. Decide to which column/row each orphan PDF cell belongs
1139
+ # append match to existing structural cell,
1140
+ # or create cell
1141
+ # ?9.d. For each pdf-cell from the list find closest and MATCH with row-table-cell
1142
+ # (minimal rectangle to-rectangle distance)
1143
+
1144
+ # *Use median instead of average to account for occasional
1145
+ # colspans that would span across columns,
1146
+ # as they are rare they shouldn't indfluence much a median position
1147
+ # of other cells in a minimal-grid column
1148
+
1149
+ self._log().debug("Start prediction post-processing...")
1150
+ table_cells = matching_details["table_cells"]
1151
+ pdf_cells = self._clear_pdf_cells(matching_details["pdf_cells"])
1152
+ matches = matching_details["matches"]
1153
+
1154
+ # ------------------------------------------------------------------------------------------
1155
+ # -1. If initial (IOU) matches are empty,
1156
+ # generate new ones based on intersection over cell
1157
+
1158
+ if not matches:
1159
+ self._log().debug(
1160
+ "-----------------------------------------------------------------"
1161
+ )
1162
+ self._log().debug(
1163
+ "-----------------------------------------------------------------"
1164
+ )
1165
+ self._log().debug(
1166
+ "- NO INITIAL MATCHES TO POST PROCESS, GENERATING NEW ONES... -"
1167
+ )
1168
+ self._log().debug(
1169
+ "-----------------------------------------------------------------"
1170
+ )
1171
+ self._log().debug(
1172
+ "-----------------------------------------------------------------"
1173
+ )
1174
+ matches = self._run_intersection_match(
1175
+ self._cell_matcher, table_cells, pdf_cells
1176
+ )
1177
+
1178
+ # ------------------------------------------------------------------------------------------
1179
+ # 0. Get minimal grid table dimension (cols/rows)
1180
+ tab_columns, tab_rows, max_cell_id = self._get_table_dimension(table_cells)
1181
+ self._log().debug(
1182
+ "COLS {}/ ROWS {}/ MAX CELL ID {}".format(
1183
+ tab_columns, tab_rows, max_cell_id
1184
+ )
1185
+ )
1186
+
1187
+ good_table_cells = []
1188
+ bad_table_cells = []
1189
+ new_bad_table_cells = []
1190
+ fixed_table_cells = []
1191
+
1192
+ # 1. Get good/bad IOU predicted cells for each structural column (of minimal grid)
1193
+ for col in range(tab_columns):
1194
+ g1, g2 = self._get_good_bad_cells_in_column(table_cells, col, matches)
1195
+ good_table_cells = g1
1196
+ bad_table_cells = g2
1197
+ self._log().debug(
1198
+ "COLUMN {}, Good table cells: {}".format(col, len(good_table_cells))
1199
+ )
1200
+ self._log().debug(
1201
+ "COLUMN {}, Bad table cells: {}".format(col, len(bad_table_cells))
1202
+ )
1203
+
1204
+ # 2. Find alignment of good IOU cells per column
1205
+ alignment = self._find_alignment_in_column(good_table_cells)
1206
+ self._log().debug("COLUMN {}, Alignment: {}".format(col, alignment))
1207
+ # alignment = "left"
1208
+
1209
+ # 3. Get median (according to alignment) "bbox left/middle/right X"
1210
+ # coordinate for good IOU cells, get median* cell size in a column.
1211
+ gm1, gm2, gm3, gm4 = self._get_median_pos_size(good_table_cells, alignment)
1212
+ median_x = gm1
1213
+ # median_y = gm2
1214
+ median_width = gm3
1215
+ median_height = gm4
1216
+ self._log().debug("Median good X = {}".format(median_x))
1217
+
1218
+ # 4. Move bad cells to the median* (left/middle/right) good in a column
1219
+ # nc = self._move_cells_to_left_pos(bad_table_cells, median_x, True,
1220
+ # TODO:
1221
+ nc = self._move_cells_to_left_pos(
1222
+ bad_table_cells, median_x, False, median_width, median_height, alignment
1223
+ )
1224
+ new_bad_table_cells = nc
1225
+ fixed_table_cells.extend(good_table_cells)
1226
+ fixed_table_cells.extend(new_bad_table_cells)
1227
+
1228
+ # ====================================================================================
1229
+ # Sort table_cells by cell_id before running IOU, to have correct indexes on the output
1230
+ fixed_table_cells_sorted = sorted(fixed_table_cells, key=lambda k: k["cell_id"])
1231
+
1232
+ # 5. Generate new matches, run Intersection over cell(pdf) on a table cells
1233
+ ip = self._run_intersection_match(
1234
+ self._cell_matcher, fixed_table_cells_sorted, pdf_cells
1235
+ )
1236
+ intersection_pdf_matches = ip
1237
+
1238
+ # 6. NOT USED
1239
+
1240
+ # 7. De-duplicate columns in aligned_table_cells
1241
+ # according to highest column score in: matches + intersection_pdf_matches
1242
+ # (this is easier now, because duplicated cells will have same bboxes)
1243
+ dd1, dd2, dd3 = self._deduplicate_cells(
1244
+ tab_columns, fixed_table_cells_sorted, matches, intersection_pdf_matches
1245
+ )
1246
+ dedupl_table_cells = dd1
1247
+ dedupl_matches = dd2
1248
+
1249
+ self._log().debug("...")
1250
+
1251
+ # 8. Do final assignment of table bbox to pdf cell based on saved scores,
1252
+ # preferring IOU over PDF Intersection, and higher Intersection over lower
1253
+ # ! IOU matches currently disabled,
1254
+ # and final assigment is done only on IOC matches
1255
+ final_matches = self._do_final_asignment(
1256
+ dedupl_table_cells, matches, dedupl_matches
1257
+ )
1258
+
1259
+ # 8.a. Re-align bboxes / re-run matching
1260
+ dedupl_table_cells_sorted = sorted(
1261
+ dedupl_table_cells, key=lambda k: k["cell_id"]
1262
+ )
1263
+
1264
+ if len(pdf_cells) > 300:
1265
+ aligned_table_cells2 = dedupl_table_cells_sorted
1266
+ else:
1267
+ aligned_table_cells2 = self._align_table_cells_to_pdf(
1268
+ dedupl_table_cells_sorted, pdf_cells, final_matches
1269
+ )
1270
+
1271
+ # 9. Distance-match orphans
1272
+ po1, po2, po3 = self._pick_orphan_cells(
1273
+ tab_rows,
1274
+ tab_columns,
1275
+ max_cell_id,
1276
+ aligned_table_cells2,
1277
+ pdf_cells,
1278
+ final_matches,
1279
+ )
1280
+ final_matches_wo = po1
1281
+ table_cells_wo = po2
1282
+ max_cell_id = po3
1283
+
1284
+ self._log().debug("*** final_matches_wo")
1285
+ self._log().debug(final_matches_wo)
1286
+ self._log().debug("*** table_cells_wo")
1287
+ self._log().debug(table_cells_wo)
1288
+
1289
+ for pdf_cell_id in range(len(final_matches_wo)):
1290
+ if str(pdf_cell_id) in final_matches_wo:
1291
+ pdf_cell_match = final_matches_wo[str(pdf_cell_id)]
1292
+ if len(pdf_cell_match) > 1:
1293
+ l1 = "!!! Multiple - {}x pdf cell match with id: {}"
1294
+ self._log().info(l1.format(len(pdf_cell_match), pdf_cell_id))
1295
+ if pdf_cell_match:
1296
+ tcellid = pdf_cell_match[0]["table_cell_id"]
1297
+ for tcell in table_cells_wo:
1298
+ if tcell["cell_id"] == tcellid:
1299
+ mrow = tcell["row_id"]
1300
+ mcol = tcell["column_id"]
1301
+ l2 = "pdf cell: {} -> row: {} | col:{}"
1302
+ self._log().debug(l2.format(pdf_cell_id, mrow, mcol))
1303
+ else:
1304
+ self._log().debug(
1305
+ "!!! pdf cell doesn't have match: {}".format(pdf_cell_id)
1306
+ )
1307
+
1308
+ # Example of an object:
1309
+ # matching_details = {
1310
+ # "iou_threshold": self._iou_thres,
1311
+ # "table_bbox": table_bbox,
1312
+ # "prediction": prediction,
1313
+ # "pdf_cells": pdf_cells,
1314
+ # "page_height": docling_table["page_height"],
1315
+ # "page_width": docling_table["page_width"],
1316
+ # "table_cells": table_cells,
1317
+ # "matches": matches
1318
+ # }
1319
+
1320
+ matching_details["table_cells"] = table_cells_wo
1321
+ matching_details["matches"] = final_matches_wo
1322
+ matching_details["pdf_cells"] = pdf_cells
1323
+
1324
+ self._log().debug("Done prediction matching and post-processing!")
1325
+ return matching_details