docling-ibm-models 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_ibm_models/layoutmodel/layout_predictor.py +171 -0
- docling_ibm_models/tableformer/__init__.py +0 -0
- docling_ibm_models/tableformer/common.py +200 -0
- docling_ibm_models/tableformer/data_management/__init__.py +0 -0
- docling_ibm_models/tableformer/data_management/data_transformer.py +504 -0
- docling_ibm_models/tableformer/data_management/functional.py +574 -0
- docling_ibm_models/tableformer/data_management/matching_post_processor.py +1325 -0
- docling_ibm_models/tableformer/data_management/tf_cell_matcher.py +596 -0
- docling_ibm_models/tableformer/data_management/tf_dataset.py +1233 -0
- docling_ibm_models/tableformer/data_management/tf_predictor.py +1020 -0
- docling_ibm_models/tableformer/data_management/transforms.py +396 -0
- docling_ibm_models/tableformer/models/__init__.py +0 -0
- docling_ibm_models/tableformer/models/common/__init__.py +0 -0
- docling_ibm_models/tableformer/models/common/base_model.py +279 -0
- docling_ibm_models/tableformer/models/table04_rs/__init__.py +0 -0
- docling_ibm_models/tableformer/models/table04_rs/bbox_decoder_rs.py +163 -0
- docling_ibm_models/tableformer/models/table04_rs/encoder04_rs.py +72 -0
- docling_ibm_models/tableformer/models/table04_rs/tablemodel04_rs.py +324 -0
- docling_ibm_models/tableformer/models/table04_rs/transformer_rs.py +203 -0
- docling_ibm_models/tableformer/otsl.py +541 -0
- docling_ibm_models/tableformer/settings.py +90 -0
- docling_ibm_models/tableformer/test_dataset_cache.py +37 -0
- docling_ibm_models/tableformer/test_prepare_image.py +99 -0
- docling_ibm_models/tableformer/utils/__init__.py +0 -0
- docling_ibm_models/tableformer/utils/app_profiler.py +243 -0
- docling_ibm_models/tableformer/utils/torch_utils.py +216 -0
- docling_ibm_models/tableformer/utils/utils.py +376 -0
- docling_ibm_models/tableformer/utils/variance.py +175 -0
- docling_ibm_models-0.1.0.dist-info/LICENSE +21 -0
- docling_ibm_models-0.1.0.dist-info/METADATA +172 -0
- docling_ibm_models-0.1.0.dist-info/RECORD +32 -0
- docling_ibm_models-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,1325 @@
|
|
1
|
+
#
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
3
|
+
# SPDX-License-Identifier: MIT
|
4
|
+
#
|
5
|
+
import json
|
6
|
+
import logging
|
7
|
+
import statistics
|
8
|
+
|
9
|
+
import docling_ibm_models.tableformer.settings as s
|
10
|
+
from docling_ibm_models.tableformer.data_management.tf_cell_matcher import CellMatcher
|
11
|
+
|
12
|
+
LOG_LEVEL = logging.INFO
|
13
|
+
# LOG_LEVEL = logging.DEBUG
|
14
|
+
|
15
|
+
|
16
|
+
class MatchingPostProcessor:
|
17
|
+
r"""
|
18
|
+
The MatchingPostProcessor aims to improve the matchings between the predicted table cells and
|
19
|
+
the pdf cells
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self, config):
|
23
|
+
self._config = config
|
24
|
+
self._cell_matcher = CellMatcher(config)
|
25
|
+
|
26
|
+
def _log(self):
|
27
|
+
# Setup a custom logger
|
28
|
+
return s.get_custom_logger(self.__class__.__name__, LOG_LEVEL)
|
29
|
+
|
30
|
+
def _get_table_dimension(self, table_cells):
|
31
|
+
r"""
|
32
|
+
Get dimensions (columns, rows) of a table from table_cells
|
33
|
+
|
34
|
+
Parameters
|
35
|
+
----------
|
36
|
+
table_cells : list of dict
|
37
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
38
|
+
|
39
|
+
Returns
|
40
|
+
-------
|
41
|
+
columns : integer,
|
42
|
+
rows : integer,
|
43
|
+
max_cell_id : integer,
|
44
|
+
highest cell_id in table_cells
|
45
|
+
"""
|
46
|
+
columns = 1
|
47
|
+
rows = 1
|
48
|
+
max_cell_id = 0
|
49
|
+
|
50
|
+
for cell in table_cells:
|
51
|
+
if cell["column_id"] > columns:
|
52
|
+
columns = cell["column_id"]
|
53
|
+
if cell["row_id"] > rows:
|
54
|
+
rows = cell["row_id"]
|
55
|
+
if cell["cell_id"] > max_cell_id:
|
56
|
+
max_cell_id = cell["cell_id"]
|
57
|
+
|
58
|
+
return columns + 1, rows + 1, max_cell_id
|
59
|
+
|
60
|
+
def _get_good_bad_cells_in_column(self, table_cells, column, matches):
|
61
|
+
r"""
|
62
|
+
1. step
|
63
|
+
Get good/bad IOU predicted cells for each structural column (of minimal grid)
|
64
|
+
|
65
|
+
Parameters
|
66
|
+
----------
|
67
|
+
table_cells : list of dict
|
68
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
69
|
+
column : integer
|
70
|
+
Index of a column
|
71
|
+
matches : dictionary of lists of table_cells
|
72
|
+
A dictionary which is indexed by the pdf_cell_id as key and the value is a list
|
73
|
+
of the table_cells that fall inside that pdf cell
|
74
|
+
|
75
|
+
Returns
|
76
|
+
-------
|
77
|
+
good_table_cells : list of dict
|
78
|
+
cells in a column that have match
|
79
|
+
bad_table_cells : list of dict
|
80
|
+
cells in a column that don't have match
|
81
|
+
"""
|
82
|
+
good_table_cells = []
|
83
|
+
bad_table_cells = []
|
84
|
+
|
85
|
+
for cell in table_cells:
|
86
|
+
if cell["column_id"] == column:
|
87
|
+
table_cell_id = cell["cell_id"]
|
88
|
+
|
89
|
+
bad_match = True
|
90
|
+
allow_class = True
|
91
|
+
|
92
|
+
for pdf_cell_id in matches:
|
93
|
+
# CHECK IF CELL CLASS TO BE VERIFIED HERE
|
94
|
+
if "cell_class" in cell:
|
95
|
+
if cell["cell_class"] <= 1:
|
96
|
+
allow_class = False
|
97
|
+
else:
|
98
|
+
print("***")
|
99
|
+
print("no cell_class in...")
|
100
|
+
print(cell)
|
101
|
+
print("***")
|
102
|
+
if allow_class:
|
103
|
+
match_list = matches[pdf_cell_id]
|
104
|
+
for match in match_list:
|
105
|
+
if match["table_cell_id"] == table_cell_id:
|
106
|
+
good_table_cells.append(cell)
|
107
|
+
bad_match = False
|
108
|
+
if bad_match:
|
109
|
+
bad_table_cells.append(cell)
|
110
|
+
|
111
|
+
return good_table_cells, bad_table_cells
|
112
|
+
|
113
|
+
def _delete_column_from_table(self, table_cells, column):
|
114
|
+
r"""
|
115
|
+
1.a. step
|
116
|
+
If all IOU in a column are bad - eliminate column (from bboxes and structure)
|
117
|
+
|
118
|
+
Parameters
|
119
|
+
----------
|
120
|
+
table_cells : list of dict
|
121
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
122
|
+
column : integer
|
123
|
+
Index of a column
|
124
|
+
|
125
|
+
Returns
|
126
|
+
-------
|
127
|
+
new_table_cells : list of dict
|
128
|
+
"""
|
129
|
+
new_table_cells = []
|
130
|
+
|
131
|
+
for cell in table_cells:
|
132
|
+
if cell["column_id"] < column:
|
133
|
+
new_table_cells.append(cell)
|
134
|
+
if cell["column_id"] > column:
|
135
|
+
new_cell = {
|
136
|
+
"bbox": cell["bbox"],
|
137
|
+
"cell_id": cell["cell_id"],
|
138
|
+
"column_id": cell["column_id"] - 1,
|
139
|
+
"label": cell["label"],
|
140
|
+
"row_id": cell["row_id"],
|
141
|
+
"cell_class": cell["cell_class"],
|
142
|
+
}
|
143
|
+
new_table_cells.append(new_cell)
|
144
|
+
|
145
|
+
return new_table_cells
|
146
|
+
|
147
|
+
def _find_alignment_in_column(self, cells):
|
148
|
+
r"""
|
149
|
+
2. step
|
150
|
+
Find alignment of good IOU cells per column
|
151
|
+
|
152
|
+
Parameters
|
153
|
+
----------
|
154
|
+
cells : list of dict
|
155
|
+
Cells in a column
|
156
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
157
|
+
|
158
|
+
Returns
|
159
|
+
-------
|
160
|
+
alignment : string
|
161
|
+
column general alignment can be: "left", "right", "center"
|
162
|
+
"""
|
163
|
+
possible_alignments = ["left", "middle", "right"]
|
164
|
+
alignment = "left" # left / right / center
|
165
|
+
|
166
|
+
lefts = []
|
167
|
+
rights = []
|
168
|
+
middles = []
|
169
|
+
|
170
|
+
for cell in cells:
|
171
|
+
x_left = cell["bbox"][0]
|
172
|
+
x_right = cell["bbox"][2]
|
173
|
+
x_middle = (x_left + x_right) / 2
|
174
|
+
lefts.append(x_left)
|
175
|
+
rights.append(x_right)
|
176
|
+
middles.append(x_middle)
|
177
|
+
|
178
|
+
if len(lefts) > 0:
|
179
|
+
delta_left = max(lefts) - min(lefts)
|
180
|
+
delta_middle = max(middles) - min(middles)
|
181
|
+
delta_right = max(rights) - min(rights)
|
182
|
+
|
183
|
+
deltas = [delta_left, delta_middle, delta_right]
|
184
|
+
align_index = deltas.index(min(deltas))
|
185
|
+
alignment = possible_alignments[align_index]
|
186
|
+
|
187
|
+
return alignment
|
188
|
+
|
189
|
+
def _get_median_pos_size(self, cells, alignment):
|
190
|
+
r"""
|
191
|
+
3. step
|
192
|
+
Get median* (according to alignment) "bbox left/middle/right X" coord
|
193
|
+
for good IOU cells, get median* cell size in a column.
|
194
|
+
|
195
|
+
Parameters
|
196
|
+
----------
|
197
|
+
cells : list of dict
|
198
|
+
Cells in a column
|
199
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
200
|
+
alignment : string
|
201
|
+
column general alignment can be: "left", "right", "center"
|
202
|
+
|
203
|
+
Returns
|
204
|
+
-------
|
205
|
+
median_x : number
|
206
|
+
Median X position of a cell (according to alignment)
|
207
|
+
median_y : number
|
208
|
+
Median Y position of a cell (according to alignment)
|
209
|
+
median_width : number
|
210
|
+
Median width of a cell
|
211
|
+
median_height : number
|
212
|
+
Median height of a cell
|
213
|
+
"""
|
214
|
+
median_x = 0
|
215
|
+
median_y = 0
|
216
|
+
median_width = 1
|
217
|
+
median_height = 1
|
218
|
+
|
219
|
+
coords_x = []
|
220
|
+
coords_y = []
|
221
|
+
widths = []
|
222
|
+
heights = []
|
223
|
+
|
224
|
+
for cell in cells:
|
225
|
+
if "rowspan_val" not in cell:
|
226
|
+
if "colspan_val" not in cell:
|
227
|
+
if cell["cell_class"] > 1:
|
228
|
+
# Use left alignment
|
229
|
+
x_coord = cell["bbox"][0]
|
230
|
+
if alignment == "middle":
|
231
|
+
# Use middle alignment
|
232
|
+
x_coord = (cell["bbox"][2] + cell["bbox"][0]) / 2
|
233
|
+
if alignment == "right":
|
234
|
+
# Use right alignment
|
235
|
+
x_coord = cell["bbox"][2]
|
236
|
+
|
237
|
+
coords_x.append(x_coord)
|
238
|
+
y_coord = cell["bbox"][1]
|
239
|
+
coords_y.append(y_coord)
|
240
|
+
|
241
|
+
width = cell["bbox"][2] - cell["bbox"][0]
|
242
|
+
widths.append(width)
|
243
|
+
height = cell["bbox"][3] - cell["bbox"][1]
|
244
|
+
heights.append(height)
|
245
|
+
else:
|
246
|
+
self._log().debug("Empty cells not considered in medians")
|
247
|
+
self._log().debug(cell)
|
248
|
+
else:
|
249
|
+
self._log().debug("Colspans not considered in medians")
|
250
|
+
self._log().debug(cell)
|
251
|
+
else:
|
252
|
+
self._log().debug("Rowspans not considered in medians")
|
253
|
+
self._log().debug(cell)
|
254
|
+
|
255
|
+
if len(coords_x) > 0:
|
256
|
+
median_x = statistics.median(coords_x)
|
257
|
+
if len(coords_y) > 0:
|
258
|
+
median_y = statistics.median(coords_y)
|
259
|
+
if len(widths) > 0:
|
260
|
+
median_width = statistics.median(widths)
|
261
|
+
if len(heights) > 0:
|
262
|
+
median_height = statistics.median(heights)
|
263
|
+
return median_x, median_y, median_width, median_height
|
264
|
+
|
265
|
+
def _move_cells_to_left_pos(
|
266
|
+
self, cells, median_x, rescale, median_width, median_height, alignment
|
267
|
+
):
|
268
|
+
r"""
|
269
|
+
4. step
|
270
|
+
Move bad cells to the median* (left/middle/right) good in a column
|
271
|
+
(Additionally), re-size cell to median* size of cells in a column
|
272
|
+
|
273
|
+
Parameters
|
274
|
+
----------
|
275
|
+
cells : list of dict
|
276
|
+
Cells in a column
|
277
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
278
|
+
median_x : number
|
279
|
+
Median X position of a cell (according to alignment)
|
280
|
+
rescale : boolean
|
281
|
+
should cells be re-sized to median or not
|
282
|
+
median_width : number
|
283
|
+
Median width of a cell
|
284
|
+
median_height : number
|
285
|
+
Median height of a cell
|
286
|
+
alignment : string
|
287
|
+
column general alignment can be: "left", "right", "center"
|
288
|
+
|
289
|
+
Returns
|
290
|
+
-------
|
291
|
+
|
292
|
+
new_table_cells : list of dict
|
293
|
+
Cells in a column
|
294
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
295
|
+
"""
|
296
|
+
new_table_cells = []
|
297
|
+
|
298
|
+
for cell in cells:
|
299
|
+
new_cell = {
|
300
|
+
"bbox": [],
|
301
|
+
"cell_id": 0,
|
302
|
+
"column_id": 0,
|
303
|
+
"label": "",
|
304
|
+
"row_id": 0,
|
305
|
+
"cell_class": 0,
|
306
|
+
}
|
307
|
+
x1 = cell["bbox"][0]
|
308
|
+
y1 = cell["bbox"][1]
|
309
|
+
x2 = cell["bbox"][2]
|
310
|
+
y2 = cell["bbox"][3]
|
311
|
+
original_width = x2 - x1
|
312
|
+
# original_height = y2 - y1
|
313
|
+
|
314
|
+
# Move to left by default
|
315
|
+
new_x1 = median_x
|
316
|
+
new_y1 = y1
|
317
|
+
new_x2 = median_x + original_width
|
318
|
+
new_y2 = y2
|
319
|
+
|
320
|
+
if rescale:
|
321
|
+
new_x2 = median_x + median_width
|
322
|
+
# Next line does vertical resizing of BBOX:
|
323
|
+
new_y2 = y1 + median_height
|
324
|
+
|
325
|
+
# Move to middle
|
326
|
+
if alignment == "middle":
|
327
|
+
# TODO
|
328
|
+
new_x1 = median_x - (original_width / 2)
|
329
|
+
new_x2 = new_x1 + original_width
|
330
|
+
if rescale:
|
331
|
+
new_x1 = median_x - (median_width / 2)
|
332
|
+
new_x2 = median_x + (median_width / 2)
|
333
|
+
|
334
|
+
# Move to right
|
335
|
+
if alignment == "right":
|
336
|
+
new_x1 = median_x - original_width
|
337
|
+
new_x2 = median_x
|
338
|
+
if rescale:
|
339
|
+
new_x1 = median_x - median_width
|
340
|
+
|
341
|
+
new_cell["bbox"] = [new_x1, new_y1, new_x2, new_y2]
|
342
|
+
new_cell["cell_id"] = cell["cell_id"]
|
343
|
+
new_cell["column_id"] = cell["column_id"]
|
344
|
+
new_cell["label"] = cell["label"]
|
345
|
+
new_cell["row_id"] = cell["row_id"]
|
346
|
+
new_cell["cell_class"] = cell["cell_class"]
|
347
|
+
# Add spans if present
|
348
|
+
if "rowspan_val" in cell:
|
349
|
+
new_cell["rowspan_val"] = cell["rowspan_val"]
|
350
|
+
if "colspan_val" in cell:
|
351
|
+
new_cell["colspan_val"] = cell["colspan_val"]
|
352
|
+
new_table_cells.append(new_cell)
|
353
|
+
return new_table_cells
|
354
|
+
|
355
|
+
def _run_intersection_match(self, cell_matcher, table_cells, pdf_cells):
|
356
|
+
r"""
|
357
|
+
5. step
|
358
|
+
Generate new matches, run Intersection over cell(pdf) on a table cells
|
359
|
+
|
360
|
+
Parameters
|
361
|
+
----------
|
362
|
+
cell_matcher : CellMatcher
|
363
|
+
src.data_management.cell_matcher
|
364
|
+
table_cells : list of dict
|
365
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
366
|
+
pdf_cells : list of dict
|
367
|
+
List of PDF cells as defined by Docling
|
368
|
+
|
369
|
+
Returns
|
370
|
+
-------
|
371
|
+
clean_matches : dictionary of lists of table_cells
|
372
|
+
A dictionary which is indexed by the pdf_cell_id as key and the value is a list
|
373
|
+
of the table_cells that fall inside that pdf cell
|
374
|
+
"""
|
375
|
+
new_matches = {}
|
376
|
+
clean_matches = {}
|
377
|
+
new_matches, matches_counter = cell_matcher._intersection_over_pdf_match(
|
378
|
+
table_cells, pdf_cells
|
379
|
+
)
|
380
|
+
clean_matches = new_matches
|
381
|
+
# Convert to JSON and back to have string keys in the dictionary
|
382
|
+
clean_matches_string = json.dumps(clean_matches)
|
383
|
+
clean_matches = json.loads(clean_matches_string)
|
384
|
+
return clean_matches
|
385
|
+
|
386
|
+
def _align_table_cells_to_pdf(self, table_cells, pdf_cells, matches):
|
387
|
+
r"""
|
388
|
+
USED in 8.a step
|
389
|
+
NOT USED in 6. step
|
390
|
+
|
391
|
+
Align table cell bboxes with good matches
|
392
|
+
to encapsulate matching pdf cells
|
393
|
+
|
394
|
+
Parameters
|
395
|
+
----------
|
396
|
+
table_cells : list of dict
|
397
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
398
|
+
pdf_cells : list of dict
|
399
|
+
List of PDF cells as defined by Docling
|
400
|
+
matches : dictionary of lists of table_cells
|
401
|
+
A dictionary which is indexed by the pdf_cell_id as key and the value is a list
|
402
|
+
of the table_cells that fall inside that pdf cell
|
403
|
+
|
404
|
+
Returns
|
405
|
+
-------
|
406
|
+
clean_table_cells : list of dict
|
407
|
+
Aligned and cleaned table cells
|
408
|
+
"""
|
409
|
+
# 6
|
410
|
+
# align table cells with matching pdf cells
|
411
|
+
new_table_cells = []
|
412
|
+
|
413
|
+
for pdf_cell_id in matches:
|
414
|
+
match_list = matches[pdf_cell_id]
|
415
|
+
one_table_cells = []
|
416
|
+
for i in range(len(match_list)):
|
417
|
+
otc = int(match_list[i]["table_cell_id"])
|
418
|
+
if otc not in one_table_cells:
|
419
|
+
one_table_cells.append(otc)
|
420
|
+
|
421
|
+
# Get bbox of pdf_cell:
|
422
|
+
pdf_cell_bbox = []
|
423
|
+
for pdf_cell in pdf_cells:
|
424
|
+
if pdf_cell["id"] == int(pdf_cell_id):
|
425
|
+
pdf_cell_bbox = pdf_cell["bbox"]
|
426
|
+
|
427
|
+
# Get bbox of pdf_cell:
|
428
|
+
for table_cell in table_cells:
|
429
|
+
if table_cell["cell_id"] in one_table_cells:
|
430
|
+
# Align bbox vertically to cover PDF cell
|
431
|
+
new_bbox = [
|
432
|
+
pdf_cell_bbox[0],
|
433
|
+
pdf_cell_bbox[1],
|
434
|
+
pdf_cell_bbox[2],
|
435
|
+
pdf_cell_bbox[3],
|
436
|
+
]
|
437
|
+
# We are sure cell is not empty,
|
438
|
+
# because we assign PDF cell to it
|
439
|
+
new_table_cell_class = "2"
|
440
|
+
|
441
|
+
if "cell_class" in table_cell:
|
442
|
+
new_table_cell_class = table_cell["cell_class"]
|
443
|
+
|
444
|
+
new_table_cell = {
|
445
|
+
"bbox": new_bbox,
|
446
|
+
"cell_id": table_cell["cell_id"],
|
447
|
+
"column_id": table_cell["column_id"],
|
448
|
+
"label": table_cell["label"],
|
449
|
+
"row_id": table_cell["row_id"],
|
450
|
+
"cell_class": new_table_cell_class,
|
451
|
+
}
|
452
|
+
|
453
|
+
if "colspan_val" in table_cell:
|
454
|
+
new_table_cell["colspan_val"] = table_cell["colspan_val"]
|
455
|
+
if "rowspan_val" in table_cell:
|
456
|
+
new_table_cell["rowspan_val"] = table_cell["rowspan_val"]
|
457
|
+
new_table_cells.append(new_table_cell)
|
458
|
+
|
459
|
+
# Rebuild table_cells list deduplicating repeating cells,
|
460
|
+
# encapsulating all duplicate cells dimensions
|
461
|
+
|
462
|
+
for new_table_cell in new_table_cells:
|
463
|
+
cell_id_to_find = new_table_cell["cell_id"]
|
464
|
+
|
465
|
+
x1s = []
|
466
|
+
y1s = []
|
467
|
+
x2s = []
|
468
|
+
y2s = []
|
469
|
+
|
470
|
+
found = 0
|
471
|
+
|
472
|
+
for found_cell in new_table_cells:
|
473
|
+
if found_cell["cell_id"] == cell_id_to_find:
|
474
|
+
found += 1
|
475
|
+
x1s.append(found_cell["bbox"][0])
|
476
|
+
y1s.append(found_cell["bbox"][1])
|
477
|
+
x2s.append(found_cell["bbox"][2])
|
478
|
+
y2s.append(found_cell["bbox"][3])
|
479
|
+
|
480
|
+
min_x1 = min(x1s)
|
481
|
+
min_y1 = min(y1s)
|
482
|
+
max_x2 = max(x2s)
|
483
|
+
max_y2 = max(y2s)
|
484
|
+
|
485
|
+
if found > 1:
|
486
|
+
new_table_cell["bbox"] = [min_x1, min_y1, max_x2, max_y2]
|
487
|
+
|
488
|
+
clean_table_cells = [
|
489
|
+
i
|
490
|
+
for n, i in enumerate(new_table_cells)
|
491
|
+
if i not in new_table_cells[n + 1 :]
|
492
|
+
]
|
493
|
+
return clean_table_cells
|
494
|
+
|
495
|
+
def _deduplicate_cells(self, tab_columns, table_cells, iou_matches, ioc_matches):
|
496
|
+
r"""
|
497
|
+
7. step
|
498
|
+
|
499
|
+
De-duplicate columns in table_cells according to highest column score
|
500
|
+
in: matches + intersection_pdf_matches
|
501
|
+
|
502
|
+
Parameters
|
503
|
+
----------
|
504
|
+
tab_columns : integer
|
505
|
+
Number of table columns
|
506
|
+
table_cells : list of dict
|
507
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
508
|
+
iou_matches : dictionary of lists of table_cells
|
509
|
+
Cell matches done using Intersection Over Union (IOU) method
|
510
|
+
ioc_matches : dictionary of lists of table_cells
|
511
|
+
Cell matches done using Intersection Over (PDF) Cell method
|
512
|
+
|
513
|
+
Returns
|
514
|
+
-------
|
515
|
+
new_table_cells : list of dict
|
516
|
+
New table cells with removed column duplicates
|
517
|
+
new_matches : dictionary of lists of table_cells
|
518
|
+
Matches that are in sync with new_table_cells
|
519
|
+
new_tab_columns : integer
|
520
|
+
New number of table columns
|
521
|
+
"""
|
522
|
+
pdf_cells_in_columns = []
|
523
|
+
total_score_in_columns = []
|
524
|
+
|
525
|
+
for col in range(tab_columns):
|
526
|
+
column_table_cells = []
|
527
|
+
column_pdf_cells_iou = []
|
528
|
+
column_pdf_cells_ioc = []
|
529
|
+
column_pdf_cells = []
|
530
|
+
column_iou_score = 0
|
531
|
+
column_ioc_score = 0
|
532
|
+
|
533
|
+
for cell in table_cells:
|
534
|
+
if cell["column_id"] == col:
|
535
|
+
table_cell_id = cell["cell_id"]
|
536
|
+
column_table_cells.append(table_cell_id)
|
537
|
+
|
538
|
+
# SUM IOU + IOC Scores for column, collect all pdf_cell_id
|
539
|
+
for iou_key in iou_matches:
|
540
|
+
iou_match_list = iou_matches[iou_key]
|
541
|
+
for uk in range(len(iou_match_list)):
|
542
|
+
t_cell_id = iou_match_list[uk]["table_cell_id"]
|
543
|
+
if t_cell_id in column_table_cells:
|
544
|
+
if "iou" in iou_match_list[uk]:
|
545
|
+
# In case initial match was IOU
|
546
|
+
column_iou_score += iou_match_list[uk]["iou"]
|
547
|
+
elif "iopdf" in iou_match_list[uk]:
|
548
|
+
# Otherwise it's intersection over PDF match
|
549
|
+
column_iou_score += iou_match_list[uk]["iopdf"]
|
550
|
+
column_pdf_cells_iou.append(iou_key)
|
551
|
+
|
552
|
+
for ioc_key in ioc_matches:
|
553
|
+
ioc_match_list = ioc_matches[ioc_key]
|
554
|
+
for k in range(len(ioc_match_list)):
|
555
|
+
t_cell_id = ioc_match_list[k]["table_cell_id"]
|
556
|
+
if t_cell_id in column_table_cells:
|
557
|
+
column_ioc_score += ioc_match_list[k]["iopdf"]
|
558
|
+
column_pdf_cells_ioc.append(ioc_key)
|
559
|
+
|
560
|
+
column_pdf_cells = column_pdf_cells_iou
|
561
|
+
column_pdf_cells += list(
|
562
|
+
set(column_pdf_cells_ioc) - set(column_pdf_cells_iou)
|
563
|
+
)
|
564
|
+
column_total_score = column_iou_score + column_ioc_score
|
565
|
+
|
566
|
+
pdf_cells_in_columns.append(column_pdf_cells)
|
567
|
+
total_score_in_columns.append(column_total_score)
|
568
|
+
self._log().debug(
|
569
|
+
"Column: {}, Score:{}, PDF cells: {}".format(
|
570
|
+
col, column_total_score, column_pdf_cells
|
571
|
+
)
|
572
|
+
)
|
573
|
+
|
574
|
+
# Eliminate duplicates in the pdf_cells_in_columns and ensure int content
|
575
|
+
# pdf_cells_in_columns:
|
576
|
+
# - initially: list of lists of str with duplicates in the inner lists
|
577
|
+
# - afterwards: list of lists of int (unique)
|
578
|
+
pdf_cells_in_columns = [
|
579
|
+
list(set([x for x in map(lambda x: int(x), le)]))
|
580
|
+
for le in pdf_cells_in_columns
|
581
|
+
]
|
582
|
+
cols_to_eliminate = []
|
583
|
+
# Pairwise comparison of all columns, finding intersection, and it's length
|
584
|
+
for cl in range(tab_columns - 1):
|
585
|
+
col_a = pdf_cells_in_columns[cl]
|
586
|
+
col_b = pdf_cells_in_columns[cl + 1]
|
587
|
+
score_a = total_score_in_columns[cl]
|
588
|
+
score_b = total_score_in_columns[cl + 1]
|
589
|
+
intsct = list(set(col_a).intersection(col_b))
|
590
|
+
int_prc = 0
|
591
|
+
if len(col_a) > 0:
|
592
|
+
int_prc = len(intsct) / len(col_a)
|
593
|
+
logstring = "Col A: {}, Col B: {}, Int: {}, %: {}, Score A: {}, Score B: {}"
|
594
|
+
self._log().debug(
|
595
|
+
logstring.format(cl, cl + 1, len(intsct), int_prc, score_a, score_b)
|
596
|
+
)
|
597
|
+
|
598
|
+
# Consider structural column elimination
|
599
|
+
# if 60% of two columns pointing to the same pdf cells
|
600
|
+
if int_prc > 0.6:
|
601
|
+
if score_a >= score_b:
|
602
|
+
# Elliminate B
|
603
|
+
cols_to_eliminate.append(cl + 1)
|
604
|
+
if score_b > score_a:
|
605
|
+
# Elliminate A
|
606
|
+
cols_to_eliminate.append(cl)
|
607
|
+
|
608
|
+
self._log().debug("Columns to eliminate: {}".format(cols_to_eliminate))
|
609
|
+
new_table_cells = []
|
610
|
+
new_matches = {}
|
611
|
+
|
612
|
+
removed_table_cell_ids = []
|
613
|
+
new_tab_columns = tab_columns - len(cols_to_eliminate)
|
614
|
+
|
615
|
+
# Clean table_cells structure
|
616
|
+
for tab_cell in table_cells:
|
617
|
+
add_cell = True
|
618
|
+
for col_del in cols_to_eliminate:
|
619
|
+
if tab_cell["column_id"] == col_del:
|
620
|
+
removed_table_cell_ids.append(tab_cell["cell_id"])
|
621
|
+
add_cell = False
|
622
|
+
if add_cell:
|
623
|
+
new_table_cells.append(tab_cell)
|
624
|
+
# Clean ioc_matches structure
|
625
|
+
for pdf_cell_id, pdf_cell_matches in ioc_matches.items():
|
626
|
+
new_cell_match = []
|
627
|
+
for pdf_match in pdf_cell_matches:
|
628
|
+
if pdf_match["table_cell_id"] not in removed_table_cell_ids:
|
629
|
+
new_cell_match.append(pdf_match)
|
630
|
+
|
631
|
+
if len(new_cell_match) > 0:
|
632
|
+
new_matches[pdf_cell_id] = new_cell_match
|
633
|
+
|
634
|
+
return new_table_cells, new_matches, new_tab_columns
|
635
|
+
|
636
|
+
def _do_final_asignment(self, table_cells, iou_matches, ioc_matches):
|
637
|
+
r"""
|
638
|
+
8. step
|
639
|
+
|
640
|
+
Do final assignment of table bbox to pdf cell based on saved scores,
|
641
|
+
either preferring IOU over PDF Intersection, and higher Intersection over lower,
|
642
|
+
or just use PDF Intersection
|
643
|
+
Rule: 1 Table cell can contain many PDF cells,
|
644
|
+
but each PDF cell has to be asigned to one Table cell
|
645
|
+
Rule: Do not discard table bboxes at this point, asign all of them
|
646
|
+
|
647
|
+
Iterate over matches, if PDF cell has more than 1 table cell match:
|
648
|
+
Go over all other matches and delete tab_cell match of lower score
|
649
|
+
(prefer iou match over ioc match)
|
650
|
+
|
651
|
+
Parameters
|
652
|
+
----------
|
653
|
+
table_cells : list of dict
|
654
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
655
|
+
iou_matches : dictionary of lists of table_cells
|
656
|
+
Cell matches done using Intersection Over Union (IOU) method
|
657
|
+
ioc_matches : dictionary of lists of table_cells
|
658
|
+
Cell matches done using Intersection Over (PDF) Cell method
|
659
|
+
|
660
|
+
Returns
|
661
|
+
-------
|
662
|
+
new_matches : dictionary of lists of table_cells
|
663
|
+
New matches with final table cell asignments
|
664
|
+
"""
|
665
|
+
new_matches = {}
|
666
|
+
|
667
|
+
for pdf_cell_id, pdf_cell_matches in ioc_matches.items():
|
668
|
+
max_ioc_match = max(pdf_cell_matches, key=lambda x: x["iopdf"])
|
669
|
+
new_matches[pdf_cell_id] = [max_ioc_match]
|
670
|
+
|
671
|
+
return new_matches
|
672
|
+
|
673
|
+
def _merge_two_bboxes(self, bbox1, bbox2):
|
674
|
+
r"""
|
675
|
+
Merge two bboxes into one bboxes that encompasses the two
|
676
|
+
|
677
|
+
Parameters
|
678
|
+
----------
|
679
|
+
bbox1 : list of numbers
|
680
|
+
bbox to be merged described as two corners [x1, y1, x2, y2]
|
681
|
+
bbox1 : list of numbers
|
682
|
+
bbox to be merged described as two corners [x1, y1, x2, y2]
|
683
|
+
|
684
|
+
Returns
|
685
|
+
-------
|
686
|
+
bbox_result : list of numbers
|
687
|
+
bbox that encompasses two input bboxes
|
688
|
+
"""
|
689
|
+
bbox_result = [-1, -1, -1, -1]
|
690
|
+
bbox_result[0] = min([bbox1[0], bbox2[0]])
|
691
|
+
bbox_result[1] = min([bbox1[1], bbox2[1]])
|
692
|
+
bbox_result[2] = max([bbox1[2], bbox2[2]])
|
693
|
+
bbox_result[3] = max([bbox1[3], bbox2[3]])
|
694
|
+
return bbox_result
|
695
|
+
|
696
|
+
def _pick_orphan_cells(
|
697
|
+
self, tab_rows, tab_cols, max_cell_id, table_cells, pdf_cells, matches
|
698
|
+
):
|
699
|
+
# 9.
|
700
|
+
# new_matches, new_table_cells, max_cell_id
|
701
|
+
r"""
|
702
|
+
9. step
|
703
|
+
|
704
|
+
Pick up remaining orphan cells (pdf cells that don't have any matches or intersections)
|
705
|
+
9.a. Determine row banding per row (min/max Y per row)
|
706
|
+
match with orphan cells (intersection with band)
|
707
|
+
9.b. Determine column banding per column (min/max X per column)
|
708
|
+
match with orphan cells (intersection with band)
|
709
|
+
9.c. Decide to which column/row each orphan PDF cell belongs
|
710
|
+
append match to existing structural cell, or create cell
|
711
|
+
|
712
|
+
Parameters
|
713
|
+
----------
|
714
|
+
tab_rows : number
|
715
|
+
Total number of rows
|
716
|
+
tab_cols : number
|
717
|
+
Total number of columns
|
718
|
+
max_cell_id : number
|
719
|
+
Highest table cell id
|
720
|
+
table_cells : list of dict
|
721
|
+
Each value is a dictionary with keys: "cell_id", "row_id", "column_id", "bbox", "label"
|
722
|
+
pdf_cells : list of dict
|
723
|
+
List of PDF cells as defined by Docling
|
724
|
+
matches : dictionary of lists of table_cells
|
725
|
+
A dictionary which is indexed by the pdf_cell_id as key and the value is a list
|
726
|
+
of the table_cells that fall inside that pdf cell
|
727
|
+
|
728
|
+
Returns
|
729
|
+
-------
|
730
|
+
new_matches : dictionary of lists of table_cells
|
731
|
+
updated matches
|
732
|
+
new_table_cells : list of dict
|
733
|
+
updated table cells
|
734
|
+
max_cell_id : number
|
735
|
+
New highest table cell id, accounting freshly added table cells (if any)
|
736
|
+
"""
|
737
|
+
|
738
|
+
new_matches = matches
|
739
|
+
new_table_cells = table_cells
|
740
|
+
|
741
|
+
# Identify orphan rows (START)
|
742
|
+
orphan_rows = []
|
743
|
+
orphan_rows_depth = []
|
744
|
+
orphan_rows_bbox = []
|
745
|
+
|
746
|
+
# List with pdf_ids which are used in some (any) row
|
747
|
+
used_row_pdf_ids = []
|
748
|
+
used_row_rowid = []
|
749
|
+
|
750
|
+
for row in range(tab_rows):
|
751
|
+
bbox_y1s = [] # y2 > y1
|
752
|
+
bbox_y2s = []
|
753
|
+
row_y1 = -1
|
754
|
+
row_y2 = -1
|
755
|
+
row_table_cells = []
|
756
|
+
row_table_cell_ids = []
|
757
|
+
orphan_cells_in_row = []
|
758
|
+
orphan_cells_in_row_depth = []
|
759
|
+
orphan_cells_in_row_bbox = []
|
760
|
+
|
761
|
+
for cell in table_cells:
|
762
|
+
if cell["row_id"] == row:
|
763
|
+
# Do not consider spanned cells
|
764
|
+
if "rowspan_val" not in cell:
|
765
|
+
# Do not consider empty cells
|
766
|
+
if cell["cell_class"] > 1:
|
767
|
+
table_cell_id = cell["cell_id"]
|
768
|
+
row_table_cells.append(cell)
|
769
|
+
row_table_cell_ids.append(table_cell_id)
|
770
|
+
bbox_y1s.append(cell["bbox"][1])
|
771
|
+
bbox_y2s.append(cell["bbox"][3])
|
772
|
+
|
773
|
+
# Y coordinates that define band of rows
|
774
|
+
if len(bbox_y1s) > 0:
|
775
|
+
row_y1 = min(bbox_y1s)
|
776
|
+
if len(bbox_y2s) > 0:
|
777
|
+
row_y2 = max(bbox_y2s)
|
778
|
+
|
779
|
+
# Find "orphan" cells that intersect the band
|
780
|
+
for pdf_cell in pdf_cells:
|
781
|
+
pdf_str_id = str(pdf_cell["id"])
|
782
|
+
if pdf_str_id not in matches:
|
783
|
+
within_band = False
|
784
|
+
depth = -1
|
785
|
+
|
786
|
+
centroid_band = (row_y2 + row_y1) / 2
|
787
|
+
centroid_cell = (pdf_cell["bbox"][3] + pdf_cell["bbox"][1]) / 2
|
788
|
+
|
789
|
+
# pdf_cell - Orphan, and don't have any match
|
790
|
+
if pdf_cell["bbox"][1] >= row_y1 and pdf_cell["bbox"][1] <= row_y2:
|
791
|
+
depth = abs(centroid_band - centroid_cell)
|
792
|
+
within_band = True
|
793
|
+
|
794
|
+
if pdf_cell["bbox"][3] >= row_y1 and pdf_cell["bbox"][3] <= row_y2:
|
795
|
+
depth = abs(centroid_band - centroid_cell)
|
796
|
+
within_band = True
|
797
|
+
|
798
|
+
if pdf_cell["bbox"][1] <= row_y1 and pdf_cell["bbox"][3] >= row_y2:
|
799
|
+
depth = abs(centroid_band - centroid_cell)
|
800
|
+
within_band = True
|
801
|
+
|
802
|
+
if within_band:
|
803
|
+
if pdf_str_id not in used_row_pdf_ids:
|
804
|
+
used_row_pdf_ids.append(pdf_str_id)
|
805
|
+
used_row_rowid.append(row)
|
806
|
+
orphan_cells_in_row.append(pdf_str_id)
|
807
|
+
orphan_cells_in_row_depth.append(round(depth))
|
808
|
+
orphan_cells_in_row_bbox.append(pdf_cell["bbox"])
|
809
|
+
else:
|
810
|
+
self._log().debug("Found duplicate: {}".format(pdf_str_id))
|
811
|
+
# Get index of a row where pdf cell was already detected
|
812
|
+
used_ind = used_row_pdf_ids.index(pdf_str_id)
|
813
|
+
duplicate_id_found_in_row = used_row_rowid[used_ind]
|
814
|
+
valid_duplicate = False
|
815
|
+
if len(orphan_rows) > duplicate_id_found_in_row:
|
816
|
+
index_in_row_list = orphan_rows[
|
817
|
+
duplicate_id_found_in_row
|
818
|
+
].index(pdf_str_id)
|
819
|
+
valid_duplicate = True
|
820
|
+
|
821
|
+
if valid_duplicate:
|
822
|
+
i1 = duplicate_id_found_in_row
|
823
|
+
i2 = index_in_row_list
|
824
|
+
score1 = orphan_rows_depth[i1][i2]
|
825
|
+
score2 = round(depth)
|
826
|
+
# If new cell better than the old one
|
827
|
+
if score2 < score1:
|
828
|
+
# Delete old record about the pdf cell...
|
829
|
+
orphan_rows[i1].pop(index_in_row_list)
|
830
|
+
orphan_rows_depth[i1].pop(index_in_row_list)
|
831
|
+
orphan_rows_bbox[i1].pop(index_in_row_list)
|
832
|
+
|
833
|
+
used_row_pdf_ids.pop(used_ind)
|
834
|
+
used_row_rowid.pop(used_ind)
|
835
|
+
# Then proceed adding new cell
|
836
|
+
used_row_pdf_ids.append(pdf_str_id)
|
837
|
+
used_row_rowid.append(row)
|
838
|
+
orphan_cells_in_row.append(pdf_str_id)
|
839
|
+
orphan_cells_in_row_depth.append(round(depth))
|
840
|
+
orphan_cells_in_row_bbox.append(pdf_cell["bbox"])
|
841
|
+
msg = "Resolved duplicate: {} in favor of new one"
|
842
|
+
self._log().debug(msg.format(pdf_str_id))
|
843
|
+
else:
|
844
|
+
msg = "Resolved duplicate: {} in favor of old one"
|
845
|
+
self._log().debug(msg.format(pdf_str_id))
|
846
|
+
|
847
|
+
log_msg = "Row: {}, Band: {}/{}, Orphan PDF cells: {}"
|
848
|
+
self._log().debug(log_msg.format(row, row_y1, row_y2, orphan_cells_in_row))
|
849
|
+
orphan_rows.append(orphan_cells_in_row)
|
850
|
+
orphan_rows_depth.append(orphan_cells_in_row_depth)
|
851
|
+
orphan_rows_bbox.append(orphan_cells_in_row_bbox)
|
852
|
+
|
853
|
+
# Identify orphan rows (END)
|
854
|
+
self._log().debug("...")
|
855
|
+
# Identify orphan columns
|
856
|
+
orphan_columns = []
|
857
|
+
orphan_columns_depth = []
|
858
|
+
orphan_columns_bbox = []
|
859
|
+
used_col_pdf_ids = []
|
860
|
+
used_col_columnid = []
|
861
|
+
|
862
|
+
for col in range(tab_cols):
|
863
|
+
bbox_x1s = [] # y2 > y1
|
864
|
+
bbox_x2s = []
|
865
|
+
col_x1 = -1
|
866
|
+
col_x2 = -1
|
867
|
+
col_table_cells = []
|
868
|
+
col_table_cell_ids = []
|
869
|
+
orphan_cells_in_col = []
|
870
|
+
orphan_cells_in_col_depth = []
|
871
|
+
orphan_cells_in_col_bbox = []
|
872
|
+
|
873
|
+
for cell in table_cells:
|
874
|
+
if cell["column_id"] == col:
|
875
|
+
# Do not consider spanned cells
|
876
|
+
if "colspan_val" not in cell:
|
877
|
+
# Do not consider empty cells
|
878
|
+
if cell["cell_class"] > 1:
|
879
|
+
table_cell_id = cell["cell_id"]
|
880
|
+
col_table_cells.append(cell)
|
881
|
+
col_table_cell_ids.append(table_cell_id)
|
882
|
+
bbox_x1s.append(cell["bbox"][0])
|
883
|
+
bbox_x2s.append(cell["bbox"][2])
|
884
|
+
else:
|
885
|
+
wrn_txt = "Orphan matching skipped cell in column {} because of colspan"
|
886
|
+
self._log().debug(wrn_txt.format(col))
|
887
|
+
# self._log().info(cell)
|
888
|
+
|
889
|
+
# X coordinates that define band of columns
|
890
|
+
if len(bbox_x1s) > 0:
|
891
|
+
col_x1 = min(bbox_x1s)
|
892
|
+
if len(bbox_x2s) > 0:
|
893
|
+
col_x2 = max(bbox_x2s)
|
894
|
+
|
895
|
+
# Find "orphan" cells that intersect the band
|
896
|
+
for pdf_cell in pdf_cells:
|
897
|
+
pdf_str_id = str(pdf_cell["id"])
|
898
|
+
if pdf_str_id not in matches:
|
899
|
+
within_band = False
|
900
|
+
depth = -1
|
901
|
+
|
902
|
+
centroid_band = (col_x2 + col_x1) / 2
|
903
|
+
centroid_cell = (pdf_cell["bbox"][2] + pdf_cell["bbox"][0]) / 2
|
904
|
+
|
905
|
+
if pdf_cell["bbox"][0] >= col_x1 and pdf_cell["bbox"][0] <= col_x2:
|
906
|
+
depth = abs(centroid_band - centroid_cell)
|
907
|
+
within_band = True
|
908
|
+
|
909
|
+
if pdf_cell["bbox"][2] >= col_x1 and pdf_cell["bbox"][2] <= col_x2:
|
910
|
+
depth = abs(centroid_band - centroid_cell)
|
911
|
+
within_band = True
|
912
|
+
|
913
|
+
if pdf_cell["bbox"][0] < col_x1 and pdf_cell["bbox"][2] > col_x2:
|
914
|
+
depth = abs(centroid_band - centroid_cell)
|
915
|
+
within_band = True
|
916
|
+
|
917
|
+
if within_band:
|
918
|
+
if pdf_str_id not in used_col_pdf_ids:
|
919
|
+
used_col_pdf_ids.append(pdf_str_id)
|
920
|
+
used_col_columnid.append(col)
|
921
|
+
orphan_cells_in_col.append(pdf_str_id)
|
922
|
+
orphan_cells_in_col_depth.append(round(depth))
|
923
|
+
orphan_cells_in_col_bbox.append(pdf_cell["bbox"])
|
924
|
+
else:
|
925
|
+
self._log().debug("Found duplicate: {}".format(pdf_str_id))
|
926
|
+
# Get index of a column where pdf cell was already detected
|
927
|
+
used_ind = used_col_pdf_ids.index(pdf_str_id)
|
928
|
+
duplicate_id_found_in_column = used_col_columnid[used_ind]
|
929
|
+
|
930
|
+
valid_col_duplicate = False
|
931
|
+
if len(orphan_columns) > duplicate_id_found_in_column:
|
932
|
+
index_in_col_list = orphan_columns[
|
933
|
+
duplicate_id_found_in_column
|
934
|
+
].index(pdf_str_id)
|
935
|
+
valid_col_duplicate = True
|
936
|
+
|
937
|
+
if valid_col_duplicate:
|
938
|
+
i1 = duplicate_id_found_in_column
|
939
|
+
i2 = index_in_col_list
|
940
|
+
score1 = orphan_columns_depth[i1][i2]
|
941
|
+
score2 = round(depth)
|
942
|
+
# If new cell better than the old one
|
943
|
+
if score2 < score1:
|
944
|
+
# Delete old record about the pdf cell...
|
945
|
+
orphan_columns[i1].pop(index_in_col_list)
|
946
|
+
orphan_columns_depth[i1].pop(index_in_col_list)
|
947
|
+
orphan_columns_bbox[i1].pop(index_in_col_list)
|
948
|
+
|
949
|
+
used_col_pdf_ids.pop(used_ind)
|
950
|
+
used_col_columnid.pop(used_ind)
|
951
|
+
# Then proceed adding new cell
|
952
|
+
used_col_pdf_ids.append(pdf_str_id)
|
953
|
+
used_col_columnid.append(col)
|
954
|
+
orphan_cells_in_col.append(pdf_str_id)
|
955
|
+
orphan_cells_in_col_depth.append(round(depth))
|
956
|
+
orphan_cells_in_col_bbox.append(pdf_cell["bbox"])
|
957
|
+
msg = "Resolved duplicate: {} in favor of new one"
|
958
|
+
self._log().debug(msg.format(pdf_str_id))
|
959
|
+
else:
|
960
|
+
msg = "Resolved duplicate: {} in favor of old one"
|
961
|
+
self._log().debug(msg.format(pdf_str_id))
|
962
|
+
|
963
|
+
orphan_columns.append(orphan_cells_in_col)
|
964
|
+
orphan_columns_depth.append(orphan_cells_in_col_depth)
|
965
|
+
orphan_columns_bbox.append(orphan_cells_in_col_bbox)
|
966
|
+
|
967
|
+
# Assign to structural cells and/or create new cells when absent
|
968
|
+
|
969
|
+
for col_ind in range(len(orphan_columns)):
|
970
|
+
self._log().debug(
|
971
|
+
"Col: {}, Orphan PDF cells: {}".format(col_ind, orphan_columns[col_ind])
|
972
|
+
)
|
973
|
+
self._log().debug(
|
974
|
+
"Col: {}, Orphan Depth: {}".format(
|
975
|
+
col_ind, orphan_columns_depth[col_ind]
|
976
|
+
)
|
977
|
+
)
|
978
|
+
self._log().debug("...")
|
979
|
+
|
980
|
+
# Collect the pdf_ids from the orphan_rows and sort them in order to produce the same
|
981
|
+
# results with the c++ implementation
|
982
|
+
orphan_rows_pdf_ids = []
|
983
|
+
row_id_per_pdf_id = {} # pdf_cell_id -> row_id
|
984
|
+
for row_id, row_pdf_ids in enumerate(orphan_rows):
|
985
|
+
if len(row_pdf_ids) == 0:
|
986
|
+
continue
|
987
|
+
# Extend the orphan_rows_pdf_ids with the pdf_ids as int
|
988
|
+
orphan_rows_pdf_ids.extend([int(x) for x in row_pdf_ids])
|
989
|
+
# Set the row_id for the pdf_ids
|
990
|
+
for pdf_cell_id in row_pdf_ids:
|
991
|
+
row_id_per_pdf_id[int(pdf_cell_id)] = row_id
|
992
|
+
|
993
|
+
orphan_rows_pdf_ids.sort()
|
994
|
+
|
995
|
+
# Assign Table cell Row ID / Table cell Column ID to orphans,
|
996
|
+
# Check if Table cell doesn't exist in the table_cells, create one,
|
997
|
+
# add match to new_matches
|
998
|
+
for pdf_cell_id_int in orphan_rows_pdf_ids:
|
999
|
+
new_row_id = row_id_per_pdf_id[pdf_cell_id_int]
|
1000
|
+
new_column_id = 0
|
1001
|
+
pdf_cell_id = str(pdf_cell_id_int)
|
1002
|
+
|
1003
|
+
if pdf_cell_id in used_col_pdf_ids:
|
1004
|
+
new_column_id = used_col_columnid[used_col_pdf_ids.index(pdf_cell_id)]
|
1005
|
+
|
1006
|
+
self._log().debug(
|
1007
|
+
"new_column_id {}, pdf_cell_id {}".format(
|
1008
|
+
new_column_id, pdf_cell_id
|
1009
|
+
)
|
1010
|
+
)
|
1011
|
+
self._log().debug(orphan_columns[new_column_id])
|
1012
|
+
depth_index = orphan_columns[new_column_id].index(pdf_cell_id)
|
1013
|
+
confidence = orphan_columns_depth[new_column_id][depth_index]
|
1014
|
+
pdf_bbox = orphan_columns_bbox[new_column_id][depth_index]
|
1015
|
+
|
1016
|
+
# 1. Find table_cell_id by new_row_id / new_column_id
|
1017
|
+
new_table_cell_id = -1
|
1018
|
+
tcell = list(
|
1019
|
+
filter(
|
1020
|
+
lambda table_cell: table_cell["row_id"] == new_row_id
|
1021
|
+
and table_cell["column_id"] == new_column_id,
|
1022
|
+
table_cells,
|
1023
|
+
)
|
1024
|
+
)
|
1025
|
+
|
1026
|
+
if len(tcell) > 0:
|
1027
|
+
new_table_cell_id = tcell[0]["cell_id"]
|
1028
|
+
self._log().debug(
|
1029
|
+
"reusing table_cell_id: {}".format(new_table_cell_id)
|
1030
|
+
)
|
1031
|
+
|
1032
|
+
for i in range(len(new_table_cells)):
|
1033
|
+
if new_table_cells[i]["cell_id"] == new_table_cell_id:
|
1034
|
+
bbox_tmp = self._merge_two_bboxes(
|
1035
|
+
new_table_cells[i]["bbox"], pdf_bbox
|
1036
|
+
)
|
1037
|
+
new_table_cells[i]["bbox"] = bbox_tmp
|
1038
|
+
|
1039
|
+
if new_table_cell_id < 0:
|
1040
|
+
max_cell_id += 1
|
1041
|
+
new_table_cell_id = max_cell_id
|
1042
|
+
|
1043
|
+
new_table_cell = {
|
1044
|
+
"bbox": pdf_bbox,
|
1045
|
+
"cell_id": new_table_cell_id,
|
1046
|
+
"column_id": new_column_id,
|
1047
|
+
"label": "body",
|
1048
|
+
"row_id": new_row_id,
|
1049
|
+
"cell_class": 2,
|
1050
|
+
}
|
1051
|
+
self._log().debug(
|
1052
|
+
"making new table_cell_id: {}".format(new_table_cell_id)
|
1053
|
+
)
|
1054
|
+
new_table_cells.append(new_table_cell)
|
1055
|
+
|
1056
|
+
# And then add new match to the new_matches
|
1057
|
+
new_matches[str(pdf_cell_id)] = [
|
1058
|
+
{"post": confidence, "table_cell_id": new_table_cell_id}
|
1059
|
+
]
|
1060
|
+
return new_matches, new_table_cells, max_cell_id
|
1061
|
+
|
1062
|
+
def _clear_pdf_cells(self, pdf_cells):
|
1063
|
+
r"""
|
1064
|
+
Clean PDF cells from cells that have an empty string as text
|
1065
|
+
|
1066
|
+
Parameters
|
1067
|
+
----------
|
1068
|
+
pdf_cells : list of dict
|
1069
|
+
List of PDF cells as defined by Docling
|
1070
|
+
|
1071
|
+
Returns
|
1072
|
+
-------
|
1073
|
+
new_pdf_cells : list of dict
|
1074
|
+
updated, cleaned list of pdf_cells
|
1075
|
+
"""
|
1076
|
+
new_pdf_cells = []
|
1077
|
+
for i in range(len(pdf_cells)):
|
1078
|
+
if pdf_cells[i]["text"] != "":
|
1079
|
+
new_pdf_cells.append(pdf_cells[i])
|
1080
|
+
return new_pdf_cells
|
1081
|
+
|
1082
|
+
def process(self, matching_details):
|
1083
|
+
r"""
|
1084
|
+
Do post processing, see details in the comments below
|
1085
|
+
|
1086
|
+
Parameters
|
1087
|
+
----------
|
1088
|
+
matching_details : dictionary
|
1089
|
+
contains all the necessary information for Docling processing
|
1090
|
+
already has predictions and initial (IOU) matches
|
1091
|
+
|
1092
|
+
Returns
|
1093
|
+
-------
|
1094
|
+
matching_details : dictionary
|
1095
|
+
matching_details that contain post-processed matches
|
1096
|
+
"""
|
1097
|
+
|
1098
|
+
# ====================================================================================
|
1099
|
+
# Start post-processing
|
1100
|
+
# ====================================================================================
|
1101
|
+
|
1102
|
+
# General description of post-processing algorithm to improve matching
|
1103
|
+
|
1104
|
+
# Uses: (IOU) matches, pdf_cells, table_cells
|
1105
|
+
# Generates: new_table_cells, new_matches
|
1106
|
+
|
1107
|
+
# +0. Get minimal grid table dimension (cols/rows)
|
1108
|
+
# +1. Get good/bad IOU predicted cells for each structural column (of minimal grid)
|
1109
|
+
# +1.a. If all IOU in a column are bad - eliminate column (from bboxes and structure)
|
1110
|
+
# +2. Find alignment of good IOU cells per column
|
1111
|
+
# +2.a. For this, measure min-max distance for left, min-max distance for centroid,
|
1112
|
+
# min-max distance for right side of cell rectangles
|
1113
|
+
# smallest distance would determine Left / Center / Right alignment
|
1114
|
+
# +3. Get median* (according to alignment) "bbox left/middle/right X" coord
|
1115
|
+
# for good IOU cells, get median* cell size in a column.
|
1116
|
+
# +4. Move bad cells to the median* (left/middle/right) good in a column
|
1117
|
+
# +4.a. (Additionally), re-size cell to median* size of cells in a column
|
1118
|
+
# +5. Generate new matches, run Intersection over cell(pdf) on a table cells
|
1119
|
+
|
1120
|
+
# NOT USED +6. Align table cell bboxes with good matches
|
1121
|
+
# to encapsulate matching pdf cells
|
1122
|
+
|
1123
|
+
# +7. De-duplicate columns in table_cells according to highest column score
|
1124
|
+
# in: matches + intersection_pdf_matches
|
1125
|
+
# +8. Do final assignment of table bbox to pdf cell based on saved scores,
|
1126
|
+
# either preferring IOU over PDF Intersection, and higher Intersection over lower,
|
1127
|
+
# or just use PDF Intersection
|
1128
|
+
# Rule: 1 Table cell can contain many PDF cells,
|
1129
|
+
# but each PDF cell has to be asigned to one Table cell
|
1130
|
+
# Rule: Do not discard table bboxes at this point, asign all of them
|
1131
|
+
# +8.a. Align table cell bboxes with matched pdf cells bboxes (instead of step 6)
|
1132
|
+
|
1133
|
+
# +9. Pick up remaining orphan cells
|
1134
|
+
# +9.a. Determine row banding per row (min/max Y per row)
|
1135
|
+
# match with orphan cells (intersection with band)
|
1136
|
+
# +9.b. Determine column banding per column (min/max X per column)
|
1137
|
+
# match with orphan cells (intersection with band)
|
1138
|
+
# +9.c. Decide to which column/row each orphan PDF cell belongs
|
1139
|
+
# append match to existing structural cell,
|
1140
|
+
# or create cell
|
1141
|
+
# ?9.d. For each pdf-cell from the list find closest and MATCH with row-table-cell
|
1142
|
+
# (minimal rectangle to-rectangle distance)
|
1143
|
+
|
1144
|
+
# *Use median instead of average to account for occasional
|
1145
|
+
# colspans that would span across columns,
|
1146
|
+
# as they are rare they shouldn't indfluence much a median position
|
1147
|
+
# of other cells in a minimal-grid column
|
1148
|
+
|
1149
|
+
self._log().debug("Start prediction post-processing...")
|
1150
|
+
table_cells = matching_details["table_cells"]
|
1151
|
+
pdf_cells = self._clear_pdf_cells(matching_details["pdf_cells"])
|
1152
|
+
matches = matching_details["matches"]
|
1153
|
+
|
1154
|
+
# ------------------------------------------------------------------------------------------
|
1155
|
+
# -1. If initial (IOU) matches are empty,
|
1156
|
+
# generate new ones based on intersection over cell
|
1157
|
+
|
1158
|
+
if not matches:
|
1159
|
+
self._log().debug(
|
1160
|
+
"-----------------------------------------------------------------"
|
1161
|
+
)
|
1162
|
+
self._log().debug(
|
1163
|
+
"-----------------------------------------------------------------"
|
1164
|
+
)
|
1165
|
+
self._log().debug(
|
1166
|
+
"- NO INITIAL MATCHES TO POST PROCESS, GENERATING NEW ONES... -"
|
1167
|
+
)
|
1168
|
+
self._log().debug(
|
1169
|
+
"-----------------------------------------------------------------"
|
1170
|
+
)
|
1171
|
+
self._log().debug(
|
1172
|
+
"-----------------------------------------------------------------"
|
1173
|
+
)
|
1174
|
+
matches = self._run_intersection_match(
|
1175
|
+
self._cell_matcher, table_cells, pdf_cells
|
1176
|
+
)
|
1177
|
+
|
1178
|
+
# ------------------------------------------------------------------------------------------
|
1179
|
+
# 0. Get minimal grid table dimension (cols/rows)
|
1180
|
+
tab_columns, tab_rows, max_cell_id = self._get_table_dimension(table_cells)
|
1181
|
+
self._log().debug(
|
1182
|
+
"COLS {}/ ROWS {}/ MAX CELL ID {}".format(
|
1183
|
+
tab_columns, tab_rows, max_cell_id
|
1184
|
+
)
|
1185
|
+
)
|
1186
|
+
|
1187
|
+
good_table_cells = []
|
1188
|
+
bad_table_cells = []
|
1189
|
+
new_bad_table_cells = []
|
1190
|
+
fixed_table_cells = []
|
1191
|
+
|
1192
|
+
# 1. Get good/bad IOU predicted cells for each structural column (of minimal grid)
|
1193
|
+
for col in range(tab_columns):
|
1194
|
+
g1, g2 = self._get_good_bad_cells_in_column(table_cells, col, matches)
|
1195
|
+
good_table_cells = g1
|
1196
|
+
bad_table_cells = g2
|
1197
|
+
self._log().debug(
|
1198
|
+
"COLUMN {}, Good table cells: {}".format(col, len(good_table_cells))
|
1199
|
+
)
|
1200
|
+
self._log().debug(
|
1201
|
+
"COLUMN {}, Bad table cells: {}".format(col, len(bad_table_cells))
|
1202
|
+
)
|
1203
|
+
|
1204
|
+
# 2. Find alignment of good IOU cells per column
|
1205
|
+
alignment = self._find_alignment_in_column(good_table_cells)
|
1206
|
+
self._log().debug("COLUMN {}, Alignment: {}".format(col, alignment))
|
1207
|
+
# alignment = "left"
|
1208
|
+
|
1209
|
+
# 3. Get median (according to alignment) "bbox left/middle/right X"
|
1210
|
+
# coordinate for good IOU cells, get median* cell size in a column.
|
1211
|
+
gm1, gm2, gm3, gm4 = self._get_median_pos_size(good_table_cells, alignment)
|
1212
|
+
median_x = gm1
|
1213
|
+
# median_y = gm2
|
1214
|
+
median_width = gm3
|
1215
|
+
median_height = gm4
|
1216
|
+
self._log().debug("Median good X = {}".format(median_x))
|
1217
|
+
|
1218
|
+
# 4. Move bad cells to the median* (left/middle/right) good in a column
|
1219
|
+
# nc = self._move_cells_to_left_pos(bad_table_cells, median_x, True,
|
1220
|
+
# TODO:
|
1221
|
+
nc = self._move_cells_to_left_pos(
|
1222
|
+
bad_table_cells, median_x, False, median_width, median_height, alignment
|
1223
|
+
)
|
1224
|
+
new_bad_table_cells = nc
|
1225
|
+
fixed_table_cells.extend(good_table_cells)
|
1226
|
+
fixed_table_cells.extend(new_bad_table_cells)
|
1227
|
+
|
1228
|
+
# ====================================================================================
|
1229
|
+
# Sort table_cells by cell_id before running IOU, to have correct indexes on the output
|
1230
|
+
fixed_table_cells_sorted = sorted(fixed_table_cells, key=lambda k: k["cell_id"])
|
1231
|
+
|
1232
|
+
# 5. Generate new matches, run Intersection over cell(pdf) on a table cells
|
1233
|
+
ip = self._run_intersection_match(
|
1234
|
+
self._cell_matcher, fixed_table_cells_sorted, pdf_cells
|
1235
|
+
)
|
1236
|
+
intersection_pdf_matches = ip
|
1237
|
+
|
1238
|
+
# 6. NOT USED
|
1239
|
+
|
1240
|
+
# 7. De-duplicate columns in aligned_table_cells
|
1241
|
+
# according to highest column score in: matches + intersection_pdf_matches
|
1242
|
+
# (this is easier now, because duplicated cells will have same bboxes)
|
1243
|
+
dd1, dd2, dd3 = self._deduplicate_cells(
|
1244
|
+
tab_columns, fixed_table_cells_sorted, matches, intersection_pdf_matches
|
1245
|
+
)
|
1246
|
+
dedupl_table_cells = dd1
|
1247
|
+
dedupl_matches = dd2
|
1248
|
+
|
1249
|
+
self._log().debug("...")
|
1250
|
+
|
1251
|
+
# 8. Do final assignment of table bbox to pdf cell based on saved scores,
|
1252
|
+
# preferring IOU over PDF Intersection, and higher Intersection over lower
|
1253
|
+
# ! IOU matches currently disabled,
|
1254
|
+
# and final assigment is done only on IOC matches
|
1255
|
+
final_matches = self._do_final_asignment(
|
1256
|
+
dedupl_table_cells, matches, dedupl_matches
|
1257
|
+
)
|
1258
|
+
|
1259
|
+
# 8.a. Re-align bboxes / re-run matching
|
1260
|
+
dedupl_table_cells_sorted = sorted(
|
1261
|
+
dedupl_table_cells, key=lambda k: k["cell_id"]
|
1262
|
+
)
|
1263
|
+
|
1264
|
+
if len(pdf_cells) > 300:
|
1265
|
+
aligned_table_cells2 = dedupl_table_cells_sorted
|
1266
|
+
else:
|
1267
|
+
aligned_table_cells2 = self._align_table_cells_to_pdf(
|
1268
|
+
dedupl_table_cells_sorted, pdf_cells, final_matches
|
1269
|
+
)
|
1270
|
+
|
1271
|
+
# 9. Distance-match orphans
|
1272
|
+
po1, po2, po3 = self._pick_orphan_cells(
|
1273
|
+
tab_rows,
|
1274
|
+
tab_columns,
|
1275
|
+
max_cell_id,
|
1276
|
+
aligned_table_cells2,
|
1277
|
+
pdf_cells,
|
1278
|
+
final_matches,
|
1279
|
+
)
|
1280
|
+
final_matches_wo = po1
|
1281
|
+
table_cells_wo = po2
|
1282
|
+
max_cell_id = po3
|
1283
|
+
|
1284
|
+
self._log().debug("*** final_matches_wo")
|
1285
|
+
self._log().debug(final_matches_wo)
|
1286
|
+
self._log().debug("*** table_cells_wo")
|
1287
|
+
self._log().debug(table_cells_wo)
|
1288
|
+
|
1289
|
+
for pdf_cell_id in range(len(final_matches_wo)):
|
1290
|
+
if str(pdf_cell_id) in final_matches_wo:
|
1291
|
+
pdf_cell_match = final_matches_wo[str(pdf_cell_id)]
|
1292
|
+
if len(pdf_cell_match) > 1:
|
1293
|
+
l1 = "!!! Multiple - {}x pdf cell match with id: {}"
|
1294
|
+
self._log().info(l1.format(len(pdf_cell_match), pdf_cell_id))
|
1295
|
+
if pdf_cell_match:
|
1296
|
+
tcellid = pdf_cell_match[0]["table_cell_id"]
|
1297
|
+
for tcell in table_cells_wo:
|
1298
|
+
if tcell["cell_id"] == tcellid:
|
1299
|
+
mrow = tcell["row_id"]
|
1300
|
+
mcol = tcell["column_id"]
|
1301
|
+
l2 = "pdf cell: {} -> row: {} | col:{}"
|
1302
|
+
self._log().debug(l2.format(pdf_cell_id, mrow, mcol))
|
1303
|
+
else:
|
1304
|
+
self._log().debug(
|
1305
|
+
"!!! pdf cell doesn't have match: {}".format(pdf_cell_id)
|
1306
|
+
)
|
1307
|
+
|
1308
|
+
# Example of an object:
|
1309
|
+
# matching_details = {
|
1310
|
+
# "iou_threshold": self._iou_thres,
|
1311
|
+
# "table_bbox": table_bbox,
|
1312
|
+
# "prediction": prediction,
|
1313
|
+
# "pdf_cells": pdf_cells,
|
1314
|
+
# "page_height": docling_table["page_height"],
|
1315
|
+
# "page_width": docling_table["page_width"],
|
1316
|
+
# "table_cells": table_cells,
|
1317
|
+
# "matches": matches
|
1318
|
+
# }
|
1319
|
+
|
1320
|
+
matching_details["table_cells"] = table_cells_wo
|
1321
|
+
matching_details["matches"] = final_matches_wo
|
1322
|
+
matching_details["pdf_cells"] = pdf_cells
|
1323
|
+
|
1324
|
+
self._log().debug("Done prediction matching and post-processing!")
|
1325
|
+
return matching_details
|