docling 2.12.0__py3-none-any.whl → 2.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/pubmed_backend.py +592 -0
- docling/backend/xml/uspto_backend.py +1888 -0
- docling/datamodel/base_models.py +21 -4
- docling/datamodel/document.py +88 -14
- docling/datamodel/pipeline_options.py +3 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +20 -3
- docling/models/ds_glm_model.py +34 -4
- docling/models/easyocr_model.py +2 -0
- docling/models/layout_model.py +134 -280
- docling/models/page_assemble_model.py +11 -1
- docling/models/table_structure_model.py +25 -29
- docling/pipeline/base_pipeline.py +3 -1
- docling/utils/glm_utils.py +11 -3
- docling/utils/layout_postprocessor.py +666 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/METADATA +2 -2
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/RECORD +21 -18
- docling/utils/layout_utils.py +0 -812
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/LICENSE +0 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/WHEEL +0 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/entry_points.txt +0 -0
docling/utils/layout_utils.py
DELETED
@@ -1,812 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import logging
|
3
|
-
|
4
|
-
import networkx as nx
|
5
|
-
from docling_core.types.doc import DocItemLabel
|
6
|
-
|
7
|
-
logger = logging.getLogger("layout_utils")
|
8
|
-
|
9
|
-
|
10
|
-
## -------------------------------
|
11
|
-
## Geometric helper functions
|
12
|
-
## The coordinates grow left to right, and bottom to top.
|
13
|
-
## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
|
14
|
-
|
15
|
-
|
16
|
-
def area(bbox):
|
17
|
-
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
18
|
-
|
19
|
-
|
20
|
-
def contains(bbox_i, bbox_j):
|
21
|
-
## Returns True if bbox_i contains bbox_j, else False
|
22
|
-
return (
|
23
|
-
bbox_i[0] <= bbox_j[0]
|
24
|
-
and bbox_i[1] <= bbox_j[1]
|
25
|
-
and bbox_i[2] >= bbox_j[2]
|
26
|
-
and bbox_i[3] >= bbox_j[3]
|
27
|
-
)
|
28
|
-
|
29
|
-
|
30
|
-
def is_intersecting(bbox_i, bbox_j):
|
31
|
-
return not (
|
32
|
-
bbox_i[2] < bbox_j[0]
|
33
|
-
or bbox_i[0] > bbox_j[2]
|
34
|
-
or bbox_i[3] < bbox_j[1]
|
35
|
-
or bbox_i[1] > bbox_j[3]
|
36
|
-
)
|
37
|
-
|
38
|
-
|
39
|
-
def bb_iou(boxA, boxB):
|
40
|
-
# determine the (x, y)-coordinates of the intersection rectangle
|
41
|
-
xA = max(boxA[0], boxB[0])
|
42
|
-
yA = max(boxA[1], boxB[1])
|
43
|
-
xB = min(boxA[2], boxB[2])
|
44
|
-
yB = min(boxA[3], boxB[3])
|
45
|
-
# compute the area of intersection rectangle
|
46
|
-
interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
|
47
|
-
# compute the area of both the prediction and ground-truth
|
48
|
-
# rectangles
|
49
|
-
boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
|
50
|
-
boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
|
51
|
-
# compute the intersection over union by taking the intersection
|
52
|
-
# area and dividing it by the sum of prediction + ground-truth
|
53
|
-
# areas - the interesection area
|
54
|
-
iou = interArea / float(boxAArea + boxBArea - interArea)
|
55
|
-
# return the intersection over union value
|
56
|
-
return iou
|
57
|
-
|
58
|
-
|
59
|
-
def compute_intersection(bbox_i, bbox_j):
|
60
|
-
## Returns the size of the intersection area of the two boxes
|
61
|
-
if not is_intersecting(bbox_i, bbox_j):
|
62
|
-
return 0
|
63
|
-
## Determine the (x, y)-coordinates of the intersection rectangle:
|
64
|
-
xA = max(bbox_i[0], bbox_j[0])
|
65
|
-
yA = max(bbox_i[1], bbox_j[1])
|
66
|
-
xB = min(bbox_i[2], bbox_j[2])
|
67
|
-
yB = min(bbox_i[3], bbox_j[3])
|
68
|
-
## Compute the area of intersection rectangle:
|
69
|
-
interArea = (xB - xA) * (yB - yA)
|
70
|
-
if interArea < 0:
|
71
|
-
logger.debug("Warning: Negative intersection detected!")
|
72
|
-
return 0
|
73
|
-
return interArea
|
74
|
-
|
75
|
-
|
76
|
-
def surrounding(bbox_i, bbox_j):
|
77
|
-
## Computes minimal box that contains both input boxes
|
78
|
-
sbox = []
|
79
|
-
sbox.append(min(bbox_i[0], bbox_j[0]))
|
80
|
-
sbox.append(min(bbox_i[1], bbox_j[1]))
|
81
|
-
sbox.append(max(bbox_i[2], bbox_j[2]))
|
82
|
-
sbox.append(max(bbox_i[3], bbox_j[3]))
|
83
|
-
return sbox
|
84
|
-
|
85
|
-
|
86
|
-
def surrounding_list(bbox_list):
|
87
|
-
## Computes minimal box that contains all boxes in the input list
|
88
|
-
## The list should be non-empty, but just in case it's not:
|
89
|
-
if len(bbox_list) == 0:
|
90
|
-
sbox = [0, 0, 0, 0]
|
91
|
-
else:
|
92
|
-
sbox = []
|
93
|
-
sbox.append(min([bbox[0] for bbox in bbox_list]))
|
94
|
-
sbox.append(min([bbox[1] for bbox in bbox_list]))
|
95
|
-
sbox.append(max([bbox[2] for bbox in bbox_list]))
|
96
|
-
sbox.append(max([bbox[3] for bbox in bbox_list]))
|
97
|
-
return sbox
|
98
|
-
|
99
|
-
|
100
|
-
def vertical_overlap(bboxA, bboxB):
|
101
|
-
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
|
102
|
-
if bboxB[3] < bboxA[1]: ## B below A
|
103
|
-
return False
|
104
|
-
elif bboxA[3] < bboxB[1]: ## A below B
|
105
|
-
return False
|
106
|
-
else:
|
107
|
-
return True
|
108
|
-
|
109
|
-
|
110
|
-
def vertical_overlap_fraction(bboxA, bboxB):
|
111
|
-
## Returns the vertical overlap as fraction of the lower bbox height.
|
112
|
-
## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
|
113
|
-
## Height 0 is permitted in the input.
|
114
|
-
heightA = bboxA[3] - bboxA[1]
|
115
|
-
heightB = bboxB[3] - bboxB[1]
|
116
|
-
min_height = min(heightA, heightB)
|
117
|
-
if bboxA[3] >= bboxB[3]: ## A starts higher or equal
|
118
|
-
if (
|
119
|
-
bboxA[1] <= bboxB[1]
|
120
|
-
): ## B is completely in A; this can include height of B = 0:
|
121
|
-
fraction = 1
|
122
|
-
else:
|
123
|
-
overlap = max(bboxB[3] - bboxA[1], 0)
|
124
|
-
fraction = overlap / max(min_height, 0.001)
|
125
|
-
else:
|
126
|
-
if (
|
127
|
-
bboxB[1] <= bboxA[1]
|
128
|
-
): ## A is completely in B; this can include height of A = 0:
|
129
|
-
fraction = 1
|
130
|
-
else:
|
131
|
-
overlap = max(bboxA[3] - bboxB[1], 0)
|
132
|
-
fraction = overlap / max(min_height, 0.001)
|
133
|
-
return fraction
|
134
|
-
|
135
|
-
|
136
|
-
## -------------------------------
|
137
|
-
## Cluster-and-cell relations
|
138
|
-
|
139
|
-
|
140
|
-
def compute_enclosed_cells(
|
141
|
-
cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
|
142
|
-
):
|
143
|
-
cells_in_cluster = []
|
144
|
-
cells_in_cluster_int = []
|
145
|
-
for ix, cell in enumerate(raw_cells):
|
146
|
-
cell_bbox = cell["bbox"]
|
147
|
-
intersection = compute_intersection(cell_bbox, cluster_bbox)
|
148
|
-
frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
|
149
|
-
|
150
|
-
if (
|
151
|
-
intersection > frac_area and frac_area > 0
|
152
|
-
): # intersect > certain fraction of cell
|
153
|
-
cells_in_cluster.append(ix)
|
154
|
-
cells_in_cluster_int.append(intersection)
|
155
|
-
elif contains(
|
156
|
-
cluster_bbox,
|
157
|
-
[cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
|
158
|
-
):
|
159
|
-
cells_in_cluster.append(ix)
|
160
|
-
return cells_in_cluster, cells_in_cluster_int
|
161
|
-
|
162
|
-
|
163
|
-
def find_clusters_around_cells(cell_count, clusters):
|
164
|
-
## Per raw cell, find to which clusters it belongs.
|
165
|
-
## Return list of these indices in the raw-cell order.
|
166
|
-
clusters_around_cells = [[] for _ in range(cell_count)]
|
167
|
-
for cl_ix, cluster in enumerate(clusters):
|
168
|
-
for ix in cluster["cell_ids"]:
|
169
|
-
clusters_around_cells[ix].append(cl_ix)
|
170
|
-
return clusters_around_cells
|
171
|
-
|
172
|
-
|
173
|
-
def find_cell_index(raw_ix, cell_array):
|
174
|
-
## "raw_ix" is a rawcell_id.
|
175
|
-
## "cell_array" has the structure of an (annotation) cells array.
|
176
|
-
## Returns index of cell in cell_array that has this rawcell_id.
|
177
|
-
for ix, cell in enumerate(cell_array):
|
178
|
-
if cell["rawcell_id"] == raw_ix:
|
179
|
-
return ix
|
180
|
-
|
181
|
-
|
182
|
-
def find_cell_indices(cluster, cell_array):
|
183
|
-
## "cluster" must have the structure as in a clusters array in a prediction,
|
184
|
-
## "cell_array" that of a cells array.
|
185
|
-
## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
|
186
|
-
## in the order of the rawcell_ids.
|
187
|
-
result = []
|
188
|
-
for raw_ix in sorted(cluster["cell_ids"]):
|
189
|
-
## Find the cell with this rawcell_id (if any)
|
190
|
-
for ix, cell in enumerate(cell_array):
|
191
|
-
if cell["rawcell_id"] == raw_ix:
|
192
|
-
result.append(ix)
|
193
|
-
return result
|
194
|
-
|
195
|
-
|
196
|
-
def find_first_cell_index(cluster, cell_array):
|
197
|
-
## "cluster" must be a dict with key "cell_ids"; it can also be a line.
|
198
|
-
## "cell_array" has the structure of a cells array in an annotation.
|
199
|
-
## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
|
200
|
-
result = [] ## We keep it a list as it can be empty (picture without text cells)
|
201
|
-
if len(cluster["cell_ids"]) == 0:
|
202
|
-
return result
|
203
|
-
raw_ix = min(cluster["cell_ids"])
|
204
|
-
## Find the cell with this rawcell_id (if any)
|
205
|
-
for ix, cell in enumerate(cell_array):
|
206
|
-
if cell["rawcell_id"] == raw_ix:
|
207
|
-
result.append(ix)
|
208
|
-
break ## One is enough; should be only one anyway.
|
209
|
-
if result == []:
|
210
|
-
logger.debug(
|
211
|
-
" Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
|
212
|
-
)
|
213
|
-
return result
|
214
|
-
|
215
|
-
|
216
|
-
## -------------------------------
|
217
|
-
## Cluster labels and text
|
218
|
-
|
219
|
-
|
220
|
-
def relabel_cluster(cluster, cl_ix, new_label, target_pred):
|
221
|
-
## "cluster" must have the structure as in a clusters array in a prediction,
|
222
|
-
## "cl_ix" is its index in target_pred,
|
223
|
-
## "new_label" is the intended new label,
|
224
|
-
## "target_pred" is the entire current target prediction.
|
225
|
-
## Sets label on the cluster itself, and on the cells in the target_pred.
|
226
|
-
## Returns new_label so that also the cl_label variable in the main code is easily set.
|
227
|
-
target_pred["clusters"][cl_ix]["type"] = new_label
|
228
|
-
cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
|
229
|
-
for ix in cluster_target_cells:
|
230
|
-
target_pred["cells"][ix]["label"] = new_label
|
231
|
-
return new_label
|
232
|
-
|
233
|
-
|
234
|
-
def find_cluster_text(cluster, raw_cells):
|
235
|
-
## "cluster" must be a dict with "cell_ids"; it can also be a line.
|
236
|
-
## "raw_cells" must have the format of item["raw"]["cells"]
|
237
|
-
## Returns the text of the cluster, with blanks between the cell contents
|
238
|
-
## (which seem to be words or phrases without starting or trailing blanks).
|
239
|
-
## Note that in formulas, this may give a lot more blanks than originally
|
240
|
-
cluster_text = ""
|
241
|
-
for raw_ix in sorted(cluster["cell_ids"]):
|
242
|
-
cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
|
243
|
-
return cluster_text.rstrip()
|
244
|
-
|
245
|
-
|
246
|
-
def find_cluster_text_without_blanks(cluster, raw_cells):
|
247
|
-
## "cluster" must be a dict with "cell_ids"; it can also be a line.
|
248
|
-
## "raw_cells" must have the format of item["raw"]["cells"]
|
249
|
-
## Returns the text of the cluster, without blanks between the cell contents
|
250
|
-
## Interesting in formula analysis.
|
251
|
-
cluster_text = ""
|
252
|
-
for raw_ix in sorted(cluster["cell_ids"]):
|
253
|
-
cluster_text = cluster_text + raw_cells[raw_ix]["text"]
|
254
|
-
return cluster_text.rstrip()
|
255
|
-
|
256
|
-
|
257
|
-
## -------------------------------
|
258
|
-
## Clusters and lines
|
259
|
-
## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
|
260
|
-
## but this one also in FormulaAnalysis)
|
261
|
-
|
262
|
-
|
263
|
-
def build_cluster_from_lines(lines, label, id):
|
264
|
-
## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
|
265
|
-
## (There is no condition that they are really geometrically lines)
|
266
|
-
## A cluster in standard format is returned with given label and id
|
267
|
-
local_lines = copy.deepcopy(
|
268
|
-
lines
|
269
|
-
) ## without this, it changes "lines" also outside this function
|
270
|
-
first_line = local_lines.pop(0)
|
271
|
-
cluster = {
|
272
|
-
"id": id,
|
273
|
-
"type": label,
|
274
|
-
"cell_ids": first_line["cell_ids"],
|
275
|
-
"bbox": first_line["bbox"],
|
276
|
-
"confidence": 0,
|
277
|
-
"created_by": "merged_cells",
|
278
|
-
}
|
279
|
-
confidence = 0
|
280
|
-
counter = 0
|
281
|
-
for line in local_lines:
|
282
|
-
new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
|
283
|
-
cluster["cell_ids"] = new_cell_ids
|
284
|
-
cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
|
285
|
-
counter += 1
|
286
|
-
confidence += line["confidence"]
|
287
|
-
confidence = confidence / counter
|
288
|
-
cluster["confidence"] = confidence
|
289
|
-
return cluster
|
290
|
-
|
291
|
-
|
292
|
-
## -------------------------------
|
293
|
-
## Reading order
|
294
|
-
|
295
|
-
|
296
|
-
def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
|
297
|
-
## In:
|
298
|
-
## Clusters: list as in predictions.
|
299
|
-
## cluster_sort_type: string, currently only "raw_cells".
|
300
|
-
## cell_sort_type: string, currently only "raw_cells".
|
301
|
-
## sort_ids: Boolean, whether the cluster ids should be adapted to their new position
|
302
|
-
## Out: Another clusters list, sorted according to the type.
|
303
|
-
|
304
|
-
logger.debug("---- Start cluster sorting ------")
|
305
|
-
|
306
|
-
if cell_sort_type == "raw_cell_ids":
|
307
|
-
for cl in clusters:
|
308
|
-
sorted_cell_ids = sorted(cl["cell_ids"])
|
309
|
-
cl["cell_ids"] = sorted_cell_ids
|
310
|
-
else:
|
311
|
-
logger.debug(
|
312
|
-
"Unknown cell_sort_type `"
|
313
|
-
+ cell_sort_type
|
314
|
-
+ "`, no cell sorting will happen."
|
315
|
-
)
|
316
|
-
|
317
|
-
if cluster_sort_type == "raw_cell_ids":
|
318
|
-
clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
|
319
|
-
clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
|
320
|
-
logger.debug(
|
321
|
-
"Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
|
322
|
-
)
|
323
|
-
logger.debug(
|
324
|
-
" Their first cell ids: "
|
325
|
-
+ str([cl["cell_ids"][0] for cl in clusters_with_cells])
|
326
|
-
)
|
327
|
-
logger.debug(
|
328
|
-
"Clusters without cells: "
|
329
|
-
+ str([cl["id"] for cl in clusters_without_cells])
|
330
|
-
)
|
331
|
-
clusters_with_cells_sorted = sorted(
|
332
|
-
clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
|
333
|
-
)
|
334
|
-
logger.debug(
|
335
|
-
" First cell ids after sorting: "
|
336
|
-
+ str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
|
337
|
-
)
|
338
|
-
sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
|
339
|
-
else:
|
340
|
-
logger.debug(
|
341
|
-
"Unknown cluster_sort_type: `"
|
342
|
-
+ cluster_sort_type
|
343
|
-
+ "`, no cluster sorting will happen."
|
344
|
-
)
|
345
|
-
|
346
|
-
if sort_ids:
|
347
|
-
for i, cl in enumerate(sorted_clusters):
|
348
|
-
cl["id"] = i
|
349
|
-
return sorted_clusters
|
350
|
-
|
351
|
-
|
352
|
-
## -------------------------------
|
353
|
-
## Line Splitting
|
354
|
-
|
355
|
-
|
356
|
-
def sort_cells_horizontal(line_cell_ids, raw_cells):
|
357
|
-
## "line_cells" should be a non-empty list of (raw) cell_ids
|
358
|
-
## "raw_cells" has the structure of item["raw"]["cells"].
|
359
|
-
## Sorts the cells in the line by x0 (left start).
|
360
|
-
new_line_cell_ids = sorted(
|
361
|
-
line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
|
362
|
-
)
|
363
|
-
return new_line_cell_ids
|
364
|
-
|
365
|
-
|
366
|
-
def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
|
367
|
-
new_clusters = []
|
368
|
-
for ix, cluster in enumerate(clusters):
|
369
|
-
new_cluster = copy.deepcopy(cluster)
|
370
|
-
logger.debug(
|
371
|
-
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
|
372
|
-
)
|
373
|
-
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
|
374
|
-
if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
|
375
|
-
logger.debug(" Empty non-picture, removed")
|
376
|
-
continue ## Skip this former cluster, now without cells.
|
377
|
-
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
|
378
|
-
new_cluster["bbox"] = new_bbox
|
379
|
-
new_clusters.append(new_cluster)
|
380
|
-
return new_clusters
|
381
|
-
|
382
|
-
|
383
|
-
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
|
384
|
-
if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
|
385
|
-
## A text-like cluster. The bbox only needs to be around the text cells:
|
386
|
-
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
387
|
-
new_bbox = surrounding_list(
|
388
|
-
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
389
|
-
)
|
390
|
-
logger.debug(" New bounding box:" + str(new_bbox))
|
391
|
-
if cluster["type"] == DocItemLabel.PICTURE:
|
392
|
-
## We only make the bbox completely comprise included text cells:
|
393
|
-
logger.debug(" Picture")
|
394
|
-
if len(cluster["cell_ids"]) != 0:
|
395
|
-
min_bbox = surrounding_list(
|
396
|
-
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
397
|
-
)
|
398
|
-
logger.debug(" Minimum bbox: " + str(min_bbox))
|
399
|
-
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
400
|
-
new_bbox = surrounding(min_bbox, cluster["bbox"])
|
401
|
-
logger.debug(" New bbox (initial and text cells): " + str(new_bbox))
|
402
|
-
else:
|
403
|
-
logger.debug(" without text cells, no change.")
|
404
|
-
new_bbox = cluster["bbox"]
|
405
|
-
else: ## A table
|
406
|
-
## At least we have to keep the included text cells, and we make the bbox completely comprise them
|
407
|
-
min_bbox = surrounding_list(
|
408
|
-
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
|
409
|
-
)
|
410
|
-
logger.debug(" Minimum bbox: " + str(min_bbox))
|
411
|
-
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
|
412
|
-
new_bbox = surrounding(min_bbox, cluster["bbox"])
|
413
|
-
logger.debug(" Possibly increased bbox: " + str(new_bbox))
|
414
|
-
|
415
|
-
## Now we look which non-belonging cells are covered.
|
416
|
-
## (To decrease dependencies, we don't make use of which cells we actually removed.)
|
417
|
-
## We don't worry about orphan cells, those could still be added to the table.
|
418
|
-
enclosed_cells = compute_enclosed_cells(
|
419
|
-
new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
|
420
|
-
)[0]
|
421
|
-
additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
|
422
|
-
logger.debug(
|
423
|
-
" Additional cells enclosed by Table bbox: " + str(additional_cells)
|
424
|
-
)
|
425
|
-
spurious_cells = additional_cells - set(orphan_cell_indices)
|
426
|
-
logger.debug(
|
427
|
-
" Spurious cells enclosed by Table bbox (additional minus orphans): "
|
428
|
-
+ str(spurious_cells)
|
429
|
-
)
|
430
|
-
if len(spurious_cells) == 0:
|
431
|
-
return new_bbox
|
432
|
-
|
433
|
-
## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
|
434
|
-
## We initialize possible cuts with the current bbox.
|
435
|
-
left_cut = new_bbox[0]
|
436
|
-
right_cut = new_bbox[2]
|
437
|
-
upper_cut = new_bbox[3]
|
438
|
-
lower_cut = new_bbox[1]
|
439
|
-
|
440
|
-
for cell_ix in spurious_cells:
|
441
|
-
cell = raw_cells[cell_ix]
|
442
|
-
# logger.debug(" Spurious cell bbox: " + str(cell["bbox"]))
|
443
|
-
is_left = cell["bbox"][2] < min_bbox[0]
|
444
|
-
is_right = cell["bbox"][0] > min_bbox[2]
|
445
|
-
is_above = cell["bbox"][1] > min_bbox[3]
|
446
|
-
is_below = cell["bbox"][3] < min_bbox[1]
|
447
|
-
# logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
|
448
|
-
|
449
|
-
if is_left:
|
450
|
-
if cell["bbox"][2] > left_cut:
|
451
|
-
## We move the left cut to exclude this cell:
|
452
|
-
left_cut = cell["bbox"][2]
|
453
|
-
if is_right:
|
454
|
-
if cell["bbox"][0] < right_cut:
|
455
|
-
## We move the right cut to exclude this cell:
|
456
|
-
right_cut = cell["bbox"][0]
|
457
|
-
if is_above:
|
458
|
-
if cell["bbox"][1] < upper_cut:
|
459
|
-
## We move the upper cut to exclude this cell:
|
460
|
-
upper_cut = cell["bbox"][1]
|
461
|
-
if is_below:
|
462
|
-
if cell["bbox"][3] > lower_cut:
|
463
|
-
## We move the left cut to exclude this cell:
|
464
|
-
lower_cut = cell["bbox"][3]
|
465
|
-
# logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
|
466
|
-
|
467
|
-
new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
|
468
|
-
|
469
|
-
logger.debug(" Final bbox: " + str(new_bbox))
|
470
|
-
return new_bbox
|
471
|
-
|
472
|
-
|
473
|
-
def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
|
474
|
-
DuplicateDeletedClusterIDs = []
|
475
|
-
for cluster_1 in cluster_predictions:
|
476
|
-
for cluster_2 in cluster_predictions:
|
477
|
-
if cluster_1["id"] != cluster_2["id"]:
|
478
|
-
if_conf = False
|
479
|
-
if cluster_1["confidence"] > cluster_2["confidence"]:
|
480
|
-
if_conf = True
|
481
|
-
if if_conf == True:
|
482
|
-
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
|
483
|
-
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
484
|
-
elif contains(
|
485
|
-
cluster_1["bbox"],
|
486
|
-
[
|
487
|
-
cluster_2["bbox"][0] + 3,
|
488
|
-
cluster_2["bbox"][1] + 3,
|
489
|
-
cluster_2["bbox"][2] - 3,
|
490
|
-
cluster_2["bbox"][3] - 3,
|
491
|
-
],
|
492
|
-
):
|
493
|
-
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
494
|
-
|
495
|
-
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
496
|
-
|
497
|
-
for cl_id in DuplicateDeletedClusterIDs:
|
498
|
-
for cluster in cluster_predictions:
|
499
|
-
if cl_id == cluster["id"]:
|
500
|
-
cluster_predictions.remove(cluster)
|
501
|
-
return cluster_predictions
|
502
|
-
|
503
|
-
|
504
|
-
# Assign orphan cells by a low confidence prediction that is below the assigned confidence
|
505
|
-
def assign_orphans_with_low_conf_pred(
|
506
|
-
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
|
507
|
-
):
|
508
|
-
for orph_id in orphan_cell_indices:
|
509
|
-
cluster_chosen = {}
|
510
|
-
iou_thresh = 0.05
|
511
|
-
confidence = 0.05
|
512
|
-
|
513
|
-
# Loop over all predictions, and find the one with the highest IOU, and confidence
|
514
|
-
for cluster in cluster_predictions_low:
|
515
|
-
calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
|
516
|
-
cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
|
517
|
-
cluster["bbox"][2] - cluster["bbox"][0]
|
518
|
-
)
|
519
|
-
cell_area = (
|
520
|
-
raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
|
521
|
-
) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
|
522
|
-
|
523
|
-
if (
|
524
|
-
(iou_thresh < calc_iou)
|
525
|
-
and (cluster["confidence"] > confidence)
|
526
|
-
and (cell_area * 3 > cluster_area)
|
527
|
-
):
|
528
|
-
cluster_chosen = cluster
|
529
|
-
iou_thresh = calc_iou
|
530
|
-
confidence = cluster["confidence"]
|
531
|
-
# If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
|
532
|
-
if iou_thresh != 0.05 and confidence != 0.05:
|
533
|
-
cluster_chosen["cell_ids"].append(orph_id)
|
534
|
-
cluster_chosen["created_by"] = "orph_low_conf"
|
535
|
-
cluster_predictions.append(cluster_chosen)
|
536
|
-
orphan_cell_indices.remove(orph_id)
|
537
|
-
return cluster_predictions, orphan_cell_indices
|
538
|
-
|
539
|
-
|
540
|
-
def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
|
541
|
-
for amb_cell_id in amb_cell_idxs:
|
542
|
-
highest_conf = 0
|
543
|
-
highest_bbox_iou = 0
|
544
|
-
cluster_chosen = None
|
545
|
-
problamatic_clusters = []
|
546
|
-
|
547
|
-
# Find clusters in question
|
548
|
-
for cluster in cluster_predictions:
|
549
|
-
|
550
|
-
if amb_cell_id in cluster["cell_ids"]:
|
551
|
-
problamatic_clusters.append(amb_cell_id)
|
552
|
-
|
553
|
-
# If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
|
554
|
-
bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
|
555
|
-
|
556
|
-
if (
|
557
|
-
cluster["confidence"] > highest_conf
|
558
|
-
and bbox_iou_val > highest_bbox_iou
|
559
|
-
):
|
560
|
-
cluster_chosen = cluster
|
561
|
-
highest_conf = cluster["confidence"]
|
562
|
-
highest_bbox_iou = bbox_iou_val
|
563
|
-
if cluster["id"] in problamatic_clusters:
|
564
|
-
problamatic_clusters.remove(cluster["id"])
|
565
|
-
|
566
|
-
# now remove the assigning of cell id from lower confidence, and threshold
|
567
|
-
for cluster in cluster_predictions:
|
568
|
-
for prob_amb_id in problamatic_clusters:
|
569
|
-
if prob_amb_id in cluster["cell_ids"]:
|
570
|
-
cluster["cell_ids"].remove(prob_amb_id)
|
571
|
-
amb_cell_idxs.remove(amb_cell_id)
|
572
|
-
|
573
|
-
return cluster_predictions, amb_cell_idxs
|
574
|
-
|
575
|
-
|
576
|
-
def ranges(nums):
|
577
|
-
# Find if consecutive numbers exist within pdf cells
|
578
|
-
# Used to remove line numbers for review manuscripts
|
579
|
-
nums = sorted(set(nums))
|
580
|
-
gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
|
581
|
-
edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
|
582
|
-
return list(zip(edges, edges))
|
583
|
-
|
584
|
-
|
585
|
-
def set_orphan_as_text(
|
586
|
-
cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
|
587
|
-
):
|
588
|
-
max_id = -1
|
589
|
-
figures = []
|
590
|
-
for cluster in cluster_predictions:
|
591
|
-
if cluster["type"] == DocItemLabel.PICTURE:
|
592
|
-
figures.append(cluster)
|
593
|
-
|
594
|
-
if cluster["id"] > max_id:
|
595
|
-
max_id = cluster["id"]
|
596
|
-
max_id += 1
|
597
|
-
|
598
|
-
lines_detector = False
|
599
|
-
content_of_orphans = []
|
600
|
-
for orph_id in orphan_cell_indices:
|
601
|
-
orph_cell = raw_cells[orph_id]
|
602
|
-
content_of_orphans.append(raw_cells[orph_id]["text"])
|
603
|
-
|
604
|
-
fil_content_of_orphans = []
|
605
|
-
for cell_content in content_of_orphans:
|
606
|
-
if cell_content.isnumeric():
|
607
|
-
try:
|
608
|
-
num = int(cell_content)
|
609
|
-
fil_content_of_orphans.append(num)
|
610
|
-
except ValueError: # ignore the cell
|
611
|
-
pass
|
612
|
-
|
613
|
-
# line_orphans = []
|
614
|
-
# Check if there are more than 2 pdf orphan cells, if there are more than 2,
|
615
|
-
# then check between the orphan cells if they are numeric
|
616
|
-
# and if they are a consecutive series of numbers (using ranges function) to decide
|
617
|
-
|
618
|
-
if len(fil_content_of_orphans) > 2:
|
619
|
-
out_ranges = ranges(fil_content_of_orphans)
|
620
|
-
if len(out_ranges) > 1:
|
621
|
-
cnt_range = 0
|
622
|
-
for ranges_ in out_ranges:
|
623
|
-
if ranges_[0] != ranges_[1]:
|
624
|
-
# If there are more than 75 (half the total line number of a review manuscript page)
|
625
|
-
# decide that there are line numbers on page to be ignored.
|
626
|
-
if len(list(range(ranges_[0], ranges_[1]))) > 75:
|
627
|
-
lines_detector = True
|
628
|
-
# line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
|
629
|
-
|
630
|
-
for orph_id in orphan_cell_indices:
|
631
|
-
orph_cell = raw_cells[orph_id]
|
632
|
-
if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
|
633
|
-
fig_flag = False
|
634
|
-
# Do not assign orphan cells if they are inside a figure
|
635
|
-
for fig in figures:
|
636
|
-
if contains(fig["bbox"], orph_cell["bbox"]):
|
637
|
-
fig_flag = True
|
638
|
-
|
639
|
-
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
|
640
|
-
if fig_flag == False and lines_detector == False:
|
641
|
-
# get class from low confidence detections if not set as text:
|
642
|
-
class_type = DocItemLabel.TEXT
|
643
|
-
|
644
|
-
for cluster in cluster_predictions_low:
|
645
|
-
intersection = compute_intersection(
|
646
|
-
orph_cell["bbox"], cluster["bbox"]
|
647
|
-
)
|
648
|
-
class_type = DocItemLabel.TEXT
|
649
|
-
if (
|
650
|
-
cluster["confidence"] > 0.1
|
651
|
-
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
|
652
|
-
):
|
653
|
-
class_type = cluster["type"]
|
654
|
-
elif contains(
|
655
|
-
cluster["bbox"],
|
656
|
-
[
|
657
|
-
orph_cell["bbox"][0] + 3,
|
658
|
-
orph_cell["bbox"][1] + 3,
|
659
|
-
orph_cell["bbox"][2] - 3,
|
660
|
-
orph_cell["bbox"][3] - 3,
|
661
|
-
],
|
662
|
-
):
|
663
|
-
class_type = cluster["type"]
|
664
|
-
elif intersection > area(orph_cell["bbox"]) * 0.2:
|
665
|
-
class_type = cluster["type"]
|
666
|
-
|
667
|
-
new_cluster = {
|
668
|
-
"id": max_id,
|
669
|
-
"bbox": orph_cell["bbox"],
|
670
|
-
"type": class_type,
|
671
|
-
"cell_ids": [orph_id],
|
672
|
-
"confidence": -1,
|
673
|
-
"created_by": "orphan_default",
|
674
|
-
}
|
675
|
-
max_id += 1
|
676
|
-
cluster_predictions.append(new_cluster)
|
677
|
-
return cluster_predictions, orphan_cell_indices
|
678
|
-
|
679
|
-
|
680
|
-
def merge_cells(cluster_predictions):
|
681
|
-
# Using graph component creates clusters if orphan cells are touching or too close.
|
682
|
-
G = nx.Graph()
|
683
|
-
for cluster in cluster_predictions:
|
684
|
-
if cluster["created_by"] == "orphan_default":
|
685
|
-
G.add_node(cluster["id"])
|
686
|
-
|
687
|
-
for cluster_1 in cluster_predictions:
|
688
|
-
for cluster_2 in cluster_predictions:
|
689
|
-
if (
|
690
|
-
cluster_1["id"] != cluster_2["id"]
|
691
|
-
and cluster_2["created_by"] == "orphan_default"
|
692
|
-
and cluster_1["created_by"] == "orphan_default"
|
693
|
-
):
|
694
|
-
cl1 = copy.deepcopy(cluster_1["bbox"])
|
695
|
-
cl2 = copy.deepcopy(cluster_2["bbox"])
|
696
|
-
cl1[0] = cl1[0] - 2
|
697
|
-
cl1[1] = cl1[1] - 2
|
698
|
-
cl1[2] = cl1[2] + 2
|
699
|
-
cl1[3] = cl1[3] + 2
|
700
|
-
cl2[0] = cl2[0] - 2
|
701
|
-
cl2[1] = cl2[1] - 2
|
702
|
-
cl2[2] = cl2[2] + 2
|
703
|
-
cl2[3] = cl2[3] + 2
|
704
|
-
if is_intersecting(cl1, cl2):
|
705
|
-
G.add_edge(cluster_1["id"], cluster_2["id"])
|
706
|
-
|
707
|
-
component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
|
708
|
-
max_id = -1
|
709
|
-
for cluster_1 in cluster_predictions:
|
710
|
-
if cluster_1["id"] > max_id:
|
711
|
-
max_id = cluster_1["id"]
|
712
|
-
|
713
|
-
for nodes in component:
|
714
|
-
if len(nodes) > 1:
|
715
|
-
max_id += 1
|
716
|
-
lines = []
|
717
|
-
for node in nodes:
|
718
|
-
for cluster in cluster_predictions:
|
719
|
-
if cluster["id"] == node:
|
720
|
-
lines.append(cluster)
|
721
|
-
cluster_predictions.remove(cluster)
|
722
|
-
new_merged_cluster = build_cluster_from_lines(
|
723
|
-
lines, DocItemLabel.TEXT, max_id
|
724
|
-
)
|
725
|
-
cluster_predictions.append(new_merged_cluster)
|
726
|
-
return cluster_predictions
|
727
|
-
|
728
|
-
|
729
|
-
def clean_up_clusters(
|
730
|
-
cluster_predictions,
|
731
|
-
raw_cells,
|
732
|
-
merge_cells=False,
|
733
|
-
img_table=False,
|
734
|
-
one_cell_table=False,
|
735
|
-
):
|
736
|
-
DuplicateDeletedClusterIDs = []
|
737
|
-
|
738
|
-
for cluster_1 in cluster_predictions:
|
739
|
-
for cluster_2 in cluster_predictions:
|
740
|
-
if cluster_1["id"] != cluster_2["id"]:
|
741
|
-
# remove any artifcats created by merging clusters
|
742
|
-
if merge_cells == True:
|
743
|
-
if contains(
|
744
|
-
cluster_1["bbox"],
|
745
|
-
[
|
746
|
-
cluster_2["bbox"][0] + 3,
|
747
|
-
cluster_2["bbox"][1] + 3,
|
748
|
-
cluster_2["bbox"][2] - 3,
|
749
|
-
cluster_2["bbox"][3] - 3,
|
750
|
-
],
|
751
|
-
):
|
752
|
-
cluster_1["cell_ids"] = (
|
753
|
-
cluster_1["cell_ids"] + cluster_2["cell_ids"]
|
754
|
-
)
|
755
|
-
DuplicateDeletedClusterIDs.append(cluster_2["id"])
|
756
|
-
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
|
757
|
-
elif img_table == True:
|
758
|
-
if (
|
759
|
-
cluster_1["type"] == DocItemLabel.TEXT
|
760
|
-
and cluster_2["type"] == DocItemLabel.PICTURE
|
761
|
-
or cluster_2["type"] == DocItemLabel.TABLE
|
762
|
-
):
|
763
|
-
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
|
764
|
-
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
765
|
-
elif contains(
|
766
|
-
[
|
767
|
-
cluster_2["bbox"][0] - 3,
|
768
|
-
cluster_2["bbox"][1] - 3,
|
769
|
-
cluster_2["bbox"][2] + 3,
|
770
|
-
cluster_2["bbox"][3] + 3,
|
771
|
-
],
|
772
|
-
cluster_1["bbox"],
|
773
|
-
):
|
774
|
-
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
775
|
-
# remove tables that have one pdf cell
|
776
|
-
if one_cell_table == True:
|
777
|
-
if (
|
778
|
-
cluster_1["type"] == DocItemLabel.TABLE
|
779
|
-
and len(cluster_1["cell_ids"]) < 2
|
780
|
-
):
|
781
|
-
DuplicateDeletedClusterIDs.append(cluster_1["id"])
|
782
|
-
|
783
|
-
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
|
784
|
-
|
785
|
-
for cl_id in DuplicateDeletedClusterIDs:
|
786
|
-
for cluster in cluster_predictions:
|
787
|
-
if cl_id == cluster["id"]:
|
788
|
-
cluster_predictions.remove(cluster)
|
789
|
-
return cluster_predictions
|
790
|
-
|
791
|
-
|
792
|
-
def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
|
793
|
-
for cluster in clusters:
|
794
|
-
cells_in_cluster, _ = compute_enclosed_cells(
|
795
|
-
cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
|
796
|
-
)
|
797
|
-
cluster["cell_ids"] = cells_in_cluster
|
798
|
-
## These cell_ids are ids of the raw cells.
|
799
|
-
## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
|
800
|
-
return clusters
|
801
|
-
|
802
|
-
|
803
|
-
# Creates a map of cell_id->cluster_id
|
804
|
-
def cell_id_state_map(clusters, cell_count):
|
805
|
-
clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
|
806
|
-
orphan_cell_indices = [
|
807
|
-
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
|
808
|
-
] # which cells are assigned no cluster?
|
809
|
-
ambiguous_cell_indices = [
|
810
|
-
ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
|
811
|
-
] # which cells are assigned > 1 clusters?
|
812
|
-
return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices
|