docling 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,806 @@
1
+ import copy
2
+ import logging
3
+
4
+ import networkx as nx
5
+
6
+ logger = logging.getLogger("layout_utils")
7
+
8
+
9
+ ## -------------------------------
10
+ ## Geometric helper functions
11
+ ## The coordinates grow left to right, and bottom to top.
12
+ ## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
13
+
14
+
15
+ def area(bbox):
16
+ return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
17
+
18
+
19
+ def contains(bbox_i, bbox_j):
20
+ ## Returns True if bbox_i contains bbox_j, else False
21
+ return (
22
+ bbox_i[0] <= bbox_j[0]
23
+ and bbox_i[1] <= bbox_j[1]
24
+ and bbox_i[2] >= bbox_j[2]
25
+ and bbox_i[3] >= bbox_j[3]
26
+ )
27
+
28
+
29
+ def is_intersecting(bbox_i, bbox_j):
30
+ return not (
31
+ bbox_i[2] < bbox_j[0]
32
+ or bbox_i[0] > bbox_j[2]
33
+ or bbox_i[3] < bbox_j[1]
34
+ or bbox_i[1] > bbox_j[3]
35
+ )
36
+
37
+
38
+ def bb_iou(boxA, boxB):
39
+ # determine the (x, y)-coordinates of the intersection rectangle
40
+ xA = max(boxA[0], boxB[0])
41
+ yA = max(boxA[1], boxB[1])
42
+ xB = min(boxA[2], boxB[2])
43
+ yB = min(boxA[3], boxB[3])
44
+ # compute the area of intersection rectangle
45
+ interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
46
+ # compute the area of both the prediction and ground-truth
47
+ # rectangles
48
+ boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
49
+ boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
50
+ # compute the intersection over union by taking the intersection
51
+ # area and dividing it by the sum of prediction + ground-truth
52
+ # areas - the interesection area
53
+ iou = interArea / float(boxAArea + boxBArea - interArea)
54
+ # return the intersection over union value
55
+ return iou
56
+
57
+
58
+ def compute_intersection(bbox_i, bbox_j):
59
+ ## Returns the size of the intersection area of the two boxes
60
+ if not is_intersecting(bbox_i, bbox_j):
61
+ return 0
62
+ ## Determine the (x, y)-coordinates of the intersection rectangle:
63
+ xA = max(bbox_i[0], bbox_j[0])
64
+ yA = max(bbox_i[1], bbox_j[1])
65
+ xB = min(bbox_i[2], bbox_j[2])
66
+ yB = min(bbox_i[3], bbox_j[3])
67
+ ## Compute the area of intersection rectangle:
68
+ interArea = (xB - xA) * (yB - yA)
69
+ if interArea < 0:
70
+ logger.debug("Warning: Negative intersection detected!")
71
+ return 0
72
+ return interArea
73
+
74
+
75
+ def surrounding(bbox_i, bbox_j):
76
+ ## Computes minimal box that contains both input boxes
77
+ sbox = []
78
+ sbox.append(min(bbox_i[0], bbox_j[0]))
79
+ sbox.append(min(bbox_i[1], bbox_j[1]))
80
+ sbox.append(max(bbox_i[2], bbox_j[2]))
81
+ sbox.append(max(bbox_i[3], bbox_j[3]))
82
+ return sbox
83
+
84
+
85
+ def surrounding_list(bbox_list):
86
+ ## Computes minimal box that contains all boxes in the input list
87
+ ## The list should be non-empty, but just in case it's not:
88
+ if len(bbox_list) == 0:
89
+ sbox = [0, 0, 0, 0]
90
+ else:
91
+ sbox = []
92
+ sbox.append(min([bbox[0] for bbox in bbox_list]))
93
+ sbox.append(min([bbox[1] for bbox in bbox_list]))
94
+ sbox.append(max([bbox[2] for bbox in bbox_list]))
95
+ sbox.append(max([bbox[3] for bbox in bbox_list]))
96
+ return sbox
97
+
98
+
99
+ def vertical_overlap(bboxA, bboxB):
100
+ ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
101
+ if bboxB[3] < bboxA[1]: ## B below A
102
+ return False
103
+ elif bboxA[3] < bboxB[1]: ## A below B
104
+ return False
105
+ else:
106
+ return True
107
+
108
+
109
+ def vertical_overlap_fraction(bboxA, bboxB):
110
+ ## Returns the vertical overlap as fraction of the lower bbox height.
111
+ ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
112
+ ## Height 0 is permitted in the input.
113
+ heightA = bboxA[3] - bboxA[1]
114
+ heightB = bboxB[3] - bboxB[1]
115
+ min_height = min(heightA, heightB)
116
+ if bboxA[3] >= bboxB[3]: ## A starts higher or equal
117
+ if (
118
+ bboxA[1] <= bboxB[1]
119
+ ): ## B is completely in A; this can include height of B = 0:
120
+ fraction = 1
121
+ else:
122
+ overlap = max(bboxB[3] - bboxA[1], 0)
123
+ fraction = overlap / max(min_height, 0.001)
124
+ else:
125
+ if (
126
+ bboxB[1] <= bboxA[1]
127
+ ): ## A is completely in B; this can include height of A = 0:
128
+ fraction = 1
129
+ else:
130
+ overlap = max(bboxA[3] - bboxB[1], 0)
131
+ fraction = overlap / max(min_height, 0.001)
132
+ return fraction
133
+
134
+
135
+ ## -------------------------------
136
+ ## Cluster-and-cell relations
137
+
138
+
139
+ def compute_enclosed_cells(
140
+ cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
141
+ ):
142
+ cells_in_cluster = []
143
+ cells_in_cluster_int = []
144
+ for ix, cell in enumerate(raw_cells):
145
+ cell_bbox = cell["bbox"]
146
+ intersection = compute_intersection(cell_bbox, cluster_bbox)
147
+ frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
148
+
149
+ if (
150
+ intersection > frac_area and frac_area > 0
151
+ ): # intersect > certain fraction of cell
152
+ cells_in_cluster.append(ix)
153
+ cells_in_cluster_int.append(intersection)
154
+ elif contains(
155
+ cluster_bbox,
156
+ [cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
157
+ ):
158
+ cells_in_cluster.append(ix)
159
+ return cells_in_cluster, cells_in_cluster_int
160
+
161
+
162
+ def find_clusters_around_cells(cell_count, clusters):
163
+ ## Per raw cell, find to which clusters it belongs.
164
+ ## Return list of these indices in the raw-cell order.
165
+ clusters_around_cells = [[] for _ in range(cell_count)]
166
+ for cl_ix, cluster in enumerate(clusters):
167
+ for ix in cluster["cell_ids"]:
168
+ clusters_around_cells[ix].append(cl_ix)
169
+ return clusters_around_cells
170
+
171
+
172
+ def find_cell_index(raw_ix, cell_array):
173
+ ## "raw_ix" is a rawcell_id.
174
+ ## "cell_array" has the structure of an (annotation) cells array.
175
+ ## Returns index of cell in cell_array that has this rawcell_id.
176
+ for ix, cell in enumerate(cell_array):
177
+ if cell["rawcell_id"] == raw_ix:
178
+ return ix
179
+
180
+
181
+ def find_cell_indices(cluster, cell_array):
182
+ ## "cluster" must have the structure as in a clusters array in a prediction,
183
+ ## "cell_array" that of a cells array.
184
+ ## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
185
+ ## in the order of the rawcell_ids.
186
+ result = []
187
+ for raw_ix in sorted(cluster["cell_ids"]):
188
+ ## Find the cell with this rawcell_id (if any)
189
+ for ix, cell in enumerate(cell_array):
190
+ if cell["rawcell_id"] == raw_ix:
191
+ result.append(ix)
192
+ return result
193
+
194
+
195
+ def find_first_cell_index(cluster, cell_array):
196
+ ## "cluster" must be a dict with key "cell_ids"; it can also be a line.
197
+ ## "cell_array" has the structure of a cells array in an annotation.
198
+ ## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
199
+ result = [] ## We keep it a list as it can be empty (picture without text cells)
200
+ if len(cluster["cell_ids"]) == 0:
201
+ return result
202
+ raw_ix = min(cluster["cell_ids"])
203
+ ## Find the cell with this rawcell_id (if any)
204
+ for ix, cell in enumerate(cell_array):
205
+ if cell["rawcell_id"] == raw_ix:
206
+ result.append(ix)
207
+ break ## One is enough; should be only one anyway.
208
+ if result == []:
209
+ logger.debug(
210
+ " Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
211
+ )
212
+ return result
213
+
214
+
215
+ ## -------------------------------
216
+ ## Cluster labels and text
217
+
218
+
219
+ def relabel_cluster(cluster, cl_ix, new_label, target_pred):
220
+ ## "cluster" must have the structure as in a clusters array in a prediction,
221
+ ## "cl_ix" is its index in target_pred,
222
+ ## "new_label" is the intended new label,
223
+ ## "target_pred" is the entire current target prediction.
224
+ ## Sets label on the cluster itself, and on the cells in the target_pred.
225
+ ## Returns new_label so that also the cl_label variable in the main code is easily set.
226
+ target_pred["clusters"][cl_ix]["type"] = new_label
227
+ cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
228
+ for ix in cluster_target_cells:
229
+ target_pred["cells"][ix]["label"] = new_label
230
+ return new_label
231
+
232
+
233
+ def find_cluster_text(cluster, raw_cells):
234
+ ## "cluster" must be a dict with "cell_ids"; it can also be a line.
235
+ ## "raw_cells" must have the format of item["raw"]["cells"]
236
+ ## Returns the text of the cluster, with blanks between the cell contents
237
+ ## (which seem to be words or phrases without starting or trailing blanks).
238
+ ## Note that in formulas, this may give a lot more blanks than originally
239
+ cluster_text = ""
240
+ for raw_ix in sorted(cluster["cell_ids"]):
241
+ cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
242
+ return cluster_text.rstrip()
243
+
244
+
245
+ def find_cluster_text_without_blanks(cluster, raw_cells):
246
+ ## "cluster" must be a dict with "cell_ids"; it can also be a line.
247
+ ## "raw_cells" must have the format of item["raw"]["cells"]
248
+ ## Returns the text of the cluster, without blanks between the cell contents
249
+ ## Interesting in formula analysis.
250
+ cluster_text = ""
251
+ for raw_ix in sorted(cluster["cell_ids"]):
252
+ cluster_text = cluster_text + raw_cells[raw_ix]["text"]
253
+ return cluster_text.rstrip()
254
+
255
+
256
+ ## -------------------------------
257
+ ## Clusters and lines
258
+ ## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
259
+ ## but this one also in FormulaAnalysis)
260
+
261
+
262
+ def build_cluster_from_lines(lines, label, id):
263
+ ## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
264
+ ## (There is no condition that they are really geometrically lines)
265
+ ## A cluster in standard format is returned with given label and id
266
+ local_lines = copy.deepcopy(
267
+ lines
268
+ ) ## without this, it changes "lines" also outside this function
269
+ first_line = local_lines.pop(0)
270
+ cluster = {
271
+ "id": id,
272
+ "type": label,
273
+ "cell_ids": first_line["cell_ids"],
274
+ "bbox": first_line["bbox"],
275
+ "confidence": 0,
276
+ "created_by": "merged_cells",
277
+ }
278
+ confidence = 0
279
+ counter = 0
280
+ for line in local_lines:
281
+ new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
282
+ cluster["cell_ids"] = new_cell_ids
283
+ cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
284
+ counter += 1
285
+ confidence += line["confidence"]
286
+ confidence = confidence / counter
287
+ cluster["confidence"] = confidence
288
+ return cluster
289
+
290
+
291
+ ## -------------------------------
292
+ ## Reading order
293
+
294
+
295
+ def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
296
+ ## In:
297
+ ## Clusters: list as in predictions.
298
+ ## cluster_sort_type: string, currently only "raw_cells".
299
+ ## cell_sort_type: string, currently only "raw_cells".
300
+ ## sort_ids: Boolean, whether the cluster ids should be adapted to their new position
301
+ ## Out: Another clusters list, sorted according to the type.
302
+
303
+ logger.debug("---- Start cluster sorting ------")
304
+
305
+ if cell_sort_type == "raw_cell_ids":
306
+ for cl in clusters:
307
+ sorted_cell_ids = sorted(cl["cell_ids"])
308
+ cl["cell_ids"] = sorted_cell_ids
309
+ else:
310
+ logger.debug(
311
+ "Unknown cell_sort_type `"
312
+ + cell_sort_type
313
+ + "`, no cell sorting will happen."
314
+ )
315
+
316
+ if cluster_sort_type == "raw_cell_ids":
317
+ clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
318
+ clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
319
+ logger.debug(
320
+ "Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
321
+ )
322
+ logger.debug(
323
+ " Their first cell ids: "
324
+ + str([cl["cell_ids"][0] for cl in clusters_with_cells])
325
+ )
326
+ logger.debug(
327
+ "Clusters without cells: "
328
+ + str([cl["id"] for cl in clusters_without_cells])
329
+ )
330
+ clusters_with_cells_sorted = sorted(
331
+ clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
332
+ )
333
+ logger.debug(
334
+ " First cell ids after sorting: "
335
+ + str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
336
+ )
337
+ sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
338
+ else:
339
+ logger.debug(
340
+ "Unknown cluster_sort_type: `"
341
+ + cluster_sort_type
342
+ + "`, no cluster sorting will happen."
343
+ )
344
+
345
+ if sort_ids:
346
+ for i, cl in enumerate(sorted_clusters):
347
+ cl["id"] = i
348
+ return sorted_clusters
349
+
350
+
351
+ ## -------------------------------
352
+ ## Line Splitting
353
+
354
+
355
+ def sort_cells_horizontal(line_cell_ids, raw_cells):
356
+ ## "line_cells" should be a non-empty list of (raw) cell_ids
357
+ ## "raw_cells" has the structure of item["raw"]["cells"].
358
+ ## Sorts the cells in the line by x0 (left start).
359
+ new_line_cell_ids = sorted(
360
+ line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
361
+ )
362
+ return new_line_cell_ids
363
+
364
+
365
+ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
366
+ new_clusters = []
367
+ for ix, cluster in enumerate(clusters):
368
+ new_cluster = copy.deepcopy(cluster)
369
+ logger.debug(
370
+ "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
371
+ )
372
+ logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
373
+ if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
374
+ logger.debug(" Empty non-picture, removed")
375
+ continue ## Skip this former cluster, now without cells.
376
+ new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
377
+ new_cluster["bbox"] = new_bbox
378
+ new_clusters.append(new_cluster)
379
+ return new_clusters
380
+
381
+
382
+ def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
383
+ if not (cluster["type"] in ["Table", "Picture"]):
384
+ ## A text-like cluster. The bbox only needs to be around the text cells:
385
+ logger.debug(" Initial bbox: " + str(cluster["bbox"]))
386
+ new_bbox = surrounding_list(
387
+ [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
388
+ )
389
+ logger.debug(" New bounding box:" + str(new_bbox))
390
+ if cluster["type"] == "Picture":
391
+ ## We only make the bbox completely comprise included text cells:
392
+ logger.debug(" Picture")
393
+ if len(cluster["cell_ids"]) != 0:
394
+ min_bbox = surrounding_list(
395
+ [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
396
+ )
397
+ logger.debug(" Minimum bbox: " + str(min_bbox))
398
+ logger.debug(" Initial bbox: " + str(cluster["bbox"]))
399
+ new_bbox = surrounding(min_bbox, cluster["bbox"])
400
+ logger.debug(" New bbox (initial and text cells): " + str(new_bbox))
401
+ else:
402
+ logger.debug(" without text cells, no change.")
403
+ new_bbox = cluster["bbox"]
404
+ else: ## A table
405
+ ## At least we have to keep the included text cells, and we make the bbox completely comprise them
406
+ min_bbox = surrounding_list(
407
+ [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
408
+ )
409
+ logger.debug(" Minimum bbox: " + str(min_bbox))
410
+ logger.debug(" Initial bbox: " + str(cluster["bbox"]))
411
+ new_bbox = surrounding(min_bbox, cluster["bbox"])
412
+ logger.debug(" Possibly increased bbox: " + str(new_bbox))
413
+
414
+ ## Now we look which non-belonging cells are covered.
415
+ ## (To decrease dependencies, we don't make use of which cells we actually removed.)
416
+ ## We don't worry about orphan cells, those could still be added to the table.
417
+ enclosed_cells = compute_enclosed_cells(
418
+ new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
419
+ )[0]
420
+ additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
421
+ logger.debug(
422
+ " Additional cells enclosed by Table bbox: " + str(additional_cells)
423
+ )
424
+ spurious_cells = additional_cells - set(orphan_cell_indices)
425
+ logger.debug(
426
+ " Spurious cells enclosed by Table bbox (additional minus orphans): "
427
+ + str(spurious_cells)
428
+ )
429
+ if len(spurious_cells) == 0:
430
+ return new_bbox
431
+
432
+ ## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
433
+ ## We initialize possible cuts with the current bbox.
434
+ left_cut = new_bbox[0]
435
+ right_cut = new_bbox[2]
436
+ upper_cut = new_bbox[3]
437
+ lower_cut = new_bbox[1]
438
+
439
+ for cell_ix in spurious_cells:
440
+ cell = raw_cells[cell_ix]
441
+ # logger.debug(" Spurious cell bbox: " + str(cell["bbox"]))
442
+ is_left = cell["bbox"][2] < min_bbox[0]
443
+ is_right = cell["bbox"][0] > min_bbox[2]
444
+ is_above = cell["bbox"][1] > min_bbox[3]
445
+ is_below = cell["bbox"][3] < min_bbox[1]
446
+ # logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
447
+
448
+ if is_left:
449
+ if cell["bbox"][2] > left_cut:
450
+ ## We move the left cut to exclude this cell:
451
+ left_cut = cell["bbox"][2]
452
+ if is_right:
453
+ if cell["bbox"][0] < right_cut:
454
+ ## We move the right cut to exclude this cell:
455
+ right_cut = cell["bbox"][0]
456
+ if is_above:
457
+ if cell["bbox"][1] < upper_cut:
458
+ ## We move the upper cut to exclude this cell:
459
+ upper_cut = cell["bbox"][1]
460
+ if is_below:
461
+ if cell["bbox"][3] > lower_cut:
462
+ ## We move the left cut to exclude this cell:
463
+ lower_cut = cell["bbox"][3]
464
+ # logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
465
+
466
+ new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
467
+
468
+ logger.debug(" Final bbox: " + str(new_bbox))
469
+ return new_bbox
470
+
471
+
472
+ def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
473
+ DuplicateDeletedClusterIDs = []
474
+ for cluster_1 in cluster_predictions:
475
+ for cluster_2 in cluster_predictions:
476
+ if cluster_1["id"] != cluster_2["id"]:
477
+ if_conf = False
478
+ if cluster_1["confidence"] > cluster_2["confidence"]:
479
+ if_conf = True
480
+ if if_conf == True:
481
+ if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
482
+ DuplicateDeletedClusterIDs.append(cluster_2["id"])
483
+ elif contains(
484
+ cluster_1["bbox"],
485
+ [
486
+ cluster_2["bbox"][0] + 3,
487
+ cluster_2["bbox"][1] + 3,
488
+ cluster_2["bbox"][2] - 3,
489
+ cluster_2["bbox"][3] - 3,
490
+ ],
491
+ ):
492
+ DuplicateDeletedClusterIDs.append(cluster_2["id"])
493
+
494
+ DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
495
+
496
+ for cl_id in DuplicateDeletedClusterIDs:
497
+ for cluster in cluster_predictions:
498
+ if cl_id == cluster["id"]:
499
+ cluster_predictions.remove(cluster)
500
+ return cluster_predictions
501
+
502
+
503
+ # Assign orphan cells by a low confidence prediction that is below the assigned confidence
504
+ def assign_orphans_with_low_conf_pred(
505
+ cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
506
+ ):
507
+ for orph_id in orphan_cell_indices:
508
+ cluster_chosen = {}
509
+ iou_thresh = 0.05
510
+ confidence = 0.05
511
+
512
+ # Loop over all predictions, and find the one with the highest IOU, and confidence
513
+ for cluster in cluster_predictions_low:
514
+ calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
515
+ cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
516
+ cluster["bbox"][2] - cluster["bbox"][0]
517
+ )
518
+ cell_area = (
519
+ raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
520
+ ) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
521
+
522
+ if (
523
+ (iou_thresh < calc_iou)
524
+ and (cluster["confidence"] > confidence)
525
+ and (cell_area * 3 > cluster_area)
526
+ ):
527
+ cluster_chosen = cluster
528
+ iou_thresh = calc_iou
529
+ confidence = cluster["confidence"]
530
+ # If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
531
+ if iou_thresh != 0.05 and confidence != 0.05:
532
+ cluster_chosen["cell_ids"].append(orph_id)
533
+ cluster_chosen["created_by"] = "orph_low_conf"
534
+ cluster_predictions.append(cluster_chosen)
535
+ orphan_cell_indices.remove(orph_id)
536
+ return cluster_predictions, orphan_cell_indices
537
+
538
+
539
+ def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
540
+ for amb_cell_id in amb_cell_idxs:
541
+ highest_conf = 0
542
+ highest_bbox_iou = 0
543
+ cluster_chosen = None
544
+ problamatic_clusters = []
545
+
546
+ # Find clusters in question
547
+ for cluster in cluster_predictions:
548
+
549
+ if amb_cell_id in cluster["cell_ids"]:
550
+ problamatic_clusters.append(amb_cell_id)
551
+
552
+ # If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
553
+ bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
554
+
555
+ if (
556
+ cluster["confidence"] > highest_conf
557
+ and bbox_iou_val > highest_bbox_iou
558
+ ):
559
+ cluster_chosen = cluster
560
+ highest_conf = cluster["confidence"]
561
+ highest_bbox_iou = bbox_iou_val
562
+ if cluster["id"] in problamatic_clusters:
563
+ problamatic_clusters.remove(cluster["id"])
564
+
565
+ # now remove the assigning of cell id from lower confidence, and threshold
566
+ for cluster in cluster_predictions:
567
+ for prob_amb_id in problamatic_clusters:
568
+ if prob_amb_id in cluster["cell_ids"]:
569
+ cluster["cell_ids"].remove(prob_amb_id)
570
+ amb_cell_idxs.remove(amb_cell_id)
571
+
572
+ return cluster_predictions, amb_cell_idxs
573
+
574
+
575
+ def ranges(nums):
576
+ # Find if consecutive numbers exist within pdf cells
577
+ # Used to remove line numbers for review manuscripts
578
+ nums = sorted(set(nums))
579
+ gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
580
+ edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
581
+ return list(zip(edges, edges))
582
+
583
+
584
+ def set_orphan_as_text(
585
+ cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
586
+ ):
587
+ max_id = -1
588
+ figures = []
589
+ for cluster in cluster_predictions:
590
+ if cluster["type"] == "Picture":
591
+ figures.append(cluster)
592
+
593
+ if cluster["id"] > max_id:
594
+ max_id = cluster["id"]
595
+ max_id += 1
596
+
597
+ lines_detector = False
598
+ content_of_orphans = []
599
+ for orph_id in orphan_cell_indices:
600
+ orph_cell = raw_cells[orph_id]
601
+ content_of_orphans.append(raw_cells[orph_id]["text"])
602
+
603
+ fil_content_of_orphans = []
604
+ for cell_content in content_of_orphans:
605
+ if cell_content.isnumeric():
606
+ try:
607
+ num = int(cell_content)
608
+ fil_content_of_orphans.append(num)
609
+ except ValueError: # ignore the cell
610
+ pass
611
+
612
+ # line_orphans = []
613
+ # Check if there are more than 2 pdf orphan cells, if there are more than 2,
614
+ # then check between the orphan cells if they are numeric
615
+ # and if they are a consecutive series of numbers (using ranges function) to decide
616
+
617
+ if len(fil_content_of_orphans) > 2:
618
+ out_ranges = ranges(fil_content_of_orphans)
619
+ if len(out_ranges) > 1:
620
+ cnt_range = 0
621
+ for ranges_ in out_ranges:
622
+ if ranges_[0] != ranges_[1]:
623
+ # If there are more than 75 (half the total line number of a review manuscript page)
624
+ # decide that there are line numbers on page to be ignored.
625
+ if len(list(range(ranges_[0], ranges_[1]))) > 75:
626
+ lines_detector = True
627
+ # line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
628
+
629
+ for orph_id in orphan_cell_indices:
630
+ orph_cell = raw_cells[orph_id]
631
+ if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
632
+ fig_flag = False
633
+ # Do not assign orphan cells if they are inside a figure
634
+ for fig in figures:
635
+ if contains(fig["bbox"], orph_cell["bbox"]):
636
+ fig_flag = True
637
+
638
+ # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
639
+ if fig_flag == False and lines_detector == False:
640
+ # get class from low confidence detections if not set as text:
641
+ class_type = "Text"
642
+
643
+ for cluster in cluster_predictions_low:
644
+ intersection = compute_intersection(
645
+ orph_cell["bbox"], cluster["bbox"]
646
+ )
647
+ class_type = "Text"
648
+ if (
649
+ cluster["confidence"] > 0.1
650
+ and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
651
+ ):
652
+ class_type = cluster["type"]
653
+ elif contains(
654
+ cluster["bbox"],
655
+ [
656
+ orph_cell["bbox"][0] + 3,
657
+ orph_cell["bbox"][1] + 3,
658
+ orph_cell["bbox"][2] - 3,
659
+ orph_cell["bbox"][3] - 3,
660
+ ],
661
+ ):
662
+ class_type = cluster["type"]
663
+ elif intersection > area(orph_cell["bbox"]) * 0.2:
664
+ class_type = cluster["type"]
665
+
666
+ new_cluster = {
667
+ "id": max_id,
668
+ "bbox": orph_cell["bbox"],
669
+ "type": class_type,
670
+ "cell_ids": [orph_id],
671
+ "confidence": -1,
672
+ "created_by": "orphan_default",
673
+ }
674
+ max_id += 1
675
+ cluster_predictions.append(new_cluster)
676
+ return cluster_predictions, orphan_cell_indices
677
+
678
+
679
+ def merge_cells(cluster_predictions):
680
+ # Using graph component creates clusters if orphan cells are touching or too close.
681
+ G = nx.Graph()
682
+ for cluster in cluster_predictions:
683
+ if cluster["created_by"] == "orphan_default":
684
+ G.add_node(cluster["id"])
685
+
686
+ for cluster_1 in cluster_predictions:
687
+ for cluster_2 in cluster_predictions:
688
+ if (
689
+ cluster_1["id"] != cluster_2["id"]
690
+ and cluster_2["created_by"] == "orphan_default"
691
+ and cluster_1["created_by"] == "orphan_default"
692
+ ):
693
+ cl1 = copy.deepcopy(cluster_1["bbox"])
694
+ cl2 = copy.deepcopy(cluster_2["bbox"])
695
+ cl1[0] = cl1[0] - 2
696
+ cl1[1] = cl1[1] - 2
697
+ cl1[2] = cl1[2] + 2
698
+ cl1[3] = cl1[3] + 2
699
+ cl2[0] = cl2[0] - 2
700
+ cl2[1] = cl2[1] - 2
701
+ cl2[2] = cl2[2] + 2
702
+ cl2[3] = cl2[3] + 2
703
+ if is_intersecting(cl1, cl2):
704
+ G.add_edge(cluster_1["id"], cluster_2["id"])
705
+
706
+ component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
707
+ max_id = -1
708
+ for cluster_1 in cluster_predictions:
709
+ if cluster_1["id"] > max_id:
710
+ max_id = cluster_1["id"]
711
+
712
+ for nodes in component:
713
+ if len(nodes) > 1:
714
+ max_id += 1
715
+ lines = []
716
+ for node in nodes:
717
+ for cluster in cluster_predictions:
718
+ if cluster["id"] == node:
719
+ lines.append(cluster)
720
+ cluster_predictions.remove(cluster)
721
+ new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
722
+ cluster_predictions.append(new_merged_cluster)
723
+ return cluster_predictions
724
+
725
+
726
+ def clean_up_clusters(
727
+ cluster_predictions,
728
+ raw_cells,
729
+ merge_cells=False,
730
+ img_table=False,
731
+ one_cell_table=False,
732
+ ):
733
+ DuplicateDeletedClusterIDs = []
734
+
735
+ for cluster_1 in cluster_predictions:
736
+ for cluster_2 in cluster_predictions:
737
+ if cluster_1["id"] != cluster_2["id"]:
738
+ # remove any artifcats created by merging clusters
739
+ if merge_cells == True:
740
+ if contains(
741
+ cluster_1["bbox"],
742
+ [
743
+ cluster_2["bbox"][0] + 3,
744
+ cluster_2["bbox"][1] + 3,
745
+ cluster_2["bbox"][2] - 3,
746
+ cluster_2["bbox"][3] - 3,
747
+ ],
748
+ ):
749
+ cluster_1["cell_ids"] = (
750
+ cluster_1["cell_ids"] + cluster_2["cell_ids"]
751
+ )
752
+ DuplicateDeletedClusterIDs.append(cluster_2["id"])
753
+ # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
754
+ elif img_table == True:
755
+ if (
756
+ cluster_1["type"] == "Text"
757
+ and cluster_2["type"] == "Picture"
758
+ or cluster_2["type"] == "Table"
759
+ ):
760
+ if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
761
+ DuplicateDeletedClusterIDs.append(cluster_1["id"])
762
+ elif contains(
763
+ [
764
+ cluster_2["bbox"][0] - 3,
765
+ cluster_2["bbox"][1] - 3,
766
+ cluster_2["bbox"][2] + 3,
767
+ cluster_2["bbox"][3] + 3,
768
+ ],
769
+ cluster_1["bbox"],
770
+ ):
771
+ DuplicateDeletedClusterIDs.append(cluster_1["id"])
772
+ # remove tables that have one pdf cell
773
+ if one_cell_table == True:
774
+ if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
775
+ DuplicateDeletedClusterIDs.append(cluster_1["id"])
776
+
777
+ DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
778
+
779
+ for cl_id in DuplicateDeletedClusterIDs:
780
+ for cluster in cluster_predictions:
781
+ if cl_id == cluster["id"]:
782
+ cluster_predictions.remove(cluster)
783
+ return cluster_predictions
784
+
785
+
786
+ def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
787
+ for cluster in clusters:
788
+ cells_in_cluster, _ = compute_enclosed_cells(
789
+ cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
790
+ )
791
+ cluster["cell_ids"] = cells_in_cluster
792
+ ## These cell_ids are ids of the raw cells.
793
+ ## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
794
+ return clusters
795
+
796
+
797
+ # Creates a map of cell_id->cluster_id
798
+ def cell_id_state_map(clusters, cell_count):
799
+ clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
800
+ orphan_cell_indices = [
801
+ ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
802
+ ] # which cells are assigned no cluster?
803
+ ambiguous_cell_indices = [
804
+ ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
805
+ ] # which cells are assigned > 1 clusters?
806
+ return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices