docling 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,812 +0,0 @@
1
- import copy
2
- import logging
3
-
4
- import networkx as nx
5
- from docling_core.types.doc import DocItemLabel
6
-
7
- logger = logging.getLogger("layout_utils")
8
-
9
-
10
- ## -------------------------------
11
- ## Geometric helper functions
12
- ## The coordinates grow left to right, and bottom to top.
13
- ## The bounding box list elements 0 to 3 are x_left, y_bottom, x_right, y_top.
14
-
15
-
16
- def area(bbox):
17
- return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
18
-
19
-
20
- def contains(bbox_i, bbox_j):
21
- ## Returns True if bbox_i contains bbox_j, else False
22
- return (
23
- bbox_i[0] <= bbox_j[0]
24
- and bbox_i[1] <= bbox_j[1]
25
- and bbox_i[2] >= bbox_j[2]
26
- and bbox_i[3] >= bbox_j[3]
27
- )
28
-
29
-
30
- def is_intersecting(bbox_i, bbox_j):
31
- return not (
32
- bbox_i[2] < bbox_j[0]
33
- or bbox_i[0] > bbox_j[2]
34
- or bbox_i[3] < bbox_j[1]
35
- or bbox_i[1] > bbox_j[3]
36
- )
37
-
38
-
39
- def bb_iou(boxA, boxB):
40
- # determine the (x, y)-coordinates of the intersection rectangle
41
- xA = max(boxA[0], boxB[0])
42
- yA = max(boxA[1], boxB[1])
43
- xB = min(boxA[2], boxB[2])
44
- yB = min(boxA[3], boxB[3])
45
- # compute the area of intersection rectangle
46
- interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
47
- # compute the area of both the prediction and ground-truth
48
- # rectangles
49
- boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
50
- boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
51
- # compute the intersection over union by taking the intersection
52
- # area and dividing it by the sum of prediction + ground-truth
53
- # areas - the interesection area
54
- iou = interArea / float(boxAArea + boxBArea - interArea)
55
- # return the intersection over union value
56
- return iou
57
-
58
-
59
- def compute_intersection(bbox_i, bbox_j):
60
- ## Returns the size of the intersection area of the two boxes
61
- if not is_intersecting(bbox_i, bbox_j):
62
- return 0
63
- ## Determine the (x, y)-coordinates of the intersection rectangle:
64
- xA = max(bbox_i[0], bbox_j[0])
65
- yA = max(bbox_i[1], bbox_j[1])
66
- xB = min(bbox_i[2], bbox_j[2])
67
- yB = min(bbox_i[3], bbox_j[3])
68
- ## Compute the area of intersection rectangle:
69
- interArea = (xB - xA) * (yB - yA)
70
- if interArea < 0:
71
- logger.debug("Warning: Negative intersection detected!")
72
- return 0
73
- return interArea
74
-
75
-
76
- def surrounding(bbox_i, bbox_j):
77
- ## Computes minimal box that contains both input boxes
78
- sbox = []
79
- sbox.append(min(bbox_i[0], bbox_j[0]))
80
- sbox.append(min(bbox_i[1], bbox_j[1]))
81
- sbox.append(max(bbox_i[2], bbox_j[2]))
82
- sbox.append(max(bbox_i[3], bbox_j[3]))
83
- return sbox
84
-
85
-
86
- def surrounding_list(bbox_list):
87
- ## Computes minimal box that contains all boxes in the input list
88
- ## The list should be non-empty, but just in case it's not:
89
- if len(bbox_list) == 0:
90
- sbox = [0, 0, 0, 0]
91
- else:
92
- sbox = []
93
- sbox.append(min([bbox[0] for bbox in bbox_list]))
94
- sbox.append(min([bbox[1] for bbox in bbox_list]))
95
- sbox.append(max([bbox[2] for bbox in bbox_list]))
96
- sbox.append(max([bbox[3] for bbox in bbox_list]))
97
- return sbox
98
-
99
-
100
- def vertical_overlap(bboxA, bboxB):
101
- ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
102
- if bboxB[3] < bboxA[1]: ## B below A
103
- return False
104
- elif bboxA[3] < bboxB[1]: ## A below B
105
- return False
106
- else:
107
- return True
108
-
109
-
110
- def vertical_overlap_fraction(bboxA, bboxB):
111
- ## Returns the vertical overlap as fraction of the lower bbox height.
112
- ## bbox[1] is the lower bound, bbox[3] the upper bound (larger number)
113
- ## Height 0 is permitted in the input.
114
- heightA = bboxA[3] - bboxA[1]
115
- heightB = bboxB[3] - bboxB[1]
116
- min_height = min(heightA, heightB)
117
- if bboxA[3] >= bboxB[3]: ## A starts higher or equal
118
- if (
119
- bboxA[1] <= bboxB[1]
120
- ): ## B is completely in A; this can include height of B = 0:
121
- fraction = 1
122
- else:
123
- overlap = max(bboxB[3] - bboxA[1], 0)
124
- fraction = overlap / max(min_height, 0.001)
125
- else:
126
- if (
127
- bboxB[1] <= bboxA[1]
128
- ): ## A is completely in B; this can include height of A = 0:
129
- fraction = 1
130
- else:
131
- overlap = max(bboxA[3] - bboxB[1], 0)
132
- fraction = overlap / max(min_height, 0.001)
133
- return fraction
134
-
135
-
136
- ## -------------------------------
137
- ## Cluster-and-cell relations
138
-
139
-
140
- def compute_enclosed_cells(
141
- cluster_bbox, raw_cells, min_cell_intersection_with_cluster=0.2
142
- ):
143
- cells_in_cluster = []
144
- cells_in_cluster_int = []
145
- for ix, cell in enumerate(raw_cells):
146
- cell_bbox = cell["bbox"]
147
- intersection = compute_intersection(cell_bbox, cluster_bbox)
148
- frac_area = area(cell_bbox) * min_cell_intersection_with_cluster
149
-
150
- if (
151
- intersection > frac_area and frac_area > 0
152
- ): # intersect > certain fraction of cell
153
- cells_in_cluster.append(ix)
154
- cells_in_cluster_int.append(intersection)
155
- elif contains(
156
- cluster_bbox,
157
- [cell_bbox[0] + 3, cell_bbox[1] + 3, cell_bbox[2] - 3, cell_bbox[3] - 3],
158
- ):
159
- cells_in_cluster.append(ix)
160
- return cells_in_cluster, cells_in_cluster_int
161
-
162
-
163
- def find_clusters_around_cells(cell_count, clusters):
164
- ## Per raw cell, find to which clusters it belongs.
165
- ## Return list of these indices in the raw-cell order.
166
- clusters_around_cells = [[] for _ in range(cell_count)]
167
- for cl_ix, cluster in enumerate(clusters):
168
- for ix in cluster["cell_ids"]:
169
- clusters_around_cells[ix].append(cl_ix)
170
- return clusters_around_cells
171
-
172
-
173
- def find_cell_index(raw_ix, cell_array):
174
- ## "raw_ix" is a rawcell_id.
175
- ## "cell_array" has the structure of an (annotation) cells array.
176
- ## Returns index of cell in cell_array that has this rawcell_id.
177
- for ix, cell in enumerate(cell_array):
178
- if cell["rawcell_id"] == raw_ix:
179
- return ix
180
-
181
-
182
- def find_cell_indices(cluster, cell_array):
183
- ## "cluster" must have the structure as in a clusters array in a prediction,
184
- ## "cell_array" that of a cells array.
185
- ## Returns list of indices of cells in cell_array that have the rawcell_ids as in the cluster,
186
- ## in the order of the rawcell_ids.
187
- result = []
188
- for raw_ix in sorted(cluster["cell_ids"]):
189
- ## Find the cell with this rawcell_id (if any)
190
- for ix, cell in enumerate(cell_array):
191
- if cell["rawcell_id"] == raw_ix:
192
- result.append(ix)
193
- return result
194
-
195
-
196
- def find_first_cell_index(cluster, cell_array):
197
- ## "cluster" must be a dict with key "cell_ids"; it can also be a line.
198
- ## "cell_array" has the structure of a cells array in an annotation.
199
- ## Returns index of cell in cell_array that has the lowest rawcell_id from the cluster.
200
- result = [] ## We keep it a list as it can be empty (picture without text cells)
201
- if len(cluster["cell_ids"]) == 0:
202
- return result
203
- raw_ix = min(cluster["cell_ids"])
204
- ## Find the cell with this rawcell_id (if any)
205
- for ix, cell in enumerate(cell_array):
206
- if cell["rawcell_id"] == raw_ix:
207
- result.append(ix)
208
- break ## One is enough; should be only one anyway.
209
- if result == []:
210
- logger.debug(
211
- " Warning: Raw cell " + str(raw_ix) + " not found in annotation cells"
212
- )
213
- return result
214
-
215
-
216
- ## -------------------------------
217
- ## Cluster labels and text
218
-
219
-
220
- def relabel_cluster(cluster, cl_ix, new_label, target_pred):
221
- ## "cluster" must have the structure as in a clusters array in a prediction,
222
- ## "cl_ix" is its index in target_pred,
223
- ## "new_label" is the intended new label,
224
- ## "target_pred" is the entire current target prediction.
225
- ## Sets label on the cluster itself, and on the cells in the target_pred.
226
- ## Returns new_label so that also the cl_label variable in the main code is easily set.
227
- target_pred["clusters"][cl_ix]["type"] = new_label
228
- cluster_target_cells = find_cell_indices(cluster, target_pred["cells"])
229
- for ix in cluster_target_cells:
230
- target_pred["cells"][ix]["label"] = new_label
231
- return new_label
232
-
233
-
234
- def find_cluster_text(cluster, raw_cells):
235
- ## "cluster" must be a dict with "cell_ids"; it can also be a line.
236
- ## "raw_cells" must have the format of item["raw"]["cells"]
237
- ## Returns the text of the cluster, with blanks between the cell contents
238
- ## (which seem to be words or phrases without starting or trailing blanks).
239
- ## Note that in formulas, this may give a lot more blanks than originally
240
- cluster_text = ""
241
- for raw_ix in sorted(cluster["cell_ids"]):
242
- cluster_text = cluster_text + raw_cells[raw_ix]["text"] + " "
243
- return cluster_text.rstrip()
244
-
245
-
246
- def find_cluster_text_without_blanks(cluster, raw_cells):
247
- ## "cluster" must be a dict with "cell_ids"; it can also be a line.
248
- ## "raw_cells" must have the format of item["raw"]["cells"]
249
- ## Returns the text of the cluster, without blanks between the cell contents
250
- ## Interesting in formula analysis.
251
- cluster_text = ""
252
- for raw_ix in sorted(cluster["cell_ids"]):
253
- cluster_text = cluster_text + raw_cells[raw_ix]["text"]
254
- return cluster_text.rstrip()
255
-
256
-
257
- ## -------------------------------
258
- ## Clusters and lines
259
- ## (Most line-oriented functions are only needed in TextAnalysisGivenClusters,
260
- ## but this one also in FormulaAnalysis)
261
-
262
-
263
- def build_cluster_from_lines(lines, label, id):
264
- ## Lines must be a non-empty list of dicts (lines) with elements "cell_ids" and "bbox"
265
- ## (There is no condition that they are really geometrically lines)
266
- ## A cluster in standard format is returned with given label and id
267
- local_lines = copy.deepcopy(
268
- lines
269
- ) ## without this, it changes "lines" also outside this function
270
- first_line = local_lines.pop(0)
271
- cluster = {
272
- "id": id,
273
- "type": label,
274
- "cell_ids": first_line["cell_ids"],
275
- "bbox": first_line["bbox"],
276
- "confidence": 0,
277
- "created_by": "merged_cells",
278
- }
279
- confidence = 0
280
- counter = 0
281
- for line in local_lines:
282
- new_cell_ids = cluster["cell_ids"] + line["cell_ids"]
283
- cluster["cell_ids"] = new_cell_ids
284
- cluster["bbox"] = surrounding(cluster["bbox"], line["bbox"])
285
- counter += 1
286
- confidence += line["confidence"]
287
- confidence = confidence / counter
288
- cluster["confidence"] = confidence
289
- return cluster
290
-
291
-
292
- ## -------------------------------
293
- ## Reading order
294
-
295
-
296
- def produce_reading_order(clusters, cluster_sort_type, cell_sort_type, sort_ids):
297
- ## In:
298
- ## Clusters: list as in predictions.
299
- ## cluster_sort_type: string, currently only "raw_cells".
300
- ## cell_sort_type: string, currently only "raw_cells".
301
- ## sort_ids: Boolean, whether the cluster ids should be adapted to their new position
302
- ## Out: Another clusters list, sorted according to the type.
303
-
304
- logger.debug("---- Start cluster sorting ------")
305
-
306
- if cell_sort_type == "raw_cell_ids":
307
- for cl in clusters:
308
- sorted_cell_ids = sorted(cl["cell_ids"])
309
- cl["cell_ids"] = sorted_cell_ids
310
- else:
311
- logger.debug(
312
- "Unknown cell_sort_type `"
313
- + cell_sort_type
314
- + "`, no cell sorting will happen."
315
- )
316
-
317
- if cluster_sort_type == "raw_cell_ids":
318
- clusters_with_cells = [cl for cl in clusters if cl["cell_ids"] != []]
319
- clusters_without_cells = [cl for cl in clusters if cl["cell_ids"] == []]
320
- logger.debug(
321
- "Clusters with cells: " + str([cl["id"] for cl in clusters_with_cells])
322
- )
323
- logger.debug(
324
- " Their first cell ids: "
325
- + str([cl["cell_ids"][0] for cl in clusters_with_cells])
326
- )
327
- logger.debug(
328
- "Clusters without cells: "
329
- + str([cl["id"] for cl in clusters_without_cells])
330
- )
331
- clusters_with_cells_sorted = sorted(
332
- clusters_with_cells, key=lambda cluster: cluster["cell_ids"][0]
333
- )
334
- logger.debug(
335
- " First cell ids after sorting: "
336
- + str([cl["cell_ids"][0] for cl in clusters_with_cells_sorted])
337
- )
338
- sorted_clusters = clusters_with_cells_sorted + clusters_without_cells
339
- else:
340
- logger.debug(
341
- "Unknown cluster_sort_type: `"
342
- + cluster_sort_type
343
- + "`, no cluster sorting will happen."
344
- )
345
-
346
- if sort_ids:
347
- for i, cl in enumerate(sorted_clusters):
348
- cl["id"] = i
349
- return sorted_clusters
350
-
351
-
352
- ## -------------------------------
353
- ## Line Splitting
354
-
355
-
356
- def sort_cells_horizontal(line_cell_ids, raw_cells):
357
- ## "line_cells" should be a non-empty list of (raw) cell_ids
358
- ## "raw_cells" has the structure of item["raw"]["cells"].
359
- ## Sorts the cells in the line by x0 (left start).
360
- new_line_cell_ids = sorted(
361
- line_cell_ids, key=lambda cell_id: raw_cells[cell_id]["bbox"][0]
362
- )
363
- return new_line_cell_ids
364
-
365
-
366
- def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
367
- new_clusters = []
368
- for ix, cluster in enumerate(clusters):
369
- new_cluster = copy.deepcopy(cluster)
370
- logger.debug(
371
- "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
372
- )
373
- logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
374
- if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
375
- logger.debug(" Empty non-picture, removed")
376
- continue ## Skip this former cluster, now without cells.
377
- new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
378
- new_cluster["bbox"] = new_bbox
379
- new_clusters.append(new_cluster)
380
- return new_clusters
381
-
382
-
383
- def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
384
- if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
385
- ## A text-like cluster. The bbox only needs to be around the text cells:
386
- logger.debug(" Initial bbox: " + str(cluster["bbox"]))
387
- new_bbox = surrounding_list(
388
- [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
389
- )
390
- logger.debug(" New bounding box:" + str(new_bbox))
391
- if cluster["type"] == DocItemLabel.PICTURE:
392
- ## We only make the bbox completely comprise included text cells:
393
- logger.debug(" Picture")
394
- if len(cluster["cell_ids"]) != 0:
395
- min_bbox = surrounding_list(
396
- [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
397
- )
398
- logger.debug(" Minimum bbox: " + str(min_bbox))
399
- logger.debug(" Initial bbox: " + str(cluster["bbox"]))
400
- new_bbox = surrounding(min_bbox, cluster["bbox"])
401
- logger.debug(" New bbox (initial and text cells): " + str(new_bbox))
402
- else:
403
- logger.debug(" without text cells, no change.")
404
- new_bbox = cluster["bbox"]
405
- else: ## A table
406
- ## At least we have to keep the included text cells, and we make the bbox completely comprise them
407
- min_bbox = surrounding_list(
408
- [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
409
- )
410
- logger.debug(" Minimum bbox: " + str(min_bbox))
411
- logger.debug(" Initial bbox: " + str(cluster["bbox"]))
412
- new_bbox = surrounding(min_bbox, cluster["bbox"])
413
- logger.debug(" Possibly increased bbox: " + str(new_bbox))
414
-
415
- ## Now we look which non-belonging cells are covered.
416
- ## (To decrease dependencies, we don't make use of which cells we actually removed.)
417
- ## We don't worry about orphan cells, those could still be added to the table.
418
- enclosed_cells = compute_enclosed_cells(
419
- new_bbox, raw_cells, min_cell_intersection_with_cluster=0.3
420
- )[0]
421
- additional_cells = set(enclosed_cells) - set(cluster["cell_ids"])
422
- logger.debug(
423
- " Additional cells enclosed by Table bbox: " + str(additional_cells)
424
- )
425
- spurious_cells = additional_cells - set(orphan_cell_indices)
426
- logger.debug(
427
- " Spurious cells enclosed by Table bbox (additional minus orphans): "
428
- + str(spurious_cells)
429
- )
430
- if len(spurious_cells) == 0:
431
- return new_bbox
432
-
433
- ## Else we want to keep as much as possible, e.g., grid lines, but not the spurious cells if we can.
434
- ## We initialize possible cuts with the current bbox.
435
- left_cut = new_bbox[0]
436
- right_cut = new_bbox[2]
437
- upper_cut = new_bbox[3]
438
- lower_cut = new_bbox[1]
439
-
440
- for cell_ix in spurious_cells:
441
- cell = raw_cells[cell_ix]
442
- # logger.debug(" Spurious cell bbox: " + str(cell["bbox"]))
443
- is_left = cell["bbox"][2] < min_bbox[0]
444
- is_right = cell["bbox"][0] > min_bbox[2]
445
- is_above = cell["bbox"][1] > min_bbox[3]
446
- is_below = cell["bbox"][3] < min_bbox[1]
447
- # logger.debug(" Left, right, above, below? " + str([is_left, is_right, is_above, is_below]))
448
-
449
- if is_left:
450
- if cell["bbox"][2] > left_cut:
451
- ## We move the left cut to exclude this cell:
452
- left_cut = cell["bbox"][2]
453
- if is_right:
454
- if cell["bbox"][0] < right_cut:
455
- ## We move the right cut to exclude this cell:
456
- right_cut = cell["bbox"][0]
457
- if is_above:
458
- if cell["bbox"][1] < upper_cut:
459
- ## We move the upper cut to exclude this cell:
460
- upper_cut = cell["bbox"][1]
461
- if is_below:
462
- if cell["bbox"][3] > lower_cut:
463
- ## We move the left cut to exclude this cell:
464
- lower_cut = cell["bbox"][3]
465
- # logger.debug(" Current bbox: " + str([left_cut, lower_cut, right_cut, upper_cut]))
466
-
467
- new_bbox = [left_cut, lower_cut, right_cut, upper_cut]
468
-
469
- logger.debug(" Final bbox: " + str(new_bbox))
470
- return new_bbox
471
-
472
-
473
- def remove_cluster_duplicates_by_conf(cluster_predictions, threshold=0.5):
474
- DuplicateDeletedClusterIDs = []
475
- for cluster_1 in cluster_predictions:
476
- for cluster_2 in cluster_predictions:
477
- if cluster_1["id"] != cluster_2["id"]:
478
- if_conf = False
479
- if cluster_1["confidence"] > cluster_2["confidence"]:
480
- if_conf = True
481
- if if_conf == True:
482
- if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > threshold:
483
- DuplicateDeletedClusterIDs.append(cluster_2["id"])
484
- elif contains(
485
- cluster_1["bbox"],
486
- [
487
- cluster_2["bbox"][0] + 3,
488
- cluster_2["bbox"][1] + 3,
489
- cluster_2["bbox"][2] - 3,
490
- cluster_2["bbox"][3] - 3,
491
- ],
492
- ):
493
- DuplicateDeletedClusterIDs.append(cluster_2["id"])
494
-
495
- DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
496
-
497
- for cl_id in DuplicateDeletedClusterIDs:
498
- for cluster in cluster_predictions:
499
- if cl_id == cluster["id"]:
500
- cluster_predictions.remove(cluster)
501
- return cluster_predictions
502
-
503
-
504
- # Assign orphan cells by a low confidence prediction that is below the assigned confidence
505
- def assign_orphans_with_low_conf_pred(
506
- cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
507
- ):
508
- for orph_id in orphan_cell_indices:
509
- cluster_chosen = {}
510
- iou_thresh = 0.05
511
- confidence = 0.05
512
-
513
- # Loop over all predictions, and find the one with the highest IOU, and confidence
514
- for cluster in cluster_predictions_low:
515
- calc_iou = bb_iou(cluster["bbox"], raw_cells[orph_id]["bbox"])
516
- cluster_area = (cluster["bbox"][3] - cluster["bbox"][1]) * (
517
- cluster["bbox"][2] - cluster["bbox"][0]
518
- )
519
- cell_area = (
520
- raw_cells[orph_id]["bbox"][3] - raw_cells[orph_id]["bbox"][1]
521
- ) * (raw_cells[orph_id]["bbox"][2] - raw_cells[orph_id]["bbox"][0])
522
-
523
- if (
524
- (iou_thresh < calc_iou)
525
- and (cluster["confidence"] > confidence)
526
- and (cell_area * 3 > cluster_area)
527
- ):
528
- cluster_chosen = cluster
529
- iou_thresh = calc_iou
530
- confidence = cluster["confidence"]
531
- # If a candidate is found, assign to it the PDF cell ids, and tag that it was created by this function for tracking
532
- if iou_thresh != 0.05 and confidence != 0.05:
533
- cluster_chosen["cell_ids"].append(orph_id)
534
- cluster_chosen["created_by"] = "orph_low_conf"
535
- cluster_predictions.append(cluster_chosen)
536
- orphan_cell_indices.remove(orph_id)
537
- return cluster_predictions, orphan_cell_indices
538
-
539
-
540
- def remove_ambigous_pdf_cell_by_conf(cluster_predictions, raw_cells, amb_cell_idxs):
541
- for amb_cell_id in amb_cell_idxs:
542
- highest_conf = 0
543
- highest_bbox_iou = 0
544
- cluster_chosen = None
545
- problamatic_clusters = []
546
-
547
- # Find clusters in question
548
- for cluster in cluster_predictions:
549
-
550
- if amb_cell_id in cluster["cell_ids"]:
551
- problamatic_clusters.append(amb_cell_id)
552
-
553
- # If the cell_id is in a cluster of high conf, and highest iou score, and smaller in area
554
- bbox_iou_val = bb_iou(cluster["bbox"], raw_cells[amb_cell_id]["bbox"])
555
-
556
- if (
557
- cluster["confidence"] > highest_conf
558
- and bbox_iou_val > highest_bbox_iou
559
- ):
560
- cluster_chosen = cluster
561
- highest_conf = cluster["confidence"]
562
- highest_bbox_iou = bbox_iou_val
563
- if cluster["id"] in problamatic_clusters:
564
- problamatic_clusters.remove(cluster["id"])
565
-
566
- # now remove the assigning of cell id from lower confidence, and threshold
567
- for cluster in cluster_predictions:
568
- for prob_amb_id in problamatic_clusters:
569
- if prob_amb_id in cluster["cell_ids"]:
570
- cluster["cell_ids"].remove(prob_amb_id)
571
- amb_cell_idxs.remove(amb_cell_id)
572
-
573
- return cluster_predictions, amb_cell_idxs
574
-
575
-
576
- def ranges(nums):
577
- # Find if consecutive numbers exist within pdf cells
578
- # Used to remove line numbers for review manuscripts
579
- nums = sorted(set(nums))
580
- gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s + 1 < e]
581
- edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
582
- return list(zip(edges, edges))
583
-
584
-
585
- def set_orphan_as_text(
586
- cluster_predictions, cluster_predictions_low, raw_cells, orphan_cell_indices
587
- ):
588
- max_id = -1
589
- figures = []
590
- for cluster in cluster_predictions:
591
- if cluster["type"] == DocItemLabel.PICTURE:
592
- figures.append(cluster)
593
-
594
- if cluster["id"] > max_id:
595
- max_id = cluster["id"]
596
- max_id += 1
597
-
598
- lines_detector = False
599
- content_of_orphans = []
600
- for orph_id in orphan_cell_indices:
601
- orph_cell = raw_cells[orph_id]
602
- content_of_orphans.append(raw_cells[orph_id]["text"])
603
-
604
- fil_content_of_orphans = []
605
- for cell_content in content_of_orphans:
606
- if cell_content.isnumeric():
607
- try:
608
- num = int(cell_content)
609
- fil_content_of_orphans.append(num)
610
- except ValueError: # ignore the cell
611
- pass
612
-
613
- # line_orphans = []
614
- # Check if there are more than 2 pdf orphan cells, if there are more than 2,
615
- # then check between the orphan cells if they are numeric
616
- # and if they are a consecutive series of numbers (using ranges function) to decide
617
-
618
- if len(fil_content_of_orphans) > 2:
619
- out_ranges = ranges(fil_content_of_orphans)
620
- if len(out_ranges) > 1:
621
- cnt_range = 0
622
- for ranges_ in out_ranges:
623
- if ranges_[0] != ranges_[1]:
624
- # If there are more than 75 (half the total line number of a review manuscript page)
625
- # decide that there are line numbers on page to be ignored.
626
- if len(list(range(ranges_[0], ranges_[1]))) > 75:
627
- lines_detector = True
628
- # line_orphans = line_orphans + list(range(ranges_[0], ranges_[1]))
629
-
630
- for orph_id in orphan_cell_indices:
631
- orph_cell = raw_cells[orph_id]
632
- if bool(orph_cell["text"] and not orph_cell["text"].isspace()):
633
- fig_flag = False
634
- # Do not assign orphan cells if they are inside a figure
635
- for fig in figures:
636
- if contains(fig["bbox"], orph_cell["bbox"]):
637
- fig_flag = True
638
-
639
- # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
640
- if fig_flag == False and lines_detector == False:
641
- # get class from low confidence detections if not set as text:
642
- class_type = DocItemLabel.TEXT
643
-
644
- for cluster in cluster_predictions_low:
645
- intersection = compute_intersection(
646
- orph_cell["bbox"], cluster["bbox"]
647
- )
648
- class_type = DocItemLabel.TEXT
649
- if (
650
- cluster["confidence"] > 0.1
651
- and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
652
- ):
653
- class_type = cluster["type"]
654
- elif contains(
655
- cluster["bbox"],
656
- [
657
- orph_cell["bbox"][0] + 3,
658
- orph_cell["bbox"][1] + 3,
659
- orph_cell["bbox"][2] - 3,
660
- orph_cell["bbox"][3] - 3,
661
- ],
662
- ):
663
- class_type = cluster["type"]
664
- elif intersection > area(orph_cell["bbox"]) * 0.2:
665
- class_type = cluster["type"]
666
-
667
- new_cluster = {
668
- "id": max_id,
669
- "bbox": orph_cell["bbox"],
670
- "type": class_type,
671
- "cell_ids": [orph_id],
672
- "confidence": -1,
673
- "created_by": "orphan_default",
674
- }
675
- max_id += 1
676
- cluster_predictions.append(new_cluster)
677
- return cluster_predictions, orphan_cell_indices
678
-
679
-
680
- def merge_cells(cluster_predictions):
681
- # Using graph component creates clusters if orphan cells are touching or too close.
682
- G = nx.Graph()
683
- for cluster in cluster_predictions:
684
- if cluster["created_by"] == "orphan_default":
685
- G.add_node(cluster["id"])
686
-
687
- for cluster_1 in cluster_predictions:
688
- for cluster_2 in cluster_predictions:
689
- if (
690
- cluster_1["id"] != cluster_2["id"]
691
- and cluster_2["created_by"] == "orphan_default"
692
- and cluster_1["created_by"] == "orphan_default"
693
- ):
694
- cl1 = copy.deepcopy(cluster_1["bbox"])
695
- cl2 = copy.deepcopy(cluster_2["bbox"])
696
- cl1[0] = cl1[0] - 2
697
- cl1[1] = cl1[1] - 2
698
- cl1[2] = cl1[2] + 2
699
- cl1[3] = cl1[3] + 2
700
- cl2[0] = cl2[0] - 2
701
- cl2[1] = cl2[1] - 2
702
- cl2[2] = cl2[2] + 2
703
- cl2[3] = cl2[3] + 2
704
- if is_intersecting(cl1, cl2):
705
- G.add_edge(cluster_1["id"], cluster_2["id"])
706
-
707
- component = sorted(map(sorted, nx.k_edge_components(G, k=1)))
708
- max_id = -1
709
- for cluster_1 in cluster_predictions:
710
- if cluster_1["id"] > max_id:
711
- max_id = cluster_1["id"]
712
-
713
- for nodes in component:
714
- if len(nodes) > 1:
715
- max_id += 1
716
- lines = []
717
- for node in nodes:
718
- for cluster in cluster_predictions:
719
- if cluster["id"] == node:
720
- lines.append(cluster)
721
- cluster_predictions.remove(cluster)
722
- new_merged_cluster = build_cluster_from_lines(
723
- lines, DocItemLabel.TEXT, max_id
724
- )
725
- cluster_predictions.append(new_merged_cluster)
726
- return cluster_predictions
727
-
728
-
729
- def clean_up_clusters(
730
- cluster_predictions,
731
- raw_cells,
732
- merge_cells=False,
733
- img_table=False,
734
- one_cell_table=False,
735
- ):
736
- DuplicateDeletedClusterIDs = []
737
-
738
- for cluster_1 in cluster_predictions:
739
- for cluster_2 in cluster_predictions:
740
- if cluster_1["id"] != cluster_2["id"]:
741
- # remove any artifcats created by merging clusters
742
- if merge_cells == True:
743
- if contains(
744
- cluster_1["bbox"],
745
- [
746
- cluster_2["bbox"][0] + 3,
747
- cluster_2["bbox"][1] + 3,
748
- cluster_2["bbox"][2] - 3,
749
- cluster_2["bbox"][3] - 3,
750
- ],
751
- ):
752
- cluster_1["cell_ids"] = (
753
- cluster_1["cell_ids"] + cluster_2["cell_ids"]
754
- )
755
- DuplicateDeletedClusterIDs.append(cluster_2["id"])
756
- # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
757
- elif img_table == True:
758
- if (
759
- cluster_1["type"] == DocItemLabel.TEXT
760
- and cluster_2["type"] == DocItemLabel.PICTURE
761
- or cluster_2["type"] == DocItemLabel.TABLE
762
- ):
763
- if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
764
- DuplicateDeletedClusterIDs.append(cluster_1["id"])
765
- elif contains(
766
- [
767
- cluster_2["bbox"][0] - 3,
768
- cluster_2["bbox"][1] - 3,
769
- cluster_2["bbox"][2] + 3,
770
- cluster_2["bbox"][3] + 3,
771
- ],
772
- cluster_1["bbox"],
773
- ):
774
- DuplicateDeletedClusterIDs.append(cluster_1["id"])
775
- # remove tables that have one pdf cell
776
- if one_cell_table == True:
777
- if (
778
- cluster_1["type"] == DocItemLabel.TABLE
779
- and len(cluster_1["cell_ids"]) < 2
780
- ):
781
- DuplicateDeletedClusterIDs.append(cluster_1["id"])
782
-
783
- DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
784
-
785
- for cl_id in DuplicateDeletedClusterIDs:
786
- for cluster in cluster_predictions:
787
- if cl_id == cluster["id"]:
788
- cluster_predictions.remove(cluster)
789
- return cluster_predictions
790
-
791
-
792
- def assigning_cell_ids_to_clusters(clusters, raw_cells, threshold):
793
- for cluster in clusters:
794
- cells_in_cluster, _ = compute_enclosed_cells(
795
- cluster["bbox"], raw_cells, min_cell_intersection_with_cluster=threshold
796
- )
797
- cluster["cell_ids"] = cells_in_cluster
798
- ## These cell_ids are ids of the raw cells.
799
- ## They are often, but not always, the same as the "id" or the index of the "cells" list in a prediction.
800
- return clusters
801
-
802
-
803
- # Creates a map of cell_id->cluster_id
804
- def cell_id_state_map(clusters, cell_count):
805
- clusters_around_cells = find_clusters_around_cells(cell_count, clusters)
806
- orphan_cell_indices = [
807
- ix for ix in range(cell_count) if len(clusters_around_cells[ix]) == 0
808
- ] # which cells are assigned no cluster?
809
- ambiguous_cell_indices = [
810
- ix for ix in range(cell_count) if len(clusters_around_cells[ix]) > 1
811
- ] # which cells are assigned > 1 clusters?
812
- return clusters_around_cells, orphan_cell_indices, ambiguous_cell_indices