docling 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,666 @@
1
+ import bisect
2
+ import logging
3
+ import sys
4
+ from collections import defaultdict
5
+ from typing import Dict, List, Set, Tuple
6
+
7
+ from docling_core.types.doc import DocItemLabel, Size
8
+ from rtree import index
9
+
10
+ from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
11
+
12
+ _log = logging.getLogger(__name__)
13
+
14
+
15
+ class UnionFind:
16
+ """Efficient Union-Find data structure for grouping elements."""
17
+
18
+ def __init__(self, elements):
19
+ self.parent = {elem: elem for elem in elements}
20
+ self.rank = {elem: 0 for elem in elements}
21
+
22
+ def find(self, x):
23
+ if self.parent[x] != x:
24
+ self.parent[x] = self.find(self.parent[x]) # Path compression
25
+ return self.parent[x]
26
+
27
+ def union(self, x, y):
28
+ root_x, root_y = self.find(x), self.find(y)
29
+ if root_x == root_y:
30
+ return
31
+
32
+ if self.rank[root_x] > self.rank[root_y]:
33
+ self.parent[root_y] = root_x
34
+ elif self.rank[root_x] < self.rank[root_y]:
35
+ self.parent[root_x] = root_y
36
+ else:
37
+ self.parent[root_y] = root_x
38
+ self.rank[root_x] += 1
39
+
40
+ def get_groups(self) -> Dict[int, List[int]]:
41
+ """Returns groups as {root: [elements]}."""
42
+ groups = defaultdict(list)
43
+ for elem in self.parent:
44
+ groups[self.find(elem)].append(elem)
45
+ return groups
46
+
47
+
48
+ class SpatialClusterIndex:
49
+ """Efficient spatial indexing for clusters using R-tree and interval trees."""
50
+
51
+ def __init__(self, clusters: List[Cluster]):
52
+ p = index.Property()
53
+ p.dimension = 2
54
+ self.spatial_index = index.Index(properties=p)
55
+ self.x_intervals = IntervalTree()
56
+ self.y_intervals = IntervalTree()
57
+ self.clusters_by_id: Dict[int, Cluster] = {}
58
+
59
+ for cluster in clusters:
60
+ self.add_cluster(cluster)
61
+
62
+ def add_cluster(self, cluster: Cluster):
63
+ bbox = cluster.bbox
64
+ self.spatial_index.insert(cluster.id, bbox.as_tuple())
65
+ self.x_intervals.insert(bbox.l, bbox.r, cluster.id)
66
+ self.y_intervals.insert(bbox.t, bbox.b, cluster.id)
67
+ self.clusters_by_id[cluster.id] = cluster
68
+
69
+ def remove_cluster(self, cluster: Cluster):
70
+ self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
71
+ del self.clusters_by_id[cluster.id]
72
+
73
+ def find_candidates(self, bbox: BoundingBox) -> Set[int]:
74
+ """Find potential overlapping cluster IDs using all indexes."""
75
+ spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
76
+ x_candidates = self.x_intervals.find_containing(
77
+ bbox.l
78
+ ) | self.x_intervals.find_containing(bbox.r)
79
+ y_candidates = self.y_intervals.find_containing(
80
+ bbox.t
81
+ ) | self.y_intervals.find_containing(bbox.b)
82
+ return spatial.union(x_candidates).union(y_candidates)
83
+
84
+ def check_overlap(
85
+ self,
86
+ bbox1: BoundingBox,
87
+ bbox2: BoundingBox,
88
+ overlap_threshold: float,
89
+ containment_threshold: float,
90
+ ) -> bool:
91
+ """Check if two bboxes overlap sufficiently."""
92
+ area1, area2 = bbox1.area(), bbox2.area()
93
+ if area1 <= 0 or area2 <= 0:
94
+ return False
95
+
96
+ overlap_area = bbox1.intersection_area_with(bbox2)
97
+ if overlap_area <= 0:
98
+ return False
99
+
100
+ iou = overlap_area / (area1 + area2 - overlap_area)
101
+ containment1 = overlap_area / area1
102
+ containment2 = overlap_area / area2
103
+
104
+ return (
105
+ iou > overlap_threshold
106
+ or containment1 > containment_threshold
107
+ or containment2 > containment_threshold
108
+ )
109
+
110
+
111
+ class Interval:
112
+ """Helper class for sortable intervals."""
113
+
114
+ def __init__(self, min_val: float, max_val: float, id: int):
115
+ self.min_val = min_val
116
+ self.max_val = max_val
117
+ self.id = id
118
+
119
+ def __lt__(self, other):
120
+ if isinstance(other, Interval):
121
+ return self.min_val < other.min_val
122
+ return self.min_val < other
123
+
124
+
125
+ class IntervalTree:
126
+ """Memory-efficient interval tree for 1D overlap queries."""
127
+
128
+ def __init__(self):
129
+ self.intervals: List[Interval] = [] # Sorted by min_val
130
+
131
+ def insert(self, min_val: float, max_val: float, id: int):
132
+ interval = Interval(min_val, max_val, id)
133
+ bisect.insort(self.intervals, interval)
134
+
135
+ def find_containing(self, point: float) -> Set[int]:
136
+ """Find all intervals containing the point."""
137
+ pos = bisect.bisect_left(self.intervals, point)
138
+ result = set()
139
+
140
+ # Check intervals starting before point
141
+ for interval in reversed(self.intervals[:pos]):
142
+ if interval.min_val <= point <= interval.max_val:
143
+ result.add(interval.id)
144
+ else:
145
+ break
146
+
147
+ # Check intervals starting at/after point
148
+ for interval in self.intervals[pos:]:
149
+ if point <= interval.max_val:
150
+ if interval.min_val <= point:
151
+ result.add(interval.id)
152
+ else:
153
+ break
154
+
155
+ return result
156
+
157
+
158
+ class LayoutPostprocessor:
159
+ """Postprocesses layout predictions by cleaning up clusters and mapping cells."""
160
+
161
+ # Cluster type-specific parameters for overlap resolution
162
+ OVERLAP_PARAMS = {
163
+ "regular": {"area_threshold": 1.3, "conf_threshold": 0.05},
164
+ "picture": {"area_threshold": 2.0, "conf_threshold": 0.3},
165
+ "wrapper": {"area_threshold": 2.0, "conf_threshold": 0.2},
166
+ }
167
+
168
+ WRAPPER_TYPES = {
169
+ DocItemLabel.FORM,
170
+ DocItemLabel.KEY_VALUE_REGION,
171
+ DocItemLabel.TABLE,
172
+ DocItemLabel.DOCUMENT_INDEX,
173
+ }
174
+ SPECIAL_TYPES = WRAPPER_TYPES.union({DocItemLabel.PICTURE})
175
+
176
+ CONFIDENCE_THRESHOLDS = {
177
+ DocItemLabel.CAPTION: 0.5,
178
+ DocItemLabel.FOOTNOTE: 0.5,
179
+ DocItemLabel.FORMULA: 0.5,
180
+ DocItemLabel.LIST_ITEM: 0.5,
181
+ DocItemLabel.PAGE_FOOTER: 0.5,
182
+ DocItemLabel.PAGE_HEADER: 0.5,
183
+ DocItemLabel.PICTURE: 0.5,
184
+ DocItemLabel.SECTION_HEADER: 0.45,
185
+ DocItemLabel.TABLE: 0.5,
186
+ DocItemLabel.TEXT: 0.5, # 0.45,
187
+ DocItemLabel.TITLE: 0.45,
188
+ DocItemLabel.CODE: 0.45,
189
+ DocItemLabel.CHECKBOX_SELECTED: 0.45,
190
+ DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
191
+ DocItemLabel.FORM: 0.45,
192
+ DocItemLabel.KEY_VALUE_REGION: 0.45,
193
+ DocItemLabel.DOCUMENT_INDEX: 0.45,
194
+ }
195
+
196
+ LABEL_REMAPPING = {
197
+ # DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
198
+ DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
199
+ }
200
+
201
+ def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
202
+ """Initialize processor with cells and clusters."""
203
+ """Initialize processor with cells and spatial indices."""
204
+ self.cells = cells
205
+ self.page_size = page_size
206
+ self.regular_clusters = [
207
+ c for c in clusters if c.label not in self.SPECIAL_TYPES
208
+ ]
209
+ self.special_clusters = [c for c in clusters if c.label in self.SPECIAL_TYPES]
210
+
211
+ # Build spatial indices once
212
+ self.regular_index = SpatialClusterIndex(self.regular_clusters)
213
+ self.picture_index = SpatialClusterIndex(
214
+ [c for c in self.special_clusters if c.label == DocItemLabel.PICTURE]
215
+ )
216
+ self.wrapper_index = SpatialClusterIndex(
217
+ [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
218
+ )
219
+
220
+ def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
221
+ """Main processing pipeline."""
222
+ self.regular_clusters = self._process_regular_clusters()
223
+ self.special_clusters = self._process_special_clusters()
224
+
225
+ # Remove regular clusters that are included in wrappers
226
+ contained_ids = {
227
+ child.id
228
+ for wrapper in self.special_clusters
229
+ if wrapper.label in self.SPECIAL_TYPES
230
+ for child in wrapper.children
231
+ }
232
+ self.regular_clusters = [
233
+ c for c in self.regular_clusters if c.id not in contained_ids
234
+ ]
235
+
236
+ # Combine and sort final clusters
237
+ final_clusters = self._sort_clusters(
238
+ self.regular_clusters + self.special_clusters, mode="id"
239
+ )
240
+ for cluster in final_clusters:
241
+ cluster.cells = self._sort_cells(cluster.cells)
242
+ # Also sort cells in children if any
243
+ for child in cluster.children:
244
+ child.cells = self._sort_cells(child.cells)
245
+
246
+ return final_clusters, self.cells
247
+
248
+ def _process_regular_clusters(self) -> List[Cluster]:
249
+ """Process regular clusters with iterative refinement."""
250
+ clusters = [
251
+ c
252
+ for c in self.regular_clusters
253
+ if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
254
+ ]
255
+
256
+ # Apply label remapping
257
+ for cluster in clusters:
258
+ if cluster.label in self.LABEL_REMAPPING:
259
+ cluster.label = self.LABEL_REMAPPING[cluster.label]
260
+
261
+ # Initial cell assignment
262
+ clusters = self._assign_cells_to_clusters(clusters)
263
+
264
+ # Remove clusters with no cells
265
+ clusters = [cluster for cluster in clusters if cluster.cells]
266
+
267
+ # Handle orphaned cells
268
+ unassigned = self._find_unassigned_cells(clusters)
269
+ if unassigned:
270
+ next_id = max((c.id for c in clusters), default=0) + 1
271
+ orphan_clusters = []
272
+ for i, cell in enumerate(unassigned):
273
+ conf = 1.0
274
+ if isinstance(cell, OcrCell):
275
+ conf = cell.confidence
276
+
277
+ orphan_clusters.append(
278
+ Cluster(
279
+ id=next_id + i,
280
+ label=DocItemLabel.TEXT,
281
+ bbox=cell.bbox,
282
+ confidence=conf,
283
+ cells=[cell],
284
+ )
285
+ )
286
+ clusters.extend(orphan_clusters)
287
+
288
+ # Iterative refinement
289
+ prev_count = len(clusters) + 1
290
+ for _ in range(3): # Maximum 3 iterations
291
+ if prev_count == len(clusters):
292
+ break
293
+ prev_count = len(clusters)
294
+ clusters = self._adjust_cluster_bboxes(clusters)
295
+ clusters = self._remove_overlapping_clusters(clusters, "regular")
296
+
297
+ return clusters
298
+
299
+ def _process_special_clusters(self) -> List[Cluster]:
300
+ special_clusters = [
301
+ c
302
+ for c in self.special_clusters
303
+ if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
304
+ ]
305
+
306
+ special_clusters = self._handle_cross_type_overlaps(special_clusters)
307
+
308
+ # Calculate page area from known page size
309
+ page_area = self.page_size.width * self.page_size.height
310
+ if page_area > 0:
311
+ # Filter out full-page pictures
312
+ special_clusters = [
313
+ cluster
314
+ for cluster in special_clusters
315
+ if not (
316
+ cluster.label == DocItemLabel.PICTURE
317
+ and cluster.bbox.area() / page_area > 0.90
318
+ )
319
+ ]
320
+
321
+ for special in special_clusters:
322
+ contained = []
323
+ for cluster in self.regular_clusters:
324
+ overlap = cluster.bbox.intersection_area_with(special.bbox)
325
+ if overlap > 0:
326
+ containment = overlap / cluster.bbox.area()
327
+ if containment > 0.8:
328
+ contained.append(cluster)
329
+
330
+ if contained:
331
+ # Sort contained clusters by minimum cell ID:
332
+ contained = self._sort_clusters(contained, mode="id")
333
+ special.children = contained
334
+
335
+ # Adjust bbox only for Form and Key-Value-Region, not Table or Picture
336
+ if special.label in [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]:
337
+ special.bbox = BoundingBox(
338
+ l=min(c.bbox.l for c in contained),
339
+ t=min(c.bbox.t for c in contained),
340
+ r=max(c.bbox.r for c in contained),
341
+ b=max(c.bbox.b for c in contained),
342
+ )
343
+
344
+ # Collect all cells from children
345
+ all_cells = []
346
+ for child in contained:
347
+ all_cells.extend(child.cells)
348
+ special.cells = self._deduplicate_cells(all_cells)
349
+ special.cells = self._sort_cells(special.cells)
350
+
351
+ picture_clusters = [
352
+ c for c in special_clusters if c.label == DocItemLabel.PICTURE
353
+ ]
354
+ picture_clusters = self._remove_overlapping_clusters(
355
+ picture_clusters, "picture"
356
+ )
357
+
358
+ wrapper_clusters = [
359
+ c for c in special_clusters if c.label in self.WRAPPER_TYPES
360
+ ]
361
+ wrapper_clusters = self._remove_overlapping_clusters(
362
+ wrapper_clusters, "wrapper"
363
+ )
364
+
365
+ return picture_clusters + wrapper_clusters
366
+
367
+ def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
368
+ """Handle overlaps between regular and wrapper clusters before child assignment.
369
+
370
+ In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
371
+ should be removed.
372
+ """
373
+ wrappers_to_remove = set()
374
+
375
+ for wrapper in special_clusters:
376
+ if wrapper.label not in self.WRAPPER_TYPES:
377
+ continue # only treat KEY_VALUE_REGION for now.
378
+
379
+ for regular in self.regular_clusters:
380
+ if regular.label == DocItemLabel.TABLE:
381
+ # Calculate overlap
382
+ overlap = regular.bbox.intersection_area_with(wrapper.bbox)
383
+ wrapper_area = wrapper.bbox.area()
384
+ overlap_ratio = overlap / wrapper_area
385
+
386
+ conf_diff = wrapper.confidence - regular.confidence
387
+
388
+ # If wrapper is mostly overlapping with a TABLE, remove the wrapper
389
+ if (
390
+ overlap_ratio > 0.9 and conf_diff < 0.1
391
+ ): # self.OVERLAP_PARAMS["wrapper"]["conf_threshold"]): # 80% overlap threshold
392
+ wrappers_to_remove.add(wrapper.id)
393
+ break
394
+
395
+ # Filter out the identified wrappers
396
+ special_clusters = [
397
+ cluster
398
+ for cluster in special_clusters
399
+ if cluster.id not in wrappers_to_remove
400
+ ]
401
+
402
+ return special_clusters
403
+
404
+ def _should_prefer_cluster(
405
+ self, candidate: Cluster, other: Cluster, params: dict
406
+ ) -> bool:
407
+ """Determine if candidate cluster should be preferred over other cluster based on rules.
408
+ Returns True if candidate should be preferred, False if not."""
409
+
410
+ # Rule 1: LIST_ITEM vs TEXT
411
+ if (
412
+ candidate.label == DocItemLabel.LIST_ITEM
413
+ and other.label == DocItemLabel.TEXT
414
+ ):
415
+ # Check if areas are similar (within 20% of each other)
416
+ area_ratio = candidate.bbox.area() / other.bbox.area()
417
+ area_similarity = abs(1 - area_ratio) < 0.2
418
+ if area_similarity:
419
+ return True
420
+
421
+ # Rule 2: CODE vs others
422
+ if candidate.label == DocItemLabel.CODE:
423
+ # Calculate how much of the other cluster is contained within the CODE cluster
424
+ overlap = other.bbox.intersection_area_with(candidate.bbox)
425
+ containment = overlap / other.bbox.area()
426
+ if containment > 0.8: # other is 80% contained within CODE
427
+ return True
428
+
429
+ # If no label-based rules matched, fall back to area/confidence thresholds
430
+ area_ratio = candidate.bbox.area() / other.bbox.area()
431
+ conf_diff = other.confidence - candidate.confidence
432
+
433
+ if (
434
+ area_ratio <= params["area_threshold"]
435
+ and conf_diff > params["conf_threshold"]
436
+ ):
437
+ return False
438
+
439
+ return True # Default to keeping candidate if no rules triggered rejection
440
+
441
+ def _select_best_cluster_from_group(
442
+ self,
443
+ group_clusters: List[Cluster],
444
+ params: dict,
445
+ ) -> Cluster:
446
+ """Select best cluster from a group of overlapping clusters based on all rules."""
447
+ current_best = None
448
+
449
+ for candidate in group_clusters:
450
+ should_select = True
451
+
452
+ for other in group_clusters:
453
+ if other == candidate:
454
+ continue
455
+
456
+ if not self._should_prefer_cluster(candidate, other, params):
457
+ should_select = False
458
+ break
459
+
460
+ if should_select:
461
+ if current_best is None:
462
+ current_best = candidate
463
+ else:
464
+ # If both clusters pass rules, prefer the larger one unless confidence differs significantly
465
+ if (
466
+ candidate.bbox.area() > current_best.bbox.area()
467
+ and current_best.confidence - candidate.confidence
468
+ <= params["conf_threshold"]
469
+ ):
470
+ current_best = candidate
471
+
472
+ return current_best if current_best else group_clusters[0]
473
+
474
+ def _remove_overlapping_clusters(
475
+ self,
476
+ clusters: List[Cluster],
477
+ cluster_type: str,
478
+ overlap_threshold: float = 0.8,
479
+ containment_threshold: float = 0.8,
480
+ ) -> List[Cluster]:
481
+ if not clusters:
482
+ return []
483
+
484
+ spatial_index = (
485
+ self.regular_index
486
+ if cluster_type == "regular"
487
+ else self.picture_index if cluster_type == "picture" else self.wrapper_index
488
+ )
489
+
490
+ # Map of currently valid clusters
491
+ valid_clusters = {c.id: c for c in clusters}
492
+ uf = UnionFind(valid_clusters.keys())
493
+ params = self.OVERLAP_PARAMS[cluster_type]
494
+
495
+ for cluster in clusters:
496
+ candidates = spatial_index.find_candidates(cluster.bbox)
497
+ candidates &= valid_clusters.keys() # Only keep existing candidates
498
+ candidates.discard(cluster.id)
499
+
500
+ for other_id in candidates:
501
+ if spatial_index.check_overlap(
502
+ cluster.bbox,
503
+ valid_clusters[other_id].bbox,
504
+ overlap_threshold,
505
+ containment_threshold,
506
+ ):
507
+ uf.union(cluster.id, other_id)
508
+
509
+ result = []
510
+ for group in uf.get_groups().values():
511
+ if len(group) == 1:
512
+ result.append(valid_clusters[group[0]])
513
+ continue
514
+
515
+ group_clusters = [valid_clusters[cid] for cid in group]
516
+ best = self._select_best_cluster_from_group(group_clusters, params)
517
+
518
+ # Simple cell merging - no special cases
519
+ for cluster in group_clusters:
520
+ if cluster != best:
521
+ best.cells.extend(cluster.cells)
522
+
523
+ best.cells = self._deduplicate_cells(best.cells)
524
+ best.cells = self._sort_cells(best.cells)
525
+ result.append(best)
526
+
527
+ return result
528
+
529
+ def _select_best_cluster(
530
+ self,
531
+ clusters: List[Cluster],
532
+ area_threshold: float,
533
+ conf_threshold: float,
534
+ ) -> Cluster:
535
+ """Iteratively select best cluster based on area and confidence thresholds."""
536
+ current_best = None
537
+ for candidate in clusters:
538
+ should_select = True
539
+ for other in clusters:
540
+ if other == candidate:
541
+ continue
542
+
543
+ area_ratio = candidate.bbox.area() / other.bbox.area()
544
+ conf_diff = other.confidence - candidate.confidence
545
+
546
+ if area_ratio <= area_threshold and conf_diff > conf_threshold:
547
+ should_select = False
548
+ break
549
+
550
+ if should_select:
551
+ if current_best is None or (
552
+ candidate.bbox.area() > current_best.bbox.area()
553
+ and current_best.confidence - candidate.confidence <= conf_threshold
554
+ ):
555
+ current_best = candidate
556
+
557
+ return current_best if current_best else clusters[0]
558
+
559
+ def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
560
+ """Ensure each cell appears only once, maintaining order of first appearance."""
561
+ seen_ids = set()
562
+ unique_cells = []
563
+ for cell in cells:
564
+ if cell.id not in seen_ids:
565
+ seen_ids.add(cell.id)
566
+ unique_cells.append(cell)
567
+ return unique_cells
568
+
569
+ def _assign_cells_to_clusters(
570
+ self, clusters: List[Cluster], min_overlap: float = 0.2
571
+ ) -> List[Cluster]:
572
+ """Assign cells to best overlapping cluster."""
573
+ for cluster in clusters:
574
+ cluster.cells = []
575
+
576
+ for cell in self.cells:
577
+ if not cell.text.strip():
578
+ continue
579
+
580
+ best_overlap = min_overlap
581
+ best_cluster = None
582
+
583
+ for cluster in clusters:
584
+ if cell.bbox.area() <= 0:
585
+ continue
586
+
587
+ overlap = cell.bbox.intersection_area_with(cluster.bbox)
588
+ overlap_ratio = overlap / cell.bbox.area()
589
+
590
+ if overlap_ratio > best_overlap:
591
+ best_overlap = overlap_ratio
592
+ best_cluster = cluster
593
+
594
+ if best_cluster is not None:
595
+ best_cluster.cells.append(cell)
596
+
597
+ # Deduplicate cells in each cluster after assignment
598
+ for cluster in clusters:
599
+ cluster.cells = self._deduplicate_cells(cluster.cells)
600
+
601
+ return clusters
602
+
603
+ def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
604
+ """Find cells not assigned to any cluster."""
605
+ assigned = {cell.id for cluster in clusters for cell in cluster.cells}
606
+ return [
607
+ cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
608
+ ]
609
+
610
+ def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
611
+ """Adjust cluster bounding boxes to contain their cells."""
612
+ for cluster in clusters:
613
+ if not cluster.cells:
614
+ continue
615
+
616
+ cells_bbox = BoundingBox(
617
+ l=min(cell.bbox.l for cell in cluster.cells),
618
+ t=min(cell.bbox.t for cell in cluster.cells),
619
+ r=max(cell.bbox.r for cell in cluster.cells),
620
+ b=max(cell.bbox.b for cell in cluster.cells),
621
+ )
622
+
623
+ if cluster.label == DocItemLabel.TABLE:
624
+ # For tables, take union of current bbox and cells bbox
625
+ cluster.bbox = BoundingBox(
626
+ l=min(cluster.bbox.l, cells_bbox.l),
627
+ t=min(cluster.bbox.t, cells_bbox.t),
628
+ r=max(cluster.bbox.r, cells_bbox.r),
629
+ b=max(cluster.bbox.b, cells_bbox.b),
630
+ )
631
+ else:
632
+ cluster.bbox = cells_bbox
633
+
634
+ return clusters
635
+
636
+ def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
637
+ """Sort cells in native reading order."""
638
+ return sorted(cells, key=lambda c: (c.id))
639
+
640
+ def _sort_clusters(
641
+ self, clusters: List[Cluster], mode: str = "id"
642
+ ) -> List[Cluster]:
643
+ """Sort clusters in reading order (top-to-bottom, left-to-right)."""
644
+ if mode == "id": # sort in the order the cells are printed in the PDF.
645
+ return sorted(
646
+ clusters,
647
+ key=lambda cluster: (
648
+ (
649
+ min(cell.id for cell in cluster.cells)
650
+ if cluster.cells
651
+ else sys.maxsize
652
+ ),
653
+ cluster.bbox.t,
654
+ cluster.bbox.l,
655
+ ),
656
+ )
657
+ elif mode == "tblr": # Sort top-to-bottom, then left-to-right ("row first")
658
+ return sorted(
659
+ clusters, key=lambda cluster: (cluster.bbox.t, cluster.bbox.l)
660
+ )
661
+ elif mode == "lrtb": # Sort left-to-right, then top-to-bottom ("column first")
662
+ return sorted(
663
+ clusters, key=lambda cluster: (cluster.bbox.l, cluster.bbox.t)
664
+ )
665
+ else:
666
+ return clusters
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.11.0
3
+ Version: 2.13.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,8 +26,8 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
30
- Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.12.1,<3.0.0)
30
+ Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
31
31
  Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)