docling 2.12.0__py3-none-any.whl → 2.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/pubmed_backend.py +592 -0
- docling/backend/xml/uspto_backend.py +1888 -0
- docling/datamodel/base_models.py +21 -4
- docling/datamodel/document.py +88 -14
- docling/datamodel/pipeline_options.py +3 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +20 -3
- docling/models/ds_glm_model.py +34 -4
- docling/models/easyocr_model.py +2 -0
- docling/models/layout_model.py +134 -280
- docling/models/page_assemble_model.py +11 -1
- docling/models/table_structure_model.py +25 -29
- docling/pipeline/base_pipeline.py +3 -1
- docling/utils/glm_utils.py +11 -3
- docling/utils/layout_postprocessor.py +666 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/METADATA +2 -2
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/RECORD +21 -18
- docling/utils/layout_utils.py +0 -812
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/LICENSE +0 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/WHEEL +0 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,666 @@
|
|
1
|
+
import bisect
|
2
|
+
import logging
|
3
|
+
import sys
|
4
|
+
from collections import defaultdict
|
5
|
+
from typing import Dict, List, Set, Tuple
|
6
|
+
|
7
|
+
from docling_core.types.doc import DocItemLabel, Size
|
8
|
+
from rtree import index
|
9
|
+
|
10
|
+
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
11
|
+
|
12
|
+
_log = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
class UnionFind:
|
16
|
+
"""Efficient Union-Find data structure for grouping elements."""
|
17
|
+
|
18
|
+
def __init__(self, elements):
|
19
|
+
self.parent = {elem: elem for elem in elements}
|
20
|
+
self.rank = {elem: 0 for elem in elements}
|
21
|
+
|
22
|
+
def find(self, x):
|
23
|
+
if self.parent[x] != x:
|
24
|
+
self.parent[x] = self.find(self.parent[x]) # Path compression
|
25
|
+
return self.parent[x]
|
26
|
+
|
27
|
+
def union(self, x, y):
|
28
|
+
root_x, root_y = self.find(x), self.find(y)
|
29
|
+
if root_x == root_y:
|
30
|
+
return
|
31
|
+
|
32
|
+
if self.rank[root_x] > self.rank[root_y]:
|
33
|
+
self.parent[root_y] = root_x
|
34
|
+
elif self.rank[root_x] < self.rank[root_y]:
|
35
|
+
self.parent[root_x] = root_y
|
36
|
+
else:
|
37
|
+
self.parent[root_y] = root_x
|
38
|
+
self.rank[root_x] += 1
|
39
|
+
|
40
|
+
def get_groups(self) -> Dict[int, List[int]]:
|
41
|
+
"""Returns groups as {root: [elements]}."""
|
42
|
+
groups = defaultdict(list)
|
43
|
+
for elem in self.parent:
|
44
|
+
groups[self.find(elem)].append(elem)
|
45
|
+
return groups
|
46
|
+
|
47
|
+
|
48
|
+
class SpatialClusterIndex:
|
49
|
+
"""Efficient spatial indexing for clusters using R-tree and interval trees."""
|
50
|
+
|
51
|
+
def __init__(self, clusters: List[Cluster]):
|
52
|
+
p = index.Property()
|
53
|
+
p.dimension = 2
|
54
|
+
self.spatial_index = index.Index(properties=p)
|
55
|
+
self.x_intervals = IntervalTree()
|
56
|
+
self.y_intervals = IntervalTree()
|
57
|
+
self.clusters_by_id: Dict[int, Cluster] = {}
|
58
|
+
|
59
|
+
for cluster in clusters:
|
60
|
+
self.add_cluster(cluster)
|
61
|
+
|
62
|
+
def add_cluster(self, cluster: Cluster):
|
63
|
+
bbox = cluster.bbox
|
64
|
+
self.spatial_index.insert(cluster.id, bbox.as_tuple())
|
65
|
+
self.x_intervals.insert(bbox.l, bbox.r, cluster.id)
|
66
|
+
self.y_intervals.insert(bbox.t, bbox.b, cluster.id)
|
67
|
+
self.clusters_by_id[cluster.id] = cluster
|
68
|
+
|
69
|
+
def remove_cluster(self, cluster: Cluster):
|
70
|
+
self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
|
71
|
+
del self.clusters_by_id[cluster.id]
|
72
|
+
|
73
|
+
def find_candidates(self, bbox: BoundingBox) -> Set[int]:
|
74
|
+
"""Find potential overlapping cluster IDs using all indexes."""
|
75
|
+
spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
|
76
|
+
x_candidates = self.x_intervals.find_containing(
|
77
|
+
bbox.l
|
78
|
+
) | self.x_intervals.find_containing(bbox.r)
|
79
|
+
y_candidates = self.y_intervals.find_containing(
|
80
|
+
bbox.t
|
81
|
+
) | self.y_intervals.find_containing(bbox.b)
|
82
|
+
return spatial.union(x_candidates).union(y_candidates)
|
83
|
+
|
84
|
+
def check_overlap(
|
85
|
+
self,
|
86
|
+
bbox1: BoundingBox,
|
87
|
+
bbox2: BoundingBox,
|
88
|
+
overlap_threshold: float,
|
89
|
+
containment_threshold: float,
|
90
|
+
) -> bool:
|
91
|
+
"""Check if two bboxes overlap sufficiently."""
|
92
|
+
area1, area2 = bbox1.area(), bbox2.area()
|
93
|
+
if area1 <= 0 or area2 <= 0:
|
94
|
+
return False
|
95
|
+
|
96
|
+
overlap_area = bbox1.intersection_area_with(bbox2)
|
97
|
+
if overlap_area <= 0:
|
98
|
+
return False
|
99
|
+
|
100
|
+
iou = overlap_area / (area1 + area2 - overlap_area)
|
101
|
+
containment1 = overlap_area / area1
|
102
|
+
containment2 = overlap_area / area2
|
103
|
+
|
104
|
+
return (
|
105
|
+
iou > overlap_threshold
|
106
|
+
or containment1 > containment_threshold
|
107
|
+
or containment2 > containment_threshold
|
108
|
+
)
|
109
|
+
|
110
|
+
|
111
|
+
class Interval:
|
112
|
+
"""Helper class for sortable intervals."""
|
113
|
+
|
114
|
+
def __init__(self, min_val: float, max_val: float, id: int):
|
115
|
+
self.min_val = min_val
|
116
|
+
self.max_val = max_val
|
117
|
+
self.id = id
|
118
|
+
|
119
|
+
def __lt__(self, other):
|
120
|
+
if isinstance(other, Interval):
|
121
|
+
return self.min_val < other.min_val
|
122
|
+
return self.min_val < other
|
123
|
+
|
124
|
+
|
125
|
+
class IntervalTree:
|
126
|
+
"""Memory-efficient interval tree for 1D overlap queries."""
|
127
|
+
|
128
|
+
def __init__(self):
|
129
|
+
self.intervals: List[Interval] = [] # Sorted by min_val
|
130
|
+
|
131
|
+
def insert(self, min_val: float, max_val: float, id: int):
|
132
|
+
interval = Interval(min_val, max_val, id)
|
133
|
+
bisect.insort(self.intervals, interval)
|
134
|
+
|
135
|
+
def find_containing(self, point: float) -> Set[int]:
|
136
|
+
"""Find all intervals containing the point."""
|
137
|
+
pos = bisect.bisect_left(self.intervals, point)
|
138
|
+
result = set()
|
139
|
+
|
140
|
+
# Check intervals starting before point
|
141
|
+
for interval in reversed(self.intervals[:pos]):
|
142
|
+
if interval.min_val <= point <= interval.max_val:
|
143
|
+
result.add(interval.id)
|
144
|
+
else:
|
145
|
+
break
|
146
|
+
|
147
|
+
# Check intervals starting at/after point
|
148
|
+
for interval in self.intervals[pos:]:
|
149
|
+
if point <= interval.max_val:
|
150
|
+
if interval.min_val <= point:
|
151
|
+
result.add(interval.id)
|
152
|
+
else:
|
153
|
+
break
|
154
|
+
|
155
|
+
return result
|
156
|
+
|
157
|
+
|
158
|
+
class LayoutPostprocessor:
|
159
|
+
"""Postprocesses layout predictions by cleaning up clusters and mapping cells."""
|
160
|
+
|
161
|
+
# Cluster type-specific parameters for overlap resolution
|
162
|
+
OVERLAP_PARAMS = {
|
163
|
+
"regular": {"area_threshold": 1.3, "conf_threshold": 0.05},
|
164
|
+
"picture": {"area_threshold": 2.0, "conf_threshold": 0.3},
|
165
|
+
"wrapper": {"area_threshold": 2.0, "conf_threshold": 0.2},
|
166
|
+
}
|
167
|
+
|
168
|
+
WRAPPER_TYPES = {
|
169
|
+
DocItemLabel.FORM,
|
170
|
+
DocItemLabel.KEY_VALUE_REGION,
|
171
|
+
DocItemLabel.TABLE,
|
172
|
+
DocItemLabel.DOCUMENT_INDEX,
|
173
|
+
}
|
174
|
+
SPECIAL_TYPES = WRAPPER_TYPES.union({DocItemLabel.PICTURE})
|
175
|
+
|
176
|
+
CONFIDENCE_THRESHOLDS = {
|
177
|
+
DocItemLabel.CAPTION: 0.5,
|
178
|
+
DocItemLabel.FOOTNOTE: 0.5,
|
179
|
+
DocItemLabel.FORMULA: 0.5,
|
180
|
+
DocItemLabel.LIST_ITEM: 0.5,
|
181
|
+
DocItemLabel.PAGE_FOOTER: 0.5,
|
182
|
+
DocItemLabel.PAGE_HEADER: 0.5,
|
183
|
+
DocItemLabel.PICTURE: 0.5,
|
184
|
+
DocItemLabel.SECTION_HEADER: 0.45,
|
185
|
+
DocItemLabel.TABLE: 0.5,
|
186
|
+
DocItemLabel.TEXT: 0.5, # 0.45,
|
187
|
+
DocItemLabel.TITLE: 0.45,
|
188
|
+
DocItemLabel.CODE: 0.45,
|
189
|
+
DocItemLabel.CHECKBOX_SELECTED: 0.45,
|
190
|
+
DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
|
191
|
+
DocItemLabel.FORM: 0.45,
|
192
|
+
DocItemLabel.KEY_VALUE_REGION: 0.45,
|
193
|
+
DocItemLabel.DOCUMENT_INDEX: 0.45,
|
194
|
+
}
|
195
|
+
|
196
|
+
LABEL_REMAPPING = {
|
197
|
+
# DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
|
198
|
+
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
199
|
+
}
|
200
|
+
|
201
|
+
def __init__(self, cells: List[Cell], clusters: List[Cluster], page_size: Size):
|
202
|
+
"""Initialize processor with cells and clusters."""
|
203
|
+
"""Initialize processor with cells and spatial indices."""
|
204
|
+
self.cells = cells
|
205
|
+
self.page_size = page_size
|
206
|
+
self.regular_clusters = [
|
207
|
+
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
208
|
+
]
|
209
|
+
self.special_clusters = [c for c in clusters if c.label in self.SPECIAL_TYPES]
|
210
|
+
|
211
|
+
# Build spatial indices once
|
212
|
+
self.regular_index = SpatialClusterIndex(self.regular_clusters)
|
213
|
+
self.picture_index = SpatialClusterIndex(
|
214
|
+
[c for c in self.special_clusters if c.label == DocItemLabel.PICTURE]
|
215
|
+
)
|
216
|
+
self.wrapper_index = SpatialClusterIndex(
|
217
|
+
[c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
|
218
|
+
)
|
219
|
+
|
220
|
+
def postprocess(self) -> Tuple[List[Cluster], List[Cell]]:
|
221
|
+
"""Main processing pipeline."""
|
222
|
+
self.regular_clusters = self._process_regular_clusters()
|
223
|
+
self.special_clusters = self._process_special_clusters()
|
224
|
+
|
225
|
+
# Remove regular clusters that are included in wrappers
|
226
|
+
contained_ids = {
|
227
|
+
child.id
|
228
|
+
for wrapper in self.special_clusters
|
229
|
+
if wrapper.label in self.SPECIAL_TYPES
|
230
|
+
for child in wrapper.children
|
231
|
+
}
|
232
|
+
self.regular_clusters = [
|
233
|
+
c for c in self.regular_clusters if c.id not in contained_ids
|
234
|
+
]
|
235
|
+
|
236
|
+
# Combine and sort final clusters
|
237
|
+
final_clusters = self._sort_clusters(
|
238
|
+
self.regular_clusters + self.special_clusters, mode="id"
|
239
|
+
)
|
240
|
+
for cluster in final_clusters:
|
241
|
+
cluster.cells = self._sort_cells(cluster.cells)
|
242
|
+
# Also sort cells in children if any
|
243
|
+
for child in cluster.children:
|
244
|
+
child.cells = self._sort_cells(child.cells)
|
245
|
+
|
246
|
+
return final_clusters, self.cells
|
247
|
+
|
248
|
+
def _process_regular_clusters(self) -> List[Cluster]:
|
249
|
+
"""Process regular clusters with iterative refinement."""
|
250
|
+
clusters = [
|
251
|
+
c
|
252
|
+
for c in self.regular_clusters
|
253
|
+
if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
|
254
|
+
]
|
255
|
+
|
256
|
+
# Apply label remapping
|
257
|
+
for cluster in clusters:
|
258
|
+
if cluster.label in self.LABEL_REMAPPING:
|
259
|
+
cluster.label = self.LABEL_REMAPPING[cluster.label]
|
260
|
+
|
261
|
+
# Initial cell assignment
|
262
|
+
clusters = self._assign_cells_to_clusters(clusters)
|
263
|
+
|
264
|
+
# Remove clusters with no cells
|
265
|
+
clusters = [cluster for cluster in clusters if cluster.cells]
|
266
|
+
|
267
|
+
# Handle orphaned cells
|
268
|
+
unassigned = self._find_unassigned_cells(clusters)
|
269
|
+
if unassigned:
|
270
|
+
next_id = max((c.id for c in clusters), default=0) + 1
|
271
|
+
orphan_clusters = []
|
272
|
+
for i, cell in enumerate(unassigned):
|
273
|
+
conf = 1.0
|
274
|
+
if isinstance(cell, OcrCell):
|
275
|
+
conf = cell.confidence
|
276
|
+
|
277
|
+
orphan_clusters.append(
|
278
|
+
Cluster(
|
279
|
+
id=next_id + i,
|
280
|
+
label=DocItemLabel.TEXT,
|
281
|
+
bbox=cell.bbox,
|
282
|
+
confidence=conf,
|
283
|
+
cells=[cell],
|
284
|
+
)
|
285
|
+
)
|
286
|
+
clusters.extend(orphan_clusters)
|
287
|
+
|
288
|
+
# Iterative refinement
|
289
|
+
prev_count = len(clusters) + 1
|
290
|
+
for _ in range(3): # Maximum 3 iterations
|
291
|
+
if prev_count == len(clusters):
|
292
|
+
break
|
293
|
+
prev_count = len(clusters)
|
294
|
+
clusters = self._adjust_cluster_bboxes(clusters)
|
295
|
+
clusters = self._remove_overlapping_clusters(clusters, "regular")
|
296
|
+
|
297
|
+
return clusters
|
298
|
+
|
299
|
+
def _process_special_clusters(self) -> List[Cluster]:
|
300
|
+
special_clusters = [
|
301
|
+
c
|
302
|
+
for c in self.special_clusters
|
303
|
+
if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
|
304
|
+
]
|
305
|
+
|
306
|
+
special_clusters = self._handle_cross_type_overlaps(special_clusters)
|
307
|
+
|
308
|
+
# Calculate page area from known page size
|
309
|
+
page_area = self.page_size.width * self.page_size.height
|
310
|
+
if page_area > 0:
|
311
|
+
# Filter out full-page pictures
|
312
|
+
special_clusters = [
|
313
|
+
cluster
|
314
|
+
for cluster in special_clusters
|
315
|
+
if not (
|
316
|
+
cluster.label == DocItemLabel.PICTURE
|
317
|
+
and cluster.bbox.area() / page_area > 0.90
|
318
|
+
)
|
319
|
+
]
|
320
|
+
|
321
|
+
for special in special_clusters:
|
322
|
+
contained = []
|
323
|
+
for cluster in self.regular_clusters:
|
324
|
+
overlap = cluster.bbox.intersection_area_with(special.bbox)
|
325
|
+
if overlap > 0:
|
326
|
+
containment = overlap / cluster.bbox.area()
|
327
|
+
if containment > 0.8:
|
328
|
+
contained.append(cluster)
|
329
|
+
|
330
|
+
if contained:
|
331
|
+
# Sort contained clusters by minimum cell ID:
|
332
|
+
contained = self._sort_clusters(contained, mode="id")
|
333
|
+
special.children = contained
|
334
|
+
|
335
|
+
# Adjust bbox only for Form and Key-Value-Region, not Table or Picture
|
336
|
+
if special.label in [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]:
|
337
|
+
special.bbox = BoundingBox(
|
338
|
+
l=min(c.bbox.l for c in contained),
|
339
|
+
t=min(c.bbox.t for c in contained),
|
340
|
+
r=max(c.bbox.r for c in contained),
|
341
|
+
b=max(c.bbox.b for c in contained),
|
342
|
+
)
|
343
|
+
|
344
|
+
# Collect all cells from children
|
345
|
+
all_cells = []
|
346
|
+
for child in contained:
|
347
|
+
all_cells.extend(child.cells)
|
348
|
+
special.cells = self._deduplicate_cells(all_cells)
|
349
|
+
special.cells = self._sort_cells(special.cells)
|
350
|
+
|
351
|
+
picture_clusters = [
|
352
|
+
c for c in special_clusters if c.label == DocItemLabel.PICTURE
|
353
|
+
]
|
354
|
+
picture_clusters = self._remove_overlapping_clusters(
|
355
|
+
picture_clusters, "picture"
|
356
|
+
)
|
357
|
+
|
358
|
+
wrapper_clusters = [
|
359
|
+
c for c in special_clusters if c.label in self.WRAPPER_TYPES
|
360
|
+
]
|
361
|
+
wrapper_clusters = self._remove_overlapping_clusters(
|
362
|
+
wrapper_clusters, "wrapper"
|
363
|
+
)
|
364
|
+
|
365
|
+
return picture_clusters + wrapper_clusters
|
366
|
+
|
367
|
+
def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
|
368
|
+
"""Handle overlaps between regular and wrapper clusters before child assignment.
|
369
|
+
|
370
|
+
In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
|
371
|
+
should be removed.
|
372
|
+
"""
|
373
|
+
wrappers_to_remove = set()
|
374
|
+
|
375
|
+
for wrapper in special_clusters:
|
376
|
+
if wrapper.label not in self.WRAPPER_TYPES:
|
377
|
+
continue # only treat KEY_VALUE_REGION for now.
|
378
|
+
|
379
|
+
for regular in self.regular_clusters:
|
380
|
+
if regular.label == DocItemLabel.TABLE:
|
381
|
+
# Calculate overlap
|
382
|
+
overlap = regular.bbox.intersection_area_with(wrapper.bbox)
|
383
|
+
wrapper_area = wrapper.bbox.area()
|
384
|
+
overlap_ratio = overlap / wrapper_area
|
385
|
+
|
386
|
+
conf_diff = wrapper.confidence - regular.confidence
|
387
|
+
|
388
|
+
# If wrapper is mostly overlapping with a TABLE, remove the wrapper
|
389
|
+
if (
|
390
|
+
overlap_ratio > 0.9 and conf_diff < 0.1
|
391
|
+
): # self.OVERLAP_PARAMS["wrapper"]["conf_threshold"]): # 80% overlap threshold
|
392
|
+
wrappers_to_remove.add(wrapper.id)
|
393
|
+
break
|
394
|
+
|
395
|
+
# Filter out the identified wrappers
|
396
|
+
special_clusters = [
|
397
|
+
cluster
|
398
|
+
for cluster in special_clusters
|
399
|
+
if cluster.id not in wrappers_to_remove
|
400
|
+
]
|
401
|
+
|
402
|
+
return special_clusters
|
403
|
+
|
404
|
+
def _should_prefer_cluster(
|
405
|
+
self, candidate: Cluster, other: Cluster, params: dict
|
406
|
+
) -> bool:
|
407
|
+
"""Determine if candidate cluster should be preferred over other cluster based on rules.
|
408
|
+
Returns True if candidate should be preferred, False if not."""
|
409
|
+
|
410
|
+
# Rule 1: LIST_ITEM vs TEXT
|
411
|
+
if (
|
412
|
+
candidate.label == DocItemLabel.LIST_ITEM
|
413
|
+
and other.label == DocItemLabel.TEXT
|
414
|
+
):
|
415
|
+
# Check if areas are similar (within 20% of each other)
|
416
|
+
area_ratio = candidate.bbox.area() / other.bbox.area()
|
417
|
+
area_similarity = abs(1 - area_ratio) < 0.2
|
418
|
+
if area_similarity:
|
419
|
+
return True
|
420
|
+
|
421
|
+
# Rule 2: CODE vs others
|
422
|
+
if candidate.label == DocItemLabel.CODE:
|
423
|
+
# Calculate how much of the other cluster is contained within the CODE cluster
|
424
|
+
overlap = other.bbox.intersection_area_with(candidate.bbox)
|
425
|
+
containment = overlap / other.bbox.area()
|
426
|
+
if containment > 0.8: # other is 80% contained within CODE
|
427
|
+
return True
|
428
|
+
|
429
|
+
# If no label-based rules matched, fall back to area/confidence thresholds
|
430
|
+
area_ratio = candidate.bbox.area() / other.bbox.area()
|
431
|
+
conf_diff = other.confidence - candidate.confidence
|
432
|
+
|
433
|
+
if (
|
434
|
+
area_ratio <= params["area_threshold"]
|
435
|
+
and conf_diff > params["conf_threshold"]
|
436
|
+
):
|
437
|
+
return False
|
438
|
+
|
439
|
+
return True # Default to keeping candidate if no rules triggered rejection
|
440
|
+
|
441
|
+
def _select_best_cluster_from_group(
|
442
|
+
self,
|
443
|
+
group_clusters: List[Cluster],
|
444
|
+
params: dict,
|
445
|
+
) -> Cluster:
|
446
|
+
"""Select best cluster from a group of overlapping clusters based on all rules."""
|
447
|
+
current_best = None
|
448
|
+
|
449
|
+
for candidate in group_clusters:
|
450
|
+
should_select = True
|
451
|
+
|
452
|
+
for other in group_clusters:
|
453
|
+
if other == candidate:
|
454
|
+
continue
|
455
|
+
|
456
|
+
if not self._should_prefer_cluster(candidate, other, params):
|
457
|
+
should_select = False
|
458
|
+
break
|
459
|
+
|
460
|
+
if should_select:
|
461
|
+
if current_best is None:
|
462
|
+
current_best = candidate
|
463
|
+
else:
|
464
|
+
# If both clusters pass rules, prefer the larger one unless confidence differs significantly
|
465
|
+
if (
|
466
|
+
candidate.bbox.area() > current_best.bbox.area()
|
467
|
+
and current_best.confidence - candidate.confidence
|
468
|
+
<= params["conf_threshold"]
|
469
|
+
):
|
470
|
+
current_best = candidate
|
471
|
+
|
472
|
+
return current_best if current_best else group_clusters[0]
|
473
|
+
|
474
|
+
def _remove_overlapping_clusters(
|
475
|
+
self,
|
476
|
+
clusters: List[Cluster],
|
477
|
+
cluster_type: str,
|
478
|
+
overlap_threshold: float = 0.8,
|
479
|
+
containment_threshold: float = 0.8,
|
480
|
+
) -> List[Cluster]:
|
481
|
+
if not clusters:
|
482
|
+
return []
|
483
|
+
|
484
|
+
spatial_index = (
|
485
|
+
self.regular_index
|
486
|
+
if cluster_type == "regular"
|
487
|
+
else self.picture_index if cluster_type == "picture" else self.wrapper_index
|
488
|
+
)
|
489
|
+
|
490
|
+
# Map of currently valid clusters
|
491
|
+
valid_clusters = {c.id: c for c in clusters}
|
492
|
+
uf = UnionFind(valid_clusters.keys())
|
493
|
+
params = self.OVERLAP_PARAMS[cluster_type]
|
494
|
+
|
495
|
+
for cluster in clusters:
|
496
|
+
candidates = spatial_index.find_candidates(cluster.bbox)
|
497
|
+
candidates &= valid_clusters.keys() # Only keep existing candidates
|
498
|
+
candidates.discard(cluster.id)
|
499
|
+
|
500
|
+
for other_id in candidates:
|
501
|
+
if spatial_index.check_overlap(
|
502
|
+
cluster.bbox,
|
503
|
+
valid_clusters[other_id].bbox,
|
504
|
+
overlap_threshold,
|
505
|
+
containment_threshold,
|
506
|
+
):
|
507
|
+
uf.union(cluster.id, other_id)
|
508
|
+
|
509
|
+
result = []
|
510
|
+
for group in uf.get_groups().values():
|
511
|
+
if len(group) == 1:
|
512
|
+
result.append(valid_clusters[group[0]])
|
513
|
+
continue
|
514
|
+
|
515
|
+
group_clusters = [valid_clusters[cid] for cid in group]
|
516
|
+
best = self._select_best_cluster_from_group(group_clusters, params)
|
517
|
+
|
518
|
+
# Simple cell merging - no special cases
|
519
|
+
for cluster in group_clusters:
|
520
|
+
if cluster != best:
|
521
|
+
best.cells.extend(cluster.cells)
|
522
|
+
|
523
|
+
best.cells = self._deduplicate_cells(best.cells)
|
524
|
+
best.cells = self._sort_cells(best.cells)
|
525
|
+
result.append(best)
|
526
|
+
|
527
|
+
return result
|
528
|
+
|
529
|
+
def _select_best_cluster(
|
530
|
+
self,
|
531
|
+
clusters: List[Cluster],
|
532
|
+
area_threshold: float,
|
533
|
+
conf_threshold: float,
|
534
|
+
) -> Cluster:
|
535
|
+
"""Iteratively select best cluster based on area and confidence thresholds."""
|
536
|
+
current_best = None
|
537
|
+
for candidate in clusters:
|
538
|
+
should_select = True
|
539
|
+
for other in clusters:
|
540
|
+
if other == candidate:
|
541
|
+
continue
|
542
|
+
|
543
|
+
area_ratio = candidate.bbox.area() / other.bbox.area()
|
544
|
+
conf_diff = other.confidence - candidate.confidence
|
545
|
+
|
546
|
+
if area_ratio <= area_threshold and conf_diff > conf_threshold:
|
547
|
+
should_select = False
|
548
|
+
break
|
549
|
+
|
550
|
+
if should_select:
|
551
|
+
if current_best is None or (
|
552
|
+
candidate.bbox.area() > current_best.bbox.area()
|
553
|
+
and current_best.confidence - candidate.confidence <= conf_threshold
|
554
|
+
):
|
555
|
+
current_best = candidate
|
556
|
+
|
557
|
+
return current_best if current_best else clusters[0]
|
558
|
+
|
559
|
+
def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
|
560
|
+
"""Ensure each cell appears only once, maintaining order of first appearance."""
|
561
|
+
seen_ids = set()
|
562
|
+
unique_cells = []
|
563
|
+
for cell in cells:
|
564
|
+
if cell.id not in seen_ids:
|
565
|
+
seen_ids.add(cell.id)
|
566
|
+
unique_cells.append(cell)
|
567
|
+
return unique_cells
|
568
|
+
|
569
|
+
def _assign_cells_to_clusters(
|
570
|
+
self, clusters: List[Cluster], min_overlap: float = 0.2
|
571
|
+
) -> List[Cluster]:
|
572
|
+
"""Assign cells to best overlapping cluster."""
|
573
|
+
for cluster in clusters:
|
574
|
+
cluster.cells = []
|
575
|
+
|
576
|
+
for cell in self.cells:
|
577
|
+
if not cell.text.strip():
|
578
|
+
continue
|
579
|
+
|
580
|
+
best_overlap = min_overlap
|
581
|
+
best_cluster = None
|
582
|
+
|
583
|
+
for cluster in clusters:
|
584
|
+
if cell.bbox.area() <= 0:
|
585
|
+
continue
|
586
|
+
|
587
|
+
overlap = cell.bbox.intersection_area_with(cluster.bbox)
|
588
|
+
overlap_ratio = overlap / cell.bbox.area()
|
589
|
+
|
590
|
+
if overlap_ratio > best_overlap:
|
591
|
+
best_overlap = overlap_ratio
|
592
|
+
best_cluster = cluster
|
593
|
+
|
594
|
+
if best_cluster is not None:
|
595
|
+
best_cluster.cells.append(cell)
|
596
|
+
|
597
|
+
# Deduplicate cells in each cluster after assignment
|
598
|
+
for cluster in clusters:
|
599
|
+
cluster.cells = self._deduplicate_cells(cluster.cells)
|
600
|
+
|
601
|
+
return clusters
|
602
|
+
|
603
|
+
def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
|
604
|
+
"""Find cells not assigned to any cluster."""
|
605
|
+
assigned = {cell.id for cluster in clusters for cell in cluster.cells}
|
606
|
+
return [
|
607
|
+
cell for cell in self.cells if cell.id not in assigned and cell.text.strip()
|
608
|
+
]
|
609
|
+
|
610
|
+
def _adjust_cluster_bboxes(self, clusters: List[Cluster]) -> List[Cluster]:
|
611
|
+
"""Adjust cluster bounding boxes to contain their cells."""
|
612
|
+
for cluster in clusters:
|
613
|
+
if not cluster.cells:
|
614
|
+
continue
|
615
|
+
|
616
|
+
cells_bbox = BoundingBox(
|
617
|
+
l=min(cell.bbox.l for cell in cluster.cells),
|
618
|
+
t=min(cell.bbox.t for cell in cluster.cells),
|
619
|
+
r=max(cell.bbox.r for cell in cluster.cells),
|
620
|
+
b=max(cell.bbox.b for cell in cluster.cells),
|
621
|
+
)
|
622
|
+
|
623
|
+
if cluster.label == DocItemLabel.TABLE:
|
624
|
+
# For tables, take union of current bbox and cells bbox
|
625
|
+
cluster.bbox = BoundingBox(
|
626
|
+
l=min(cluster.bbox.l, cells_bbox.l),
|
627
|
+
t=min(cluster.bbox.t, cells_bbox.t),
|
628
|
+
r=max(cluster.bbox.r, cells_bbox.r),
|
629
|
+
b=max(cluster.bbox.b, cells_bbox.b),
|
630
|
+
)
|
631
|
+
else:
|
632
|
+
cluster.bbox = cells_bbox
|
633
|
+
|
634
|
+
return clusters
|
635
|
+
|
636
|
+
def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
|
637
|
+
"""Sort cells in native reading order."""
|
638
|
+
return sorted(cells, key=lambda c: (c.id))
|
639
|
+
|
640
|
+
def _sort_clusters(
|
641
|
+
self, clusters: List[Cluster], mode: str = "id"
|
642
|
+
) -> List[Cluster]:
|
643
|
+
"""Sort clusters in reading order (top-to-bottom, left-to-right)."""
|
644
|
+
if mode == "id": # sort in the order the cells are printed in the PDF.
|
645
|
+
return sorted(
|
646
|
+
clusters,
|
647
|
+
key=lambda cluster: (
|
648
|
+
(
|
649
|
+
min(cell.id for cell in cluster.cells)
|
650
|
+
if cluster.cells
|
651
|
+
else sys.maxsize
|
652
|
+
),
|
653
|
+
cluster.bbox.t,
|
654
|
+
cluster.bbox.l,
|
655
|
+
),
|
656
|
+
)
|
657
|
+
elif mode == "tblr": # Sort top-to-bottom, then left-to-right ("row first")
|
658
|
+
return sorted(
|
659
|
+
clusters, key=lambda cluster: (cluster.bbox.t, cluster.bbox.l)
|
660
|
+
)
|
661
|
+
elif mode == "lrtb": # Sort left-to-right, then top-to-bottom ("column first")
|
662
|
+
return sorted(
|
663
|
+
clusters, key=lambda cluster: (cluster.bbox.l, cluster.bbox.t)
|
664
|
+
)
|
665
|
+
else:
|
666
|
+
return clusters
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.14.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.12.1,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|