docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,683 @@
1
+ import bisect
2
+ import logging
3
+ import sys
4
+ from collections import defaultdict
5
+
6
+ from docling_core.types.doc import DocItemLabel, Size
7
+ from docling_core.types.doc.page import TextCell
8
+ from rtree import index
9
+
10
+ from docling.datamodel.base_models import BoundingBox, Cluster, Page
11
+ from docling.datamodel.pipeline_options import LayoutOptions
12
+
13
+ _log = logging.getLogger(__name__)
14
+
15
+
16
+ class UnionFind:
17
+ """Efficient Union-Find data structure for grouping elements."""
18
+
19
+ def __init__(self, elements):
20
+ self.parent = {elem: elem for elem in elements}
21
+ self.rank = dict.fromkeys(elements, 0)
22
+
23
+ def find(self, x):
24
+ if self.parent[x] != x:
25
+ self.parent[x] = self.find(self.parent[x]) # Path compression
26
+ return self.parent[x]
27
+
28
+ def union(self, x, y):
29
+ root_x, root_y = self.find(x), self.find(y)
30
+ if root_x == root_y:
31
+ return
32
+
33
+ if self.rank[root_x] > self.rank[root_y]:
34
+ self.parent[root_y] = root_x
35
+ elif self.rank[root_x] < self.rank[root_y]:
36
+ self.parent[root_x] = root_y
37
+ else:
38
+ self.parent[root_y] = root_x
39
+ self.rank[root_x] += 1
40
+
41
+ def get_groups(self) -> dict[int, list[int]]:
42
+ """Returns groups as {root: [elements]}."""
43
+ groups = defaultdict(list)
44
+ for elem in self.parent:
45
+ groups[self.find(elem)].append(elem)
46
+ return groups
47
+
48
+
49
+ class SpatialClusterIndex:
50
+ """Efficient spatial indexing for clusters using R-tree and interval trees."""
51
+
52
+ def __init__(self, clusters: list[Cluster]):
53
+ p = index.Property()
54
+ p.dimension = 2
55
+ self.spatial_index = index.Index(properties=p)
56
+ self.x_intervals = IntervalTree()
57
+ self.y_intervals = IntervalTree()
58
+ self.clusters_by_id: dict[int, Cluster] = {}
59
+
60
+ for cluster in clusters:
61
+ self.add_cluster(cluster)
62
+
63
+ def add_cluster(self, cluster: Cluster):
64
+ bbox = cluster.bbox
65
+ self.spatial_index.insert(cluster.id, bbox.as_tuple())
66
+ self.x_intervals.insert(bbox.l, bbox.r, cluster.id)
67
+ self.y_intervals.insert(bbox.t, bbox.b, cluster.id)
68
+ self.clusters_by_id[cluster.id] = cluster
69
+
70
+ def remove_cluster(self, cluster: Cluster):
71
+ self.spatial_index.delete(cluster.id, cluster.bbox.as_tuple())
72
+ del self.clusters_by_id[cluster.id]
73
+
74
+ def find_candidates(self, bbox: BoundingBox) -> set[int]:
75
+ """Find potential overlapping cluster IDs using all indexes."""
76
+ spatial = set(self.spatial_index.intersection(bbox.as_tuple()))
77
+ x_candidates = self.x_intervals.find_containing(
78
+ bbox.l
79
+ ) | self.x_intervals.find_containing(bbox.r)
80
+ y_candidates = self.y_intervals.find_containing(
81
+ bbox.t
82
+ ) | self.y_intervals.find_containing(bbox.b)
83
+ return spatial.union(x_candidates).union(y_candidates)
84
+
85
+ def check_overlap(
86
+ self,
87
+ bbox1: BoundingBox,
88
+ bbox2: BoundingBox,
89
+ overlap_threshold: float,
90
+ containment_threshold: float,
91
+ ) -> bool:
92
+ """Check if two bboxes overlap sufficiently."""
93
+ if bbox1.area() <= 0 or bbox2.area() <= 0:
94
+ return False
95
+
96
+ iou = bbox1.intersection_over_union(bbox2)
97
+ containment1 = bbox1.intersection_over_self(bbox2)
98
+ containment2 = bbox2.intersection_over_self(bbox1)
99
+
100
+ return (
101
+ iou > overlap_threshold
102
+ or containment1 > containment_threshold
103
+ or containment2 > containment_threshold
104
+ )
105
+
106
+
107
+ class Interval:
108
+ """Helper class for sortable intervals."""
109
+
110
+ def __init__(self, min_val: float, max_val: float, id: int):
111
+ self.min_val = min_val
112
+ self.max_val = max_val
113
+ self.id = id
114
+
115
+ def __lt__(self, other):
116
+ if isinstance(other, Interval):
117
+ return self.min_val < other.min_val
118
+ return self.min_val < other
119
+
120
+
121
+ class IntervalTree:
122
+ """Memory-efficient interval tree for 1D overlap queries."""
123
+
124
+ def __init__(self):
125
+ self.intervals: list[Interval] = [] # Sorted by min_val
126
+
127
+ def insert(self, min_val: float, max_val: float, id: int):
128
+ interval = Interval(min_val, max_val, id)
129
+ bisect.insort(self.intervals, interval)
130
+
131
+ def find_containing(self, point: float) -> set[int]:
132
+ """Find all intervals containing the point."""
133
+ pos = bisect.bisect_left(self.intervals, point)
134
+ result = set()
135
+
136
+ # Check intervals starting before point
137
+ for interval in reversed(self.intervals[:pos]):
138
+ if interval.min_val <= point <= interval.max_val:
139
+ result.add(interval.id)
140
+ else:
141
+ break
142
+
143
+ # Check intervals starting at/after point
144
+ for interval in self.intervals[pos:]:
145
+ if point <= interval.max_val:
146
+ if interval.min_val <= point:
147
+ result.add(interval.id)
148
+ else:
149
+ break
150
+
151
+ return result
152
+
153
+
154
+ class LayoutPostprocessor:
155
+ """Postprocesses layout predictions by cleaning up clusters and mapping cells."""
156
+
157
+ # Cluster type-specific parameters for overlap resolution
158
+ OVERLAP_PARAMS = {
159
+ "regular": {"area_threshold": 1.3, "conf_threshold": 0.05},
160
+ "picture": {"area_threshold": 2.0, "conf_threshold": 0.3},
161
+ "wrapper": {"area_threshold": 2.0, "conf_threshold": 0.2},
162
+ }
163
+
164
+ WRAPPER_TYPES = {
165
+ DocItemLabel.FORM,
166
+ DocItemLabel.KEY_VALUE_REGION,
167
+ DocItemLabel.TABLE,
168
+ DocItemLabel.DOCUMENT_INDEX,
169
+ }
170
+ SPECIAL_TYPES = WRAPPER_TYPES.union({DocItemLabel.PICTURE})
171
+
172
+ CONFIDENCE_THRESHOLDS = {
173
+ DocItemLabel.CAPTION: 0.5,
174
+ DocItemLabel.FOOTNOTE: 0.5,
175
+ DocItemLabel.FORMULA: 0.5,
176
+ DocItemLabel.LIST_ITEM: 0.5,
177
+ DocItemLabel.PAGE_FOOTER: 0.5,
178
+ DocItemLabel.PAGE_HEADER: 0.5,
179
+ DocItemLabel.PICTURE: 0.5,
180
+ DocItemLabel.SECTION_HEADER: 0.45,
181
+ DocItemLabel.TABLE: 0.5,
182
+ DocItemLabel.TEXT: 0.5, # 0.45,
183
+ DocItemLabel.TITLE: 0.45,
184
+ DocItemLabel.CODE: 0.45,
185
+ DocItemLabel.CHECKBOX_SELECTED: 0.45,
186
+ DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
187
+ DocItemLabel.FORM: 0.45,
188
+ DocItemLabel.KEY_VALUE_REGION: 0.45,
189
+ DocItemLabel.DOCUMENT_INDEX: 0.45,
190
+ }
191
+
192
+ LABEL_REMAPPING = {
193
+ # DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
194
+ DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
195
+ }
196
+
197
+ def __init__(
198
+ self, page: Page, clusters: list[Cluster], options: LayoutOptions
199
+ ) -> None:
200
+ """Initialize processor with page and clusters."""
201
+
202
+ self.cells = page.cells
203
+ self.page = page
204
+ self.page_size = page.size
205
+ self.all_clusters = clusters
206
+ self.options = options
207
+ self.regular_clusters = [
208
+ c for c in clusters if c.label not in self.SPECIAL_TYPES
209
+ ]
210
+ self.special_clusters = [c for c in clusters if c.label in self.SPECIAL_TYPES]
211
+
212
+ # Build spatial indices once
213
+ self.regular_index = SpatialClusterIndex(self.regular_clusters)
214
+ self.picture_index = SpatialClusterIndex(
215
+ [c for c in self.special_clusters if c.label == DocItemLabel.PICTURE]
216
+ )
217
+ self.wrapper_index = SpatialClusterIndex(
218
+ [c for c in self.special_clusters if c.label in self.WRAPPER_TYPES]
219
+ )
220
+
221
+ def postprocess(self) -> tuple[list[Cluster], list[TextCell]]:
222
+ """Main processing pipeline."""
223
+ self.regular_clusters = self._process_regular_clusters()
224
+ self.special_clusters = self._process_special_clusters()
225
+
226
+ # Remove regular clusters that are included in wrappers
227
+ contained_ids = {
228
+ child.id
229
+ for wrapper in self.special_clusters
230
+ if wrapper.label in self.SPECIAL_TYPES
231
+ for child in wrapper.children
232
+ }
233
+ self.regular_clusters = [
234
+ c for c in self.regular_clusters if c.id not in contained_ids
235
+ ]
236
+
237
+ # Combine and sort final clusters
238
+ final_clusters = self._sort_clusters(
239
+ self.regular_clusters + self.special_clusters, mode="id"
240
+ )
241
+
242
+ # Conditionally process cells if not skipping cell assignment
243
+ if not self.options.skip_cell_assignment:
244
+ for cluster in final_clusters:
245
+ cluster.cells = self._sort_cells(cluster.cells)
246
+ # Also sort cells in children if any
247
+ for child in cluster.children:
248
+ child.cells = self._sort_cells(child.cells)
249
+
250
+ assert self.page.parsed_page is not None
251
+ self.page.parsed_page.textline_cells = self.cells
252
+ self.page.parsed_page.has_lines = len(self.cells) > 0
253
+
254
+ return final_clusters, self.cells
255
+
256
+ def _process_regular_clusters(self) -> list[Cluster]:
257
+ """Process regular clusters with iterative refinement."""
258
+ clusters = [
259
+ c
260
+ for c in self.regular_clusters
261
+ if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
262
+ ]
263
+
264
+ # Apply label remapping
265
+ for cluster in clusters:
266
+ if cluster.label in self.LABEL_REMAPPING:
267
+ cluster.label = self.LABEL_REMAPPING[cluster.label]
268
+
269
+ # Conditionally assign cells to clusters
270
+ if not self.options.skip_cell_assignment:
271
+ # Initial cell assignment
272
+ clusters = self._assign_cells_to_clusters(clusters)
273
+
274
+ # Remove clusters with no cells (if keep_empty_clusters is False),
275
+ # but always keep clusters with label DocItemLabel.FORMULA
276
+ if not self.options.keep_empty_clusters:
277
+ clusters = [
278
+ cluster
279
+ for cluster in clusters
280
+ if cluster.cells or cluster.label == DocItemLabel.FORMULA
281
+ ]
282
+
283
+ # Handle orphaned cells
284
+ unassigned = self._find_unassigned_cells(clusters)
285
+ if unassigned and self.options.create_orphan_clusters:
286
+ next_id = max((c.id for c in self.all_clusters), default=0) + 1
287
+ orphan_clusters = []
288
+ for i, cell in enumerate(unassigned):
289
+ conf = cell.confidence
290
+
291
+ orphan_clusters.append(
292
+ Cluster(
293
+ id=next_id + i,
294
+ label=DocItemLabel.TEXT,
295
+ bbox=cell.to_bounding_box(),
296
+ confidence=conf,
297
+ cells=[cell],
298
+ )
299
+ )
300
+ clusters.extend(orphan_clusters)
301
+
302
+ # Iterative refinement
303
+ prev_count = len(clusters) + 1
304
+ for _ in range(3): # Maximum 3 iterations
305
+ if prev_count == len(clusters):
306
+ break
307
+ prev_count = len(clusters)
308
+ clusters = self._adjust_cluster_bboxes(clusters)
309
+ clusters = self._remove_overlapping_clusters(clusters, "regular")
310
+
311
+ return clusters
312
+
313
+ def _process_special_clusters(self) -> list[Cluster]:
314
+ special_clusters = [
315
+ c
316
+ for c in self.special_clusters
317
+ if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
318
+ ]
319
+
320
+ special_clusters = self._handle_cross_type_overlaps(special_clusters)
321
+
322
+ # Calculate page area from known page size
323
+ assert self.page_size is not None
324
+ page_area = self.page_size.width * self.page_size.height
325
+ if page_area > 0:
326
+ # Filter out full-page pictures
327
+ special_clusters = [
328
+ cluster
329
+ for cluster in special_clusters
330
+ if not (
331
+ cluster.label == DocItemLabel.PICTURE
332
+ and cluster.bbox.area() / page_area > 0.90
333
+ )
334
+ ]
335
+
336
+ for special in special_clusters:
337
+ contained = []
338
+ for cluster in self.regular_clusters:
339
+ containment = cluster.bbox.intersection_over_self(special.bbox)
340
+ if containment > 0.8:
341
+ contained.append(cluster)
342
+
343
+ if contained:
344
+ # Sort contained clusters by minimum cell ID:
345
+ contained = self._sort_clusters(contained, mode="id")
346
+ special.children = contained
347
+
348
+ # Adjust bbox only for Form and Key-Value-Region, not Table or Picture
349
+ if special.label in [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]:
350
+ special.bbox = BoundingBox(
351
+ l=min(c.bbox.l for c in contained),
352
+ t=min(c.bbox.t for c in contained),
353
+ r=max(c.bbox.r for c in contained),
354
+ b=max(c.bbox.b for c in contained),
355
+ )
356
+
357
+ # Conditionally collect cells from children
358
+ if not self.options.skip_cell_assignment:
359
+ all_cells = []
360
+ for child in contained:
361
+ all_cells.extend(child.cells)
362
+ special.cells = self._deduplicate_cells(all_cells)
363
+ special.cells = self._sort_cells(special.cells)
364
+ else:
365
+ special.cells = []
366
+
367
+ picture_clusters = [
368
+ c for c in special_clusters if c.label == DocItemLabel.PICTURE
369
+ ]
370
+ picture_clusters = self._remove_overlapping_clusters(
371
+ picture_clusters, "picture"
372
+ )
373
+
374
+ wrapper_clusters = [
375
+ c for c in special_clusters if c.label in self.WRAPPER_TYPES
376
+ ]
377
+ wrapper_clusters = self._remove_overlapping_clusters(
378
+ wrapper_clusters, "wrapper"
379
+ )
380
+
381
+ return picture_clusters + wrapper_clusters
382
+
383
+ def _handle_cross_type_overlaps(self, special_clusters) -> list[Cluster]:
384
+ """Handle overlaps between regular and wrapper clusters before child assignment.
385
+
386
+ In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
387
+ should be removed.
388
+ """
389
+ wrappers_to_remove = set()
390
+
391
+ for wrapper in special_clusters:
392
+ if wrapper.label not in self.WRAPPER_TYPES:
393
+ continue # only treat KEY_VALUE_REGION for now.
394
+
395
+ for regular in self.regular_clusters:
396
+ if regular.label == DocItemLabel.TABLE:
397
+ # Calculate overlap
398
+ overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)
399
+
400
+ conf_diff = wrapper.confidence - regular.confidence
401
+
402
+ # If wrapper is mostly overlapping with a TABLE, remove the wrapper
403
+ if (
404
+ overlap_ratio > 0.9 and conf_diff < 0.1
405
+ ): # self.OVERLAP_PARAMS["wrapper"]["conf_threshold"]): # 80% overlap threshold
406
+ wrappers_to_remove.add(wrapper.id)
407
+ break
408
+
409
+ # Filter out the identified wrappers
410
+ special_clusters = [
411
+ cluster
412
+ for cluster in special_clusters
413
+ if cluster.id not in wrappers_to_remove
414
+ ]
415
+
416
+ return special_clusters
417
+
418
+ def _should_prefer_cluster(
419
+ self, candidate: Cluster, other: Cluster, params: dict
420
+ ) -> bool:
421
+ """Determine if candidate cluster should be preferred over other cluster based on rules.
422
+ Returns True if candidate should be preferred, False if not."""
423
+
424
+ # Rule 1: LIST_ITEM vs TEXT
425
+ if (
426
+ candidate.label == DocItemLabel.LIST_ITEM
427
+ and other.label == DocItemLabel.TEXT
428
+ ):
429
+ # Check if areas are similar (within 20% of each other)
430
+ area_ratio = candidate.bbox.area() / other.bbox.area()
431
+ area_similarity = abs(1 - area_ratio) < 0.2
432
+ if area_similarity:
433
+ return True
434
+
435
+ # Rule 2: CODE vs others
436
+ if candidate.label == DocItemLabel.CODE:
437
+ # Calculate how much of the other cluster is contained within the CODE cluster
438
+ containment = other.bbox.intersection_over_self(candidate.bbox)
439
+ if containment > 0.8: # other is 80% contained within CODE
440
+ return True
441
+
442
+ # If no label-based rules matched, fall back to area/confidence thresholds
443
+ area_ratio = candidate.bbox.area() / other.bbox.area()
444
+ conf_diff = other.confidence - candidate.confidence
445
+
446
+ if (
447
+ area_ratio <= params["area_threshold"]
448
+ and conf_diff > params["conf_threshold"]
449
+ ):
450
+ return False
451
+
452
+ return True # Default to keeping candidate if no rules triggered rejection
453
+
454
+ def _select_best_cluster_from_group(
455
+ self,
456
+ group_clusters: list[Cluster],
457
+ params: dict,
458
+ ) -> Cluster:
459
+ """Select best cluster from a group of overlapping clusters based on all rules."""
460
+ current_best = None
461
+
462
+ for candidate in group_clusters:
463
+ should_select = True
464
+
465
+ for other in group_clusters:
466
+ if other == candidate:
467
+ continue
468
+
469
+ if not self._should_prefer_cluster(candidate, other, params):
470
+ should_select = False
471
+ break
472
+
473
+ if should_select:
474
+ if current_best is None:
475
+ current_best = candidate
476
+ else:
477
+ # If both clusters pass rules, prefer the larger one unless confidence differs significantly
478
+ if (
479
+ candidate.bbox.area() > current_best.bbox.area()
480
+ and current_best.confidence - candidate.confidence
481
+ <= params["conf_threshold"]
482
+ ):
483
+ current_best = candidate
484
+
485
+ return current_best if current_best else group_clusters[0]
486
+
487
+ def _remove_overlapping_clusters(
488
+ self,
489
+ clusters: list[Cluster],
490
+ cluster_type: str,
491
+ overlap_threshold: float = 0.8,
492
+ containment_threshold: float = 0.8,
493
+ ) -> list[Cluster]:
494
+ if not clusters:
495
+ return []
496
+
497
+ spatial_index = (
498
+ self.regular_index
499
+ if cluster_type == "regular"
500
+ else self.picture_index
501
+ if cluster_type == "picture"
502
+ else self.wrapper_index
503
+ )
504
+
505
+ # Map of currently valid clusters
506
+ valid_clusters = {c.id: c for c in clusters}
507
+ uf = UnionFind(valid_clusters.keys())
508
+ params = self.OVERLAP_PARAMS[cluster_type]
509
+
510
+ for cluster in clusters:
511
+ candidates = spatial_index.find_candidates(cluster.bbox)
512
+ candidates &= valid_clusters.keys() # Only keep existing candidates
513
+ candidates.discard(cluster.id)
514
+
515
+ for other_id in candidates:
516
+ if spatial_index.check_overlap(
517
+ cluster.bbox,
518
+ valid_clusters[other_id].bbox,
519
+ overlap_threshold,
520
+ containment_threshold,
521
+ ):
522
+ uf.union(cluster.id, other_id)
523
+
524
+ result = []
525
+ for group in uf.get_groups().values():
526
+ if len(group) == 1:
527
+ result.append(valid_clusters[group[0]])
528
+ continue
529
+
530
+ group_clusters = [valid_clusters[cid] for cid in group]
531
+ best = self._select_best_cluster_from_group(group_clusters, params)
532
+
533
+ # Simple cell merging - no special cases
534
+ for cluster in group_clusters:
535
+ if cluster != best:
536
+ best.cells.extend(cluster.cells)
537
+
538
+ best.cells = self._deduplicate_cells(best.cells)
539
+ best.cells = self._sort_cells(best.cells)
540
+ result.append(best)
541
+
542
+ return result
543
+
544
+ def _select_best_cluster(
545
+ self,
546
+ clusters: list[Cluster],
547
+ area_threshold: float,
548
+ conf_threshold: float,
549
+ ) -> Cluster:
550
+ """Iteratively select best cluster based on area and confidence thresholds."""
551
+ current_best = None
552
+ for candidate in clusters:
553
+ should_select = True
554
+ for other in clusters:
555
+ if other == candidate:
556
+ continue
557
+
558
+ area_ratio = candidate.bbox.area() / other.bbox.area()
559
+ conf_diff = other.confidence - candidate.confidence
560
+
561
+ if area_ratio <= area_threshold and conf_diff > conf_threshold:
562
+ should_select = False
563
+ break
564
+
565
+ if should_select:
566
+ if current_best is None or (
567
+ candidate.bbox.area() > current_best.bbox.area()
568
+ and current_best.confidence - candidate.confidence <= conf_threshold
569
+ ):
570
+ current_best = candidate
571
+
572
+ return current_best if current_best else clusters[0]
573
+
574
+ def _deduplicate_cells(self, cells: list[TextCell]) -> list[TextCell]:
575
+ """Ensure each cell appears only once, maintaining order of first appearance."""
576
+ seen_ids = set()
577
+ unique_cells = []
578
+ for cell in cells:
579
+ if cell.index not in seen_ids:
580
+ seen_ids.add(cell.index)
581
+ unique_cells.append(cell)
582
+ return unique_cells
583
+
584
+ def _assign_cells_to_clusters(
585
+ self, clusters: list[Cluster], min_overlap: float = 0.2
586
+ ) -> list[Cluster]:
587
+ """Assign cells to best overlapping cluster."""
588
+ for cluster in clusters:
589
+ cluster.cells = []
590
+
591
+ for cell in self.cells:
592
+ if not cell.text.strip():
593
+ continue
594
+
595
+ best_overlap = min_overlap
596
+ best_cluster = None
597
+
598
+ for cluster in clusters:
599
+ if cell.rect.to_bounding_box().area() <= 0:
600
+ continue
601
+
602
+ overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
603
+ cluster.bbox
604
+ )
605
+ if overlap_ratio > best_overlap:
606
+ best_overlap = overlap_ratio
607
+ best_cluster = cluster
608
+
609
+ if best_cluster is not None:
610
+ best_cluster.cells.append(cell)
611
+
612
+ # Deduplicate cells in each cluster after assignment
613
+ for cluster in clusters:
614
+ cluster.cells = self._deduplicate_cells(cluster.cells)
615
+
616
+ return clusters
617
+
618
+ def _find_unassigned_cells(self, clusters: list[Cluster]) -> list[TextCell]:
619
+ """Find cells not assigned to any cluster."""
620
+ assigned = {cell.index for cluster in clusters for cell in cluster.cells}
621
+ return [
622
+ cell
623
+ for cell in self.cells
624
+ if cell.index not in assigned and cell.text.strip()
625
+ ]
626
+
627
+ def _adjust_cluster_bboxes(self, clusters: list[Cluster]) -> list[Cluster]:
628
+ """Adjust cluster bounding boxes to contain their cells."""
629
+ for cluster in clusters:
630
+ if not cluster.cells:
631
+ continue
632
+
633
+ cells_bbox = BoundingBox(
634
+ l=min(cell.rect.to_bounding_box().l for cell in cluster.cells),
635
+ t=min(cell.rect.to_bounding_box().t for cell in cluster.cells),
636
+ r=max(cell.rect.to_bounding_box().r for cell in cluster.cells),
637
+ b=max(cell.rect.to_bounding_box().b for cell in cluster.cells),
638
+ )
639
+
640
+ if cluster.label == DocItemLabel.TABLE:
641
+ # For tables, take union of current bbox and cells bbox
642
+ cluster.bbox = BoundingBox(
643
+ l=min(cluster.bbox.l, cells_bbox.l),
644
+ t=min(cluster.bbox.t, cells_bbox.t),
645
+ r=max(cluster.bbox.r, cells_bbox.r),
646
+ b=max(cluster.bbox.b, cells_bbox.b),
647
+ )
648
+ else:
649
+ cluster.bbox = cells_bbox
650
+
651
+ return clusters
652
+
653
+ def _sort_cells(self, cells: list[TextCell]) -> list[TextCell]:
654
+ """Sort cells in native reading order."""
655
+ return sorted(cells, key=lambda c: (c.index))
656
+
657
+ def _sort_clusters(
658
+ self, clusters: list[Cluster], mode: str = "id"
659
+ ) -> list[Cluster]:
660
+ """Sort clusters in reading order (top-to-bottom, left-to-right)."""
661
+ if mode == "id": # sort in the order the cells are printed in the PDF.
662
+ return sorted(
663
+ clusters,
664
+ key=lambda cluster: (
665
+ (
666
+ min(cell.index for cell in cluster.cells)
667
+ if cluster.cells
668
+ else sys.maxsize
669
+ ),
670
+ cluster.bbox.t,
671
+ cluster.bbox.l,
672
+ ),
673
+ )
674
+ elif mode == "tblr": # Sort top-to-bottom, then left-to-right ("row first")
675
+ return sorted(
676
+ clusters, key=lambda cluster: (cluster.bbox.t, cluster.bbox.l)
677
+ )
678
+ elif mode == "lrtb": # Sort left-to-right, then top-to-bottom ("column first")
679
+ return sorted(
680
+ clusters, key=lambda cluster: (cluster.bbox.l, cluster.bbox.t)
681
+ )
682
+ else:
683
+ return clusters
docling/utils/locks.py ADDED
@@ -0,0 +1,3 @@
1
+ import threading
2
+
3
+ pypdfium2_lock = threading.Lock()