docling 2.25.1__py3-none-any.whl → 2.26.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/cli/main.py +2 -2
- docling/datamodel/pipeline_options.py +1 -1
- docling/models/code_formula_model.py +84 -5
- docling/models/document_picture_classifier.py +1 -1
- docling/models/table_structure_model.py +1 -1
- docling/utils/layout_postprocessor.py +2 -1
- {docling-2.25.1.dist-info → docling-2.26.0.dist-info}/METADATA +2 -2
- {docling-2.25.1.dist-info → docling-2.26.0.dist-info}/RECORD +11 -11
- {docling-2.25.1.dist-info → docling-2.26.0.dist-info}/LICENSE +0 -0
- {docling-2.25.1.dist-info → docling-2.26.0.dist-info}/WHEEL +0 -0
- {docling-2.25.1.dist-info → docling-2.26.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -210,7 +210,7 @@ def convert(
|
|
210
210
|
table_mode: Annotated[
|
211
211
|
TableFormerMode,
|
212
212
|
typer.Option(..., help="The mode to use in the table structure model."),
|
213
|
-
] = TableFormerMode.
|
213
|
+
] = TableFormerMode.ACCURATE,
|
214
214
|
enrich_code: Annotated[
|
215
215
|
bool,
|
216
216
|
typer.Option(..., help="Enable the code enrichment model in the pipeline."),
|
@@ -245,7 +245,7 @@ def convert(
|
|
245
245
|
typer.Option(
|
246
246
|
...,
|
247
247
|
"--abort-on-error/--no-abort-on-error",
|
248
|
-
help="If enabled, the
|
248
|
+
help="If enabled, the processing will be aborted when the first error is encountered.",
|
249
249
|
),
|
250
250
|
] = False,
|
251
251
|
output: Annotated[
|
@@ -99,7 +99,7 @@ class TableStructureOptions(BaseModel):
|
|
99
99
|
# are merged across table columns.
|
100
100
|
# False: Let table structure model define the text cells, ignore PDF cells.
|
101
101
|
)
|
102
|
-
mode: TableFormerMode = TableFormerMode.
|
102
|
+
mode: TableFormerMode = TableFormerMode.ACCURATE
|
103
103
|
|
104
104
|
|
105
105
|
class OcrOptions(BaseModel):
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import re
|
2
|
+
from collections import Counter
|
2
3
|
from pathlib import Path
|
3
4
|
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
4
5
|
|
@@ -11,7 +12,7 @@ from docling_core.types.doc import (
|
|
11
12
|
TextItem,
|
12
13
|
)
|
13
14
|
from docling_core.types.doc.labels import CodeLanguageLabel
|
14
|
-
from PIL import Image
|
15
|
+
from PIL import Image, ImageOps
|
15
16
|
from pydantic import BaseModel
|
16
17
|
|
17
18
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
@@ -65,7 +66,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
65
66
|
_model_repo_folder = "ds4sd--CodeFormula"
|
66
67
|
elements_batch_size = 5
|
67
68
|
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
|
68
|
-
expansion_factor = 0.
|
69
|
+
expansion_factor = 0.18
|
69
70
|
|
70
71
|
def __init__(
|
71
72
|
self,
|
@@ -124,7 +125,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
124
125
|
repo_id="ds4sd/CodeFormula",
|
125
126
|
force_download=force,
|
126
127
|
local_dir=local_dir,
|
127
|
-
revision="v1.0.
|
128
|
+
revision="v1.0.2",
|
128
129
|
)
|
129
130
|
|
130
131
|
return Path(download_path)
|
@@ -175,7 +176,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
175
176
|
- The second element is the extracted language if a match is found;
|
176
177
|
otherwise, `None`.
|
177
178
|
"""
|
178
|
-
pattern = r"^<_([
|
179
|
+
pattern = r"^<_([^_>]+)_>\s(.*)"
|
179
180
|
match = re.match(pattern, input_string, flags=re.DOTALL)
|
180
181
|
if match:
|
181
182
|
language = str(match.group(1)) # the captured programming language
|
@@ -206,6 +207,82 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
206
207
|
except ValueError:
|
207
208
|
return CodeLanguageLabel.UNKNOWN
|
208
209
|
|
210
|
+
def _get_most_frequent_edge_color(self, pil_img: Image.Image):
|
211
|
+
"""
|
212
|
+
Compute the most frequent color along the outer edges of a PIL image.
|
213
|
+
|
214
|
+
Parameters
|
215
|
+
----------
|
216
|
+
pil_img : Image.Image
|
217
|
+
A PIL Image in any mode (L, RGB, RGBA, etc.).
|
218
|
+
|
219
|
+
Returns
|
220
|
+
-------
|
221
|
+
(int) or (tuple): The most common edge color as a scalar (for grayscale) or
|
222
|
+
tuple (for RGB/RGBA).
|
223
|
+
"""
|
224
|
+
# Convert to NumPy array for easy pixel access
|
225
|
+
img_np = np.array(pil_img)
|
226
|
+
|
227
|
+
if img_np.ndim == 2:
|
228
|
+
# Grayscale-like image: shape (H, W)
|
229
|
+
# Extract edges: top row, bottom row, left col, right col
|
230
|
+
top = img_np[0, :] # shape (W,)
|
231
|
+
bottom = img_np[-1, :] # shape (W,)
|
232
|
+
left = img_np[:, 0] # shape (H,)
|
233
|
+
right = img_np[:, -1] # shape (H,)
|
234
|
+
|
235
|
+
# Concatenate all edges
|
236
|
+
edges = np.concatenate([top, bottom, left, right])
|
237
|
+
|
238
|
+
# Count frequencies
|
239
|
+
freq = Counter(edges.tolist())
|
240
|
+
most_common_value, _ = freq.most_common(1)[0]
|
241
|
+
return int(most_common_value) # single channel color
|
242
|
+
|
243
|
+
else:
|
244
|
+
# Color image: shape (H, W, C)
|
245
|
+
top = img_np[0, :, :] # shape (W, C)
|
246
|
+
bottom = img_np[-1, :, :] # shape (W, C)
|
247
|
+
left = img_np[:, 0, :] # shape (H, C)
|
248
|
+
right = img_np[:, -1, :] # shape (H, C)
|
249
|
+
|
250
|
+
# Concatenate edges along first axis
|
251
|
+
edges = np.concatenate([top, bottom, left, right], axis=0)
|
252
|
+
|
253
|
+
# Convert each color to a tuple for counting
|
254
|
+
edges_as_tuples = [tuple(pixel) for pixel in edges]
|
255
|
+
freq = Counter(edges_as_tuples)
|
256
|
+
most_common_value, _ = freq.most_common(1)[0]
|
257
|
+
return most_common_value # e.g. (R, G, B) or (R, G, B, A)
|
258
|
+
|
259
|
+
def _pad_with_most_frequent_edge_color(
|
260
|
+
self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
|
261
|
+
):
|
262
|
+
"""
|
263
|
+
Pads an image (PIL or NumPy array) using the most frequent edge color.
|
264
|
+
|
265
|
+
Parameters
|
266
|
+
----------
|
267
|
+
img : Union[Image.Image, np.ndarray]
|
268
|
+
The original image.
|
269
|
+
padding : tuple
|
270
|
+
Padding (left, top, right, bottom) in pixels.
|
271
|
+
|
272
|
+
Returns
|
273
|
+
-------
|
274
|
+
Image.Image: A new PIL image with the specified padding.
|
275
|
+
"""
|
276
|
+
if isinstance(img, np.ndarray):
|
277
|
+
pil_img = Image.fromarray(img)
|
278
|
+
else:
|
279
|
+
pil_img = img
|
280
|
+
|
281
|
+
most_freq_color = self._get_most_frequent_edge_color(pil_img)
|
282
|
+
|
283
|
+
padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color)
|
284
|
+
return padded_img
|
285
|
+
|
209
286
|
def __call__(
|
210
287
|
self,
|
211
288
|
doc: DoclingDocument,
|
@@ -238,7 +315,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
238
315
|
assert isinstance(el.item, TextItem)
|
239
316
|
elements.append(el.item)
|
240
317
|
labels.append(el.item.label)
|
241
|
-
images.append(
|
318
|
+
images.append(
|
319
|
+
self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10))
|
320
|
+
)
|
242
321
|
|
243
322
|
outputs = self.code_formula_model.predict(images, labels)
|
244
323
|
|
@@ -203,6 +203,7 @@ class LayoutPostprocessor:
|
|
203
203
|
"""Initialize processor with cells and spatial indices."""
|
204
204
|
self.cells = cells
|
205
205
|
self.page_size = page_size
|
206
|
+
self.all_clusters = clusters
|
206
207
|
self.regular_clusters = [
|
207
208
|
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
208
209
|
]
|
@@ -267,7 +268,7 @@ class LayoutPostprocessor:
|
|
267
268
|
# Handle orphaned cells
|
268
269
|
unassigned = self._find_unassigned_cells(clusters)
|
269
270
|
if unassigned:
|
270
|
-
next_id = max((c.id for c in
|
271
|
+
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
271
272
|
orphan_clusters = []
|
272
273
|
for i, cell in enumerate(unassigned):
|
273
274
|
conf = 1.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.26.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -185,7 +185,7 @@ For individual model usage, please refer to the model licenses found in the orig
|
|
185
185
|
|
186
186
|
Docling has been brought to you by IBM.
|
187
187
|
|
188
|
-
[supported_formats]: https://ds4sd.github.io/docling/supported_formats/
|
188
|
+
[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
|
189
189
|
[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
|
190
190
|
[integrations]: https://ds4sd.github.io/docling/integrations/
|
191
191
|
|
@@ -19,21 +19,21 @@ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqaf
|
|
19
19
|
docling/backend/xml/uspto_backend.py,sha256=IGUNeF2xpLeaVrX6nKb-jXgtSYD2ozULsrDPcrI1IbQ,71040
|
20
20
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
21
21
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
-
docling/cli/main.py,sha256=
|
22
|
+
docling/cli/main.py,sha256=unokSvmqZqFE_yLUQGBIo7q9QjdFrrE8EqnHxnqpGtM,16863
|
23
23
|
docling/cli/models.py,sha256=DDnz-boX2MexPxC8OnOMPgSPG0iwseT3xkkCfgPrZis,3969
|
24
24
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
25
25
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
26
|
docling/datamodel/base_models.py,sha256=kMDT-rFhtJUFOOOry4wd2PzCMTLFixFklgSgmRDMS64,7201
|
27
27
|
docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
|
28
|
-
docling/datamodel/pipeline_options.py,sha256=
|
28
|
+
docling/datamodel/pipeline_options.py,sha256=L5ZmMZOkE0T2419uk_butX3ZoY8GhLJcmuGm2Gf1OHU,11991
|
29
29
|
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
30
30
|
docling/document_converter.py,sha256=AeiSmKzWcnOkZm8O-KIBG72g3l4W2CAsq3yEbfC1tiE,13184
|
31
31
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
32
32
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
33
|
docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,2679
|
34
34
|
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
35
|
-
docling/models/code_formula_model.py,sha256=
|
36
|
-
docling/models/document_picture_classifier.py,sha256=
|
35
|
+
docling/models/code_formula_model.py,sha256=mOu5luYMzyrCCr8MRGOciNcSvULpQysDd_FXn96WPc8,11477
|
36
|
+
docling/models/document_picture_classifier.py,sha256=fz77RsTdlnA_yC47O-KUq2xVWMKX0_9jm_EGcHliw-E,6235
|
37
37
|
docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
|
38
38
|
docling/models/hf_vlm_model.py,sha256=NUtLEuG-kNGJeDHWmQKAAOZG4WF0a5hn-KXUUM1mHBQ,6820
|
39
39
|
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
@@ -45,7 +45,7 @@ docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0
|
|
45
45
|
docling/models/picture_description_vlm_model.py,sha256=EvKn4zWgTsQnbMFEoDhU3Ox4Pu5DkPqd2QewsGoXULU,3641
|
46
46
|
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
47
47
|
docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
|
48
|
-
docling/models/table_structure_model.py,sha256=
|
48
|
+
docling/models/table_structure_model.py,sha256=gEXHRtHlLFUsP_Gs2EPaBJL-3KlMHa5HLUwzr3kN4_Y,11225
|
49
49
|
docling/models/tesseract_ocr_cli_model.py,sha256=F5EhS4NDEmLkPq-a0P7o2LrzjmJgACzlYXTDvtD3NtY,9343
|
50
50
|
docling/models/tesseract_ocr_model.py,sha256=ikGu6QNknLG64c9yYIb0Ix6MGhBzOoa1ODbNc8MT5r8,8508
|
51
51
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -58,15 +58,15 @@ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
59
59
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
60
60
|
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
61
|
-
docling/utils/layout_postprocessor.py,sha256=
|
61
|
+
docling/utils/layout_postprocessor.py,sha256=kdIk5TpAEXvsQUvkdALBDnAbjc4I_j8s8w6GEvbu4f0,24304
|
62
62
|
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
63
63
|
docling/utils/model_downloader.py,sha256=sxAQvjiIu9m2Ur5Ot5C5SATmgWJAHi0xSjzxj8QXYJk,3213
|
64
64
|
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
65
65
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
66
66
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
67
67
|
docling/utils/visualization.py,sha256=cmbIroPQXPmJdFrNIfpC26WpijBwx05qmpu3QhiG1EI,2850
|
68
|
-
docling-2.
|
69
|
-
docling-2.
|
70
|
-
docling-2.
|
71
|
-
docling-2.
|
72
|
-
docling-2.
|
68
|
+
docling-2.26.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
69
|
+
docling-2.26.0.dist-info/METADATA,sha256=IPh-vv9mpl1sHnl4pkEsLGrdYeBlaJ-mfN28sn_zito,8803
|
70
|
+
docling-2.26.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
71
|
+
docling-2.26.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
72
|
+
docling-2.26.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|