docling 2.25.2__py3-none-any.whl → 2.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -210,7 +210,7 @@ def convert(
210
210
  table_mode: Annotated[
211
211
  TableFormerMode,
212
212
  typer.Option(..., help="The mode to use in the table structure model."),
213
- ] = TableFormerMode.FAST,
213
+ ] = TableFormerMode.ACCURATE,
214
214
  enrich_code: Annotated[
215
215
  bool,
216
216
  typer.Option(..., help="Enable the code enrichment model in the pipeline."),
@@ -245,7 +245,7 @@ def convert(
245
245
  typer.Option(
246
246
  ...,
247
247
  "--abort-on-error/--no-abort-on-error",
248
- help="If enabled, the bitmap content will be processed using OCR.",
248
+ help="If enabled, the processing will be aborted when the first error is encountered.",
249
249
  ),
250
250
  ] = False,
251
251
  output: Annotated[
@@ -99,7 +99,7 @@ class TableStructureOptions(BaseModel):
99
99
  # are merged across table columns.
100
100
  # False: Let table structure model define the text cells, ignore PDF cells.
101
101
  )
102
- mode: TableFormerMode = TableFormerMode.FAST
102
+ mode: TableFormerMode = TableFormerMode.ACCURATE
103
103
 
104
104
 
105
105
  class OcrOptions(BaseModel):
@@ -1,4 +1,5 @@
1
1
  import re
2
+ from collections import Counter
2
3
  from pathlib import Path
3
4
  from typing import Iterable, List, Literal, Optional, Tuple, Union
4
5
 
@@ -11,7 +12,7 @@ from docling_core.types.doc import (
11
12
  TextItem,
12
13
  )
13
14
  from docling_core.types.doc.labels import CodeLanguageLabel
14
- from PIL import Image
15
+ from PIL import Image, ImageOps
15
16
  from pydantic import BaseModel
16
17
 
17
18
  from docling.datamodel.base_models import ItemAndImageEnrichmentElement
@@ -65,7 +66,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
65
66
  _model_repo_folder = "ds4sd--CodeFormula"
66
67
  elements_batch_size = 5
67
68
  images_scale = 1.66 # = 120 dpi, aligned with training data resolution
68
- expansion_factor = 0.03
69
+ expansion_factor = 0.18
69
70
 
70
71
  def __init__(
71
72
  self,
@@ -124,7 +125,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
124
125
  repo_id="ds4sd/CodeFormula",
125
126
  force_download=force,
126
127
  local_dir=local_dir,
127
- revision="v1.0.1",
128
+ revision="v1.0.2",
128
129
  )
129
130
 
130
131
  return Path(download_path)
@@ -175,7 +176,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
175
176
  - The second element is the extracted language if a match is found;
176
177
  otherwise, `None`.
177
178
  """
178
- pattern = r"^<_([^>]+)_>\s*(.*)"
179
+ pattern = r"^<_([^_>]+)_>\s(.*)"
179
180
  match = re.match(pattern, input_string, flags=re.DOTALL)
180
181
  if match:
181
182
  language = str(match.group(1)) # the captured programming language
@@ -206,6 +207,82 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
206
207
  except ValueError:
207
208
  return CodeLanguageLabel.UNKNOWN
208
209
 
210
+ def _get_most_frequent_edge_color(self, pil_img: Image.Image):
211
+ """
212
+ Compute the most frequent color along the outer edges of a PIL image.
213
+
214
+ Parameters
215
+ ----------
216
+ pil_img : Image.Image
217
+ A PIL Image in any mode (L, RGB, RGBA, etc.).
218
+
219
+ Returns
220
+ -------
221
+ (int) or (tuple): The most common edge color as a scalar (for grayscale) or
222
+ tuple (for RGB/RGBA).
223
+ """
224
+ # Convert to NumPy array for easy pixel access
225
+ img_np = np.array(pil_img)
226
+
227
+ if img_np.ndim == 2:
228
+ # Grayscale-like image: shape (H, W)
229
+ # Extract edges: top row, bottom row, left col, right col
230
+ top = img_np[0, :] # shape (W,)
231
+ bottom = img_np[-1, :] # shape (W,)
232
+ left = img_np[:, 0] # shape (H,)
233
+ right = img_np[:, -1] # shape (H,)
234
+
235
+ # Concatenate all edges
236
+ edges = np.concatenate([top, bottom, left, right])
237
+
238
+ # Count frequencies
239
+ freq = Counter(edges.tolist())
240
+ most_common_value, _ = freq.most_common(1)[0]
241
+ return int(most_common_value) # single channel color
242
+
243
+ else:
244
+ # Color image: shape (H, W, C)
245
+ top = img_np[0, :, :] # shape (W, C)
246
+ bottom = img_np[-1, :, :] # shape (W, C)
247
+ left = img_np[:, 0, :] # shape (H, C)
248
+ right = img_np[:, -1, :] # shape (H, C)
249
+
250
+ # Concatenate edges along first axis
251
+ edges = np.concatenate([top, bottom, left, right], axis=0)
252
+
253
+ # Convert each color to a tuple for counting
254
+ edges_as_tuples = [tuple(pixel) for pixel in edges]
255
+ freq = Counter(edges_as_tuples)
256
+ most_common_value, _ = freq.most_common(1)[0]
257
+ return most_common_value # e.g. (R, G, B) or (R, G, B, A)
258
+
259
+ def _pad_with_most_frequent_edge_color(
260
+ self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
261
+ ):
262
+ """
263
+ Pads an image (PIL or NumPy array) using the most frequent edge color.
264
+
265
+ Parameters
266
+ ----------
267
+ img : Union[Image.Image, np.ndarray]
268
+ The original image.
269
+ padding : tuple
270
+ Padding (left, top, right, bottom) in pixels.
271
+
272
+ Returns
273
+ -------
274
+ Image.Image: A new PIL image with the specified padding.
275
+ """
276
+ if isinstance(img, np.ndarray):
277
+ pil_img = Image.fromarray(img)
278
+ else:
279
+ pil_img = img
280
+
281
+ most_freq_color = self._get_most_frequent_edge_color(pil_img)
282
+
283
+ padded_img = ImageOps.expand(pil_img, border=padding, fill=most_freq_color)
284
+ return padded_img
285
+
209
286
  def __call__(
210
287
  self,
211
288
  doc: DoclingDocument,
@@ -238,7 +315,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
238
315
  assert isinstance(el.item, TextItem)
239
316
  elements.append(el.item)
240
317
  labels.append(el.item.label)
241
- images.append(el.image)
318
+ images.append(
319
+ self._pad_with_most_frequent_edge_color(el.image, (20, 10, 20, 10))
320
+ )
242
321
 
243
322
  outputs = self.code_formula_model.predict(images, labels)
244
323
 
@@ -113,7 +113,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
113
113
  repo_id="ds4sd/DocumentFigureClassifier",
114
114
  force_download=force,
115
115
  local_dir=local_dir,
116
- revision="v1.0.0",
116
+ revision="v1.0.1",
117
117
  )
118
118
 
119
119
  return Path(download_path)
@@ -95,7 +95,7 @@ class TableStructureModel(BasePageModel):
95
95
  repo_id="ds4sd/docling-models",
96
96
  force_download=force,
97
97
  local_dir=local_dir,
98
- revision="v2.1.0",
98
+ revision="v2.2.0",
99
99
  )
100
100
 
101
101
  return Path(download_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.25.2
3
+ Version: 2.26.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,21 +19,21 @@ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqaf
19
19
  docling/backend/xml/uspto_backend.py,sha256=IGUNeF2xpLeaVrX6nKb-jXgtSYD2ozULsrDPcrI1IbQ,71040
20
20
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
21
21
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
22
+ docling/cli/main.py,sha256=unokSvmqZqFE_yLUQGBIo7q9QjdFrrE8EqnHxnqpGtM,16863
23
23
  docling/cli/models.py,sha256=DDnz-boX2MexPxC8OnOMPgSPG0iwseT3xkkCfgPrZis,3969
24
24
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
25
25
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  docling/datamodel/base_models.py,sha256=kMDT-rFhtJUFOOOry4wd2PzCMTLFixFklgSgmRDMS64,7201
27
27
  docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
28
- docling/datamodel/pipeline_options.py,sha256=YpWqCqkA44YUFPhiBg_LYcfOAXxNhv10vZKrkfLtJ_I,11987
28
+ docling/datamodel/pipeline_options.py,sha256=L5ZmMZOkE0T2419uk_butX3ZoY8GhLJcmuGm2Gf1OHU,11991
29
29
  docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
30
30
  docling/document_converter.py,sha256=AeiSmKzWcnOkZm8O-KIBG72g3l4W2CAsq3yEbfC1tiE,13184
31
31
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
32
32
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,2679
34
34
  docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
35
- docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
36
- docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
35
+ docling/models/code_formula_model.py,sha256=mOu5luYMzyrCCr8MRGOciNcSvULpQysDd_FXn96WPc8,11477
36
+ docling/models/document_picture_classifier.py,sha256=fz77RsTdlnA_yC47O-KUq2xVWMKX0_9jm_EGcHliw-E,6235
37
37
  docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
38
38
  docling/models/hf_vlm_model.py,sha256=NUtLEuG-kNGJeDHWmQKAAOZG4WF0a5hn-KXUUM1mHBQ,6820
39
39
  docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
@@ -45,7 +45,7 @@ docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0
45
45
  docling/models/picture_description_vlm_model.py,sha256=EvKn4zWgTsQnbMFEoDhU3Ox4Pu5DkPqd2QewsGoXULU,3641
46
46
  docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
47
47
  docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
48
- docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
48
+ docling/models/table_structure_model.py,sha256=gEXHRtHlLFUsP_Gs2EPaBJL-3KlMHa5HLUwzr3kN4_Y,11225
49
49
  docling/models/tesseract_ocr_cli_model.py,sha256=F5EhS4NDEmLkPq-a0P7o2LrzjmJgACzlYXTDvtD3NtY,9343
50
50
  docling/models/tesseract_ocr_model.py,sha256=ikGu6QNknLG64c9yYIb0Ix6MGhBzOoa1ODbNc8MT5r8,8508
51
51
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -65,8 +65,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
65
65
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
66
66
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
67
67
  docling/utils/visualization.py,sha256=cmbIroPQXPmJdFrNIfpC26WpijBwx05qmpu3QhiG1EI,2850
68
- docling-2.25.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
69
- docling-2.25.2.dist-info/METADATA,sha256=NsR1pyqk-Q5G5pHrpaLf6TCQEE-r-hGrEB9Hpqdgykk,8803
70
- docling-2.25.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
71
- docling-2.25.2.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
72
- docling-2.25.2.dist-info/RECORD,,
68
+ docling-2.26.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
69
+ docling-2.26.0.dist-info/METADATA,sha256=IPh-vv9mpl1sHnl4pkEsLGrdYeBlaJ-mfN28sn_zito,8803
70
+ docling-2.26.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
71
+ docling-2.26.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
72
+ docling-2.26.0.dist-info/RECORD,,