docling 2.49.0__py3-none-any.whl → 2.51.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
30
30
  page_no: int,
31
31
  create_words: bool = True,
32
32
  create_textlines: bool = True,
33
+ keep_chars: bool = False,
34
+ keep_lines: bool = False,
35
+ keep_images: bool = True,
33
36
  ):
34
37
  self._ppage = page_obj
35
38
  self._dp_doc = dp_doc
36
39
  self._page_no = page_no
40
+
37
41
  self._create_words = create_words
38
42
  self._create_textlines = create_textlines
39
43
 
44
+ self._keep_chars = keep_chars
45
+ self._keep_lines = keep_lines
46
+ self._keep_images = keep_images
47
+
40
48
  self._dpage: Optional[SegmentedPdfPage] = None
41
49
  self._unloaded = False
42
50
  self.valid = (self._ppage is not None) and (self._dp_doc is not None)
@@ -47,8 +55,12 @@ class DoclingParseV4PageBackend(PdfPageBackend):
47
55
 
48
56
  seg_page = self._dp_doc.get_page(
49
57
  self._page_no + 1,
58
+ keep_chars=self._keep_chars,
59
+ keep_lines=self._keep_lines,
60
+ keep_bitmaps=self._keep_images,
50
61
  create_words=self._create_words,
51
62
  create_textlines=self._create_textlines,
63
+ enforce_same_font=True,
52
64
  )
53
65
 
54
66
  # In Docling, all TextCell instances are expected with top-left origin.
@@ -467,13 +467,14 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
467
467
 
468
468
  @contextmanager
469
469
  def _use_hyperlink(self, tag: Tag):
470
+ old_hyperlink: Union[AnyUrl, Path, None] = None
471
+ new_hyperlink: Union[AnyUrl, Path, None] = None
470
472
  this_href = tag.get("href")
471
473
  if this_href is None:
472
474
  yield None
473
475
  else:
474
476
  if isinstance(this_href, str) and this_href:
475
- old_hyperlink: Union[AnyUrl, Path, None] = self.hyperlink
476
- new_hyperlink: Union[AnyUrl, Path, None] = None
477
+ old_hyperlink = self.hyperlink
477
478
  if self.original_url is not None:
478
479
  this_href = urljoin(str(self.original_url), str(this_href))
479
480
  # ugly fix for relative links since pydantic does not support them.
@@ -237,7 +237,9 @@ class PdfBackend(str, Enum):
237
237
 
238
238
 
239
239
  # Define an enum for the ocr engines
240
- @deprecated("Use ocr_factory.registered_enum")
240
+ @deprecated(
241
+ "Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
242
+ )
241
243
  class OcrEngine(str, Enum):
242
244
  """Enum of valid OCR engines."""
243
245
 
@@ -283,10 +285,10 @@ class LayoutOptions(BaseModel):
283
285
  keep_empty_clusters: bool = (
284
286
  False # Whether to keep clusters that contain no text cells
285
287
  )
288
+ model_spec: LayoutModelConfig = DOCLING_LAYOUT_HERON
286
289
  skip_cell_assignment: bool = (
287
290
  False # Skip cell-to-cluster assignment for VLM-only processing
288
291
  )
289
- model_spec: LayoutModelConfig = DOCLING_LAYOUT_V2
290
292
 
291
293
 
292
294
  class AsrPipelineOptions(PipelineOptions):
@@ -91,7 +91,7 @@ class LayoutModel(BasePageModel):
91
91
  local_dir: Optional[Path] = None,
92
92
  force: bool = False,
93
93
  progress: bool = False,
94
- layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
94
+ layout_model_config: LayoutModelConfig = LayoutOptions().model_spec, # use default
95
95
  ) -> Path:
96
96
  return download_hf_model(
97
97
  repo_id=layout_model_config.repo_id,
@@ -122,8 +122,8 @@ class LayoutModel(BasePageModel):
122
122
  left_clusters = [c for c in clusters if c.label not in exclude_labels]
123
123
  right_clusters = [c for c in clusters if c.label in exclude_labels]
124
124
  # Create a deep copy of the original image for both sides
125
- left_image = copy.deepcopy(page.image)
126
- right_image = copy.deepcopy(page.image)
125
+ left_image = page.image.copy()
126
+ right_image = page.image.copy()
127
127
 
128
128
  # Draw clusters on both images
129
129
  draw_clusters(left_image, left_clusters, scale_x, scale_y)
@@ -90,7 +90,7 @@ class PagePreprocessingModel(BasePageModel):
90
90
 
91
91
  # DEBUG code:
92
92
  def draw_text_boxes(image, cells, show: bool = False):
93
- draw = ImageDraw.Draw(image)
93
+ draw = ImageDraw.Draw(image.copy())
94
94
  for c in cells:
95
95
  x0, y0, x1, y1 = (
96
96
  c.to_bounding_box().l,
@@ -94,7 +94,7 @@ class TableStructureModel(BasePageModel):
94
94
  ) -> Path:
95
95
  return download_hf_model(
96
96
  repo_id="ds4sd/docling-models",
97
- revision="v2.2.0",
97
+ revision="v2.3.0",
98
98
  local_dir=local_dir,
99
99
  force=force,
100
100
  progress=progress,
@@ -4,6 +4,7 @@ from typing import Optional
4
4
 
5
5
  from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
6
6
  from docling.datamodel.pipeline_options import (
7
+ LayoutOptions,
7
8
  granite_picture_description,
8
9
  smolvlm_picture_description,
9
10
  )
@@ -47,7 +48,7 @@ def download_models(
47
48
  if with_layout:
48
49
  _log.info("Downloading layout model...")
49
50
  LayoutModel.download_models(
50
- local_dir=output_dir / DOCLING_LAYOUT_V2.model_repo_folder,
51
+ local_dir=output_dir / LayoutOptions().model_spec.model_repo_folder,
51
52
  force=force,
52
53
  progress=progress,
53
54
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.49.0
3
+ Version: 2.51.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,8 +27,8 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
- Requires-Dist: docling-parse<5.0.0,>=4.2.2
31
- Requires-Dist: docling-ibm-models<4,>=3.9.0
30
+ Requires-Dist: docling-parse<5.0.0,>=4.4.0
31
+ Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
34
34
  Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
@@ -101,17 +101,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
101
101
 
102
102
  ## Features
103
103
 
104
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
104
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
105
105
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
106
106
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
107
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
107
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
108
108
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
109
109
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
110
110
  * 🔍 Extensive OCR support for scanned PDFs and images
111
111
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
112
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
112
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
113
113
  * 💻 Simple and convenient CLI
114
114
 
115
+ ### What's new
116
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
117
+
115
118
  ### Coming soon
116
119
 
117
120
  * 📝 Metadata extraction, including title, authors, references & language
@@ -222,3 +225,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
222
225
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
223
226
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
224
227
  [integrations]: https://docling-project.github.io/docling/integrations/
228
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -9,8 +9,8 @@ docling/backend/asciidoc_backend.py,sha256=RDNLrPJHxROiM7-NQdZn3DdvAyiPAndbSWcZo
9
9
  docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
10
10
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
11
11
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
12
- docling/backend/docling_parse_v4_backend.py,sha256=MbCMxNGmoW4iuev9tX1Vt4jtIeak2kC9Uac3xQSRxeo,7509
13
- docling/backend/html_backend.py,sha256=MqtU9fA83lcjqb85lFTmGDedOH72WxTmwvj0ZzPur1I,42224
12
+ docling/backend/docling_parse_v4_backend.py,sha256=xCBbaaXjNNrOaod9tmBuCbe5mL_ipmTNG2XOxVbGG3w,7891
13
+ docling/backend/html_backend.py,sha256=7I3BQSmC7P47jpzXHt3OuPNhtVedJiZVEjjLykyx5pY,42245
14
14
  docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
15
15
  docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
16
16
  docling/backend/msexcel_backend.py,sha256=5JRbPwOjR1r45AMeIts1rj6InbOgLBf_CtAhvNPVmsQ,19157
@@ -40,7 +40,7 @@ docling/datamodel/base_models.py,sha256=vOt895z0GsFirHkkI3hM23e9oyUuz9RXfcGFtoIN
40
40
  docling/datamodel/document.py,sha256=ElY7G6FYJ6Bayyw433_tbnxyE47fnQRoBG_mygvOBrA,17370
41
41
  docling/datamodel/extraction.py,sha256=7dgvtK5SuvgfB8LHAwS1FwrW1kcMQJuJG0ol8uAQgoQ,1323
42
42
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
43
- docling/datamodel/pipeline_options.py,sha256=0J0xVOSfI3pqRMkXlzX_rtmVBgCTsR2QJz54xugP8sg,10963
43
+ docling/datamodel/pipeline_options.py,sha256=bwBZoQbjk--5vE7Vz7N6KEbew-b93ge0ez1-cDPlUnQ,11019
44
44
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
45
45
  docling/datamodel/pipeline_options_vlm_model.py,sha256=AcqqThSW74hwQ6x7pazzm57LnJiUqB7gQi5wFayGlbk,2628
46
46
  docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
@@ -52,16 +52,16 @@ docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDW
52
52
  docling/models/code_formula_model.py,sha256=XRugm4EwifLRc-TrAk-glKlktJP-nAPneKh2EOovkJU,11308
53
53
  docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
54
54
  docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
55
- docling/models/layout_model.py,sha256=Nfbo6keMB4vVjGoZdFMqD9CmZcWh-0bE3LkRjJTDJQ0,9146
55
+ docling/models/layout_model.py,sha256=2D7Ey2Mvtzyq9KARIFLaUZKzsR661h7Zggwn0IM9R3c,9154
56
56
  docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
57
57
  docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
58
- docling/models/page_preprocessing_model.py,sha256=rHNX1uP1ScTjVUlsxZ0eamK2uNUqI94WpnyrP10Pj6k,5277
58
+ docling/models/page_preprocessing_model.py,sha256=EmusNexws5ZmR93js_saVU0BedqZ_HIHQeY7lcf52tI,5284
59
59
  docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
60
60
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
61
61
  docling/models/picture_description_vlm_model.py,sha256=5BJvaF3PHuL9lCVYqPv9krh3h_7YwNSdKYw1EVEj13k,4156
62
62
  docling/models/rapid_ocr_model.py,sha256=7yZC7I1qoC9xC8xJIjTk2c8VFm89RfB6Vr7IDOnr5gs,7102
63
63
  docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
64
- docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
64
+ docling/models/table_structure_model.py,sha256=7vO8LisdoqCTsY8X8lsk9d-oD2hVjUtdaWlkMTQxEg0,12518
65
65
  docling/models/tesseract_ocr_cli_model.py,sha256=I3Gn28Y-LD8OfvyCElN9fLiNgpo2sT0uMkVt258253s,12881
66
66
  docling/models/tesseract_ocr_model.py,sha256=GdI5Cjfi87qcehVbM3wdKRvKkl_F9A4bwTUbjXZCJYA,10745
67
67
  docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
@@ -93,15 +93,15 @@ docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
93
93
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
94
94
  docling/utils/layout_postprocessor.py,sha256=sE9UR3Nv4iOk26uoIsN3bFioE7ScfAjj0orDBDneLXg,25166
95
95
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
96
- docling/utils/model_downloader.py,sha256=3vijCsAIVwWqehGBDRxRq7mJ3yRb9-zBsG00iqjqegU,4076
96
+ docling/utils/model_downloader.py,sha256=lAIyevIC6dyv1TS0ElRSAGNylB5n_V8pWs1PhxH8wAQ,4104
97
97
  docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
98
98
  docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,1842
99
99
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
100
100
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
101
101
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
102
- docling-2.49.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
103
- docling-2.49.0.dist-info/METADATA,sha256=Gn1u-LwLRMCqHamlyu1M4w9a8NvGfk-jfcCh0XjhsfQ,10731
104
- docling-2.49.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
105
- docling-2.49.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
106
- docling-2.49.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
107
- docling-2.49.0.dist-info/RECORD,,
102
+ docling-2.51.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
103
+ docling-2.51.0.dist-info/METADATA,sha256=oVomcZQUJIX-QnMmIFggzyjuhea7jbgVXus9bZZkYDU,10886
104
+ docling-2.51.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
105
+ docling-2.51.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
106
+ docling-2.51.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
107
+ docling-2.51.0.dist-info/RECORD,,