docling 2.50.0__py3-none-any.whl → 2.51.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,13 +30,21 @@ class DoclingParseV4PageBackend(PdfPageBackend):
30
30
  page_no: int,
31
31
  create_words: bool = True,
32
32
  create_textlines: bool = True,
33
+ keep_chars: bool = False,
34
+ keep_lines: bool = False,
35
+ keep_images: bool = True,
33
36
  ):
34
37
  self._ppage = page_obj
35
38
  self._dp_doc = dp_doc
36
39
  self._page_no = page_no
40
+
37
41
  self._create_words = create_words
38
42
  self._create_textlines = create_textlines
39
43
 
44
+ self._keep_chars = keep_chars
45
+ self._keep_lines = keep_lines
46
+ self._keep_images = keep_images
47
+
40
48
  self._dpage: Optional[SegmentedPdfPage] = None
41
49
  self._unloaded = False
42
50
  self.valid = (self._ppage is not None) and (self._dp_doc is not None)
@@ -47,8 +55,12 @@ class DoclingParseV4PageBackend(PdfPageBackend):
47
55
 
48
56
  seg_page = self._dp_doc.get_page(
49
57
  self._page_no + 1,
58
+ keep_chars=self._keep_chars,
59
+ keep_lines=self._keep_lines,
60
+ keep_bitmaps=self._keep_images,
50
61
  create_words=self._create_words,
51
62
  create_textlines=self._create_textlines,
63
+ enforce_same_font=True,
52
64
  )
53
65
 
54
66
  # In Docling, all TextCell instances are expected with top-left origin.
@@ -237,7 +237,9 @@ class PdfBackend(str, Enum):
237
237
 
238
238
 
239
239
  # Define an enum for the ocr engines
240
- @deprecated("Use ocr_factory.registered_enum")
240
+ @deprecated(
241
+ "Use get_ocr_factory().registered_kind to get a list of registered OCR engines."
242
+ )
241
243
  class OcrEngine(str, Enum):
242
244
  """Enum of valid OCR engines."""
243
245
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.50.0
3
+ Version: 2.51.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
29
  Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
30
- Requires-Dist: docling-parse<5.0.0,>=4.2.2
30
+ Requires-Dist: docling-parse<5.0.0,>=4.4.0
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
33
33
  Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
@@ -101,17 +101,20 @@ Docling simplifies document processing, parsing diverse formats — including ad
101
101
 
102
102
  ## Features
103
103
 
104
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
104
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
105
105
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
106
106
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
107
- * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
107
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
108
108
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
109
109
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
110
110
  * 🔍 Extensive OCR support for scanned PDFs and images
111
111
  * 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
112
- * 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
112
+ * 🎙️ Audio support with Automatic Speech Recognition (ASR) models
113
113
  * 💻 Simple and convenient CLI
114
114
 
115
+ ### What's new
116
+ * 📤 Structured [information extraction][extraction] \[🧪 beta\]
117
+
115
118
  ### Coming soon
116
119
 
117
120
  * 📝 Metadata extraction, including title, authors, references & language
@@ -222,3 +225,4 @@ The project was started by the AI for knowledge team at IBM Research Zurich.
222
225
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
223
226
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
224
227
  [integrations]: https://docling-project.github.io/docling/integrations/
228
+ [extraction]: https://docling-project.github.io/docling/examples/extraction/
@@ -9,7 +9,7 @@ docling/backend/asciidoc_backend.py,sha256=RDNLrPJHxROiM7-NQdZn3DdvAyiPAndbSWcZo
9
9
  docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
10
10
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
11
11
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
12
- docling/backend/docling_parse_v4_backend.py,sha256=MbCMxNGmoW4iuev9tX1Vt4jtIeak2kC9Uac3xQSRxeo,7509
12
+ docling/backend/docling_parse_v4_backend.py,sha256=xCBbaaXjNNrOaod9tmBuCbe5mL_ipmTNG2XOxVbGG3w,7891
13
13
  docling/backend/html_backend.py,sha256=7I3BQSmC7P47jpzXHt3OuPNhtVedJiZVEjjLykyx5pY,42245
14
14
  docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
15
15
  docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
@@ -40,7 +40,7 @@ docling/datamodel/base_models.py,sha256=vOt895z0GsFirHkkI3hM23e9oyUuz9RXfcGFtoIN
40
40
  docling/datamodel/document.py,sha256=ElY7G6FYJ6Bayyw433_tbnxyE47fnQRoBG_mygvOBrA,17370
41
41
  docling/datamodel/extraction.py,sha256=7dgvtK5SuvgfB8LHAwS1FwrW1kcMQJuJG0ol8uAQgoQ,1323
42
42
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
43
- docling/datamodel/pipeline_options.py,sha256=IkbBJGQjZ9nrxN9qN6L0KBLnf1F3BBg3tfCMWPmx0cQ,10966
43
+ docling/datamodel/pipeline_options.py,sha256=bwBZoQbjk--5vE7Vz7N6KEbew-b93ge0ez1-cDPlUnQ,11019
44
44
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
45
45
  docling/datamodel/pipeline_options_vlm_model.py,sha256=AcqqThSW74hwQ6x7pazzm57LnJiUqB7gQi5wFayGlbk,2628
46
46
  docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
@@ -99,9 +99,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
99
99
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
100
100
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
101
101
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
102
- docling-2.50.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
103
- docling-2.50.0.dist-info/METADATA,sha256=w6U8qf-fYMZi6EXxFXFLxs9WOSG3S0Ilblg-klEyK3Y,10731
104
- docling-2.50.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
105
- docling-2.50.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
106
- docling-2.50.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
107
- docling-2.50.0.dist-info/RECORD,,
102
+ docling-2.51.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
103
+ docling-2.51.0.dist-info/METADATA,sha256=oVomcZQUJIX-QnMmIFggzyjuhea7jbgVXus9bZZkYDU,10886
104
+ docling-2.51.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
105
+ docling-2.51.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
106
+ docling-2.51.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
107
+ docling-2.51.0.dist-info/RECORD,,