docling 2.4.0__py3-none-any.whl → 2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,7 +29,7 @@ class DoclingParsePageBackend(PdfPageBackend):
29
29
  self._dpage = parsed_page["pages"][0]
30
30
  else:
31
31
  _log.info(
32
- f"An error occured when loading page {page_no} of document {document_hash}."
32
+ f"An error occurred when loading page {page_no} of document {document_hash}."
33
33
  )
34
34
 
35
35
  def is_valid(self) -> bool:
@@ -31,7 +31,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
31
31
  self._dpage = parsed_page["pages"][0]
32
32
  else:
33
33
  _log.info(
34
- f"An error occured when loading page {page_no} of document {document_hash}."
34
+ f"An error occurred when loading page {page_no} of document {document_hash}."
35
35
  )
36
36
 
37
37
  def is_valid(self) -> bool:
@@ -29,7 +29,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
29
29
  self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
30
30
  except PdfiumError as e:
31
31
  _log.info(
32
- f"An exception occured when loading page {page_no} of document {document_hash}.",
32
+ f"An exception occurred when loading page {page_no} of document {document_hash}.",
33
33
  exc_info=True,
34
34
  )
35
35
  self.valid = False
@@ -2,6 +2,7 @@ import logging
2
2
  from typing import Iterable
3
3
 
4
4
  import numpy
5
+ import torch
5
6
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
7
 
7
8
  from docling.datamodel.base_models import OcrCell, Page
@@ -30,8 +31,12 @@ class EasyOcrModel(BaseOcrModel):
30
31
  "Alternatively, Docling has support for other OCR engines. See the documentation."
31
32
  )
32
33
 
34
+ use_gpu = (
35
+ False if torch.backends.mps.is_available() else self.options.use_gpu
36
+ )
33
37
  self.reader = easyocr.Reader(
34
38
  lang_list=self.options.lang,
39
+ gpu=use_gpu,
35
40
  model_storage_directory=self.options.model_storage_directory,
36
41
  download_enabled=self.options.download_enabled,
37
42
  )
@@ -22,25 +22,37 @@ class TesseractOcrModel(BaseOcrModel):
22
22
  self.reader = None
23
23
 
24
24
  if self.enabled:
25
- setup_errmsg = (
25
+ install_errmsg = (
26
26
  "tesserocr is not correctly installed. "
27
27
  "Please install it via `pip install tesserocr` to use this OCR engine. "
28
- "Note that tesserocr might have to be manually compiled for working with"
28
+ "Note that tesserocr might have to be manually compiled for working with "
29
29
  "your Tesseract installation. The Docling documentation provides examples for it. "
30
- "Alternatively, Docling has support for other OCR engines. See the documentation."
30
+ "Alternatively, Docling has support for other OCR engines. See the documentation: "
31
+ "https://ds4sd.github.io/docling/installation/"
31
32
  )
33
+ missing_langs_errmsg = (
34
+ "tesserocr is not correctly configured. No language models have been detected. "
35
+ "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
36
+ "You can find more information how to setup other OCR engines in Docling "
37
+ "documentation: "
38
+ "https://ds4sd.github.io/docling/installation/"
39
+ )
40
+
32
41
  try:
33
42
  import tesserocr
34
43
  except ImportError:
35
- raise ImportError(setup_errmsg)
36
-
44
+ raise ImportError(install_errmsg)
37
45
  try:
38
46
  tesseract_version = tesserocr.tesseract_version()
39
- _log.debug("Initializing TesserOCR: %s", tesseract_version)
40
47
  except:
41
- raise ImportError(setup_errmsg)
48
+ raise ImportError(install_errmsg)
49
+
50
+ _, tesserocr_languages = tesserocr.get_languages()
51
+ if not tesserocr_languages:
52
+ raise ImportError(missing_langs_errmsg)
42
53
 
43
54
  # Initialize the tesseractAPI
55
+ _log.debug("Initializing TesserOCR: %s", tesseract_version)
44
56
  lang = "+".join(self.options.lang)
45
57
  if self.options.path is not None:
46
58
  self.reader = tesserocr.PyTessBaseAPI(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.4.0
3
+ Version: 2.4.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -53,6 +53,10 @@ Description-Content-Type: text/markdown
53
53
 
54
54
  # Docling
55
55
 
56
+ <p align="center">
57
+ <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
58
+ </p>
59
+
56
60
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
57
61
  [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
58
62
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
@@ -66,19 +70,22 @@ Description-Content-Type: text/markdown
66
70
 
67
71
  Docling parses documents and exports them to the desired format with ease and speed.
68
72
 
69
-
70
73
  ## Features
71
74
 
72
75
  * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
73
76
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
74
77
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
75
- * 📝 Metadata extraction, including title, authors, references & language
76
- * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
78
+ * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
77
79
  * 🔍 OCR support for scanned PDFs
78
80
  * 💻 Simple and convenient CLI
79
81
 
80
82
  Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
81
83
 
84
+ ### Coming soon
85
+
86
+ * ♾️ Equation & code extraction
87
+ * 📝 Metadata extraction, including title, authors, references & language
88
+ * 🦜🔗 Native LangChain extension
82
89
 
83
90
  ## Installation
84
91
 
@@ -104,16 +111,13 @@ result = converter.convert(source)
104
111
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
105
112
  ```
106
113
 
107
-
108
114
  Check out [Getting started](https://ds4sd.github.io/docling/).
109
115
  You will find lots of tuning options to leverage all the advanced capabilities.
110
116
 
111
-
112
117
  ## Get help and support
113
118
 
114
119
  Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
115
120
 
116
-
117
121
  ## Technical report
118
122
 
119
123
  For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
@@ -122,7 +126,6 @@ For more details on Docling's inner workings, check out the [Docling Technical R
122
126
 
123
127
  Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
124
128
 
125
-
126
129
  ## References
127
130
 
128
131
  If you use Docling in your projects, please consider citing the following:
@@ -142,6 +145,10 @@ If you use Docling in your projects, please consider citing the following:
142
145
 
143
146
  ## License
144
147
 
145
- The Docling codebase is under MIT license.
148
+ The Docling codebase is under MIT license.
146
149
  For individual model usage, please refer to the model licenses found in the original packages.
147
150
 
151
+ ## IBM ❤️ Open Source AI
152
+
153
+ Docling has been brought to you by IBM.
154
+
@@ -2,14 +2,14 @@ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=-or6kWVV7egQeyIuN-vI0Tr7Q1htalBZSlhgq_G2RdU,1678
4
4
  docling/backend/asciidoc_backend.py,sha256=kXZxOLk_LvLFVZwnJVVwjmvc3QWZ0iiG7VnwjgtC3hI,14051
5
- docling/backend/docling_parse_backend.py,sha256=TaIMli9vePd3fz9L6S4t75JPYZDpgYBLRGfWjbc9Hbk,7632
6
- docling/backend/docling_parse_v2_backend.py,sha256=QlVU8NgqKvVCa99E8oDa2Xvy__kq30C-myGY3o9Qoq4,8588
5
+ docling/backend/docling_parse_backend.py,sha256=csWy6ZGxDuZfNr0YTrUU40DXqelN_TJksWIYoXxZMjU,7633
6
+ docling/backend/docling_parse_v2_backend.py,sha256=gUr9_fwHbkj238oYQPJ9AxpjFL2jGvhjBlBQPblmSAg,8589
7
7
  docling/backend/html_backend.py,sha256=p3WlYta1f3e4osmvVR12KIUYLJimveTX8UwEkyPt7_g,15161
8
8
  docling/backend/md_backend.py,sha256=tmuSCghjor9PqKIiVieCuZ4_t5JEjZMy3cq7u3yTgyU,14032
9
9
  docling/backend/mspowerpoint_backend.py,sha256=J472AIH_IXvGg3D0FDmXhue1At_VSBD6n15c64Kxttw,15446
10
10
  docling/backend/msword_backend.py,sha256=FAUdP74QxGKo2xMZQ4WQGYwtpIBCTJ_FG17PBpRwhxI,17230
11
11
  docling/backend/pdf_backend.py,sha256=unnw7QiRE1VXg6Pj-eYrtnFGrp5SSYiI324OlFxyv6c,2050
12
- docling/backend/pypdfium2_backend.py,sha256=MJX6fQqwK3r967fyAAs-RA_YIkeQvhgsLkQAgaBTgaE,8995
12
+ docling/backend/pypdfium2_backend.py,sha256=B4bfv-dfzlWiKTfF8LN5fto_99YBu8A2c1_XIVwRUWI,8996
13
13
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  docling/cli/main.py,sha256=IOeIpGoK_5AeE_6LYTU_nfZjqpZ5xeGaTCB8Vfsama0,9334
15
15
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -22,13 +22,13 @@ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  docling/models/base_model.py,sha256=Yq_-FmUhqhE20vXYG3WiQXDRTIPjik1CyuEZ8iYTGAY,701
23
23
  docling/models/base_ocr_model.py,sha256=Ti0glL-_DVRfmP3MpywYVmkNf5RP6qhRg_UKzJuV1Dc,5663
24
24
  docling/models/ds_glm_model.py,sha256=2OpWW8MMzCIshrtP36gDSRPYOCjv1ex34FqxD2nYjP4,11986
25
- docling/models/easyocr_model.py,sha256=23hWq484qVS3nkch6nRRWowfQamN-McFZgfbHfp5Vuo,3818
25
+ docling/models/easyocr_model.py,sha256=zl-B3OR81r0AHt5zFP_A5TqJUSjIjo1-x4GsMZyIzVk,3983
26
26
  docling/models/layout_model.py,sha256=ZvbTSyxvXB5yLHNEti0Wv3trz0vwGuHySI5TCdApb0U,14011
27
27
  docling/models/page_assemble_model.py,sha256=kSGNiRKhmzkpFH7xCiT3rulMsgJmUXFa6Th_eB-cLEk,7103
28
28
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
29
29
  docling/models/table_structure_model.py,sha256=-ANSQpiN2avt3B9sbi7dHcoULUJbMBalAR5xxlrM7To,8421
30
30
  docling/models/tesseract_ocr_cli_model.py,sha256=ZflwQcD7YjhPqEB8bbgNgP14OBD4NNEJefUS8Lbr5X0,6511
31
- docling/models/tesseract_ocr_model.py,sha256=AccCgaYNzGryiJnkwR4sv2FeOdlSgO3uspdQOmo1sNY,5569
31
+ docling/models/tesseract_ocr_model.py,sha256=X9qlzwaTZLtSGXFIZuD7MO6EzFmHl1D-FjktUBko6us,6234
32
32
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
33
  docling/pipeline/base_pipeline.py,sha256=IF1XWYgUGbdB4-teLkmM4Hvg_UNEfPrGuhExMRTUsk8,7168
34
34
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
@@ -38,8 +38,8 @@ docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
38
38
  docling/utils/layout_utils.py,sha256=vlN0rc8i0ayRGn3WnaG-pdmqEL00KKGl2zez3Gj-hrk,32074
39
39
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
40
40
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
41
- docling-2.4.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
42
- docling-2.4.0.dist-info/METADATA,sha256=9o2Nd020wn0UeQ7d0ABRQt6UnVagPxTFson9bDzcbEA,6116
43
- docling-2.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
44
- docling-2.4.0.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
45
- docling-2.4.0.dist-info/RECORD,,
41
+ docling-2.4.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
42
+ docling-2.4.2.dist-info/METADATA,sha256=WCTdN8xVORJ5Nxt-1CQSmNEe4j5kNvgYu2BK25O0We4,6530
43
+ docling-2.4.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
44
+ docling-2.4.2.dist-info/entry_points.txt,sha256=VOSzV77znM52dz5ysaDuJ0ijl1cnfrh1ZPg8od5OcTs,48
45
+ docling-2.4.2.dist-info/RECORD,,