docling 2.4.0__tar.gz → 2.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {docling-2.4.0 → docling-2.4.1}/PKG-INFO +16 -9
  2. {docling-2.4.0 → docling-2.4.1}/README.md +15 -8
  3. {docling-2.4.0 → docling-2.4.1}/docling/backend/docling_parse_backend.py +1 -1
  4. {docling-2.4.0 → docling-2.4.1}/docling/backend/docling_parse_v2_backend.py +1 -1
  5. {docling-2.4.0 → docling-2.4.1}/docling/backend/pypdfium2_backend.py +1 -1
  6. {docling-2.4.0 → docling-2.4.1}/docling/models/tesseract_ocr_model.py +19 -7
  7. {docling-2.4.0 → docling-2.4.1}/pyproject.toml +1 -1
  8. {docling-2.4.0 → docling-2.4.1}/LICENSE +0 -0
  9. {docling-2.4.0 → docling-2.4.1}/docling/__init__.py +0 -0
  10. {docling-2.4.0 → docling-2.4.1}/docling/backend/__init__.py +0 -0
  11. {docling-2.4.0 → docling-2.4.1}/docling/backend/abstract_backend.py +0 -0
  12. {docling-2.4.0 → docling-2.4.1}/docling/backend/asciidoc_backend.py +0 -0
  13. {docling-2.4.0 → docling-2.4.1}/docling/backend/html_backend.py +0 -0
  14. {docling-2.4.0 → docling-2.4.1}/docling/backend/md_backend.py +0 -0
  15. {docling-2.4.0 → docling-2.4.1}/docling/backend/mspowerpoint_backend.py +0 -0
  16. {docling-2.4.0 → docling-2.4.1}/docling/backend/msword_backend.py +0 -0
  17. {docling-2.4.0 → docling-2.4.1}/docling/backend/pdf_backend.py +0 -0
  18. {docling-2.4.0 → docling-2.4.1}/docling/cli/__init__.py +0 -0
  19. {docling-2.4.0 → docling-2.4.1}/docling/cli/main.py +0 -0
  20. {docling-2.4.0 → docling-2.4.1}/docling/datamodel/__init__.py +0 -0
  21. {docling-2.4.0 → docling-2.4.1}/docling/datamodel/base_models.py +0 -0
  22. {docling-2.4.0 → docling-2.4.1}/docling/datamodel/document.py +0 -0
  23. {docling-2.4.0 → docling-2.4.1}/docling/datamodel/pipeline_options.py +0 -0
  24. {docling-2.4.0 → docling-2.4.1}/docling/datamodel/settings.py +0 -0
  25. {docling-2.4.0 → docling-2.4.1}/docling/document_converter.py +0 -0
  26. {docling-2.4.0 → docling-2.4.1}/docling/models/__init__.py +0 -0
  27. {docling-2.4.0 → docling-2.4.1}/docling/models/base_model.py +0 -0
  28. {docling-2.4.0 → docling-2.4.1}/docling/models/base_ocr_model.py +0 -0
  29. {docling-2.4.0 → docling-2.4.1}/docling/models/ds_glm_model.py +0 -0
  30. {docling-2.4.0 → docling-2.4.1}/docling/models/easyocr_model.py +0 -0
  31. {docling-2.4.0 → docling-2.4.1}/docling/models/layout_model.py +0 -0
  32. {docling-2.4.0 → docling-2.4.1}/docling/models/page_assemble_model.py +0 -0
  33. {docling-2.4.0 → docling-2.4.1}/docling/models/page_preprocessing_model.py +0 -0
  34. {docling-2.4.0 → docling-2.4.1}/docling/models/table_structure_model.py +0 -0
  35. {docling-2.4.0 → docling-2.4.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  36. {docling-2.4.0 → docling-2.4.1}/docling/pipeline/__init__.py +0 -0
  37. {docling-2.4.0 → docling-2.4.1}/docling/pipeline/base_pipeline.py +0 -0
  38. {docling-2.4.0 → docling-2.4.1}/docling/pipeline/simple_pipeline.py +0 -0
  39. {docling-2.4.0 → docling-2.4.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  40. {docling-2.4.0 → docling-2.4.1}/docling/utils/__init__.py +0 -0
  41. {docling-2.4.0 → docling-2.4.1}/docling/utils/export.py +0 -0
  42. {docling-2.4.0 → docling-2.4.1}/docling/utils/layout_utils.py +0 -0
  43. {docling-2.4.0 → docling-2.4.1}/docling/utils/profiling.py +0 -0
  44. {docling-2.4.0 → docling-2.4.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.4.0
3
+ Version: 2.4.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -53,6 +53,10 @@ Description-Content-Type: text/markdown
53
53
 
54
54
  # Docling
55
55
 
56
+ <p align="center">
57
+ <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
58
+ </p>
59
+
56
60
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
57
61
  [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
58
62
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
@@ -66,19 +70,22 @@ Description-Content-Type: text/markdown
66
70
 
67
71
  Docling parses documents and exports them to the desired format with ease and speed.
68
72
 
69
-
70
73
  ## Features
71
74
 
72
75
  * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
73
76
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
74
77
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
75
- * 📝 Metadata extraction, including title, authors, references & language
76
- * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
78
+ * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
77
79
  * 🔍 OCR support for scanned PDFs
78
80
  * 💻 Simple and convenient CLI
79
81
 
80
82
  Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
81
83
 
84
+ ### Coming soon
85
+
86
+ * ♾️ Equation & code extraction
87
+ * 📝 Metadata extraction, including title, authors, references & language
88
+ * 🦜🔗 Native LangChain extension
82
89
 
83
90
  ## Installation
84
91
 
@@ -104,16 +111,13 @@ result = converter.convert(source)
104
111
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
105
112
  ```
106
113
 
107
-
108
114
  Check out [Getting started](https://ds4sd.github.io/docling/).
109
115
  You will find lots of tuning options to leverage all the advanced capabilities.
110
116
 
111
-
112
117
  ## Get help and support
113
118
 
114
119
  Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
115
120
 
116
-
117
121
  ## Technical report
118
122
 
119
123
  For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
@@ -122,7 +126,6 @@ For more details on Docling's inner workings, check out the [Docling Technical R
122
126
 
123
127
  Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
124
128
 
125
-
126
129
  ## References
127
130
 
128
131
  If you use Docling in your projects, please consider citing the following:
@@ -142,6 +145,10 @@ If you use Docling in your projects, please consider citing the following:
142
145
 
143
146
  ## License
144
147
 
145
- The Docling codebase is under MIT license.
148
+ The Docling codebase is under MIT license.
146
149
  For individual model usage, please refer to the model licenses found in the original packages.
147
150
 
151
+ ## IBM ❤️ Open Source AI
152
+
153
+ Docling has been brought to you by IBM.
154
+
@@ -6,6 +6,10 @@
6
6
 
7
7
  # Docling
8
8
 
9
+ <p align="center">
10
+ <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
11
+ </p>
12
+
9
13
  [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
10
14
  [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
11
15
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
@@ -19,19 +23,22 @@
19
23
 
20
24
  Docling parses documents and exports them to the desired format with ease and speed.
21
25
 
22
-
23
26
  ## Features
24
27
 
25
28
  * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
26
29
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
27
30
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
28
- * 📝 Metadata extraction, including title, authors, references & language
29
- * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
31
+ * 🤖 Easy integration with LlamaIndex 🦙 & LangChain 🦜🔗 for powerful RAG / QA applications
30
32
  * 🔍 OCR support for scanned PDFs
31
33
  * 💻 Simple and convenient CLI
32
34
 
33
35
  Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
34
36
 
37
+ ### Coming soon
38
+
39
+ * ♾️ Equation & code extraction
40
+ * 📝 Metadata extraction, including title, authors, references & language
41
+ * 🦜🔗 Native LangChain extension
35
42
 
36
43
  ## Installation
37
44
 
@@ -57,16 +64,13 @@ result = converter.convert(source)
57
64
  print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
58
65
  ```
59
66
 
60
-
61
67
  Check out [Getting started](https://ds4sd.github.io/docling/).
62
68
  You will find lots of tuning options to leverage all the advanced capabilities.
63
69
 
64
-
65
70
  ## Get help and support
66
71
 
67
72
  Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
68
73
 
69
-
70
74
  ## Technical report
71
75
 
72
76
  For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
@@ -75,7 +79,6 @@ For more details on Docling's inner workings, check out the [Docling Technical R
75
79
 
76
80
  Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
77
81
 
78
-
79
82
  ## References
80
83
 
81
84
  If you use Docling in your projects, please consider citing the following:
@@ -95,5 +98,9 @@ If you use Docling in your projects, please consider citing the following:
95
98
 
96
99
  ## License
97
100
 
98
- The Docling codebase is under MIT license.
101
+ The Docling codebase is under MIT license.
99
102
  For individual model usage, please refer to the model licenses found in the original packages.
103
+
104
+ ## IBM ❤️ Open Source AI
105
+
106
+ Docling has been brought to you by IBM.
@@ -29,7 +29,7 @@ class DoclingParsePageBackend(PdfPageBackend):
29
29
  self._dpage = parsed_page["pages"][0]
30
30
  else:
31
31
  _log.info(
32
- f"An error occured when loading page {page_no} of document {document_hash}."
32
+ f"An error occurred when loading page {page_no} of document {document_hash}."
33
33
  )
34
34
 
35
35
  def is_valid(self) -> bool:
@@ -31,7 +31,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
31
31
  self._dpage = parsed_page["pages"][0]
32
32
  else:
33
33
  _log.info(
34
- f"An error occured when loading page {page_no} of document {document_hash}."
34
+ f"An error occurred when loading page {page_no} of document {document_hash}."
35
35
  )
36
36
 
37
37
  def is_valid(self) -> bool:
@@ -29,7 +29,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
29
29
  self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
30
30
  except PdfiumError as e:
31
31
  _log.info(
32
- f"An exception occured when loading page {page_no} of document {document_hash}.",
32
+ f"An exception occurred when loading page {page_no} of document {document_hash}.",
33
33
  exc_info=True,
34
34
  )
35
35
  self.valid = False
@@ -22,25 +22,37 @@ class TesseractOcrModel(BaseOcrModel):
22
22
  self.reader = None
23
23
 
24
24
  if self.enabled:
25
- setup_errmsg = (
25
+ install_errmsg = (
26
26
  "tesserocr is not correctly installed. "
27
27
  "Please install it via `pip install tesserocr` to use this OCR engine. "
28
- "Note that tesserocr might have to be manually compiled for working with"
28
+ "Note that tesserocr might have to be manually compiled for working with "
29
29
  "your Tesseract installation. The Docling documentation provides examples for it. "
30
- "Alternatively, Docling has support for other OCR engines. See the documentation."
30
+ "Alternatively, Docling has support for other OCR engines. See the documentation: "
31
+ "https://ds4sd.github.io/docling/installation/"
31
32
  )
33
+ missing_langs_errmsg = (
34
+ "tesserocr is not correctly configured. No language models have been detected. "
35
+ "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
36
+ "You can find more information how to setup other OCR engines in Docling "
37
+ "documentation: "
38
+ "https://ds4sd.github.io/docling/installation/"
39
+ )
40
+
32
41
  try:
33
42
  import tesserocr
34
43
  except ImportError:
35
- raise ImportError(setup_errmsg)
36
-
44
+ raise ImportError(install_errmsg)
37
45
  try:
38
46
  tesseract_version = tesserocr.tesseract_version()
39
- _log.debug("Initializing TesserOCR: %s", tesseract_version)
40
47
  except:
41
- raise ImportError(setup_errmsg)
48
+ raise ImportError(install_errmsg)
49
+
50
+ _, tesserocr_languages = tesserocr.get_languages()
51
+ if not tesserocr_languages:
52
+ raise ImportError(missing_langs_errmsg)
42
53
 
43
54
  # Initialize the tesseractAPI
55
+ _log.debug("Initializing TesserOCR: %s", tesseract_version)
44
56
  lang = "+".join(self.options.lang)
45
57
  if self.options.path is not None:
46
58
  self.reader = tesserocr.PyTessBaseAPI(
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.4.0" # DO NOT EDIT, updated automatically
3
+ version = "2.4.1" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes