docling 1.5.0__tar.gz → 2.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {docling-1.5.0 → docling-2.8.1}/LICENSE +1 -1
  2. docling-2.8.1/PKG-INFO +179 -0
  3. docling-2.8.1/README.md +123 -0
  4. docling-2.8.1/docling/backend/abstract_backend.py +64 -0
  5. docling-2.8.1/docling/backend/asciidoc_backend.py +431 -0
  6. {docling-1.5.0 → docling-2.8.1}/docling/backend/docling_parse_backend.py +72 -32
  7. docling-2.8.1/docling/backend/docling_parse_v2_backend.py +248 -0
  8. docling-2.8.1/docling/backend/html_backend.py +441 -0
  9. docling-2.8.1/docling/backend/md_backend.py +346 -0
  10. docling-2.8.1/docling/backend/msexcel_backend.py +374 -0
  11. docling-2.8.1/docling/backend/mspowerpoint_backend.py +408 -0
  12. docling-2.8.1/docling/backend/msword_backend.py +525 -0
  13. docling-2.8.1/docling/backend/pdf_backend.py +78 -0
  14. {docling-1.5.0 → docling-2.8.1}/docling/backend/pypdfium2_backend.py +57 -16
  15. docling-2.8.1/docling/cli/main.py +355 -0
  16. docling-2.8.1/docling/datamodel/base_models.py +216 -0
  17. docling-2.8.1/docling/datamodel/document.py +546 -0
  18. docling-2.8.1/docling/datamodel/pipeline_options.py +139 -0
  19. {docling-1.5.0 → docling-2.8.1}/docling/datamodel/settings.py +19 -2
  20. docling-2.8.1/docling/document_converter.py +295 -0
  21. docling-2.8.1/docling/models/base_model.py +28 -0
  22. docling-2.8.1/docling/models/base_ocr_model.py +173 -0
  23. docling-2.8.1/docling/models/ds_glm_model.py +301 -0
  24. docling-2.8.1/docling/models/easyocr_model.py +98 -0
  25. {docling-1.5.0 → docling-2.8.1}/docling/models/layout_model.py +156 -114
  26. docling-2.8.1/docling/models/ocr_mac_model.py +118 -0
  27. docling-2.8.1/docling/models/page_assemble_model.py +174 -0
  28. docling-2.8.1/docling/models/page_preprocessing_model.py +79 -0
  29. docling-2.8.1/docling/models/rapid_ocr_model.py +147 -0
  30. docling-2.8.1/docling/models/table_structure_model.py +206 -0
  31. docling-2.8.1/docling/models/tesseract_ocr_cli_model.py +180 -0
  32. docling-2.8.1/docling/models/tesseract_ocr_model.py +150 -0
  33. docling-2.8.1/docling/pipeline/base_pipeline.py +189 -0
  34. docling-2.8.1/docling/pipeline/simple_pipeline.py +56 -0
  35. docling-2.8.1/docling/pipeline/standard_pdf_pipeline.py +220 -0
  36. docling-2.8.1/docling/utils/__init__.py +0 -0
  37. docling-2.8.1/docling/utils/export.py +146 -0
  38. {docling-1.5.0 → docling-2.8.1}/docling/utils/layout_utils.py +17 -11
  39. docling-2.8.1/docling/utils/profiling.py +62 -0
  40. docling-2.8.1/pyproject.toml +172 -0
  41. docling-1.5.0/PKG-INFO +0 -192
  42. docling-1.5.0/README.md +0 -153
  43. docling-1.5.0/docling/backend/abstract_backend.py +0 -55
  44. docling-1.5.0/docling/datamodel/base_models.py +0 -304
  45. docling-1.5.0/docling/datamodel/document.py +0 -363
  46. docling-1.5.0/docling/document_converter.py +0 -278
  47. docling-1.5.0/docling/models/ds_glm_model.py +0 -82
  48. docling-1.5.0/docling/models/easyocr_model.py +0 -77
  49. docling-1.5.0/docling/models/page_assemble_model.py +0 -148
  50. docling-1.5.0/docling/models/table_structure_model.py +0 -145
  51. docling-1.5.0/docling/pipeline/base_model_pipeline.py +0 -18
  52. docling-1.5.0/docling/pipeline/standard_model_pipeline.py +0 -40
  53. docling-1.5.0/pyproject.toml +0 -91
  54. {docling-1.5.0 → docling-2.8.1}/docling/__init__.py +0 -0
  55. {docling-1.5.0 → docling-2.8.1}/docling/backend/__init__.py +0 -0
  56. {docling-1.5.0/docling/datamodel → docling-2.8.1/docling/cli}/__init__.py +0 -0
  57. {docling-1.5.0/docling/models → docling-2.8.1/docling/datamodel}/__init__.py +0 -0
  58. {docling-1.5.0/docling/pipeline → docling-2.8.1/docling/models}/__init__.py +0 -0
  59. {docling-1.5.0/docling/utils → docling-2.8.1/docling/pipeline}/__init__.py +0 -0
  60. {docling-1.5.0 → docling-2.8.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) [year] [fullname]
3
+ Copyright (c) 2024 International Business Machines
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
docling-2.8.1/PKG-INFO ADDED
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.1
2
+ Name: docling
3
+ Version: 2.8.1
4
+ Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
+ Home-page: https://github.com/DS4SD/docling
6
+ License: MIT
7
+ Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
8
+ Author: Christoph Auer
9
+ Author-email: cau@zurich.ibm.com
10
+ Requires-Python: >=3.9,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Provides-Extra: ocrmac
24
+ Provides-Extra: rapidocr
25
+ Provides-Extra: tesserocr
26
+ Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
+ Requires-Dist: certifi (>=2024.7.4)
28
+ Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
+ Requires-Dist: docling-core (>=2.5.1,<3.0.0)
30
+ Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
+ Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
32
+ Requires-Dist: easyocr (>=1.7,<2.0)
33
+ Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
+ Requires-Dist: huggingface_hub (>=0.23,<1)
35
+ Requires-Dist: lxml (>=4.0.0,<6.0.0)
36
+ Requires-Dist: marko (>=2.1.2,<3.0.0)
37
+ Requires-Dist: ocrmac (>=1.0.0,<2.0.0) ; (sys_platform == "darwin") and (extra == "ocrmac")
38
+ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (extra == "rapidocr")
39
+ Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
+ Requires-Dist: pydantic (>=2.0.0,<2.10)
43
+ Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
44
+ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
45
+ Requires-Dist: python-docx (>=1.1.2,<2.0.0)
46
+ Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
47
+ Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
48
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
49
+ Requires-Dist: rtree (>=1.3.0,<2.0.0)
50
+ Requires-Dist: scipy (>=1.6.0,<2.0.0)
51
+ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
52
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
53
+ Project-URL: Repository, https://github.com/DS4SD/docling
54
+ Description-Content-Type: text/markdown
55
+
56
+ <p align="center">
57
+ <a href="https://github.com/ds4sd/docling">
58
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
59
+ </a>
60
+ </p>
61
+
62
+ # 🦆 Docling
63
+
64
+ <p align="center">
65
+ <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
66
+ </p>
67
+
68
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
69
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
70
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
71
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
72
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
73
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
74
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
75
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
76
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
77
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
78
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
79
+
80
+ Docling parses documents and exports them to the desired format with ease and speed.
81
+
82
+ ## Features
83
+
84
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
85
+ * 📑 Advanced PDF document understanding including page layout, reading order & table structures
86
+ * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
87
+ * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
88
+ * 🔍 OCR support for scanned PDFs
89
+ * 💻 Simple and convenient CLI
90
+
91
+ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
92
+
93
+ ### Coming soon
94
+
95
+ * ♾️ Equation & code extraction
96
+ * 📝 Metadata extraction, including title, authors, references & language
97
+ * 🦜🔗 Native LangChain extension
98
+
99
+ ## Installation
100
+
101
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
102
+ ```bash
103
+ pip install docling
104
+ ```
105
+
106
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
107
+
108
+ More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
109
+
110
+ ## Getting started
111
+
112
+ To convert individual documents, use `convert()`, for example:
113
+
114
+ ```python
115
+ from docling.document_converter import DocumentConverter
116
+
117
+ source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
118
+ converter = DocumentConverter()
119
+ result = converter.convert(source)
120
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
121
+ ```
122
+
123
+ More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
124
+ the docs.
125
+
126
+ ## Documentation
127
+
128
+ Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
129
+ installation, usage, concepts, recipes, extensions, and more.
130
+
131
+ ## Examples
132
+
133
+ Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
134
+ demonstrating how to address different application use cases with Docling.
135
+
136
+ ## Integrations
137
+
138
+ To further accelerate your AI application development, check out Docling's native
139
+ [integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
140
+ and tools.
141
+
142
+ ## Get help and support
143
+
144
+ Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
145
+
146
+ ## Technical report
147
+
148
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
149
+
150
+ ## Contributing
151
+
152
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
153
+
154
+ ## References
155
+
156
+ If you use Docling in your projects, please consider citing the following:
157
+
158
+ ```bib
159
+ @techreport{Docling,
160
+ author = {Deep Search Team},
161
+ month = {8},
162
+ title = {Docling Technical Report},
163
+ url = {https://arxiv.org/abs/2408.09869},
164
+ eprint = {2408.09869},
165
+ doi = {10.48550/arXiv.2408.09869},
166
+ version = {1.0.0},
167
+ year = {2024}
168
+ }
169
+ ```
170
+
171
+ ## License
172
+
173
+ The Docling codebase is under MIT license.
174
+ For individual model usage, please refer to the model licenses found in the original packages.
175
+
176
+ ## IBM ❤️ Open Source AI
177
+
178
+ Docling has been brought to you by IBM.
179
+
@@ -0,0 +1,123 @@
1
+ <p align="center">
2
+ <a href="https://github.com/ds4sd/docling">
3
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/docs/assets/docling_processing.png" width="100%"/>
4
+ </a>
5
+ </p>
6
+
7
+ # 🦆 Docling
8
+
9
+ <p align="center">
10
+ <a href="https://trendshift.io/repositories/12132" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12132" alt="DS4SD%2Fdocling | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
11
+ </p>
12
+
13
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
14
+ [![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
15
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
16
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
17
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
18
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
19
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
20
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
21
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
22
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
23
+ [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
24
+
25
+ Docling parses documents and exports them to the desired format with ease and speed.
26
+
27
+ ## Features
28
+
29
+ * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to Markdown and JSON
30
+ * 📑 Advanced PDF document understanding including page layout, reading order & table structures
31
+ * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
32
+ * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
33
+ * 🔍 OCR support for scanned PDFs
34
+ * 💻 Simple and convenient CLI
35
+
36
+ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
37
+
38
+ ### Coming soon
39
+
40
+ * ♾️ Equation & code extraction
41
+ * 📝 Metadata extraction, including title, authors, references & language
42
+ * 🦜🔗 Native LangChain extension
43
+
44
+ ## Installation
45
+
46
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
47
+ ```bash
48
+ pip install docling
49
+ ```
50
+
51
+ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures.
52
+
53
+ More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs.
54
+
55
+ ## Getting started
56
+
57
+ To convert individual documents, use `convert()`, for example:
58
+
59
+ ```python
60
+ from docling.document_converter import DocumentConverter
61
+
62
+ source = "https://arxiv.org/pdf/2408.09869" # document per local path or URL
63
+ converter = DocumentConverter()
64
+ result = converter.convert(source)
65
+ print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]"
66
+ ```
67
+
68
+ More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in
69
+ the docs.
70
+
71
+ ## Documentation
72
+
73
+ Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on
74
+ installation, usage, concepts, recipes, extensions, and more.
75
+
76
+ ## Examples
77
+
78
+ Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/),
79
+ demonstrating how to address different application use cases with Docling.
80
+
81
+ ## Integrations
82
+
83
+ To further accelerate your AI application development, check out Docling's native
84
+ [integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks
85
+ and tools.
86
+
87
+ ## Get help and support
88
+
89
+ Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions).
90
+
91
+ ## Technical report
92
+
93
+ For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
94
+
95
+ ## Contributing
96
+
97
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
98
+
99
+ ## References
100
+
101
+ If you use Docling in your projects, please consider citing the following:
102
+
103
+ ```bib
104
+ @techreport{Docling,
105
+ author = {Deep Search Team},
106
+ month = {8},
107
+ title = {Docling Technical Report},
108
+ url = {https://arxiv.org/abs/2408.09869},
109
+ eprint = {2408.09869},
110
+ doi = {10.48550/arXiv.2408.09869},
111
+ version = {1.0.0},
112
+ year = {2024}
113
+ }
114
+ ```
115
+
116
+ ## License
117
+
118
+ The Docling codebase is under MIT license.
119
+ For individual model usage, please refer to the model licenses found in the original packages.
120
+
121
+ ## IBM ❤️ Open Source AI
122
+
123
+ Docling has been brought to you by IBM.
@@ -0,0 +1,64 @@
1
+ from abc import ABC, abstractmethod
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Set, Union
5
+
6
+ from docling_core.types.doc import DoclingDocument
7
+
8
+ if TYPE_CHECKING:
9
+ from docling.datamodel.base_models import InputFormat
10
+ from docling.datamodel.document import InputDocument
11
+
12
+
13
+ class AbstractDocumentBackend(ABC):
14
+ @abstractmethod
15
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
16
+ self.file = in_doc.file
17
+ self.path_or_stream = path_or_stream
18
+ self.document_hash = in_doc.document_hash
19
+ self.input_format = in_doc.format
20
+
21
+ @abstractmethod
22
+ def is_valid(self) -> bool:
23
+ pass
24
+
25
+ @classmethod
26
+ @abstractmethod
27
+ def supports_pagination(cls) -> bool:
28
+ pass
29
+
30
+ @abstractmethod
31
+ def unload(self):
32
+ if isinstance(self.path_or_stream, BytesIO):
33
+ self.path_or_stream.close()
34
+
35
+ self.path_or_stream = None
36
+
37
+ @classmethod
38
+ @abstractmethod
39
+ def supported_formats(cls) -> Set["InputFormat"]:
40
+ pass
41
+
42
+
43
+ class PaginatedDocumentBackend(AbstractDocumentBackend):
44
+ """DeclarativeDocumentBackend.
45
+
46
+ A declarative document backend is a backend that can transform to DoclingDocument
47
+ straight without a recognition pipeline.
48
+ """
49
+
50
+ @abstractmethod
51
+ def page_count(self) -> int:
52
+ pass
53
+
54
+
55
+ class DeclarativeDocumentBackend(AbstractDocumentBackend):
56
+ """DeclarativeDocumentBackend.
57
+
58
+ A declarative document backend is a backend that can transform to DoclingDocument
59
+ straight without a recognition pipeline.
60
+ """
61
+
62
+ @abstractmethod
63
+ def convert(self) -> DoclingDocument:
64
+ pass