docling 1.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/utils/utils.py ADDED
@@ -0,0 +1,41 @@
1
+ import hashlib
2
+ from io import BytesIO
3
+ from itertools import islice
4
+ from pathlib import Path
5
+ from typing import List, Union
6
+
7
+
8
+ def chunkify(iterator, chunk_size):
9
+ """Yield successive chunks of chunk_size from the iterable."""
10
+ if isinstance(iterator, List):
11
+ iterator = iter(iterator)
12
+ for first in iterator: # Take the first element from the iterator
13
+ yield [first] + list(islice(iterator, chunk_size - 1))
14
+
15
+
16
+ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
17
+ """Create a stable page_hash of the path_or_stream of a file"""
18
+
19
+ block_size = 65536
20
+ hasher = hashlib.sha256()
21
+
22
+ def _hash_buf(binary_stream):
23
+ buf = binary_stream.read(block_size) # read and page_hash in chunks
24
+ while len(buf) > 0:
25
+ hasher.update(buf)
26
+ buf = binary_stream.read(block_size)
27
+
28
+ if isinstance(path_or_stream, Path):
29
+ with path_or_stream.open("rb") as afile:
30
+ _hash_buf(afile)
31
+ elif isinstance(path_or_stream, BytesIO):
32
+ _hash_buf(path_or_stream)
33
+
34
+ return hasher.hexdigest()
35
+
36
+
37
+ def create_hash(string: str):
38
+ hasher = hashlib.sha256()
39
+ hasher.update(string.encode("utf-8"))
40
+
41
+ return hasher.hexdigest()
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [year] [fullname]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,192 @@
1
+ Metadata-Version: 2.1
2
+ Name: docling
3
+ Version: 1.6.2
4
+ Summary: Docling PDF conversion package
5
+ Home-page: https://github.com/DS4SD/docling
6
+ License: MIT
7
+ Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
8
+ Author: Christoph Auer
9
+ Author-email: cau@zurich.ibm.com
10
+ Requires-Python: >=3.10,<4.0
11
+ Classifier: Development Status :: 5 - Production/Stable
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: MacOS :: MacOS X
16
+ Classifier: Operating System :: POSIX :: Linux
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Dist: certifi (>=2024.7.4)
23
+ Requires-Dist: deepsearch-glm (>=0.19.0,<1)
24
+ Requires-Dist: docling-core (>=1.1.2,<2.0.0)
25
+ Requires-Dist: docling-ibm-models (>=1.1.2,<2.0.0)
26
+ Requires-Dist: docling-parse (>=0.2.0,<0.3.0)
27
+ Requires-Dist: easyocr (>=1.7,<2.0)
28
+ Requires-Dist: filetype (>=1.2.0,<2.0.0)
29
+ Requires-Dist: huggingface_hub (>=0.23,<1)
30
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
31
+ Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
32
+ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
33
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
34
+ Requires-Dist: rtree (>=1.3.0,<2.0.0)
35
+ Requires-Dist: scipy (>=1.14.1,<2.0.0)
36
+ Project-URL: Repository, https://github.com/DS4SD/docling
37
+ Description-Content-Type: text/markdown
38
+
39
+ <p align="center">
40
+ <a href="https://github.com/ds4sd/docling">
41
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
42
+ </a>
43
+ </p>
44
+
45
+ # Docling
46
+
47
+ [![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
48
+ [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
49
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
50
+ [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
51
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
52
+ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
53
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
54
+ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
55
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
56
+
57
+ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
58
+
59
+ ## Features
60
+ * ⚡ Converts any PDF document to JSON or Markdown format, stable and lightning fast
61
+ * 📑 Understands detailed page layout, reading order and recovers table structures
62
+ * 📝 Extracts metadata from the document, such as title, authors, references and language
63
+ * 🔍 Optionally applies OCR (use with scanned PDFs)
64
+
65
+ ## Installation
66
+
67
+ To use Docling, simply install `docling` from your package manager, e.g. pip:
68
+ ```bash
69
+ pip install docling
70
+ ```
71
+
72
+ > [!NOTE]
73
+ > Works on macOS and Linux environments. Windows platforms are currently not tested.
74
+
75
+ ### Development setup
76
+
77
+ To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
78
+ ```bash
79
+ poetry install --all-extras
80
+ ```
81
+
82
+ ## Usage
83
+
84
+ ### Convert a single document
85
+
86
+ To convert invidual PDF documents, use `convert_single()`, for example:
87
+ ```python
88
+ from docling.document_converter import DocumentConverter
89
+
90
+ source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
91
+ converter = DocumentConverter()
92
+ doc = converter.convert_single(source)
93
+ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
94
+ ```
95
+
96
+ ### Convert a batch of documents
97
+
98
+ For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
99
+
100
+ From a local repo clone, you can run it with:
101
+
102
+ ```
103
+ python examples/batch_convert.py
104
+ ```
105
+ The output of the above command will be written to `./scratch`.
106
+
107
+ ### Adjust pipeline features
108
+
109
+ The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
110
+ one can adjust the conversion pipeline and features.
111
+
112
+
113
+ #### Control pipeline options
114
+
115
+ You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
116
+ ```python
117
+ doc_converter = DocumentConverter(
118
+ artifacts_path=artifacts_path,
119
+ pipeline_options=PipelineOptions(
120
+ do_table_structure=False, # controls if table structure is recovered
121
+ do_ocr=True, # controls if OCR is applied (ignores programmatic content)
122
+ ),
123
+ )
124
+ ```
125
+
126
+ #### Control table extraction options
127
+
128
+ You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
129
+ This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
130
+
131
+
132
+ ```python
133
+ pipeline_options = PipelineOptions(do_table_structure=True)
134
+ pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
135
+
136
+ doc_converter = DocumentConverter(
137
+ artifacts_path=artifacts_path,
138
+ pipeline_options=pipeline_options,
139
+ )
140
+ ```
141
+
142
+ ### Impose limits on the document size
143
+
144
+ You can limit the file size and number of pages which should be allowed to process per document:
145
+ ```python
146
+ conv_input = DocumentConversionInput.from_paths(
147
+ paths=[Path("./test/data/2206.01062.pdf")],
148
+ limits=DocumentLimits(max_num_pages=100, max_file_size=20971520)
149
+ )
150
+ ```
151
+
152
+ ### Convert from binary PDF streams
153
+
154
+ You can convert PDFs from a binary stream instead of from the filesystem as follows:
155
+ ```python
156
+ buf = BytesIO(your_binary_stream)
157
+ docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
158
+ conv_input = DocumentConversionInput.from_streams(docs)
159
+ converted_docs = doc_converter.convert(conv_input)
160
+ ```
161
+ ### Limit resource usage
162
+
163
+ You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
164
+
165
+
166
+ ## Contributing
167
+
168
+ Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
169
+
170
+
171
+ ## References
172
+
173
+ If you use Docling in your projects, please consider citing the following:
174
+
175
+ ```bib
176
+ @techreport{Docling,
177
+ author = {Deep Search Team},
178
+ month = {8},
179
+ title = {{Docling Technical Report}},
180
+ url={https://arxiv.org/abs/2408.09869},
181
+ eprint={2408.09869},
182
+ doi = "10.48550/arXiv.2408.09869",
183
+ version = {1.0.0},
184
+ year = {2024}
185
+ }
186
+ ```
187
+
188
+ ## License
189
+
190
+ The Docling codebase is under MIT license.
191
+ For individual model usage, please refer to the model licenses found in the original packages.
192
+
@@ -0,0 +1,27 @@
1
+ docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ docling/backend/abstract_backend.py,sha256=ZfEHaBPGM1cmqrhaEoU3MHhnHU11NhOnhtFEIbVMYDo,1221
4
+ docling/backend/docling_parse_backend.py,sha256=TN7Ln3Lkc8k0v6HzxA2iUGc8f2iqMw0I-3eryLQkpdw,6924
5
+ docling/backend/pypdfium2_backend.py,sha256=xUiIYgd7i22YDx4-W2hfPUaQFszW0gcT6pavG5qZ8LE,8062
6
+ docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ docling/datamodel/base_models.py,sha256=5VHit5h7OleKnbhvy-sWDxQLizEdNrGUBrypyzwHyAE,8604
8
+ docling/datamodel/document.py,sha256=Dgi9pSwXCgIoR26MKiRDiVMyMaFKdvGSKq2Fm5Lef9M,13173
9
+ docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
10
+ docling/document_converter.py,sha256=UFSELvUSWsr8s0VByu4lNuzu7bn7zZauJTL3FTSLSBg,10371
11
+ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ docling/models/base_ocr_model.py,sha256=Ipl82a3AV2OsgMQSMEMpnWJ6MXcmyIQzmp52PmTaB0g,4465
13
+ docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
14
+ docling/models/easyocr_model.py,sha256=ABIqALvtNNrDQ47fXaZ0lDFhOwKsYGUUlAPnIsFZgZA,2232
15
+ docling/models/layout_model.py,sha256=ZFmaLXlRWUfsT1pJCiYVxhQFrBBsiz6Aw0m9GM3UvVM,11249
16
+ docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
17
+ docling/models/table_structure_model.py,sha256=5jzTlpM-GdCSq4l0vD1W6aSPTJXeTcXEnNuPxnw-DlA,5437
18
+ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ docling/pipeline/base_model_pipeline.py,sha256=AC5NTR0xLy5JIZqsTINkKEHeCPqpyvJpuE_bcnZhyvI,529
20
+ docling/pipeline/standard_model_pipeline.py,sha256=UTjyaEXvz9htYZz-IMTkn11cZwNjgvo_Fl2dfBVnRQs,1442
21
+ docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
23
+ docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
24
+ docling-1.6.2.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
25
+ docling-1.6.2.dist-info/METADATA,sha256=JF_IcxQ0hSuqcEhHU8qf5UlB3udVHRmRWiozPkLAcP4,7229
26
+ docling-1.6.2.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
27
+ docling-1.6.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 1.9.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any