docling 1.16.0__tar.gz → 1.16.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.16.0 → docling-1.16.1}/PKG-INFO +24 -2
- {docling-1.16.0 → docling-1.16.1}/README.md +22 -0
- {docling-1.16.0 → docling-1.16.1}/pyproject.toml +2 -2
- {docling-1.16.0 → docling-1.16.1}/LICENSE +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/__init__.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/backend/__init__.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/backend/abstract_backend.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/backend/docling_parse_backend.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/cli/__init__.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/cli/main.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/datamodel/__init__.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/datamodel/base_models.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/datamodel/document.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/datamodel/pipeline_options.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/datamodel/settings.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/document_converter.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/models/__init__.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/models/base_ocr_model.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/models/ds_glm_model.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/models/easyocr_model.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/models/layout_model.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/models/page_assemble_model.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/models/table_structure_model.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/pipeline/__init__.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/utils/__init__.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/utils/export.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/utils/layout_utils.py +0 -0
- {docling-1.16.0 → docling-1.16.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.16.
|
3
|
+
Version: 1.16.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -22,7 +22,7 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
22
|
Requires-Dist: certifi (>=2024.7.4)
|
23
23
|
Requires-Dist: deepsearch-glm (>=0.21.1,<0.22.0)
|
24
24
|
Requires-Dist: docling-core (>=1.6.2,<2.0.0)
|
25
|
-
Requires-Dist: docling-ibm-models (>=1.
|
25
|
+
Requires-Dist: docling-ibm-models (>=1.3.1,<2.0.0)
|
26
26
|
Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
|
27
27
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
28
28
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
@@ -251,6 +251,28 @@ results = doc_converter.convert(conv_input)
|
|
251
251
|
|
252
252
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
253
253
|
|
254
|
+
### Chunking
|
255
|
+
|
256
|
+
You can perform a hierarchy-aware chunking of a Docling document as follows:
|
257
|
+
|
258
|
+
```python
|
259
|
+
from docling.document_converter import DocumentConverter
|
260
|
+
from docling_core.transforms.chunker import HierarchicalChunker
|
261
|
+
|
262
|
+
doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2206.01062").output
|
263
|
+
chunks = list(HierarchicalChunker().chunk(doc))
|
264
|
+
# > [
|
265
|
+
# > ChunkWithMetadata(
|
266
|
+
# > path='$.main-text[0]',
|
267
|
+
# > text='DocLayNet: A Large Human-Annotated Dataset [...]',
|
268
|
+
# > page=1,
|
269
|
+
# > bbox=[107.30, 672.38, 505.19, 709.08]
|
270
|
+
# > ),
|
271
|
+
# > [...]
|
272
|
+
# > ]
|
273
|
+
```
|
274
|
+
|
275
|
+
|
254
276
|
## Technical report
|
255
277
|
|
256
278
|
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
@@ -207,6 +207,28 @@ results = doc_converter.convert(conv_input)
|
|
207
207
|
|
208
208
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
209
209
|
|
210
|
+
### Chunking
|
211
|
+
|
212
|
+
You can perform a hierarchy-aware chunking of a Docling document as follows:
|
213
|
+
|
214
|
+
```python
|
215
|
+
from docling.document_converter import DocumentConverter
|
216
|
+
from docling_core.transforms.chunker import HierarchicalChunker
|
217
|
+
|
218
|
+
doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2206.01062").output
|
219
|
+
chunks = list(HierarchicalChunker().chunk(doc))
|
220
|
+
# > [
|
221
|
+
# > ChunkWithMetadata(
|
222
|
+
# > path='$.main-text[0]',
|
223
|
+
# > text='DocLayNet: A Large Human-Annotated Dataset [...]',
|
224
|
+
# > page=1,
|
225
|
+
# > bbox=[107.30, 672.38, 505.19, 709.08]
|
226
|
+
# > ),
|
227
|
+
# > [...]
|
228
|
+
# > ]
|
229
|
+
```
|
230
|
+
|
231
|
+
|
210
232
|
## Technical report
|
211
233
|
|
212
234
|
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.16.
|
3
|
+
version = "1.16.1" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -38,7 +38,7 @@ torchvision = [
|
|
38
38
|
python = "^3.10"
|
39
39
|
pydantic = "^2.0.0"
|
40
40
|
docling-core = "^1.6.2"
|
41
|
-
docling-ibm-models = "^1.
|
41
|
+
docling-ibm-models = "^1.3.1"
|
42
42
|
deepsearch-glm = "^0.21.1"
|
43
43
|
filetype = "^1.2.0"
|
44
44
|
pypdfium2 = "^4.30.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|