docling 1.8.5__tar.gz → 1.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.8.5 → docling-1.10.0}/LICENSE +1 -1
- {docling-1.8.5 → docling-1.10.0}/PKG-INFO +15 -8
- {docling-1.8.5 → docling-1.10.0}/README.md +10 -4
- {docling-1.8.5 → docling-1.10.0}/docling/datamodel/base_models.py +9 -0
- {docling-1.8.5 → docling-1.10.0}/docling/models/ds_glm_model.py +1 -1
- docling-1.10.0/docling/utils/export.py +193 -0
- {docling-1.8.5 → docling-1.10.0}/pyproject.toml +9 -4
- {docling-1.8.5 → docling-1.10.0}/docling/__init__.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/backend/__init__.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/backend/abstract_backend.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/datamodel/__init__.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/datamodel/document.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/datamodel/settings.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/document_converter.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/models/__init__.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/models/base_ocr_model.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/models/easyocr_model.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/models/layout_model.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/models/page_assemble_model.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/models/table_structure_model.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/pipeline/__init__.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/pipeline/standard_model_pipeline.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/utils/__init__.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/utils/layout_utils.py +0 -0
- {docling-1.8.5 → docling-1.10.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.10.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -20,13 +20,14 @@ Classifier: Programming Language :: Python :: 3.11
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
22
|
Requires-Dist: certifi (>=2024.7.4)
|
23
|
-
Requires-Dist: deepsearch-glm (>=0.
|
24
|
-
Requires-Dist: docling-core (>=1.1.
|
23
|
+
Requires-Dist: deepsearch-glm (>=0.21.0,<0.22.0)
|
24
|
+
Requires-Dist: docling-core (>=1.1.3,<2.0.0)
|
25
25
|
Requires-Dist: docling-ibm-models (>=1.1.3,<2.0.0)
|
26
|
-
Requires-Dist: docling-parse (>=1.
|
26
|
+
Requires-Dist: docling-parse (>=1.2.0,<2.0.0)
|
27
27
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
28
28
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
29
29
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
30
|
+
Requires-Dist: pyarrow (>=16.1.0,<17.0.0)
|
30
31
|
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
31
32
|
Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
|
32
33
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
@@ -62,6 +63,8 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
|
|
62
63
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
63
64
|
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
64
65
|
|
66
|
+
For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
|
67
|
+
|
65
68
|
## Installation
|
66
69
|
|
67
70
|
To use Docling, simply install `docling` from your package manager, e.g. pip:
|
@@ -180,6 +183,10 @@ results = doc_converter.convert(conv_input)
|
|
180
183
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
181
184
|
|
182
185
|
|
186
|
+
## Technical report
|
187
|
+
|
188
|
+
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
189
|
+
|
183
190
|
## Contributing
|
184
191
|
|
185
192
|
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
@@ -193,10 +200,10 @@ If you use Docling in your projects, please consider citing the following:
|
|
193
200
|
@techreport{Docling,
|
194
201
|
author = {Deep Search Team},
|
195
202
|
month = {8},
|
196
|
-
title = {
|
197
|
-
url={https://arxiv.org/abs/2408.09869},
|
198
|
-
eprint={2408.09869},
|
199
|
-
doi =
|
203
|
+
title = {Docling Technical Report},
|
204
|
+
url = {https://arxiv.org/abs/2408.09869},
|
205
|
+
eprint = {2408.09869},
|
206
|
+
doi = {10.48550/arXiv.2408.09869},
|
200
207
|
version = {1.0.0},
|
201
208
|
year = {2024}
|
202
209
|
}
|
@@ -24,6 +24,8 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy, self-co
|
|
24
24
|
* 📝 Extracts metadata from the document, such as title, authors, references and language
|
25
25
|
* 🔍 Optionally applies OCR (use with scanned PDFs)
|
26
26
|
|
27
|
+
For RAG, check out [Quackling](https://github.com/DS4SD/quackling) to get the most out of your docs, be it using LlamaIndex, LangChain or your pipeline.
|
28
|
+
|
27
29
|
## Installation
|
28
30
|
|
29
31
|
To use Docling, simply install `docling` from your package manager, e.g. pip:
|
@@ -142,6 +144,10 @@ results = doc_converter.convert(conv_input)
|
|
142
144
|
You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads.
|
143
145
|
|
144
146
|
|
147
|
+
## Technical report
|
148
|
+
|
149
|
+
For more details on Docling's inner workings, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).
|
150
|
+
|
145
151
|
## Contributing
|
146
152
|
|
147
153
|
Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details.
|
@@ -155,10 +161,10 @@ If you use Docling in your projects, please consider citing the following:
|
|
155
161
|
@techreport{Docling,
|
156
162
|
author = {Deep Search Team},
|
157
163
|
month = {8},
|
158
|
-
title = {
|
159
|
-
url={https://arxiv.org/abs/2408.09869},
|
160
|
-
eprint={2408.09869},
|
161
|
-
doi =
|
164
|
+
title = {Docling Technical Report},
|
165
|
+
url = {https://arxiv.org/abs/2408.09869},
|
166
|
+
eprint = {2408.09869},
|
167
|
+
doi = {10.48550/arXiv.2408.09869},
|
162
168
|
version = {1.0.0},
|
163
169
|
year = {2024}
|
164
170
|
}
|
@@ -71,6 +71,15 @@ class BoundingBox(BaseModel):
|
|
71
71
|
|
72
72
|
return out_bbox
|
73
73
|
|
74
|
+
def normalized(self, page_size: PageSize) -> "BoundingBox":
|
75
|
+
out_bbox = copy.deepcopy(self)
|
76
|
+
out_bbox.l /= page_size.width
|
77
|
+
out_bbox.r /= page_size.width
|
78
|
+
out_bbox.t /= page_size.height
|
79
|
+
out_bbox.b /= page_size.height
|
80
|
+
|
81
|
+
return out_bbox
|
82
|
+
|
74
83
|
def as_tuple(self):
|
75
84
|
if self.coord_origin == CoordOrigin.TOPLEFT:
|
76
85
|
return (self.l, self.t, self.r, self.b)
|
@@ -2,7 +2,7 @@ import copy
|
|
2
2
|
import random
|
3
3
|
|
4
4
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
5
|
-
from deepsearch_glm.utils.
|
5
|
+
from deepsearch_glm.utils.doc_utils import to_legacy_document_format
|
6
6
|
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
|
7
7
|
from docling_core.types import BaseText
|
8
8
|
from docling_core.types import Document as DsDocument
|
@@ -0,0 +1,193 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any, Dict, Iterable, List, Tuple
|
3
|
+
|
4
|
+
from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
|
7
|
+
from docling.datamodel.document import ConvertedDocument, Page
|
8
|
+
|
9
|
+
_log = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
def _export_table_to_html(table: Table):
|
13
|
+
|
14
|
+
# TODO: this is flagged as internal, because we will move it
|
15
|
+
# to the docling-core package.
|
16
|
+
|
17
|
+
def _get_tablecell_span(cell: TableCell, ix):
|
18
|
+
span = set([s[ix] for s in cell.spans])
|
19
|
+
if len(span) == 0:
|
20
|
+
return 1, None, None
|
21
|
+
return len(span), min(span), max(span)
|
22
|
+
|
23
|
+
body = ""
|
24
|
+
nrows = table.num_rows
|
25
|
+
ncols = table.num_cols
|
26
|
+
|
27
|
+
for i in range(nrows):
|
28
|
+
body += "<tr>"
|
29
|
+
for j in range(ncols):
|
30
|
+
cell: TableCell = table.data[i][j]
|
31
|
+
|
32
|
+
rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
|
33
|
+
colspan, colstart, colend = _get_tablecell_span(cell, 1)
|
34
|
+
|
35
|
+
if rowstart is not None and rowstart != i:
|
36
|
+
continue
|
37
|
+
if colstart is not None and colstart != j:
|
38
|
+
continue
|
39
|
+
|
40
|
+
if rowstart is None:
|
41
|
+
rowstart = i
|
42
|
+
if colstart is None:
|
43
|
+
colstart = j
|
44
|
+
|
45
|
+
content = cell.text.strip()
|
46
|
+
label = cell.obj_type
|
47
|
+
label_class = "body"
|
48
|
+
celltag = "td"
|
49
|
+
if label in ["row_header", "row_multi_header", "row_title"]:
|
50
|
+
label_class = "header"
|
51
|
+
elif label in ["col_header", "col_multi_header"]:
|
52
|
+
label_class = "header"
|
53
|
+
celltag = "th"
|
54
|
+
|
55
|
+
opening_tag = f"{celltag}"
|
56
|
+
if rowspan > 1:
|
57
|
+
opening_tag += f' rowspan="{rowspan}"'
|
58
|
+
if colspan > 1:
|
59
|
+
opening_tag += f' colspan="{colspan}"'
|
60
|
+
|
61
|
+
body += f"<{opening_tag}>{content}</{celltag}>"
|
62
|
+
body += "</tr>"
|
63
|
+
body = f"<table>{body}</table>"
|
64
|
+
|
65
|
+
return body
|
66
|
+
|
67
|
+
|
68
|
+
def generate_multimodal_pages(
|
69
|
+
doc_result: ConvertedDocument,
|
70
|
+
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
71
|
+
|
72
|
+
label_to_doclaynet = {
|
73
|
+
"title": "title",
|
74
|
+
"table-of-contents": "document_index",
|
75
|
+
"subtitle-level-1": "section_header",
|
76
|
+
"checkbox-selected": "checkbox_selected",
|
77
|
+
"checkbox-unselected": "checkbox_unselected",
|
78
|
+
"caption": "caption",
|
79
|
+
"page-header": "page_header",
|
80
|
+
"page-footer": "page_footer",
|
81
|
+
"footnote": "footnote",
|
82
|
+
"table": "table",
|
83
|
+
"formula": "formula",
|
84
|
+
"list-item": "list_item",
|
85
|
+
"code": "code",
|
86
|
+
"figure": "picture",
|
87
|
+
"picture": "picture",
|
88
|
+
"reference": "text",
|
89
|
+
"paragraph": "text",
|
90
|
+
"text": "text",
|
91
|
+
}
|
92
|
+
|
93
|
+
content_text = ""
|
94
|
+
page_no = 0
|
95
|
+
start_ix = 0
|
96
|
+
end_ix = 0
|
97
|
+
doc_items = []
|
98
|
+
|
99
|
+
doc = doc_result.output
|
100
|
+
|
101
|
+
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
|
102
|
+
segments = []
|
103
|
+
|
104
|
+
for ix, item in doc_items:
|
105
|
+
item_type = item.obj_type
|
106
|
+
label = label_to_doclaynet.get(item_type, None)
|
107
|
+
|
108
|
+
if label is None:
|
109
|
+
continue
|
110
|
+
|
111
|
+
bbox = BoundingBox.from_tuple(
|
112
|
+
item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
|
113
|
+
)
|
114
|
+
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
|
115
|
+
page_size=page.size
|
116
|
+
)
|
117
|
+
|
118
|
+
new_segment = {
|
119
|
+
"index_in_doc": ix,
|
120
|
+
"label": label,
|
121
|
+
"text": item.text if item.text is not None else "",
|
122
|
+
"bbox": new_bbox.as_tuple(),
|
123
|
+
"data": [],
|
124
|
+
}
|
125
|
+
|
126
|
+
if isinstance(item, Table):
|
127
|
+
table_html = _export_table_to_html(item)
|
128
|
+
new_segment["data"].append(
|
129
|
+
{
|
130
|
+
"html_seq": table_html,
|
131
|
+
"otsl_seq": "",
|
132
|
+
}
|
133
|
+
)
|
134
|
+
|
135
|
+
segments.append(new_segment)
|
136
|
+
|
137
|
+
return segments
|
138
|
+
|
139
|
+
def _process_page_cells(page: Page):
|
140
|
+
cells = []
|
141
|
+
for cell in page.cells:
|
142
|
+
new_bbox = cell.bbox.to_top_left_origin(
|
143
|
+
page_height=page.size.height
|
144
|
+
).normalized(page_size=page.size)
|
145
|
+
is_ocr = isinstance(cell, OcrCell)
|
146
|
+
ocr_confidence = cell.confidence if is_ocr else 1.0
|
147
|
+
cells.append(
|
148
|
+
{
|
149
|
+
"text": cell.text,
|
150
|
+
"bbox": new_bbox.as_tuple(),
|
151
|
+
"ocr": is_ocr,
|
152
|
+
"ocr_confidence": ocr_confidence,
|
153
|
+
}
|
154
|
+
)
|
155
|
+
return cells
|
156
|
+
|
157
|
+
def _process_page():
|
158
|
+
page_ix = page_no - 1
|
159
|
+
page = doc_result.pages[page_ix]
|
160
|
+
|
161
|
+
page_cells = _process_page_cells(page=page)
|
162
|
+
page_segments = _process_page_segments(doc_items=doc_items, page=page)
|
163
|
+
content_md = doc.export_to_markdown(
|
164
|
+
main_text_start=start_ix, main_text_stop=end_ix
|
165
|
+
)
|
166
|
+
|
167
|
+
return content_text, content_md, page_cells, page_segments, page
|
168
|
+
|
169
|
+
for ix, orig_item in enumerate(doc.main_text):
|
170
|
+
|
171
|
+
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
172
|
+
if item is None or item.prov is None or len(item.prov) == 0:
|
173
|
+
_log.debug(f"Skipping item {orig_item}")
|
174
|
+
continue
|
175
|
+
|
176
|
+
item_page = item.prov[0].page
|
177
|
+
|
178
|
+
# Page is complete
|
179
|
+
if page_no > 0 and item_page > page_no:
|
180
|
+
yield _process_page()
|
181
|
+
|
182
|
+
start_ix = ix
|
183
|
+
doc_items = []
|
184
|
+
content_text = ""
|
185
|
+
|
186
|
+
page_no = item_page
|
187
|
+
end_ix = ix
|
188
|
+
doc_items.append((ix, item))
|
189
|
+
if item.text is not None and item.text != "":
|
190
|
+
content_text += item.text + " "
|
191
|
+
|
192
|
+
if len(doc_items) > 0:
|
193
|
+
yield _process_page()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.10.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -23,19 +23,20 @@ packages = [{include = "docling"}]
|
|
23
23
|
[tool.poetry.dependencies]
|
24
24
|
python = "^3.10"
|
25
25
|
pydantic = "^2.0.0"
|
26
|
-
docling-core = "^1.1.
|
26
|
+
docling-core = "^1.1.3"
|
27
27
|
docling-ibm-models = "^1.1.3"
|
28
|
-
deepsearch-glm = "^0.
|
28
|
+
deepsearch-glm = "^0.21.0"
|
29
29
|
filetype = "^1.2.0"
|
30
30
|
pypdfium2 = "^4.30.0"
|
31
31
|
pydantic-settings = "^2.3.0"
|
32
32
|
huggingface_hub = ">=0.23,<1"
|
33
33
|
requests = "^2.32.3"
|
34
34
|
easyocr = "^1.7"
|
35
|
-
docling-parse = "^1.
|
35
|
+
docling-parse = "^1.2.0"
|
36
36
|
certifi = ">=2024.7.4"
|
37
37
|
rtree = "^1.3.0"
|
38
38
|
scipy = "^1.14.1"
|
39
|
+
pyarrow = "^16.1.0"
|
39
40
|
|
40
41
|
[tool.poetry.group.dev.dependencies]
|
41
42
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
@@ -51,6 +52,10 @@ types-requests = "^2.31.0.2"
|
|
51
52
|
flake8-pyproject = "^1.2.3"
|
52
53
|
pylint = "^2.17.5"
|
53
54
|
|
55
|
+
|
56
|
+
[tool.poetry.group.examples.dependencies]
|
57
|
+
datasets = "^2.21.0"
|
58
|
+
|
54
59
|
[build-system]
|
55
60
|
requires = ["poetry-core"]
|
56
61
|
build-backend = "poetry.core.masonry.api"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|