docling 2.13.0__tar.gz → 2.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.13.0 → docling-2.15.0}/PKG-INFO +2 -2
- {docling-2.13.0 → docling-2.15.0}/docling/backend/html_backend.py +2 -2
- {docling-2.13.0 → docling-2.15.0}/docling/backend/mspowerpoint_backend.py +15 -11
- docling-2.15.0/docling/backend/xml/pubmed_backend.py +592 -0
- {docling-2.13.0 → docling-2.15.0}/docling/cli/main.py +14 -2
- {docling-2.13.0 → docling-2.15.0}/docling/datamodel/base_models.py +3 -0
- {docling-2.13.0 → docling-2.15.0}/docling/datamodel/document.py +19 -4
- {docling-2.13.0 → docling-2.15.0}/docling/document_converter.py +13 -3
- {docling-2.13.0 → docling-2.15.0}/docling/models/base_ocr_model.py +14 -1
- {docling-2.13.0 → docling-2.15.0}/docling/models/layout_model.py +18 -25
- {docling-2.13.0 → docling-2.15.0}/docling/models/table_structure_model.py +20 -0
- {docling-2.13.0 → docling-2.15.0}/pyproject.toml +2 -2
- {docling-2.13.0 → docling-2.15.0}/LICENSE +0 -0
- {docling-2.13.0 → docling-2.15.0}/README.md +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/md_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/chunking/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/cli/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/datamodel/settings.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/exceptions.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/base_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/ds_glm_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/easyocr_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/py.typed +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/utils/__init__.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/utils/export.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/utils/glm_utils.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/utils/profiling.py +0 -0
- {docling-2.13.0 → docling-2.15.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.15.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -26,7 +26,7 @@ Provides-Extra: tesserocr
|
|
26
26
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
27
27
|
Requires-Dist: certifi (>=2024.7.4)
|
28
28
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
29
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
29
|
+
Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
|
30
30
|
Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
|
31
31
|
Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
|
32
32
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
37
37
|
|
38
38
|
try:
|
39
39
|
if isinstance(self.path_or_stream, BytesIO):
|
40
|
-
text_stream = self.path_or_stream.getvalue()
|
40
|
+
text_stream = self.path_or_stream.getvalue()
|
41
41
|
self.soup = BeautifulSoup(text_stream, "html.parser")
|
42
42
|
if isinstance(self.path_or_stream, Path):
|
43
|
-
with open(self.path_or_stream, "
|
43
|
+
with open(self.path_or_stream, "rb") as f:
|
44
44
|
html_content = f.read()
|
45
45
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
46
46
|
except Exception as e:
|
@@ -16,7 +16,7 @@ from docling_core.types.doc import (
|
|
16
16
|
TableCell,
|
17
17
|
TableData,
|
18
18
|
)
|
19
|
-
from PIL import Image
|
19
|
+
from PIL import Image, UnidentifiedImageError
|
20
20
|
from pptx import Presentation
|
21
21
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
22
22
|
|
@@ -120,6 +120,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
120
120
|
bullet_type = "None"
|
121
121
|
list_text = ""
|
122
122
|
list_label = GroupLabel.LIST
|
123
|
+
doc_label = DocItemLabel.LIST_ITEM
|
123
124
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
124
125
|
|
125
126
|
# Identify if shape contains lists
|
@@ -276,16 +277,19 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
276
277
|
im_dpi, _ = image.dpi
|
277
278
|
|
278
279
|
# Open it with PIL
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
280
|
+
try:
|
281
|
+
pil_image = Image.open(BytesIO(image_bytes))
|
282
|
+
|
283
|
+
# shape has picture
|
284
|
+
prov = self.generate_prov(shape, slide_ind, "")
|
285
|
+
doc.add_picture(
|
286
|
+
parent=parent_slide,
|
287
|
+
image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
|
288
|
+
caption=None,
|
289
|
+
prov=prov,
|
290
|
+
)
|
291
|
+
except (UnidentifiedImageError, OSError) as e:
|
292
|
+
_log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
|
289
293
|
return
|
290
294
|
|
291
295
|
def handle_tables(self, shape, parent_slide, slide_ind, doc):
|
@@ -0,0 +1,592 @@
|
|
1
|
+
import logging
|
2
|
+
from io import BytesIO
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Any, Set, Union
|
5
|
+
|
6
|
+
import lxml
|
7
|
+
from bs4 import BeautifulSoup
|
8
|
+
from docling_core.types.doc import (
|
9
|
+
DocItemLabel,
|
10
|
+
DoclingDocument,
|
11
|
+
DocumentOrigin,
|
12
|
+
GroupLabel,
|
13
|
+
TableCell,
|
14
|
+
TableData,
|
15
|
+
)
|
16
|
+
from lxml import etree
|
17
|
+
from typing_extensions import TypedDict, override
|
18
|
+
|
19
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
20
|
+
from docling.datamodel.base_models import InputFormat
|
21
|
+
from docling.datamodel.document import InputDocument
|
22
|
+
|
23
|
+
_log = logging.getLogger(__name__)
|
24
|
+
|
25
|
+
|
26
|
+
class Paragraph(TypedDict):
|
27
|
+
text: str
|
28
|
+
headers: list[str]
|
29
|
+
|
30
|
+
|
31
|
+
class Author(TypedDict):
|
32
|
+
name: str
|
33
|
+
affiliation_names: list[str]
|
34
|
+
|
35
|
+
|
36
|
+
class Table(TypedDict):
|
37
|
+
label: str
|
38
|
+
caption: str
|
39
|
+
content: str
|
40
|
+
|
41
|
+
|
42
|
+
class FigureCaption(TypedDict):
|
43
|
+
label: str
|
44
|
+
caption: str
|
45
|
+
|
46
|
+
|
47
|
+
class Reference(TypedDict):
|
48
|
+
author_names: str
|
49
|
+
title: str
|
50
|
+
journal: str
|
51
|
+
year: str
|
52
|
+
|
53
|
+
|
54
|
+
class XMLComponents(TypedDict):
|
55
|
+
title: str
|
56
|
+
authors: list[Author]
|
57
|
+
abstract: str
|
58
|
+
paragraphs: list[Paragraph]
|
59
|
+
tables: list[Table]
|
60
|
+
figure_captions: list[FigureCaption]
|
61
|
+
references: list[Reference]
|
62
|
+
|
63
|
+
|
64
|
+
class PubMedDocumentBackend(DeclarativeDocumentBackend):
|
65
|
+
"""
|
66
|
+
The code from this document backend has been developed by modifying parts of the PubMed Parser library (version 0.5.0, released on 12.08.2024):
|
67
|
+
Achakulvisut et al., (2020).
|
68
|
+
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML Dataset XML Dataset.
|
69
|
+
Journal of Open Source Software, 5(46), 1979,
|
70
|
+
https://doi.org/10.21105/joss.01979
|
71
|
+
"""
|
72
|
+
|
73
|
+
@override
|
74
|
+
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
75
|
+
super().__init__(in_doc, path_or_stream)
|
76
|
+
self.path_or_stream = path_or_stream
|
77
|
+
|
78
|
+
# Initialize parents for the document hierarchy
|
79
|
+
self.parents: dict = {}
|
80
|
+
|
81
|
+
self.valid = False
|
82
|
+
try:
|
83
|
+
if isinstance(self.path_or_stream, BytesIO):
|
84
|
+
self.path_or_stream.seek(0)
|
85
|
+
self.tree: lxml.etree._ElementTree = etree.parse(self.path_or_stream)
|
86
|
+
if "/NLM//DTD JATS" in self.tree.docinfo.public_id:
|
87
|
+
self.valid = True
|
88
|
+
except Exception as exc:
|
89
|
+
raise RuntimeError(
|
90
|
+
f"Could not initialize PubMed backend for file with hash {self.document_hash}."
|
91
|
+
) from exc
|
92
|
+
|
93
|
+
@override
|
94
|
+
def is_valid(self) -> bool:
|
95
|
+
return self.valid
|
96
|
+
|
97
|
+
@classmethod
|
98
|
+
@override
|
99
|
+
def supports_pagination(cls) -> bool:
|
100
|
+
return False
|
101
|
+
|
102
|
+
@override
|
103
|
+
def unload(self):
|
104
|
+
if isinstance(self.path_or_stream, BytesIO):
|
105
|
+
self.path_or_stream.close()
|
106
|
+
self.path_or_stream = None
|
107
|
+
|
108
|
+
@classmethod
|
109
|
+
@override
|
110
|
+
def supported_formats(cls) -> Set[InputFormat]:
|
111
|
+
return {InputFormat.XML_PUBMED}
|
112
|
+
|
113
|
+
@override
|
114
|
+
def convert(self) -> DoclingDocument:
|
115
|
+
# Create empty document
|
116
|
+
origin = DocumentOrigin(
|
117
|
+
filename=self.file.name or "file",
|
118
|
+
mimetype="application/xml",
|
119
|
+
binary_hash=self.document_hash,
|
120
|
+
)
|
121
|
+
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
122
|
+
|
123
|
+
_log.debug("Trying to convert PubMed XML document...")
|
124
|
+
|
125
|
+
# Get parsed XML components
|
126
|
+
xml_components: XMLComponents = self._parse()
|
127
|
+
|
128
|
+
# Add XML components to the document
|
129
|
+
doc = self._populate_document(doc, xml_components)
|
130
|
+
return doc
|
131
|
+
|
132
|
+
def _parse_title(self) -> str:
|
133
|
+
title: str = " ".join(
|
134
|
+
[
|
135
|
+
t.replace("\n", "")
|
136
|
+
for t in self.tree.xpath(".//title-group/article-title")[0].itertext()
|
137
|
+
]
|
138
|
+
)
|
139
|
+
return title
|
140
|
+
|
141
|
+
def _parse_authors(self) -> list[Author]:
|
142
|
+
# Get mapping between affiliation ids and names
|
143
|
+
affiliation_names = []
|
144
|
+
for affiliation_node in self.tree.xpath(".//aff[@id]"):
|
145
|
+
affiliation_names.append(
|
146
|
+
": ".join([t for t in affiliation_node.itertext() if t != "\n"])
|
147
|
+
)
|
148
|
+
affiliation_ids_names = {
|
149
|
+
id: name
|
150
|
+
for id, name in zip(self.tree.xpath(".//aff[@id]/@id"), affiliation_names)
|
151
|
+
}
|
152
|
+
|
153
|
+
# Get author names and affiliation names
|
154
|
+
authors: list[Author] = []
|
155
|
+
for author_node in self.tree.xpath(
|
156
|
+
'.//contrib-group/contrib[@contrib-type="author"]'
|
157
|
+
):
|
158
|
+
author: Author = {
|
159
|
+
"name": "",
|
160
|
+
"affiliation_names": [],
|
161
|
+
}
|
162
|
+
|
163
|
+
# Affiliation names
|
164
|
+
affiliation_ids = [
|
165
|
+
a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
|
166
|
+
]
|
167
|
+
for id in affiliation_ids:
|
168
|
+
if id in affiliation_ids_names:
|
169
|
+
author["affiliation_names"].append(affiliation_ids_names[id])
|
170
|
+
|
171
|
+
# Name
|
172
|
+
author["name"] = (
|
173
|
+
author_node.xpath("name/surname")[0].text
|
174
|
+
+ " "
|
175
|
+
+ author_node.xpath("name/given-names")[0].text
|
176
|
+
)
|
177
|
+
|
178
|
+
authors.append(author)
|
179
|
+
return authors
|
180
|
+
|
181
|
+
def _parse_abstract(self) -> str:
|
182
|
+
texts = []
|
183
|
+
for abstract_node in self.tree.xpath(".//abstract"):
|
184
|
+
for text in abstract_node.itertext():
|
185
|
+
texts.append(text.replace("\n", ""))
|
186
|
+
abstract: str = "".join(texts)
|
187
|
+
return abstract
|
188
|
+
|
189
|
+
def _parse_main_text(self) -> list[Paragraph]:
|
190
|
+
paragraphs: list[Paragraph] = []
|
191
|
+
for paragraph_node in self.tree.xpath("//body//p"):
|
192
|
+
# Skip captions
|
193
|
+
if "/caption" in paragraph_node.getroottree().getpath(paragraph_node):
|
194
|
+
continue
|
195
|
+
|
196
|
+
paragraph: Paragraph = {"text": "", "headers": []}
|
197
|
+
|
198
|
+
# Text
|
199
|
+
paragraph["text"] = "".join(
|
200
|
+
[t.replace("\n", "") for t in paragraph_node.itertext()]
|
201
|
+
)
|
202
|
+
|
203
|
+
# Header
|
204
|
+
path = "../title"
|
205
|
+
while len(paragraph_node.xpath(path)) > 0:
|
206
|
+
paragraph["headers"].append(
|
207
|
+
"".join(
|
208
|
+
[
|
209
|
+
t.replace("\n", "")
|
210
|
+
for t in paragraph_node.xpath(path)[0].itertext()
|
211
|
+
]
|
212
|
+
)
|
213
|
+
)
|
214
|
+
path = "../" + path
|
215
|
+
|
216
|
+
paragraphs.append(paragraph)
|
217
|
+
|
218
|
+
return paragraphs
|
219
|
+
|
220
|
+
def _parse_tables(self) -> list[Table]:
|
221
|
+
tables: list[Table] = []
|
222
|
+
for table_node in self.tree.xpath(".//body//table-wrap"):
|
223
|
+
table: Table = {"label": "", "caption": "", "content": ""}
|
224
|
+
|
225
|
+
# Content
|
226
|
+
if len(table_node.xpath("table")) > 0:
|
227
|
+
table_content_node = table_node.xpath("table")[0]
|
228
|
+
elif len(table_node.xpath("alternatives/table")) > 0:
|
229
|
+
table_content_node = table_node.xpath("alternatives/table")[0]
|
230
|
+
else:
|
231
|
+
table_content_node = None
|
232
|
+
if table_content_node != None:
|
233
|
+
table["content"] = etree.tostring(table_content_node).decode("utf-8")
|
234
|
+
|
235
|
+
# Caption
|
236
|
+
if len(table_node.xpath("caption/p")) > 0:
|
237
|
+
caption_node = table_node.xpath("caption/p")[0]
|
238
|
+
elif len(table_node.xpath("caption/title")) > 0:
|
239
|
+
caption_node = table_node.xpath("caption/title")[0]
|
240
|
+
else:
|
241
|
+
caption_node = None
|
242
|
+
if caption_node != None:
|
243
|
+
table["caption"] = "".join(
|
244
|
+
[t.replace("\n", "") for t in caption_node.itertext()]
|
245
|
+
)
|
246
|
+
|
247
|
+
# Label
|
248
|
+
if len(table_node.xpath("label")) > 0:
|
249
|
+
table["label"] = table_node.xpath("label")[0].text
|
250
|
+
|
251
|
+
tables.append(table)
|
252
|
+
return tables
|
253
|
+
|
254
|
+
def _parse_figure_captions(self) -> list[FigureCaption]:
|
255
|
+
figure_captions: list[FigureCaption] = []
|
256
|
+
|
257
|
+
if not (self.tree.xpath(".//fig")):
|
258
|
+
return figure_captions
|
259
|
+
|
260
|
+
for figure_node in self.tree.xpath(".//fig"):
|
261
|
+
figure_caption: FigureCaption = {
|
262
|
+
"caption": "",
|
263
|
+
"label": "",
|
264
|
+
}
|
265
|
+
|
266
|
+
# Label
|
267
|
+
if figure_node.xpath("label"):
|
268
|
+
figure_caption["label"] = "".join(
|
269
|
+
[
|
270
|
+
t.replace("\n", "")
|
271
|
+
for t in figure_node.xpath("label")[0].itertext()
|
272
|
+
]
|
273
|
+
)
|
274
|
+
|
275
|
+
# Caption
|
276
|
+
if figure_node.xpath("caption"):
|
277
|
+
caption = ""
|
278
|
+
for caption_node in figure_node.xpath("caption")[0].getchildren():
|
279
|
+
caption += (
|
280
|
+
"".join([t.replace("\n", "") for t in caption_node.itertext()])
|
281
|
+
+ "\n"
|
282
|
+
)
|
283
|
+
figure_caption["caption"] = caption
|
284
|
+
|
285
|
+
figure_captions.append(figure_caption)
|
286
|
+
|
287
|
+
return figure_captions
|
288
|
+
|
289
|
+
def _parse_references(self) -> list[Reference]:
|
290
|
+
references: list[Reference] = []
|
291
|
+
for reference_node_abs in self.tree.xpath(".//ref-list/ref"):
|
292
|
+
reference: Reference = {
|
293
|
+
"author_names": "",
|
294
|
+
"title": "",
|
295
|
+
"journal": "",
|
296
|
+
"year": "",
|
297
|
+
}
|
298
|
+
reference_node: Any = None
|
299
|
+
for tag in ["mixed-citation", "element-citation", "citation"]:
|
300
|
+
if len(reference_node_abs.xpath(tag)) > 0:
|
301
|
+
reference_node = reference_node_abs.xpath(tag)[0]
|
302
|
+
break
|
303
|
+
|
304
|
+
if reference_node is None:
|
305
|
+
continue
|
306
|
+
|
307
|
+
if all(
|
308
|
+
not (ref_type in ["citation-type", "publication-type"])
|
309
|
+
for ref_type in reference_node.attrib.keys()
|
310
|
+
):
|
311
|
+
continue
|
312
|
+
|
313
|
+
# Author names
|
314
|
+
names = []
|
315
|
+
if len(reference_node.xpath("name")) > 0:
|
316
|
+
for name_node in reference_node.xpath("name"):
|
317
|
+
name_str = " ".join(
|
318
|
+
[t.text for t in name_node.getchildren() if (t.text != None)]
|
319
|
+
)
|
320
|
+
names.append(name_str)
|
321
|
+
elif len(reference_node.xpath("person-group")) > 0:
|
322
|
+
for name_node in reference_node.xpath("person-group")[0]:
|
323
|
+
name_str = (
|
324
|
+
name_node.xpath("given-names")[0].text
|
325
|
+
+ " "
|
326
|
+
+ name_node.xpath("surname")[0].text
|
327
|
+
)
|
328
|
+
names.append(name_str)
|
329
|
+
reference["author_names"] = "; ".join(names)
|
330
|
+
|
331
|
+
# Title
|
332
|
+
if len(reference_node.xpath("article-title")) > 0:
|
333
|
+
reference["title"] = " ".join(
|
334
|
+
[
|
335
|
+
t.replace("\n", " ")
|
336
|
+
for t in reference_node.xpath("article-title")[0].itertext()
|
337
|
+
]
|
338
|
+
)
|
339
|
+
|
340
|
+
# Journal
|
341
|
+
if len(reference_node.xpath("source")) > 0:
|
342
|
+
reference["journal"] = reference_node.xpath("source")[0].text
|
343
|
+
|
344
|
+
# Year
|
345
|
+
if len(reference_node.xpath("year")) > 0:
|
346
|
+
reference["year"] = reference_node.xpath("year")[0].text
|
347
|
+
|
348
|
+
if (
|
349
|
+
not (reference_node.xpath("article-title"))
|
350
|
+
and not (reference_node.xpath("journal"))
|
351
|
+
and not (reference_node.xpath("year"))
|
352
|
+
):
|
353
|
+
reference["title"] = reference_node.text
|
354
|
+
|
355
|
+
references.append(reference)
|
356
|
+
return references
|
357
|
+
|
358
|
+
def _parse(self) -> XMLComponents:
|
359
|
+
"""Parsing PubMed document."""
|
360
|
+
xml_components: XMLComponents = {
|
361
|
+
"title": self._parse_title(),
|
362
|
+
"authors": self._parse_authors(),
|
363
|
+
"abstract": self._parse_abstract(),
|
364
|
+
"paragraphs": self._parse_main_text(),
|
365
|
+
"tables": self._parse_tables(),
|
366
|
+
"figure_captions": self._parse_figure_captions(),
|
367
|
+
"references": self._parse_references(),
|
368
|
+
}
|
369
|
+
return xml_components
|
370
|
+
|
371
|
+
def _populate_document(
|
372
|
+
self, doc: DoclingDocument, xml_components: XMLComponents
|
373
|
+
) -> DoclingDocument:
|
374
|
+
self._add_title(doc, xml_components)
|
375
|
+
self._add_authors(doc, xml_components)
|
376
|
+
self._add_abstract(doc, xml_components)
|
377
|
+
self._add_main_text(doc, xml_components)
|
378
|
+
|
379
|
+
if xml_components["tables"]:
|
380
|
+
self._add_tables(doc, xml_components)
|
381
|
+
|
382
|
+
if xml_components["figure_captions"]:
|
383
|
+
self._add_figure_captions(doc, xml_components)
|
384
|
+
|
385
|
+
self._add_references(doc, xml_components)
|
386
|
+
return doc
|
387
|
+
|
388
|
+
def _add_figure_captions(
|
389
|
+
self, doc: DoclingDocument, xml_components: XMLComponents
|
390
|
+
) -> None:
|
391
|
+
self.parents["Figures"] = doc.add_heading(
|
392
|
+
parent=self.parents["Title"], text="Figures"
|
393
|
+
)
|
394
|
+
for figure_caption_xml_component in xml_components["figure_captions"]:
|
395
|
+
figure_caption_text = (
|
396
|
+
figure_caption_xml_component["label"]
|
397
|
+
+ ": "
|
398
|
+
+ figure_caption_xml_component["caption"].strip()
|
399
|
+
)
|
400
|
+
fig_caption = doc.add_text(
|
401
|
+
label=DocItemLabel.CAPTION, text=figure_caption_text
|
402
|
+
)
|
403
|
+
doc.add_picture(
|
404
|
+
parent=self.parents["Figures"],
|
405
|
+
caption=fig_caption,
|
406
|
+
)
|
407
|
+
return
|
408
|
+
|
409
|
+
def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
|
410
|
+
self.parents["Title"] = doc.add_text(
|
411
|
+
parent=None,
|
412
|
+
text=xml_components["title"],
|
413
|
+
label=DocItemLabel.TITLE,
|
414
|
+
)
|
415
|
+
return
|
416
|
+
|
417
|
+
def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
|
418
|
+
authors_affiliations: list = []
|
419
|
+
for author in xml_components["authors"]:
|
420
|
+
authors_affiliations.append(author["name"])
|
421
|
+
authors_affiliations.append(", ".join(author["affiliation_names"]))
|
422
|
+
authors_affiliations_str = "; ".join(authors_affiliations)
|
423
|
+
|
424
|
+
doc.add_text(
|
425
|
+
parent=self.parents["Title"],
|
426
|
+
text=authors_affiliations_str,
|
427
|
+
label=DocItemLabel.PARAGRAPH,
|
428
|
+
)
|
429
|
+
return
|
430
|
+
|
431
|
+
def _add_abstract(
|
432
|
+
self, doc: DoclingDocument, xml_components: XMLComponents
|
433
|
+
) -> None:
|
434
|
+
abstract_text: str = xml_components["abstract"]
|
435
|
+
self.parents["Abstract"] = doc.add_heading(
|
436
|
+
parent=self.parents["Title"], text="Abstract"
|
437
|
+
)
|
438
|
+
doc.add_text(
|
439
|
+
parent=self.parents["Abstract"],
|
440
|
+
text=abstract_text,
|
441
|
+
label=DocItemLabel.TEXT,
|
442
|
+
)
|
443
|
+
return
|
444
|
+
|
445
|
+
def _add_main_text(
|
446
|
+
self, doc: DoclingDocument, xml_components: XMLComponents
|
447
|
+
) -> None:
|
448
|
+
added_headers: list = []
|
449
|
+
for paragraph in xml_components["paragraphs"]:
|
450
|
+
if not (paragraph["headers"]):
|
451
|
+
continue
|
452
|
+
|
453
|
+
# Header
|
454
|
+
for i, header in enumerate(reversed(paragraph["headers"])):
|
455
|
+
if header in added_headers:
|
456
|
+
continue
|
457
|
+
added_headers.append(header)
|
458
|
+
|
459
|
+
if ((i - 1) >= 0) and list(reversed(paragraph["headers"]))[
|
460
|
+
i - 1
|
461
|
+
] in self.parents:
|
462
|
+
parent = self.parents[list(reversed(paragraph["headers"]))[i - 1]]
|
463
|
+
else:
|
464
|
+
parent = self.parents["Title"]
|
465
|
+
|
466
|
+
self.parents[header] = doc.add_heading(parent=parent, text=header)
|
467
|
+
|
468
|
+
# Paragraph text
|
469
|
+
if paragraph["headers"][0] in self.parents:
|
470
|
+
parent = self.parents[paragraph["headers"][0]]
|
471
|
+
else:
|
472
|
+
parent = self.parents["Title"]
|
473
|
+
|
474
|
+
doc.add_text(parent=parent, label=DocItemLabel.TEXT, text=paragraph["text"])
|
475
|
+
return
|
476
|
+
|
477
|
+
def _add_references(
|
478
|
+
self, doc: DoclingDocument, xml_components: XMLComponents
|
479
|
+
) -> None:
|
480
|
+
self.parents["References"] = doc.add_heading(
|
481
|
+
parent=self.parents["Title"], text="References"
|
482
|
+
)
|
483
|
+
current_list = doc.add_group(
|
484
|
+
parent=self.parents["References"], label=GroupLabel.LIST, name="list"
|
485
|
+
)
|
486
|
+
for reference in xml_components["references"]:
|
487
|
+
reference_text: str = ""
|
488
|
+
if reference["author_names"]:
|
489
|
+
reference_text += reference["author_names"] + ". "
|
490
|
+
|
491
|
+
if reference["title"]:
|
492
|
+
reference_text += reference["title"]
|
493
|
+
if reference["title"][-1] != ".":
|
494
|
+
reference_text += "."
|
495
|
+
reference_text += " "
|
496
|
+
|
497
|
+
if reference["journal"]:
|
498
|
+
reference_text += reference["journal"]
|
499
|
+
|
500
|
+
if reference["year"]:
|
501
|
+
reference_text += " (" + reference["year"] + ")"
|
502
|
+
|
503
|
+
if not (reference_text):
|
504
|
+
_log.debug(f"Skipping reference for: {str(self.file)}")
|
505
|
+
continue
|
506
|
+
|
507
|
+
doc.add_list_item(
|
508
|
+
text=reference_text, enumerated=False, parent=current_list
|
509
|
+
)
|
510
|
+
return
|
511
|
+
|
512
|
+
def _add_tables(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
|
513
|
+
self.parents["Tables"] = doc.add_heading(
|
514
|
+
parent=self.parents["Title"], text="Tables"
|
515
|
+
)
|
516
|
+
for table_xml_component in xml_components["tables"]:
|
517
|
+
try:
|
518
|
+
self._add_table(doc, table_xml_component)
|
519
|
+
except Exception as e:
|
520
|
+
_log.debug(f"Skipping unsupported table for: {str(self.file)}")
|
521
|
+
pass
|
522
|
+
return
|
523
|
+
|
524
|
+
def _add_table(self, doc: DoclingDocument, table_xml_component: Table) -> None:
|
525
|
+
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
|
526
|
+
table_tag = soup.find("table")
|
527
|
+
|
528
|
+
nested_tables = table_tag.find("table")
|
529
|
+
if nested_tables:
|
530
|
+
_log.debug(f"Skipping nested table for: {str(self.file)}")
|
531
|
+
return
|
532
|
+
|
533
|
+
# Count the number of rows (number of <tr> elements)
|
534
|
+
num_rows = len(table_tag.find_all("tr"))
|
535
|
+
|
536
|
+
# Find the number of columns (taking into account colspan)
|
537
|
+
num_cols = 0
|
538
|
+
for row in table_tag.find_all("tr"):
|
539
|
+
col_count = 0
|
540
|
+
for cell in row.find_all(["td", "th"]):
|
541
|
+
colspan = int(cell.get("colspan", 1))
|
542
|
+
col_count += colspan
|
543
|
+
num_cols = max(num_cols, col_count)
|
544
|
+
|
545
|
+
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
546
|
+
|
547
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
548
|
+
|
549
|
+
# Iterate over the rows in the table
|
550
|
+
for row_idx, row in enumerate(table_tag.find_all("tr")):
|
551
|
+
# For each row, find all the column cells (both <td> and <th>)
|
552
|
+
cells = row.find_all(["td", "th"])
|
553
|
+
|
554
|
+
# Check if each cell in the row is a header -> means it is a column header
|
555
|
+
col_header = True
|
556
|
+
for j, html_cell in enumerate(cells):
|
557
|
+
if html_cell.name == "td":
|
558
|
+
col_header = False
|
559
|
+
|
560
|
+
# Extract and print the text content of each cell
|
561
|
+
col_idx = 0
|
562
|
+
for _, html_cell in enumerate(cells):
|
563
|
+
text = html_cell.text
|
564
|
+
|
565
|
+
col_span = int(html_cell.get("colspan", 1))
|
566
|
+
row_span = int(html_cell.get("rowspan", 1))
|
567
|
+
|
568
|
+
while grid[row_idx][col_idx] != None:
|
569
|
+
col_idx += 1
|
570
|
+
for r in range(row_span):
|
571
|
+
for c in range(col_span):
|
572
|
+
grid[row_idx + r][col_idx + c] = text
|
573
|
+
|
574
|
+
cell = TableCell(
|
575
|
+
text=text,
|
576
|
+
row_span=row_span,
|
577
|
+
col_span=col_span,
|
578
|
+
start_row_offset_idx=row_idx,
|
579
|
+
end_row_offset_idx=row_idx + row_span,
|
580
|
+
start_col_offset_idx=col_idx,
|
581
|
+
end_col_offset_idx=col_idx + col_span,
|
582
|
+
col_header=col_header,
|
583
|
+
row_header=((not col_header) and html_cell.name == "th"),
|
584
|
+
)
|
585
|
+
data.table_cells.append(cell)
|
586
|
+
|
587
|
+
table_caption = doc.add_text(
|
588
|
+
label=DocItemLabel.CAPTION,
|
589
|
+
text=table_xml_component["label"] + ": " + table_xml_component["caption"],
|
590
|
+
)
|
591
|
+
doc.add_table(data=data, parent=self.parents["Tables"], caption=table_caption)
|
592
|
+
return
|
@@ -164,6 +164,11 @@ def convert(
|
|
164
164
|
to_formats: List[OutputFormat] = typer.Option(
|
165
165
|
None, "--to", help="Specify output formats. Defaults to Markdown."
|
166
166
|
),
|
167
|
+
headers: str = typer.Option(
|
168
|
+
None,
|
169
|
+
"--headers",
|
170
|
+
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
|
171
|
+
),
|
167
172
|
image_export_mode: Annotated[
|
168
173
|
ImageRefMode,
|
169
174
|
typer.Option(
|
@@ -279,12 +284,19 @@ def convert(
|
|
279
284
|
if from_formats is None:
|
280
285
|
from_formats = [e for e in InputFormat]
|
281
286
|
|
287
|
+
parsed_headers: Optional[Dict[str, str]] = None
|
288
|
+
if headers is not None:
|
289
|
+
headers_t = TypeAdapter(Dict[str, str])
|
290
|
+
parsed_headers = headers_t.validate_json(headers)
|
291
|
+
|
282
292
|
with tempfile.TemporaryDirectory() as tempdir:
|
283
293
|
input_doc_paths: List[Path] = []
|
284
294
|
for src in input_sources:
|
285
295
|
try:
|
286
296
|
# check if we can fetch some remote url
|
287
|
-
source = resolve_source_to_path(
|
297
|
+
source = resolve_source_to_path(
|
298
|
+
source=src, headers=parsed_headers, workdir=Path(tempdir)
|
299
|
+
)
|
288
300
|
input_doc_paths.append(source)
|
289
301
|
except FileNotFoundError:
|
290
302
|
err_console.print(
|
@@ -390,7 +402,7 @@ def convert(
|
|
390
402
|
start_time = time.time()
|
391
403
|
|
392
404
|
conv_results = doc_converter.convert_all(
|
393
|
-
input_doc_paths, raises_on_error=abort_on_error
|
405
|
+
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
|
394
406
|
)
|
395
407
|
|
396
408
|
output.mkdir(parents=True, exist_ok=True)
|
@@ -33,6 +33,7 @@ class InputFormat(str, Enum):
|
|
33
33
|
DOCX = "docx"
|
34
34
|
PPTX = "pptx"
|
35
35
|
HTML = "html"
|
36
|
+
XML_PUBMED = "xml_pubmed"
|
36
37
|
IMAGE = "image"
|
37
38
|
PDF = "pdf"
|
38
39
|
ASCIIDOC = "asciidoc"
|
@@ -55,6 +56,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
55
56
|
InputFormat.PDF: ["pdf"],
|
56
57
|
InputFormat.MD: ["md"],
|
57
58
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
59
|
+
InputFormat.XML_PUBMED: ["xml", "nxml"],
|
58
60
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
59
61
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
60
62
|
InputFormat.XLSX: ["xlsx"],
|
@@ -72,6 +74,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
72
74
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
73
75
|
],
|
74
76
|
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
77
|
+
InputFormat.XML_PUBMED: ["application/xml"],
|
75
78
|
InputFormat.IMAGE: [
|
76
79
|
"image/png",
|
77
80
|
"image/jpeg",
|
@@ -227,13 +227,18 @@ class _DummyBackend(AbstractDocumentBackend):
|
|
227
227
|
class _DocumentConversionInput(BaseModel):
|
228
228
|
|
229
229
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
230
|
+
headers: Optional[Dict[str, str]] = None
|
230
231
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
231
232
|
|
232
233
|
def docs(
|
233
234
|
self, format_options: Dict[InputFormat, "FormatOption"]
|
234
235
|
) -> Iterable[InputDocument]:
|
235
236
|
for item in self.path_or_stream_iterator:
|
236
|
-
obj =
|
237
|
+
obj = (
|
238
|
+
resolve_source_to_stream(item, self.headers)
|
239
|
+
if isinstance(item, str)
|
240
|
+
else item
|
241
|
+
)
|
237
242
|
format = self._guess_format(obj)
|
238
243
|
backend: Type[AbstractDocumentBackend]
|
239
244
|
if format not in format_options.keys():
|
@@ -292,8 +297,7 @@ class _DocumentConversionInput(BaseModel):
|
|
292
297
|
mime = mime or "text/plain"
|
293
298
|
formats = MimeTypeToFormat.get(mime, [])
|
294
299
|
if formats:
|
295
|
-
|
296
|
-
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
|
300
|
+
if len(formats) == 1 and mime not in ("text/plain"):
|
297
301
|
return formats[0]
|
298
302
|
else: # ambiguity in formats
|
299
303
|
return _DocumentConversionInput._guess_from_content(
|
@@ -325,6 +329,12 @@ class _DocumentConversionInput(BaseModel):
|
|
325
329
|
):
|
326
330
|
input_format = InputFormat.XML_USPTO
|
327
331
|
|
332
|
+
if (
|
333
|
+
InputFormat.XML_PUBMED in formats
|
334
|
+
and "/NLM//DTD JATS" in xml_doctype
|
335
|
+
):
|
336
|
+
input_format = InputFormat.XML_PUBMED
|
337
|
+
|
328
338
|
elif mime == "text/plain":
|
329
339
|
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
330
340
|
input_format = InputFormat.XML_USPTO
|
@@ -340,7 +350,6 @@ class _DocumentConversionInput(BaseModel):
|
|
340
350
|
mime = FormatToMimeType[InputFormat.HTML][0]
|
341
351
|
elif ext in FormatToExtensions[InputFormat.MD]:
|
342
352
|
mime = FormatToMimeType[InputFormat.MD][0]
|
343
|
-
|
344
353
|
return mime
|
345
354
|
|
346
355
|
@staticmethod
|
@@ -370,4 +379,10 @@ class _DocumentConversionInput(BaseModel):
|
|
370
379
|
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
371
380
|
return "text/html"
|
372
381
|
|
382
|
+
p = re.compile(
|
383
|
+
r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
|
384
|
+
)
|
385
|
+
if p.search(content_str):
|
386
|
+
return "application/xml"
|
387
|
+
|
373
388
|
return None
|
@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
16
16
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
17
17
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
18
|
+
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
18
19
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
19
20
|
from docling.datamodel.base_models import (
|
20
21
|
ConversionStatus,
|
@@ -88,6 +89,11 @@ class PatentUsptoFormatOption(FormatOption):
|
|
88
89
|
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
89
90
|
|
90
91
|
|
92
|
+
class XMLPubMedFormatOption(FormatOption):
|
93
|
+
pipeline_cls: Type = SimplePipeline
|
94
|
+
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
|
95
|
+
|
96
|
+
|
91
97
|
class ImageFormatOption(FormatOption):
|
92
98
|
pipeline_cls: Type = StandardPdfPipeline
|
93
99
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
@@ -121,6 +127,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
121
127
|
InputFormat.XML_USPTO: FormatOption(
|
122
128
|
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
123
129
|
),
|
130
|
+
InputFormat.XML_PUBMED: FormatOption(
|
131
|
+
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
|
132
|
+
),
|
124
133
|
InputFormat.IMAGE: FormatOption(
|
125
134
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
126
135
|
),
|
@@ -167,16 +176,17 @@ class DocumentConverter:
|
|
167
176
|
def convert(
|
168
177
|
self,
|
169
178
|
source: Union[Path, str, DocumentStream], # TODO review naming
|
179
|
+
headers: Optional[Dict[str, str]] = None,
|
170
180
|
raises_on_error: bool = True,
|
171
181
|
max_num_pages: int = sys.maxsize,
|
172
182
|
max_file_size: int = sys.maxsize,
|
173
183
|
) -> ConversionResult:
|
174
|
-
|
175
184
|
all_res = self.convert_all(
|
176
185
|
source=[source],
|
177
186
|
raises_on_error=raises_on_error,
|
178
187
|
max_num_pages=max_num_pages,
|
179
188
|
max_file_size=max_file_size,
|
189
|
+
headers=headers,
|
180
190
|
)
|
181
191
|
return next(all_res)
|
182
192
|
|
@@ -184,6 +194,7 @@ class DocumentConverter:
|
|
184
194
|
def convert_all(
|
185
195
|
self,
|
186
196
|
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
197
|
+
headers: Optional[Dict[str, str]] = None,
|
187
198
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
188
199
|
max_num_pages: int = sys.maxsize,
|
189
200
|
max_file_size: int = sys.maxsize,
|
@@ -193,8 +204,7 @@ class DocumentConverter:
|
|
193
204
|
max_file_size=max_file_size,
|
194
205
|
)
|
195
206
|
conv_input = _DocumentConversionInput(
|
196
|
-
path_or_stream_iterator=source,
|
197
|
-
limits=limits,
|
207
|
+
path_or_stream_iterator=source, limits=limits, headers=headers
|
198
208
|
)
|
199
209
|
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
200
210
|
|
@@ -138,18 +138,31 @@ class BaseOcrModel(BasePageModel):
|
|
138
138
|
|
139
139
|
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
140
140
|
image = copy.deepcopy(page.image)
|
141
|
+
scale_x = image.width / page.size.width
|
142
|
+
scale_y = image.height / page.size.height
|
143
|
+
|
141
144
|
draw = ImageDraw.Draw(image, "RGBA")
|
142
145
|
|
143
146
|
# Draw OCR rectangles as yellow filled rect
|
144
147
|
for rect in ocr_rects:
|
145
148
|
x0, y0, x1, y1 = rect.as_tuple()
|
149
|
+
y0 *= scale_x
|
150
|
+
y1 *= scale_y
|
151
|
+
x0 *= scale_x
|
152
|
+
x1 *= scale_x
|
153
|
+
|
146
154
|
shade_color = (255, 255, 0, 40) # transparent yellow
|
147
155
|
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
|
148
156
|
|
149
157
|
# Draw OCR and programmatic cells
|
150
158
|
for tc in page.cells:
|
151
159
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
152
|
-
|
160
|
+
y0 *= scale_x
|
161
|
+
y1 *= scale_y
|
162
|
+
x0 *= scale_x
|
163
|
+
x1 *= scale_x
|
164
|
+
|
165
|
+
color = "gray"
|
153
166
|
if isinstance(tc, OcrCell):
|
154
167
|
color = "magenta"
|
155
168
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
@@ -67,29 +67,9 @@ class LayoutModel(BasePageModel):
|
|
67
67
|
- Right: Clusters including FORM, KEY_VALUE_REGION, and PICTURE.
|
68
68
|
Includes label names and confidence scores for each cluster.
|
69
69
|
"""
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
DocItemLabel.LIST_ITEM: (153, 153, 255), # Light Purple
|
74
|
-
DocItemLabel.FORMULA: (192, 192, 192), # Gray
|
75
|
-
DocItemLabel.TABLE: (255, 204, 204), # Light Pink
|
76
|
-
DocItemLabel.PICTURE: (255, 204, 164), # Light Beige
|
77
|
-
DocItemLabel.SECTION_HEADER: (255, 153, 153), # Light Red
|
78
|
-
DocItemLabel.PAGE_HEADER: (204, 255, 204), # Light Green
|
79
|
-
DocItemLabel.PAGE_FOOTER: (
|
80
|
-
204,
|
81
|
-
255,
|
82
|
-
204,
|
83
|
-
), # Light Green (same as Page-Header)
|
84
|
-
DocItemLabel.TITLE: (255, 153, 153), # Light Red (same as Section-Header)
|
85
|
-
DocItemLabel.FOOTNOTE: (200, 200, 255), # Light Blue
|
86
|
-
DocItemLabel.DOCUMENT_INDEX: (220, 220, 220), # Light Gray
|
87
|
-
DocItemLabel.CODE: (125, 125, 125), # Gray
|
88
|
-
DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193), # Pale Green
|
89
|
-
DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193), # Light Pink
|
90
|
-
DocItemLabel.FORM: (200, 255, 255), # Light Cyan
|
91
|
-
DocItemLabel.KEY_VALUE_REGION: (183, 65, 14), # Rusty orange
|
92
|
-
}
|
70
|
+
scale_x = page.image.width / page.size.width
|
71
|
+
scale_y = page.image.height / page.size.height
|
72
|
+
|
93
73
|
# Filter clusters for left and right images
|
94
74
|
exclude_labels = {
|
95
75
|
DocItemLabel.FORM,
|
@@ -118,6 +98,11 @@ class LayoutModel(BasePageModel):
|
|
118
98
|
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
119
99
|
for tc in c.cells:
|
120
100
|
cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
|
101
|
+
cx0 *= scale_x
|
102
|
+
cx1 *= scale_x
|
103
|
+
cy0 *= scale_x
|
104
|
+
cy1 *= scale_y
|
105
|
+
|
121
106
|
draw.rectangle(
|
122
107
|
[(cx0, cy0), (cx1, cy1)],
|
123
108
|
outline=None,
|
@@ -125,8 +110,16 @@ class LayoutModel(BasePageModel):
|
|
125
110
|
)
|
126
111
|
# Draw cluster rectangle
|
127
112
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
128
|
-
|
129
|
-
|
113
|
+
x0 *= scale_x
|
114
|
+
x1 *= scale_x
|
115
|
+
y0 *= scale_x
|
116
|
+
y1 *= scale_y
|
117
|
+
|
118
|
+
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
119
|
+
cluster_outline_color = (
|
120
|
+
*list(DocItemLabel.get_color(c.label)),
|
121
|
+
255,
|
122
|
+
)
|
130
123
|
draw.rectangle(
|
131
124
|
[(x0, y0), (x1, y1)],
|
132
125
|
outline=cluster_outline_color,
|
@@ -66,23 +66,43 @@ class TableStructureModel(BasePageModel):
|
|
66
66
|
show: bool = False,
|
67
67
|
):
|
68
68
|
assert page._backend is not None
|
69
|
+
assert page.size is not None
|
69
70
|
|
70
71
|
image = (
|
71
72
|
page._backend.get_page_image()
|
72
73
|
) # make new image to avoid drawing on the saved ones
|
74
|
+
|
75
|
+
scale_x = image.width / page.size.width
|
76
|
+
scale_y = image.height / page.size.height
|
77
|
+
|
73
78
|
draw = ImageDraw.Draw(image)
|
74
79
|
|
75
80
|
for table_element in tbl_list:
|
76
81
|
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
|
82
|
+
y0 *= scale_x
|
83
|
+
y1 *= scale_y
|
84
|
+
x0 *= scale_x
|
85
|
+
x1 *= scale_x
|
86
|
+
|
77
87
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
78
88
|
|
79
89
|
for cell in table_element.cluster.cells:
|
80
90
|
x0, y0, x1, y1 = cell.bbox.as_tuple()
|
91
|
+
x0 *= scale_x
|
92
|
+
x1 *= scale_x
|
93
|
+
y0 *= scale_x
|
94
|
+
y1 *= scale_y
|
95
|
+
|
81
96
|
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
82
97
|
|
83
98
|
for tc in table_element.table_cells:
|
84
99
|
if tc.bbox is not None:
|
85
100
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
101
|
+
x0 *= scale_x
|
102
|
+
x1 *= scale_x
|
103
|
+
y0 *= scale_x
|
104
|
+
y1 *= scale_y
|
105
|
+
|
86
106
|
if tc.column_header:
|
87
107
|
width = 3
|
88
108
|
else:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.15.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -25,7 +25,7 @@ packages = [{include = "docling"}]
|
|
25
25
|
# actual dependencies:
|
26
26
|
######################
|
27
27
|
python = "^3.9"
|
28
|
-
docling-core = { version = "^2.
|
28
|
+
docling-core = { version = "^2.13.1", extras = ["chunking"] }
|
29
29
|
pydantic = "^2.0.0"
|
30
30
|
docling-ibm-models = "^3.1.0"
|
31
31
|
deepsearch-glm = "^1.0.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|