docling 2.12.0__py3-none-any.whl → 2.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/pubmed_backend.py +592 -0
- docling/backend/xml/uspto_backend.py +1888 -0
- docling/datamodel/base_models.py +21 -4
- docling/datamodel/document.py +88 -14
- docling/datamodel/pipeline_options.py +3 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +20 -3
- docling/models/ds_glm_model.py +34 -4
- docling/models/easyocr_model.py +2 -0
- docling/models/layout_model.py +134 -280
- docling/models/page_assemble_model.py +11 -1
- docling/models/table_structure_model.py +25 -29
- docling/pipeline/base_pipeline.py +3 -1
- docling/utils/glm_utils.py +11 -3
- docling/utils/layout_postprocessor.py +666 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/METADATA +2 -2
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/RECORD +21 -18
- docling/utils/layout_utils.py +0 -812
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/LICENSE +0 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/WHEEL +0 -0
- {docling-2.12.0.dist-info → docling-2.14.0.dist-info}/entry_points.txt +0 -0
docling/datamodel/base_models.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from enum import Enum
|
1
|
+
from enum import Enum
|
2
2
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
3
3
|
|
4
4
|
from docling_core.types.doc import (
|
@@ -28,14 +28,18 @@ class ConversionStatus(str, Enum):
|
|
28
28
|
|
29
29
|
|
30
30
|
class InputFormat(str, Enum):
|
31
|
+
"""A document format supported by document backend parsers."""
|
32
|
+
|
31
33
|
DOCX = "docx"
|
32
34
|
PPTX = "pptx"
|
33
35
|
HTML = "html"
|
36
|
+
XML_PUBMED = "xml_pubmed"
|
34
37
|
IMAGE = "image"
|
35
38
|
PDF = "pdf"
|
36
39
|
ASCIIDOC = "asciidoc"
|
37
40
|
MD = "md"
|
38
41
|
XLSX = "xlsx"
|
42
|
+
XML_USPTO = "xml_uspto"
|
39
43
|
|
40
44
|
|
41
45
|
class OutputFormat(str, Enum):
|
@@ -52,9 +56,11 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
52
56
|
InputFormat.PDF: ["pdf"],
|
53
57
|
InputFormat.MD: ["md"],
|
54
58
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
59
|
+
InputFormat.XML_PUBMED: ["xml", "nxml"],
|
55
60
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
56
61
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
57
62
|
InputFormat.XLSX: ["xlsx"],
|
63
|
+
InputFormat.XML_USPTO: ["xml", "txt"],
|
58
64
|
}
|
59
65
|
|
60
66
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
@@ -68,6 +74,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
68
74
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
69
75
|
],
|
70
76
|
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
77
|
+
InputFormat.XML_PUBMED: ["application/xml"],
|
71
78
|
InputFormat.IMAGE: [
|
72
79
|
"image/png",
|
73
80
|
"image/jpeg",
|
@@ -81,10 +88,13 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
81
88
|
InputFormat.XLSX: [
|
82
89
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
83
90
|
],
|
91
|
+
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
84
92
|
}
|
85
93
|
|
86
|
-
MimeTypeToFormat = {
|
87
|
-
mime: fmt for fmt
|
94
|
+
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
95
|
+
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
|
96
|
+
for value in FormatToMimeType.values()
|
97
|
+
for mime in value
|
88
98
|
}
|
89
99
|
|
90
100
|
|
@@ -122,6 +132,7 @@ class Cluster(BaseModel):
|
|
122
132
|
bbox: BoundingBox
|
123
133
|
confidence: float = 1.0
|
124
134
|
cells: List[Cell] = []
|
135
|
+
children: List["Cluster"] = [] # Add child cluster support
|
125
136
|
|
126
137
|
|
127
138
|
class BasePageElement(BaseModel):
|
@@ -136,6 +147,12 @@ class LayoutPrediction(BaseModel):
|
|
136
147
|
clusters: List[Cluster] = []
|
137
148
|
|
138
149
|
|
150
|
+
class ContainerElement(
|
151
|
+
BasePageElement
|
152
|
+
): # Used for Form and Key-Value-Regions, only for typing.
|
153
|
+
pass
|
154
|
+
|
155
|
+
|
139
156
|
class Table(BasePageElement):
|
140
157
|
otsl_seq: List[str]
|
141
158
|
num_rows: int = 0
|
@@ -175,7 +192,7 @@ class PagePredictions(BaseModel):
|
|
175
192
|
equations_prediction: Optional[EquationPrediction] = None
|
176
193
|
|
177
194
|
|
178
|
-
PageElement = Union[TextElement, Table, FigureElement]
|
195
|
+
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
179
196
|
|
180
197
|
|
181
198
|
class AssembledUnit(BaseModel):
|
docling/datamodel/document.py
CHANGED
@@ -3,7 +3,17 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import
|
6
|
+
from typing import (
|
7
|
+
TYPE_CHECKING,
|
8
|
+
Dict,
|
9
|
+
Iterable,
|
10
|
+
List,
|
11
|
+
Literal,
|
12
|
+
Optional,
|
13
|
+
Set,
|
14
|
+
Type,
|
15
|
+
Union,
|
16
|
+
)
|
7
17
|
|
8
18
|
import filetype
|
9
19
|
from docling_core.types.doc import (
|
@@ -63,7 +73,7 @@ _log = logging.getLogger(__name__)
|
|
63
73
|
|
64
74
|
layout_label_to_ds_type = {
|
65
75
|
DocItemLabel.TITLE: "title",
|
66
|
-
DocItemLabel.DOCUMENT_INDEX: "table
|
76
|
+
DocItemLabel.DOCUMENT_INDEX: "table",
|
67
77
|
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
68
78
|
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
69
79
|
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
@@ -78,6 +88,8 @@ layout_label_to_ds_type = {
|
|
78
88
|
DocItemLabel.PICTURE: "figure",
|
79
89
|
DocItemLabel.TEXT: "paragraph",
|
80
90
|
DocItemLabel.PARAGRAPH: "paragraph",
|
91
|
+
DocItemLabel.FORM: DocItemLabel.FORM.value,
|
92
|
+
DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
|
81
93
|
}
|
82
94
|
|
83
95
|
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
@@ -235,7 +247,7 @@ class _DocumentConversionInput(BaseModel):
|
|
235
247
|
if isinstance(obj, Path):
|
236
248
|
yield InputDocument(
|
237
249
|
path_or_stream=obj,
|
238
|
-
format=format,
|
250
|
+
format=format, # type: ignore[arg-type]
|
239
251
|
filename=obj.name,
|
240
252
|
limits=self.limits,
|
241
253
|
backend=backend,
|
@@ -243,7 +255,7 @@ class _DocumentConversionInput(BaseModel):
|
|
243
255
|
elif isinstance(obj, DocumentStream):
|
244
256
|
yield InputDocument(
|
245
257
|
path_or_stream=obj.stream,
|
246
|
-
format=format,
|
258
|
+
format=format, # type: ignore[arg-type]
|
247
259
|
filename=obj.name,
|
248
260
|
limits=self.limits,
|
249
261
|
backend=backend,
|
@@ -251,15 +263,15 @@ class _DocumentConversionInput(BaseModel):
|
|
251
263
|
else:
|
252
264
|
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
253
265
|
|
254
|
-
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
266
|
+
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
|
255
267
|
content = b"" # empty binary blob
|
256
|
-
|
268
|
+
formats: list[InputFormat] = []
|
257
269
|
|
258
270
|
if isinstance(obj, Path):
|
259
271
|
mime = filetype.guess_mime(str(obj))
|
260
272
|
if mime is None:
|
261
273
|
ext = obj.suffix[1:]
|
262
|
-
mime =
|
274
|
+
mime = _DocumentConversionInput._mime_from_extension(ext)
|
263
275
|
if mime is None: # must guess from
|
264
276
|
with obj.open("rb") as f:
|
265
277
|
content = f.read(1024) # Read first 1KB
|
@@ -274,15 +286,58 @@ class _DocumentConversionInput(BaseModel):
|
|
274
286
|
if ("." in obj.name and not obj.name.startswith("."))
|
275
287
|
else ""
|
276
288
|
)
|
277
|
-
mime =
|
289
|
+
mime = _DocumentConversionInput._mime_from_extension(ext)
|
278
290
|
|
279
|
-
mime = mime or
|
291
|
+
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
280
292
|
mime = mime or "text/plain"
|
293
|
+
formats = MimeTypeToFormat.get(mime, [])
|
294
|
+
if formats:
|
295
|
+
if len(formats) == 1 and mime not in ("text/plain"):
|
296
|
+
return formats[0]
|
297
|
+
else: # ambiguity in formats
|
298
|
+
return _DocumentConversionInput._guess_from_content(
|
299
|
+
content, mime, formats
|
300
|
+
)
|
301
|
+
else:
|
302
|
+
return None
|
303
|
+
|
304
|
+
@staticmethod
|
305
|
+
def _guess_from_content(
|
306
|
+
content: bytes, mime: str, formats: list[InputFormat]
|
307
|
+
) -> Optional[InputFormat]:
|
308
|
+
"""Guess the input format of a document by checking part of its content."""
|
309
|
+
input_format: Optional[InputFormat] = None
|
310
|
+
content_str = content.decode("utf-8")
|
311
|
+
|
312
|
+
if mime == "application/xml":
|
313
|
+
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
314
|
+
if match_doctype:
|
315
|
+
xml_doctype = match_doctype.group()
|
316
|
+
if InputFormat.XML_USPTO in formats and any(
|
317
|
+
item in xml_doctype
|
318
|
+
for item in (
|
319
|
+
"us-patent-application-v4",
|
320
|
+
"us-patent-grant-v4",
|
321
|
+
"us-grant-025",
|
322
|
+
"patent-application-publication",
|
323
|
+
)
|
324
|
+
):
|
325
|
+
input_format = InputFormat.XML_USPTO
|
326
|
+
|
327
|
+
if (
|
328
|
+
InputFormat.XML_PUBMED in formats
|
329
|
+
and "/NLM//DTD JATS" in xml_doctype
|
330
|
+
):
|
331
|
+
input_format = InputFormat.XML_PUBMED
|
332
|
+
|
333
|
+
elif mime == "text/plain":
|
334
|
+
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
335
|
+
input_format = InputFormat.XML_USPTO
|
281
336
|
|
282
|
-
|
283
|
-
return format
|
337
|
+
return input_format
|
284
338
|
|
285
|
-
|
339
|
+
@staticmethod
|
340
|
+
def _mime_from_extension(ext):
|
286
341
|
mime = None
|
287
342
|
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
288
343
|
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
@@ -290,10 +345,21 @@ class _DocumentConversionInput(BaseModel):
|
|
290
345
|
mime = FormatToMimeType[InputFormat.HTML][0]
|
291
346
|
elif ext in FormatToExtensions[InputFormat.MD]:
|
292
347
|
mime = FormatToMimeType[InputFormat.MD][0]
|
293
|
-
|
294
348
|
return mime
|
295
349
|
|
296
|
-
|
350
|
+
@staticmethod
|
351
|
+
def _detect_html_xhtml(
|
352
|
+
content: bytes,
|
353
|
+
) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
|
354
|
+
"""Guess the mime type of an XHTML, HTML, or XML file from its content.
|
355
|
+
|
356
|
+
Args:
|
357
|
+
content: A short piece of a document from its beginning.
|
358
|
+
|
359
|
+
Returns:
|
360
|
+
The mime type of an XHTML, HTML, or XML file, or None if the content does
|
361
|
+
not match any of these formats.
|
362
|
+
"""
|
297
363
|
content_str = content.decode("ascii", errors="ignore").lower()
|
298
364
|
# Remove XML comments
|
299
365
|
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
@@ -302,8 +368,16 @@ class _DocumentConversionInput(BaseModel):
|
|
302
368
|
if re.match(r"<\?xml", content_str):
|
303
369
|
if "xhtml" in content_str[:1000]:
|
304
370
|
return "application/xhtml+xml"
|
371
|
+
else:
|
372
|
+
return "application/xml"
|
305
373
|
|
306
374
|
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
307
375
|
return "text/html"
|
308
376
|
|
377
|
+
p = re.compile(
|
378
|
+
r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
|
379
|
+
)
|
380
|
+
if p.search(content_str):
|
381
|
+
return "application/xml"
|
382
|
+
|
309
383
|
return None
|
@@ -139,7 +139,10 @@ class EasyOcrOptions(OcrOptions):
|
|
139
139
|
|
140
140
|
use_gpu: Optional[bool] = None
|
141
141
|
|
142
|
+
confidence_threshold: float = 0.65
|
143
|
+
|
142
144
|
model_storage_directory: Optional[str] = None
|
145
|
+
recog_network: Optional[str] = "standard"
|
143
146
|
download_enabled: bool = True
|
144
147
|
|
145
148
|
model_config = ConfigDict(
|
docling/datamodel/settings.py
CHANGED
docling/document_converter.py
CHANGED
@@ -15,6 +15,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
16
16
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
17
17
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
18
|
+
from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
|
19
|
+
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
18
20
|
from docling.datamodel.base_models import (
|
19
21
|
ConversionStatus,
|
20
22
|
DoclingComponentType,
|
@@ -82,12 +84,22 @@ class HTMLFormatOption(FormatOption):
|
|
82
84
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
83
85
|
|
84
86
|
|
85
|
-
class
|
87
|
+
class PatentUsptoFormatOption(FormatOption):
|
88
|
+
pipeline_cls: Type = SimplePipeline
|
89
|
+
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
90
|
+
|
91
|
+
|
92
|
+
class XMLPubMedFormatOption(FormatOption):
|
93
|
+
pipeline_cls: Type = SimplePipeline
|
94
|
+
backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
|
95
|
+
|
96
|
+
|
97
|
+
class ImageFormatOption(FormatOption):
|
86
98
|
pipeline_cls: Type = StandardPdfPipeline
|
87
99
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
88
100
|
|
89
101
|
|
90
|
-
class
|
102
|
+
class PdfFormatOption(FormatOption):
|
91
103
|
pipeline_cls: Type = StandardPdfPipeline
|
92
104
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
93
105
|
|
@@ -112,6 +124,12 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
112
124
|
InputFormat.HTML: FormatOption(
|
113
125
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
114
126
|
),
|
127
|
+
InputFormat.XML_USPTO: FormatOption(
|
128
|
+
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
129
|
+
),
|
130
|
+
InputFormat.XML_PUBMED: FormatOption(
|
131
|
+
pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
|
132
|
+
),
|
115
133
|
InputFormat.IMAGE: FormatOption(
|
116
134
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
117
135
|
),
|
@@ -162,7 +180,6 @@ class DocumentConverter:
|
|
162
180
|
max_num_pages: int = sys.maxsize,
|
163
181
|
max_file_size: int = sys.maxsize,
|
164
182
|
) -> ConversionResult:
|
165
|
-
|
166
183
|
all_res = self.convert_all(
|
167
184
|
source=[source],
|
168
185
|
raises_on_error=raises_on_error,
|
docling/models/ds_glm_model.py
CHANGED
@@ -22,9 +22,15 @@ from docling_core.types.legacy_doc.document import (
|
|
22
22
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
23
23
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
24
24
|
from PIL import ImageDraw
|
25
|
-
from pydantic import BaseModel, ConfigDict
|
26
|
-
|
27
|
-
from docling.datamodel.base_models import
|
25
|
+
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
26
|
+
|
27
|
+
from docling.datamodel.base_models import (
|
28
|
+
Cluster,
|
29
|
+
ContainerElement,
|
30
|
+
FigureElement,
|
31
|
+
Table,
|
32
|
+
TextElement,
|
33
|
+
)
|
28
34
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
29
35
|
from docling.datamodel.settings import settings
|
30
36
|
from docling.utils.glm_utils import to_docling_document
|
@@ -204,7 +210,31 @@ class GlmModel:
|
|
204
210
|
)
|
205
211
|
],
|
206
212
|
obj_type=layout_label_to_ds_type.get(element.label),
|
207
|
-
|
213
|
+
payload={
|
214
|
+
"children": TypeAdapter(List[Cluster]).dump_python(
|
215
|
+
element.cluster.children
|
216
|
+
)
|
217
|
+
}, # hack to channel child clusters through GLM
|
218
|
+
)
|
219
|
+
)
|
220
|
+
elif isinstance(element, ContainerElement):
|
221
|
+
main_text.append(
|
222
|
+
BaseText(
|
223
|
+
text="",
|
224
|
+
payload={
|
225
|
+
"children": TypeAdapter(List[Cluster]).dump_python(
|
226
|
+
element.cluster.children
|
227
|
+
)
|
228
|
+
}, # hack to channel child clusters through GLM
|
229
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
230
|
+
name=element.label,
|
231
|
+
prov=[
|
232
|
+
Prov(
|
233
|
+
bbox=target_bbox,
|
234
|
+
page=element.page_no + 1,
|
235
|
+
span=[0, 0],
|
236
|
+
)
|
237
|
+
],
|
208
238
|
)
|
209
239
|
)
|
210
240
|
|
docling/models/easyocr_model.py
CHANGED
@@ -66,6 +66,7 @@ class EasyOcrModel(BaseOcrModel):
|
|
66
66
|
lang_list=self.options.lang,
|
67
67
|
gpu=use_gpu,
|
68
68
|
model_storage_directory=self.options.model_storage_directory,
|
69
|
+
recog_network=self.options.recog_network,
|
69
70
|
download_enabled=self.options.download_enabled,
|
70
71
|
verbose=False,
|
71
72
|
)
|
@@ -117,6 +118,7 @@ class EasyOcrModel(BaseOcrModel):
|
|
117
118
|
),
|
118
119
|
)
|
119
120
|
for ix, line in enumerate(result)
|
121
|
+
if line[2] >= self.options.confidence_threshold
|
120
122
|
]
|
121
123
|
all_ocr_cells.extend(cells)
|
122
124
|
|