docling 2.12.0__py3-none-any.whl → 2.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/uspto_backend.py +1888 -0
- docling/datamodel/base_models.py +18 -4
- docling/datamodel/document.py +77 -13
- docling/datamodel/pipeline_options.py +3 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +11 -2
- docling/models/ds_glm_model.py +34 -4
- docling/models/easyocr_model.py +2 -0
- docling/models/layout_model.py +134 -280
- docling/models/page_assemble_model.py +11 -1
- docling/models/table_structure_model.py +25 -29
- docling/pipeline/base_pipeline.py +3 -1
- docling/utils/glm_utils.py +11 -3
- docling/utils/layout_postprocessor.py +666 -0
- {docling-2.12.0.dist-info → docling-2.13.0.dist-info}/METADATA +2 -2
- {docling-2.12.0.dist-info → docling-2.13.0.dist-info}/RECORD +20 -18
- docling/utils/layout_utils.py +0 -812
- {docling-2.12.0.dist-info → docling-2.13.0.dist-info}/LICENSE +0 -0
- {docling-2.12.0.dist-info → docling-2.13.0.dist-info}/WHEEL +0 -0
- {docling-2.12.0.dist-info → docling-2.13.0.dist-info}/entry_points.txt +0 -0
docling/datamodel/base_models.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from enum import Enum
|
1
|
+
from enum import Enum
|
2
2
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
3
3
|
|
4
4
|
from docling_core.types.doc import (
|
@@ -28,6 +28,8 @@ class ConversionStatus(str, Enum):
|
|
28
28
|
|
29
29
|
|
30
30
|
class InputFormat(str, Enum):
|
31
|
+
"""A document format supported by document backend parsers."""
|
32
|
+
|
31
33
|
DOCX = "docx"
|
32
34
|
PPTX = "pptx"
|
33
35
|
HTML = "html"
|
@@ -36,6 +38,7 @@ class InputFormat(str, Enum):
|
|
36
38
|
ASCIIDOC = "asciidoc"
|
37
39
|
MD = "md"
|
38
40
|
XLSX = "xlsx"
|
41
|
+
XML_USPTO = "xml_uspto"
|
39
42
|
|
40
43
|
|
41
44
|
class OutputFormat(str, Enum):
|
@@ -55,6 +58,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
55
58
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
56
59
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
57
60
|
InputFormat.XLSX: ["xlsx"],
|
61
|
+
InputFormat.XML_USPTO: ["xml", "txt"],
|
58
62
|
}
|
59
63
|
|
60
64
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
@@ -81,10 +85,13 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
81
85
|
InputFormat.XLSX: [
|
82
86
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
83
87
|
],
|
88
|
+
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
84
89
|
}
|
85
90
|
|
86
|
-
MimeTypeToFormat = {
|
87
|
-
mime: fmt for fmt
|
91
|
+
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
92
|
+
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
|
93
|
+
for value in FormatToMimeType.values()
|
94
|
+
for mime in value
|
88
95
|
}
|
89
96
|
|
90
97
|
|
@@ -122,6 +129,7 @@ class Cluster(BaseModel):
|
|
122
129
|
bbox: BoundingBox
|
123
130
|
confidence: float = 1.0
|
124
131
|
cells: List[Cell] = []
|
132
|
+
children: List["Cluster"] = [] # Add child cluster support
|
125
133
|
|
126
134
|
|
127
135
|
class BasePageElement(BaseModel):
|
@@ -136,6 +144,12 @@ class LayoutPrediction(BaseModel):
|
|
136
144
|
clusters: List[Cluster] = []
|
137
145
|
|
138
146
|
|
147
|
+
class ContainerElement(
|
148
|
+
BasePageElement
|
149
|
+
): # Used for Form and Key-Value-Regions, only for typing.
|
150
|
+
pass
|
151
|
+
|
152
|
+
|
139
153
|
class Table(BasePageElement):
|
140
154
|
otsl_seq: List[str]
|
141
155
|
num_rows: int = 0
|
@@ -175,7 +189,7 @@ class PagePredictions(BaseModel):
|
|
175
189
|
equations_prediction: Optional[EquationPrediction] = None
|
176
190
|
|
177
191
|
|
178
|
-
PageElement = Union[TextElement, Table, FigureElement]
|
192
|
+
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
179
193
|
|
180
194
|
|
181
195
|
class AssembledUnit(BaseModel):
|
docling/datamodel/document.py
CHANGED
@@ -3,7 +3,17 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import
|
6
|
+
from typing import (
|
7
|
+
TYPE_CHECKING,
|
8
|
+
Dict,
|
9
|
+
Iterable,
|
10
|
+
List,
|
11
|
+
Literal,
|
12
|
+
Optional,
|
13
|
+
Set,
|
14
|
+
Type,
|
15
|
+
Union,
|
16
|
+
)
|
7
17
|
|
8
18
|
import filetype
|
9
19
|
from docling_core.types.doc import (
|
@@ -63,7 +73,7 @@ _log = logging.getLogger(__name__)
|
|
63
73
|
|
64
74
|
layout_label_to_ds_type = {
|
65
75
|
DocItemLabel.TITLE: "title",
|
66
|
-
DocItemLabel.DOCUMENT_INDEX: "table
|
76
|
+
DocItemLabel.DOCUMENT_INDEX: "table",
|
67
77
|
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
68
78
|
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
69
79
|
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
@@ -78,6 +88,8 @@ layout_label_to_ds_type = {
|
|
78
88
|
DocItemLabel.PICTURE: "figure",
|
79
89
|
DocItemLabel.TEXT: "paragraph",
|
80
90
|
DocItemLabel.PARAGRAPH: "paragraph",
|
91
|
+
DocItemLabel.FORM: DocItemLabel.FORM.value,
|
92
|
+
DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
|
81
93
|
}
|
82
94
|
|
83
95
|
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
@@ -235,7 +247,7 @@ class _DocumentConversionInput(BaseModel):
|
|
235
247
|
if isinstance(obj, Path):
|
236
248
|
yield InputDocument(
|
237
249
|
path_or_stream=obj,
|
238
|
-
format=format,
|
250
|
+
format=format, # type: ignore[arg-type]
|
239
251
|
filename=obj.name,
|
240
252
|
limits=self.limits,
|
241
253
|
backend=backend,
|
@@ -243,7 +255,7 @@ class _DocumentConversionInput(BaseModel):
|
|
243
255
|
elif isinstance(obj, DocumentStream):
|
244
256
|
yield InputDocument(
|
245
257
|
path_or_stream=obj.stream,
|
246
|
-
format=format,
|
258
|
+
format=format, # type: ignore[arg-type]
|
247
259
|
filename=obj.name,
|
248
260
|
limits=self.limits,
|
249
261
|
backend=backend,
|
@@ -251,15 +263,15 @@ class _DocumentConversionInput(BaseModel):
|
|
251
263
|
else:
|
252
264
|
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
253
265
|
|
254
|
-
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
266
|
+
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
|
255
267
|
content = b"" # empty binary blob
|
256
|
-
|
268
|
+
formats: list[InputFormat] = []
|
257
269
|
|
258
270
|
if isinstance(obj, Path):
|
259
271
|
mime = filetype.guess_mime(str(obj))
|
260
272
|
if mime is None:
|
261
273
|
ext = obj.suffix[1:]
|
262
|
-
mime =
|
274
|
+
mime = _DocumentConversionInput._mime_from_extension(ext)
|
263
275
|
if mime is None: # must guess from
|
264
276
|
with obj.open("rb") as f:
|
265
277
|
content = f.read(1024) # Read first 1KB
|
@@ -274,15 +286,53 @@ class _DocumentConversionInput(BaseModel):
|
|
274
286
|
if ("." in obj.name and not obj.name.startswith("."))
|
275
287
|
else ""
|
276
288
|
)
|
277
|
-
mime =
|
289
|
+
mime = _DocumentConversionInput._mime_from_extension(ext)
|
278
290
|
|
279
|
-
mime = mime or
|
291
|
+
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
280
292
|
mime = mime or "text/plain"
|
293
|
+
formats = MimeTypeToFormat.get(mime, [])
|
294
|
+
if formats:
|
295
|
+
# TODO: remove application/xml case after adding another XML parse
|
296
|
+
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
|
297
|
+
return formats[0]
|
298
|
+
else: # ambiguity in formats
|
299
|
+
return _DocumentConversionInput._guess_from_content(
|
300
|
+
content, mime, formats
|
301
|
+
)
|
302
|
+
else:
|
303
|
+
return None
|
304
|
+
|
305
|
+
@staticmethod
|
306
|
+
def _guess_from_content(
|
307
|
+
content: bytes, mime: str, formats: list[InputFormat]
|
308
|
+
) -> Optional[InputFormat]:
|
309
|
+
"""Guess the input format of a document by checking part of its content."""
|
310
|
+
input_format: Optional[InputFormat] = None
|
311
|
+
content_str = content.decode("utf-8")
|
312
|
+
|
313
|
+
if mime == "application/xml":
|
314
|
+
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
315
|
+
if match_doctype:
|
316
|
+
xml_doctype = match_doctype.group()
|
317
|
+
if InputFormat.XML_USPTO in formats and any(
|
318
|
+
item in xml_doctype
|
319
|
+
for item in (
|
320
|
+
"us-patent-application-v4",
|
321
|
+
"us-patent-grant-v4",
|
322
|
+
"us-grant-025",
|
323
|
+
"patent-application-publication",
|
324
|
+
)
|
325
|
+
):
|
326
|
+
input_format = InputFormat.XML_USPTO
|
327
|
+
|
328
|
+
elif mime == "text/plain":
|
329
|
+
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
330
|
+
input_format = InputFormat.XML_USPTO
|
281
331
|
|
282
|
-
|
283
|
-
return format
|
332
|
+
return input_format
|
284
333
|
|
285
|
-
|
334
|
+
@staticmethod
|
335
|
+
def _mime_from_extension(ext):
|
286
336
|
mime = None
|
287
337
|
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
288
338
|
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
@@ -293,7 +343,19 @@ class _DocumentConversionInput(BaseModel):
|
|
293
343
|
|
294
344
|
return mime
|
295
345
|
|
296
|
-
|
346
|
+
@staticmethod
|
347
|
+
def _detect_html_xhtml(
|
348
|
+
content: bytes,
|
349
|
+
) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
|
350
|
+
"""Guess the mime type of an XHTML, HTML, or XML file from its content.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
content: A short piece of a document from its beginning.
|
354
|
+
|
355
|
+
Returns:
|
356
|
+
The mime type of an XHTML, HTML, or XML file, or None if the content does
|
357
|
+
not match any of these formats.
|
358
|
+
"""
|
297
359
|
content_str = content.decode("ascii", errors="ignore").lower()
|
298
360
|
# Remove XML comments
|
299
361
|
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
@@ -302,6 +364,8 @@ class _DocumentConversionInput(BaseModel):
|
|
302
364
|
if re.match(r"<\?xml", content_str):
|
303
365
|
if "xhtml" in content_str[:1000]:
|
304
366
|
return "application/xhtml+xml"
|
367
|
+
else:
|
368
|
+
return "application/xml"
|
305
369
|
|
306
370
|
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
307
371
|
return "text/html"
|
@@ -139,7 +139,10 @@ class EasyOcrOptions(OcrOptions):
|
|
139
139
|
|
140
140
|
use_gpu: Optional[bool] = None
|
141
141
|
|
142
|
+
confidence_threshold: float = 0.65
|
143
|
+
|
142
144
|
model_storage_directory: Optional[str] = None
|
145
|
+
recog_network: Optional[str] = "standard"
|
143
146
|
download_enabled: bool = True
|
144
147
|
|
145
148
|
model_config = ConfigDict(
|
docling/datamodel/settings.py
CHANGED
docling/document_converter.py
CHANGED
@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
16
16
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
17
17
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
18
|
+
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
18
19
|
from docling.datamodel.base_models import (
|
19
20
|
ConversionStatus,
|
20
21
|
DoclingComponentType,
|
@@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
|
|
82
83
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
83
84
|
|
84
85
|
|
85
|
-
class
|
86
|
+
class PatentUsptoFormatOption(FormatOption):
|
87
|
+
pipeline_cls: Type = SimplePipeline
|
88
|
+
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
89
|
+
|
90
|
+
|
91
|
+
class ImageFormatOption(FormatOption):
|
86
92
|
pipeline_cls: Type = StandardPdfPipeline
|
87
93
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
88
94
|
|
89
95
|
|
90
|
-
class
|
96
|
+
class PdfFormatOption(FormatOption):
|
91
97
|
pipeline_cls: Type = StandardPdfPipeline
|
92
98
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
93
99
|
|
@@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
112
118
|
InputFormat.HTML: FormatOption(
|
113
119
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
114
120
|
),
|
121
|
+
InputFormat.XML_USPTO: FormatOption(
|
122
|
+
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
123
|
+
),
|
115
124
|
InputFormat.IMAGE: FormatOption(
|
116
125
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
117
126
|
),
|
docling/models/ds_glm_model.py
CHANGED
@@ -22,9 +22,15 @@ from docling_core.types.legacy_doc.document import (
|
|
22
22
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
23
23
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
24
24
|
from PIL import ImageDraw
|
25
|
-
from pydantic import BaseModel, ConfigDict
|
26
|
-
|
27
|
-
from docling.datamodel.base_models import
|
25
|
+
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
26
|
+
|
27
|
+
from docling.datamodel.base_models import (
|
28
|
+
Cluster,
|
29
|
+
ContainerElement,
|
30
|
+
FigureElement,
|
31
|
+
Table,
|
32
|
+
TextElement,
|
33
|
+
)
|
28
34
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
29
35
|
from docling.datamodel.settings import settings
|
30
36
|
from docling.utils.glm_utils import to_docling_document
|
@@ -204,7 +210,31 @@ class GlmModel:
|
|
204
210
|
)
|
205
211
|
],
|
206
212
|
obj_type=layout_label_to_ds_type.get(element.label),
|
207
|
-
|
213
|
+
payload={
|
214
|
+
"children": TypeAdapter(List[Cluster]).dump_python(
|
215
|
+
element.cluster.children
|
216
|
+
)
|
217
|
+
}, # hack to channel child clusters through GLM
|
218
|
+
)
|
219
|
+
)
|
220
|
+
elif isinstance(element, ContainerElement):
|
221
|
+
main_text.append(
|
222
|
+
BaseText(
|
223
|
+
text="",
|
224
|
+
payload={
|
225
|
+
"children": TypeAdapter(List[Cluster]).dump_python(
|
226
|
+
element.cluster.children
|
227
|
+
)
|
228
|
+
}, # hack to channel child clusters through GLM
|
229
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
230
|
+
name=element.label,
|
231
|
+
prov=[
|
232
|
+
Prov(
|
233
|
+
bbox=target_bbox,
|
234
|
+
page=element.page_no + 1,
|
235
|
+
span=[0, 0],
|
236
|
+
)
|
237
|
+
],
|
208
238
|
)
|
209
239
|
)
|
210
240
|
|
docling/models/easyocr_model.py
CHANGED
@@ -66,6 +66,7 @@ class EasyOcrModel(BaseOcrModel):
|
|
66
66
|
lang_list=self.options.lang,
|
67
67
|
gpu=use_gpu,
|
68
68
|
model_storage_directory=self.options.model_storage_directory,
|
69
|
+
recog_network=self.options.recog_network,
|
69
70
|
download_enabled=self.options.download_enabled,
|
70
71
|
verbose=False,
|
71
72
|
)
|
@@ -117,6 +118,7 @@ class EasyOcrModel(BaseOcrModel):
|
|
117
118
|
),
|
118
119
|
)
|
119
120
|
for ix, line in enumerate(result)
|
121
|
+
if line[2] >= self.options.confidence_threshold
|
120
122
|
]
|
121
123
|
all_ocr_cells.extend(cells)
|
122
124
|
|