docling 2.12.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- from enum import Enum, auto
1
+ from enum import Enum
2
2
  from typing import TYPE_CHECKING, Dict, List, Optional, Union
3
3
 
4
4
  from docling_core.types.doc import (
@@ -28,6 +28,8 @@ class ConversionStatus(str, Enum):
28
28
 
29
29
 
30
30
  class InputFormat(str, Enum):
31
+ """A document format supported by document backend parsers."""
32
+
31
33
  DOCX = "docx"
32
34
  PPTX = "pptx"
33
35
  HTML = "html"
@@ -36,6 +38,7 @@ class InputFormat(str, Enum):
36
38
  ASCIIDOC = "asciidoc"
37
39
  MD = "md"
38
40
  XLSX = "xlsx"
41
+ XML_USPTO = "xml_uspto"
39
42
 
40
43
 
41
44
  class OutputFormat(str, Enum):
@@ -55,6 +58,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
55
58
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
56
59
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
57
60
  InputFormat.XLSX: ["xlsx"],
61
+ InputFormat.XML_USPTO: ["xml", "txt"],
58
62
  }
59
63
 
60
64
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -81,10 +85,13 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
81
85
  InputFormat.XLSX: [
82
86
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
83
87
  ],
88
+ InputFormat.XML_USPTO: ["application/xml", "text/plain"],
84
89
  }
85
90
 
86
- MimeTypeToFormat = {
87
- mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
91
+ MimeTypeToFormat: dict[str, list[InputFormat]] = {
92
+ mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
93
+ for value in FormatToMimeType.values()
94
+ for mime in value
88
95
  }
89
96
 
90
97
 
@@ -122,6 +129,7 @@ class Cluster(BaseModel):
122
129
  bbox: BoundingBox
123
130
  confidence: float = 1.0
124
131
  cells: List[Cell] = []
132
+ children: List["Cluster"] = [] # Add child cluster support
125
133
 
126
134
 
127
135
  class BasePageElement(BaseModel):
@@ -136,6 +144,12 @@ class LayoutPrediction(BaseModel):
136
144
  clusters: List[Cluster] = []
137
145
 
138
146
 
147
+ class ContainerElement(
148
+ BasePageElement
149
+ ): # Used for Form and Key-Value-Regions, only for typing.
150
+ pass
151
+
152
+
139
153
  class Table(BasePageElement):
140
154
  otsl_seq: List[str]
141
155
  num_rows: int = 0
@@ -175,7 +189,7 @@ class PagePredictions(BaseModel):
175
189
  equations_prediction: Optional[EquationPrediction] = None
176
190
 
177
191
 
178
- PageElement = Union[TextElement, Table, FigureElement]
192
+ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
179
193
 
180
194
 
181
195
  class AssembledUnit(BaseModel):
@@ -3,7 +3,17 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Dict,
9
+ Iterable,
10
+ List,
11
+ Literal,
12
+ Optional,
13
+ Set,
14
+ Type,
15
+ Union,
16
+ )
7
17
 
8
18
  import filetype
9
19
  from docling_core.types.doc import (
@@ -63,7 +73,7 @@ _log = logging.getLogger(__name__)
63
73
 
64
74
  layout_label_to_ds_type = {
65
75
  DocItemLabel.TITLE: "title",
66
- DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
76
+ DocItemLabel.DOCUMENT_INDEX: "table",
67
77
  DocItemLabel.SECTION_HEADER: "subtitle-level-1",
68
78
  DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
69
79
  DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
@@ -78,6 +88,8 @@ layout_label_to_ds_type = {
78
88
  DocItemLabel.PICTURE: "figure",
79
89
  DocItemLabel.TEXT: "paragraph",
80
90
  DocItemLabel.PARAGRAPH: "paragraph",
91
+ DocItemLabel.FORM: DocItemLabel.FORM.value,
92
+ DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
81
93
  }
82
94
 
83
95
  _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
@@ -235,7 +247,7 @@ class _DocumentConversionInput(BaseModel):
235
247
  if isinstance(obj, Path):
236
248
  yield InputDocument(
237
249
  path_or_stream=obj,
238
- format=format,
250
+ format=format, # type: ignore[arg-type]
239
251
  filename=obj.name,
240
252
  limits=self.limits,
241
253
  backend=backend,
@@ -243,7 +255,7 @@ class _DocumentConversionInput(BaseModel):
243
255
  elif isinstance(obj, DocumentStream):
244
256
  yield InputDocument(
245
257
  path_or_stream=obj.stream,
246
- format=format,
258
+ format=format, # type: ignore[arg-type]
247
259
  filename=obj.name,
248
260
  limits=self.limits,
249
261
  backend=backend,
@@ -251,15 +263,15 @@ class _DocumentConversionInput(BaseModel):
251
263
  else:
252
264
  raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
253
265
 
254
- def _guess_format(self, obj: Union[Path, DocumentStream]):
266
+ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
255
267
  content = b"" # empty binary blob
256
- format = None
268
+ formats: list[InputFormat] = []
257
269
 
258
270
  if isinstance(obj, Path):
259
271
  mime = filetype.guess_mime(str(obj))
260
272
  if mime is None:
261
273
  ext = obj.suffix[1:]
262
- mime = self._mime_from_extension(ext)
274
+ mime = _DocumentConversionInput._mime_from_extension(ext)
263
275
  if mime is None: # must guess from
264
276
  with obj.open("rb") as f:
265
277
  content = f.read(1024) # Read first 1KB
@@ -274,15 +286,53 @@ class _DocumentConversionInput(BaseModel):
274
286
  if ("." in obj.name and not obj.name.startswith("."))
275
287
  else ""
276
288
  )
277
- mime = self._mime_from_extension(ext)
289
+ mime = _DocumentConversionInput._mime_from_extension(ext)
278
290
 
279
- mime = mime or self._detect_html_xhtml(content)
291
+ mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
280
292
  mime = mime or "text/plain"
293
+ formats = MimeTypeToFormat.get(mime, [])
294
+ if formats:
295
+ # TODO: remove application/xml case after adding another XML parse
296
+ if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
297
+ return formats[0]
298
+ else: # ambiguity in formats
299
+ return _DocumentConversionInput._guess_from_content(
300
+ content, mime, formats
301
+ )
302
+ else:
303
+ return None
304
+
305
+ @staticmethod
306
+ def _guess_from_content(
307
+ content: bytes, mime: str, formats: list[InputFormat]
308
+ ) -> Optional[InputFormat]:
309
+ """Guess the input format of a document by checking part of its content."""
310
+ input_format: Optional[InputFormat] = None
311
+ content_str = content.decode("utf-8")
312
+
313
+ if mime == "application/xml":
314
+ match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
315
+ if match_doctype:
316
+ xml_doctype = match_doctype.group()
317
+ if InputFormat.XML_USPTO in formats and any(
318
+ item in xml_doctype
319
+ for item in (
320
+ "us-patent-application-v4",
321
+ "us-patent-grant-v4",
322
+ "us-grant-025",
323
+ "patent-application-publication",
324
+ )
325
+ ):
326
+ input_format = InputFormat.XML_USPTO
327
+
328
+ elif mime == "text/plain":
329
+ if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
330
+ input_format = InputFormat.XML_USPTO
281
331
 
282
- format = MimeTypeToFormat.get(mime)
283
- return format
332
+ return input_format
284
333
 
285
- def _mime_from_extension(self, ext):
334
+ @staticmethod
335
+ def _mime_from_extension(ext):
286
336
  mime = None
287
337
  if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
288
338
  mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
@@ -293,7 +343,19 @@ class _DocumentConversionInput(BaseModel):
293
343
 
294
344
  return mime
295
345
 
296
- def _detect_html_xhtml(self, content):
346
+ @staticmethod
347
+ def _detect_html_xhtml(
348
+ content: bytes,
349
+ ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
350
+ """Guess the mime type of an XHTML, HTML, or XML file from its content.
351
+
352
+ Args:
353
+ content: A short piece of a document from its beginning.
354
+
355
+ Returns:
356
+ The mime type of an XHTML, HTML, or XML file, or None if the content does
357
+ not match any of these formats.
358
+ """
297
359
  content_str = content.decode("ascii", errors="ignore").lower()
298
360
  # Remove XML comments
299
361
  content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
@@ -302,6 +364,8 @@ class _DocumentConversionInput(BaseModel):
302
364
  if re.match(r"<\?xml", content_str):
303
365
  if "xhtml" in content_str[:1000]:
304
366
  return "application/xhtml+xml"
367
+ else:
368
+ return "application/xml"
305
369
 
306
370
  if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
307
371
  return "text/html"
@@ -139,7 +139,10 @@ class EasyOcrOptions(OcrOptions):
139
139
 
140
140
  use_gpu: Optional[bool] = None
141
141
 
142
+ confidence_threshold: float = 0.65
143
+
142
144
  model_storage_directory: Optional[str] = None
145
+ recog_network: Optional[str] = "standard"
143
146
  download_enabled: bool = True
144
147
 
145
148
  model_config = ConfigDict(
@@ -31,6 +31,7 @@ class DebugSettings(BaseModel):
31
31
  visualize_cells: bool = False
32
32
  visualize_ocr: bool = False
33
33
  visualize_layout: bool = False
34
+ visualize_raw_layout: bool = False
34
35
  visualize_tables: bool = False
35
36
 
36
37
  profile_pipeline_timings: bool = False
@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
17
17
  from docling.backend.msword_backend import MsWordDocumentBackend
18
+ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
18
19
  from docling.datamodel.base_models import (
19
20
  ConversionStatus,
20
21
  DoclingComponentType,
@@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
82
83
  backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
83
84
 
84
85
 
85
- class PdfFormatOption(FormatOption):
86
+ class PatentUsptoFormatOption(FormatOption):
87
+ pipeline_cls: Type = SimplePipeline
88
+ backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
89
+
90
+
91
+ class ImageFormatOption(FormatOption):
86
92
  pipeline_cls: Type = StandardPdfPipeline
87
93
  backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
88
94
 
89
95
 
90
- class ImageFormatOption(FormatOption):
96
+ class PdfFormatOption(FormatOption):
91
97
  pipeline_cls: Type = StandardPdfPipeline
92
98
  backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
93
99
 
@@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
112
118
  InputFormat.HTML: FormatOption(
113
119
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
120
  ),
121
+ InputFormat.XML_USPTO: FormatOption(
122
+ pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
123
+ ),
115
124
  InputFormat.IMAGE: FormatOption(
116
125
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
117
126
  ),
@@ -22,9 +22,15 @@ from docling_core.types.legacy_doc.document import (
22
22
  from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
23
23
  from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
24
24
  from PIL import ImageDraw
25
- from pydantic import BaseModel, ConfigDict
26
-
27
- from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
25
+ from pydantic import BaseModel, ConfigDict, TypeAdapter
26
+
27
+ from docling.datamodel.base_models import (
28
+ Cluster,
29
+ ContainerElement,
30
+ FigureElement,
31
+ Table,
32
+ TextElement,
33
+ )
28
34
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
29
35
  from docling.datamodel.settings import settings
30
36
  from docling.utils.glm_utils import to_docling_document
@@ -204,7 +210,31 @@ class GlmModel:
204
210
  )
205
211
  ],
206
212
  obj_type=layout_label_to_ds_type.get(element.label),
207
- # data=[[]],
213
+ payload={
214
+ "children": TypeAdapter(List[Cluster]).dump_python(
215
+ element.cluster.children
216
+ )
217
+ }, # hack to channel child clusters through GLM
218
+ )
219
+ )
220
+ elif isinstance(element, ContainerElement):
221
+ main_text.append(
222
+ BaseText(
223
+ text="",
224
+ payload={
225
+ "children": TypeAdapter(List[Cluster]).dump_python(
226
+ element.cluster.children
227
+ )
228
+ }, # hack to channel child clusters through GLM
229
+ obj_type=layout_label_to_ds_type.get(element.label),
230
+ name=element.label,
231
+ prov=[
232
+ Prov(
233
+ bbox=target_bbox,
234
+ page=element.page_no + 1,
235
+ span=[0, 0],
236
+ )
237
+ ],
208
238
  )
209
239
  )
210
240
 
@@ -66,6 +66,7 @@ class EasyOcrModel(BaseOcrModel):
66
66
  lang_list=self.options.lang,
67
67
  gpu=use_gpu,
68
68
  model_storage_directory=self.options.model_storage_directory,
69
+ recog_network=self.options.recog_network,
69
70
  download_enabled=self.options.download_enabled,
70
71
  verbose=False,
71
72
  )
@@ -117,6 +118,7 @@ class EasyOcrModel(BaseOcrModel):
117
118
  ),
118
119
  )
119
120
  for ix, line in enumerate(result)
121
+ if line[2] >= self.options.confidence_threshold
120
122
  ]
121
123
  all_ocr_cells.extend(cells)
122
124