docling 2.12.0__py3-none-any.whl → 2.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- from enum import Enum, auto
1
+ from enum import Enum
2
2
  from typing import TYPE_CHECKING, Dict, List, Optional, Union
3
3
 
4
4
  from docling_core.types.doc import (
@@ -28,14 +28,18 @@ class ConversionStatus(str, Enum):
28
28
 
29
29
 
30
30
  class InputFormat(str, Enum):
31
+ """A document format supported by document backend parsers."""
32
+
31
33
  DOCX = "docx"
32
34
  PPTX = "pptx"
33
35
  HTML = "html"
36
+ XML_PUBMED = "xml_pubmed"
34
37
  IMAGE = "image"
35
38
  PDF = "pdf"
36
39
  ASCIIDOC = "asciidoc"
37
40
  MD = "md"
38
41
  XLSX = "xlsx"
42
+ XML_USPTO = "xml_uspto"
39
43
 
40
44
 
41
45
  class OutputFormat(str, Enum):
@@ -52,9 +56,11 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
52
56
  InputFormat.PDF: ["pdf"],
53
57
  InputFormat.MD: ["md"],
54
58
  InputFormat.HTML: ["html", "htm", "xhtml"],
59
+ InputFormat.XML_PUBMED: ["xml", "nxml"],
55
60
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
56
61
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
57
62
  InputFormat.XLSX: ["xlsx"],
63
+ InputFormat.XML_USPTO: ["xml", "txt"],
58
64
  }
59
65
 
60
66
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -68,6 +74,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
68
74
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
69
75
  ],
70
76
  InputFormat.HTML: ["text/html", "application/xhtml+xml"],
77
+ InputFormat.XML_PUBMED: ["application/xml"],
71
78
  InputFormat.IMAGE: [
72
79
  "image/png",
73
80
  "image/jpeg",
@@ -81,10 +88,13 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
81
88
  InputFormat.XLSX: [
82
89
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
83
90
  ],
91
+ InputFormat.XML_USPTO: ["application/xml", "text/plain"],
84
92
  }
85
93
 
86
- MimeTypeToFormat = {
87
- mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
94
+ MimeTypeToFormat: dict[str, list[InputFormat]] = {
95
+ mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
96
+ for value in FormatToMimeType.values()
97
+ for mime in value
88
98
  }
89
99
 
90
100
 
@@ -122,6 +132,7 @@ class Cluster(BaseModel):
122
132
  bbox: BoundingBox
123
133
  confidence: float = 1.0
124
134
  cells: List[Cell] = []
135
+ children: List["Cluster"] = [] # Add child cluster support
125
136
 
126
137
 
127
138
  class BasePageElement(BaseModel):
@@ -136,6 +147,12 @@ class LayoutPrediction(BaseModel):
136
147
  clusters: List[Cluster] = []
137
148
 
138
149
 
150
+ class ContainerElement(
151
+ BasePageElement
152
+ ): # Used for Form and Key-Value-Regions, only for typing.
153
+ pass
154
+
155
+
139
156
  class Table(BasePageElement):
140
157
  otsl_seq: List[str]
141
158
  num_rows: int = 0
@@ -175,7 +192,7 @@ class PagePredictions(BaseModel):
175
192
  equations_prediction: Optional[EquationPrediction] = None
176
193
 
177
194
 
178
- PageElement = Union[TextElement, Table, FigureElement]
195
+ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
179
196
 
180
197
 
181
198
  class AssembledUnit(BaseModel):
@@ -3,7 +3,17 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Dict,
9
+ Iterable,
10
+ List,
11
+ Literal,
12
+ Optional,
13
+ Set,
14
+ Type,
15
+ Union,
16
+ )
7
17
 
8
18
  import filetype
9
19
  from docling_core.types.doc import (
@@ -63,7 +73,7 @@ _log = logging.getLogger(__name__)
63
73
 
64
74
  layout_label_to_ds_type = {
65
75
  DocItemLabel.TITLE: "title",
66
- DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
76
+ DocItemLabel.DOCUMENT_INDEX: "table",
67
77
  DocItemLabel.SECTION_HEADER: "subtitle-level-1",
68
78
  DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
69
79
  DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
@@ -78,6 +88,8 @@ layout_label_to_ds_type = {
78
88
  DocItemLabel.PICTURE: "figure",
79
89
  DocItemLabel.TEXT: "paragraph",
80
90
  DocItemLabel.PARAGRAPH: "paragraph",
91
+ DocItemLabel.FORM: DocItemLabel.FORM.value,
92
+ DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
81
93
  }
82
94
 
83
95
  _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
@@ -235,7 +247,7 @@ class _DocumentConversionInput(BaseModel):
235
247
  if isinstance(obj, Path):
236
248
  yield InputDocument(
237
249
  path_or_stream=obj,
238
- format=format,
250
+ format=format, # type: ignore[arg-type]
239
251
  filename=obj.name,
240
252
  limits=self.limits,
241
253
  backend=backend,
@@ -243,7 +255,7 @@ class _DocumentConversionInput(BaseModel):
243
255
  elif isinstance(obj, DocumentStream):
244
256
  yield InputDocument(
245
257
  path_or_stream=obj.stream,
246
- format=format,
258
+ format=format, # type: ignore[arg-type]
247
259
  filename=obj.name,
248
260
  limits=self.limits,
249
261
  backend=backend,
@@ -251,15 +263,15 @@ class _DocumentConversionInput(BaseModel):
251
263
  else:
252
264
  raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
253
265
 
254
- def _guess_format(self, obj: Union[Path, DocumentStream]):
266
+ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
255
267
  content = b"" # empty binary blob
256
- format = None
268
+ formats: list[InputFormat] = []
257
269
 
258
270
  if isinstance(obj, Path):
259
271
  mime = filetype.guess_mime(str(obj))
260
272
  if mime is None:
261
273
  ext = obj.suffix[1:]
262
- mime = self._mime_from_extension(ext)
274
+ mime = _DocumentConversionInput._mime_from_extension(ext)
263
275
  if mime is None: # must guess from
264
276
  with obj.open("rb") as f:
265
277
  content = f.read(1024) # Read first 1KB
@@ -274,15 +286,58 @@ class _DocumentConversionInput(BaseModel):
274
286
  if ("." in obj.name and not obj.name.startswith("."))
275
287
  else ""
276
288
  )
277
- mime = self._mime_from_extension(ext)
289
+ mime = _DocumentConversionInput._mime_from_extension(ext)
278
290
 
279
- mime = mime or self._detect_html_xhtml(content)
291
+ mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
280
292
  mime = mime or "text/plain"
293
+ formats = MimeTypeToFormat.get(mime, [])
294
+ if formats:
295
+ if len(formats) == 1 and mime not in ("text/plain"):
296
+ return formats[0]
297
+ else: # ambiguity in formats
298
+ return _DocumentConversionInput._guess_from_content(
299
+ content, mime, formats
300
+ )
301
+ else:
302
+ return None
303
+
304
+ @staticmethod
305
+ def _guess_from_content(
306
+ content: bytes, mime: str, formats: list[InputFormat]
307
+ ) -> Optional[InputFormat]:
308
+ """Guess the input format of a document by checking part of its content."""
309
+ input_format: Optional[InputFormat] = None
310
+ content_str = content.decode("utf-8")
311
+
312
+ if mime == "application/xml":
313
+ match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
314
+ if match_doctype:
315
+ xml_doctype = match_doctype.group()
316
+ if InputFormat.XML_USPTO in formats and any(
317
+ item in xml_doctype
318
+ for item in (
319
+ "us-patent-application-v4",
320
+ "us-patent-grant-v4",
321
+ "us-grant-025",
322
+ "patent-application-publication",
323
+ )
324
+ ):
325
+ input_format = InputFormat.XML_USPTO
326
+
327
+ if (
328
+ InputFormat.XML_PUBMED in formats
329
+ and "/NLM//DTD JATS" in xml_doctype
330
+ ):
331
+ input_format = InputFormat.XML_PUBMED
332
+
333
+ elif mime == "text/plain":
334
+ if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
335
+ input_format = InputFormat.XML_USPTO
281
336
 
282
- format = MimeTypeToFormat.get(mime)
283
- return format
337
+ return input_format
284
338
 
285
- def _mime_from_extension(self, ext):
339
+ @staticmethod
340
+ def _mime_from_extension(ext):
286
341
  mime = None
287
342
  if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
288
343
  mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
@@ -290,10 +345,21 @@ class _DocumentConversionInput(BaseModel):
290
345
  mime = FormatToMimeType[InputFormat.HTML][0]
291
346
  elif ext in FormatToExtensions[InputFormat.MD]:
292
347
  mime = FormatToMimeType[InputFormat.MD][0]
293
-
294
348
  return mime
295
349
 
296
- def _detect_html_xhtml(self, content):
350
+ @staticmethod
351
+ def _detect_html_xhtml(
352
+ content: bytes,
353
+ ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
354
+ """Guess the mime type of an XHTML, HTML, or XML file from its content.
355
+
356
+ Args:
357
+ content: A short piece of a document from its beginning.
358
+
359
+ Returns:
360
+ The mime type of an XHTML, HTML, or XML file, or None if the content does
361
+ not match any of these formats.
362
+ """
297
363
  content_str = content.decode("ascii", errors="ignore").lower()
298
364
  # Remove XML comments
299
365
  content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
@@ -302,8 +368,16 @@ class _DocumentConversionInput(BaseModel):
302
368
  if re.match(r"<\?xml", content_str):
303
369
  if "xhtml" in content_str[:1000]:
304
370
  return "application/xhtml+xml"
371
+ else:
372
+ return "application/xml"
305
373
 
306
374
  if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
307
375
  return "text/html"
308
376
 
377
+ p = re.compile(
378
+ r"<!doctype\s+(?P<root>[a-zA-Z_:][a-zA-Z0-9_:.-]*)\s+.*>\s*<(?P=root)\b"
379
+ )
380
+ if p.search(content_str):
381
+ return "application/xml"
382
+
309
383
  return None
@@ -139,7 +139,10 @@ class EasyOcrOptions(OcrOptions):
139
139
 
140
140
  use_gpu: Optional[bool] = None
141
141
 
142
+ confidence_threshold: float = 0.65
143
+
142
144
  model_storage_directory: Optional[str] = None
145
+ recog_network: Optional[str] = "standard"
143
146
  download_enabled: bool = True
144
147
 
145
148
  model_config = ConfigDict(
@@ -31,6 +31,7 @@ class DebugSettings(BaseModel):
31
31
  visualize_cells: bool = False
32
32
  visualize_ocr: bool = False
33
33
  visualize_layout: bool = False
34
+ visualize_raw_layout: bool = False
34
35
  visualize_tables: bool = False
35
36
 
36
37
  profile_pipeline_timings: bool = False
@@ -15,6 +15,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
17
17
  from docling.backend.msword_backend import MsWordDocumentBackend
18
+ from docling.backend.xml.pubmed_backend import PubMedDocumentBackend
19
+ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
18
20
  from docling.datamodel.base_models import (
19
21
  ConversionStatus,
20
22
  DoclingComponentType,
@@ -82,12 +84,22 @@ class HTMLFormatOption(FormatOption):
82
84
  backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
83
85
 
84
86
 
85
- class PdfFormatOption(FormatOption):
87
+ class PatentUsptoFormatOption(FormatOption):
88
+ pipeline_cls: Type = SimplePipeline
89
+ backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
90
+
91
+
92
+ class XMLPubMedFormatOption(FormatOption):
93
+ pipeline_cls: Type = SimplePipeline
94
+ backend: Type[AbstractDocumentBackend] = PubMedDocumentBackend
95
+
96
+
97
+ class ImageFormatOption(FormatOption):
86
98
  pipeline_cls: Type = StandardPdfPipeline
87
99
  backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
88
100
 
89
101
 
90
- class ImageFormatOption(FormatOption):
102
+ class PdfFormatOption(FormatOption):
91
103
  pipeline_cls: Type = StandardPdfPipeline
92
104
  backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
93
105
 
@@ -112,6 +124,12 @@ def _get_default_option(format: InputFormat) -> FormatOption:
112
124
  InputFormat.HTML: FormatOption(
113
125
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
126
  ),
127
+ InputFormat.XML_USPTO: FormatOption(
128
+ pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
129
+ ),
130
+ InputFormat.XML_PUBMED: FormatOption(
131
+ pipeline_cls=SimplePipeline, backend=PubMedDocumentBackend
132
+ ),
115
133
  InputFormat.IMAGE: FormatOption(
116
134
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
117
135
  ),
@@ -162,7 +180,6 @@ class DocumentConverter:
162
180
  max_num_pages: int = sys.maxsize,
163
181
  max_file_size: int = sys.maxsize,
164
182
  ) -> ConversionResult:
165
-
166
183
  all_res = self.convert_all(
167
184
  source=[source],
168
185
  raises_on_error=raises_on_error,
@@ -22,9 +22,15 @@ from docling_core.types.legacy_doc.document import (
22
22
  from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
23
23
  from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
24
24
  from PIL import ImageDraw
25
- from pydantic import BaseModel, ConfigDict
26
-
27
- from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
25
+ from pydantic import BaseModel, ConfigDict, TypeAdapter
26
+
27
+ from docling.datamodel.base_models import (
28
+ Cluster,
29
+ ContainerElement,
30
+ FigureElement,
31
+ Table,
32
+ TextElement,
33
+ )
28
34
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
29
35
  from docling.datamodel.settings import settings
30
36
  from docling.utils.glm_utils import to_docling_document
@@ -204,7 +210,31 @@ class GlmModel:
204
210
  )
205
211
  ],
206
212
  obj_type=layout_label_to_ds_type.get(element.label),
207
- # data=[[]],
213
+ payload={
214
+ "children": TypeAdapter(List[Cluster]).dump_python(
215
+ element.cluster.children
216
+ )
217
+ }, # hack to channel child clusters through GLM
218
+ )
219
+ )
220
+ elif isinstance(element, ContainerElement):
221
+ main_text.append(
222
+ BaseText(
223
+ text="",
224
+ payload={
225
+ "children": TypeAdapter(List[Cluster]).dump_python(
226
+ element.cluster.children
227
+ )
228
+ }, # hack to channel child clusters through GLM
229
+ obj_type=layout_label_to_ds_type.get(element.label),
230
+ name=element.label,
231
+ prov=[
232
+ Prov(
233
+ bbox=target_bbox,
234
+ page=element.page_no + 1,
235
+ span=[0, 0],
236
+ )
237
+ ],
208
238
  )
209
239
  )
210
240
 
@@ -66,6 +66,7 @@ class EasyOcrModel(BaseOcrModel):
66
66
  lang_list=self.options.lang,
67
67
  gpu=use_gpu,
68
68
  model_storage_directory=self.options.model_storage_directory,
69
+ recog_network=self.options.recog_network,
69
70
  download_enabled=self.options.download_enabled,
70
71
  verbose=False,
71
72
  )
@@ -117,6 +118,7 @@ class EasyOcrModel(BaseOcrModel):
117
118
  ),
118
119
  )
119
120
  for ix, line in enumerate(result)
121
+ if line[2] >= self.options.confidence_threshold
120
122
  ]
121
123
  all_ocr_cells.extend(cells)
122
124