docling 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -26,6 +26,8 @@ from docling.datamodel.base_models import (
26
26
  )
27
27
  from docling.datamodel.document import ConversionResult
28
28
  from docling.datamodel.pipeline_options import (
29
+ AcceleratorDevice,
30
+ AcceleratorOptions,
29
31
  EasyOcrOptions,
30
32
  OcrEngine,
31
33
  OcrMacOptions,
@@ -257,6 +259,10 @@ def convert(
257
259
  help="The timeout for processing each document, in seconds.",
258
260
  ),
259
261
  ] = None,
262
+ num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
263
+ device: Annotated[
264
+ AcceleratorDevice, typer.Option(..., help="Accelerator device")
265
+ ] = AcceleratorDevice.AUTO,
260
266
  ):
261
267
  if verbose == 0:
262
268
  logging.basicConfig(level=logging.WARNING)
@@ -336,7 +342,9 @@ def convert(
336
342
  if ocr_lang_list is not None:
337
343
  ocr_options.lang = ocr_lang_list
338
344
 
345
+ accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
339
346
  pipeline_options = PdfPipelineOptions(
347
+ accelerator_options=accelerator_options,
340
348
  do_ocr=ocr,
341
349
  ocr_options=ocr_options,
342
350
  do_table_structure=True,
@@ -1,4 +1,4 @@
1
- from enum import Enum, auto
1
+ from enum import Enum
2
2
  from typing import TYPE_CHECKING, Dict, List, Optional, Union
3
3
 
4
4
  from docling_core.types.doc import (
@@ -28,6 +28,8 @@ class ConversionStatus(str, Enum):
28
28
 
29
29
 
30
30
  class InputFormat(str, Enum):
31
+ """A document format supported by document backend parsers."""
32
+
31
33
  DOCX = "docx"
32
34
  PPTX = "pptx"
33
35
  HTML = "html"
@@ -36,6 +38,7 @@ class InputFormat(str, Enum):
36
38
  ASCIIDOC = "asciidoc"
37
39
  MD = "md"
38
40
  XLSX = "xlsx"
41
+ XML_USPTO = "xml_uspto"
39
42
 
40
43
 
41
44
  class OutputFormat(str, Enum):
@@ -55,6 +58,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
55
58
  InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
56
59
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
57
60
  InputFormat.XLSX: ["xlsx"],
61
+ InputFormat.XML_USPTO: ["xml", "txt"],
58
62
  }
59
63
 
60
64
  FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -81,10 +85,13 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
81
85
  InputFormat.XLSX: [
82
86
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
83
87
  ],
88
+ InputFormat.XML_USPTO: ["application/xml", "text/plain"],
84
89
  }
85
90
 
86
- MimeTypeToFormat = {
87
- mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
91
+ MimeTypeToFormat: dict[str, list[InputFormat]] = {
92
+ mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
93
+ for value in FormatToMimeType.values()
94
+ for mime in value
88
95
  }
89
96
 
90
97
 
@@ -122,6 +129,7 @@ class Cluster(BaseModel):
122
129
  bbox: BoundingBox
123
130
  confidence: float = 1.0
124
131
  cells: List[Cell] = []
132
+ children: List["Cluster"] = [] # Add child cluster support
125
133
 
126
134
 
127
135
  class BasePageElement(BaseModel):
@@ -136,6 +144,12 @@ class LayoutPrediction(BaseModel):
136
144
  clusters: List[Cluster] = []
137
145
 
138
146
 
147
+ class ContainerElement(
148
+ BasePageElement
149
+ ): # Used for Form and Key-Value-Regions, only for typing.
150
+ pass
151
+
152
+
139
153
  class Table(BasePageElement):
140
154
  otsl_seq: List[str]
141
155
  num_rows: int = 0
@@ -175,7 +189,7 @@ class PagePredictions(BaseModel):
175
189
  equations_prediction: Optional[EquationPrediction] = None
176
190
 
177
191
 
178
- PageElement = Union[TextElement, Table, FigureElement]
192
+ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
179
193
 
180
194
 
181
195
  class AssembledUnit(BaseModel):
@@ -3,7 +3,17 @@ import re
3
3
  from enum import Enum
4
4
  from io import BytesIO
5
5
  from pathlib import Path, PurePath
6
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
6
+ from typing import (
7
+ TYPE_CHECKING,
8
+ Dict,
9
+ Iterable,
10
+ List,
11
+ Literal,
12
+ Optional,
13
+ Set,
14
+ Type,
15
+ Union,
16
+ )
7
17
 
8
18
  import filetype
9
19
  from docling_core.types.doc import (
@@ -63,7 +73,7 @@ _log = logging.getLogger(__name__)
63
73
 
64
74
  layout_label_to_ds_type = {
65
75
  DocItemLabel.TITLE: "title",
66
- DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
76
+ DocItemLabel.DOCUMENT_INDEX: "table",
67
77
  DocItemLabel.SECTION_HEADER: "subtitle-level-1",
68
78
  DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
69
79
  DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
@@ -78,6 +88,8 @@ layout_label_to_ds_type = {
78
88
  DocItemLabel.PICTURE: "figure",
79
89
  DocItemLabel.TEXT: "paragraph",
80
90
  DocItemLabel.PARAGRAPH: "paragraph",
91
+ DocItemLabel.FORM: DocItemLabel.FORM.value,
92
+ DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
81
93
  }
82
94
 
83
95
  _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
@@ -235,7 +247,7 @@ class _DocumentConversionInput(BaseModel):
235
247
  if isinstance(obj, Path):
236
248
  yield InputDocument(
237
249
  path_or_stream=obj,
238
- format=format,
250
+ format=format, # type: ignore[arg-type]
239
251
  filename=obj.name,
240
252
  limits=self.limits,
241
253
  backend=backend,
@@ -243,7 +255,7 @@ class _DocumentConversionInput(BaseModel):
243
255
  elif isinstance(obj, DocumentStream):
244
256
  yield InputDocument(
245
257
  path_or_stream=obj.stream,
246
- format=format,
258
+ format=format, # type: ignore[arg-type]
247
259
  filename=obj.name,
248
260
  limits=self.limits,
249
261
  backend=backend,
@@ -251,15 +263,15 @@ class _DocumentConversionInput(BaseModel):
251
263
  else:
252
264
  raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
253
265
 
254
- def _guess_format(self, obj: Union[Path, DocumentStream]):
266
+ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
255
267
  content = b"" # empty binary blob
256
- format = None
268
+ formats: list[InputFormat] = []
257
269
 
258
270
  if isinstance(obj, Path):
259
271
  mime = filetype.guess_mime(str(obj))
260
272
  if mime is None:
261
273
  ext = obj.suffix[1:]
262
- mime = self._mime_from_extension(ext)
274
+ mime = _DocumentConversionInput._mime_from_extension(ext)
263
275
  if mime is None: # must guess from
264
276
  with obj.open("rb") as f:
265
277
  content = f.read(1024) # Read first 1KB
@@ -274,15 +286,53 @@ class _DocumentConversionInput(BaseModel):
274
286
  if ("." in obj.name and not obj.name.startswith("."))
275
287
  else ""
276
288
  )
277
- mime = self._mime_from_extension(ext)
289
+ mime = _DocumentConversionInput._mime_from_extension(ext)
278
290
 
279
- mime = mime or self._detect_html_xhtml(content)
291
+ mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
280
292
  mime = mime or "text/plain"
293
+ formats = MimeTypeToFormat.get(mime, [])
294
+ if formats:
295
+ # TODO: remove application/xml case after adding another XML parse
296
+ if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
297
+ return formats[0]
298
+ else: # ambiguity in formats
299
+ return _DocumentConversionInput._guess_from_content(
300
+ content, mime, formats
301
+ )
302
+ else:
303
+ return None
304
+
305
+ @staticmethod
306
+ def _guess_from_content(
307
+ content: bytes, mime: str, formats: list[InputFormat]
308
+ ) -> Optional[InputFormat]:
309
+ """Guess the input format of a document by checking part of its content."""
310
+ input_format: Optional[InputFormat] = None
311
+ content_str = content.decode("utf-8")
312
+
313
+ if mime == "application/xml":
314
+ match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
315
+ if match_doctype:
316
+ xml_doctype = match_doctype.group()
317
+ if InputFormat.XML_USPTO in formats and any(
318
+ item in xml_doctype
319
+ for item in (
320
+ "us-patent-application-v4",
321
+ "us-patent-grant-v4",
322
+ "us-grant-025",
323
+ "patent-application-publication",
324
+ )
325
+ ):
326
+ input_format = InputFormat.XML_USPTO
327
+
328
+ elif mime == "text/plain":
329
+ if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
330
+ input_format = InputFormat.XML_USPTO
281
331
 
282
- format = MimeTypeToFormat.get(mime)
283
- return format
332
+ return input_format
284
333
 
285
- def _mime_from_extension(self, ext):
334
+ @staticmethod
335
+ def _mime_from_extension(ext):
286
336
  mime = None
287
337
  if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
288
338
  mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
@@ -293,7 +343,19 @@ class _DocumentConversionInput(BaseModel):
293
343
 
294
344
  return mime
295
345
 
296
- def _detect_html_xhtml(self, content):
346
+ @staticmethod
347
+ def _detect_html_xhtml(
348
+ content: bytes,
349
+ ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
350
+ """Guess the mime type of an XHTML, HTML, or XML file from its content.
351
+
352
+ Args:
353
+ content: A short piece of a document from its beginning.
354
+
355
+ Returns:
356
+ The mime type of an XHTML, HTML, or XML file, or None if the content does
357
+ not match any of these formats.
358
+ """
297
359
  content_str = content.decode("ascii", errors="ignore").lower()
298
360
  # Remove XML comments
299
361
  content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
@@ -302,6 +364,8 @@ class _DocumentConversionInput(BaseModel):
302
364
  if re.match(r"<\?xml", content_str):
303
365
  if "xhtml" in content_str[:1000]:
304
366
  return "application/xhtml+xml"
367
+ else:
368
+ return "application/xml"
305
369
 
306
370
  if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
307
371
  return "text/html"
@@ -1,8 +1,66 @@
1
+ import logging
2
+ import os
3
+ import warnings
1
4
  from enum import Enum
2
5
  from pathlib import Path
3
- from typing import List, Literal, Optional, Union
6
+ from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
4
7
 
5
- from pydantic import BaseModel, ConfigDict, Field
8
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
9
+ from pydantic_settings import (
10
+ BaseSettings,
11
+ PydanticBaseSettingsSource,
12
+ SettingsConfigDict,
13
+ )
14
+ from typing_extensions import deprecated
15
+
16
+ _log = logging.getLogger(__name__)
17
+
18
+
19
+ class AcceleratorDevice(str, Enum):
20
+ """Devices to run model inference"""
21
+
22
+ AUTO = "auto"
23
+ CPU = "cpu"
24
+ CUDA = "cuda"
25
+ MPS = "mps"
26
+
27
+
28
+ class AcceleratorOptions(BaseSettings):
29
+ model_config = SettingsConfigDict(
30
+ env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
31
+ )
32
+
33
+ num_threads: int = 4
34
+ device: AcceleratorDevice = AcceleratorDevice.AUTO
35
+
36
+ @model_validator(mode="before")
37
+ @classmethod
38
+ def check_alternative_envvars(cls, data: Any) -> Any:
39
+ r"""
40
+ Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
41
+ The alternative envvar is used only if it is valid and the regular envvar is not set.
42
+
43
+ Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
44
+ the same functionality. In case the alias envvar is set and the user tries to override the
45
+ parameter in settings initialization, Pydantic treats the parameter provided in __init__()
46
+ as an extra input instead of simply overwriting the evvar value for that parameter.
47
+ """
48
+ if isinstance(data, dict):
49
+ input_num_threads = data.get("num_threads")
50
+
51
+ # Check if to set the num_threads from the alternative envvar
52
+ if input_num_threads is None:
53
+ docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
54
+ omp_num_threads = os.getenv("OMP_NUM_THREADS")
55
+ if docling_num_threads is None and omp_num_threads is not None:
56
+ try:
57
+ data["num_threads"] = int(omp_num_threads)
58
+ except ValueError:
59
+ _log.error(
60
+ "Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
61
+ omp_num_threads,
62
+ )
63
+ return data
6
64
 
7
65
 
8
66
  class TableFormerMode(str, Enum):
@@ -78,9 +136,14 @@ class EasyOcrOptions(OcrOptions):
78
136
 
79
137
  kind: Literal["easyocr"] = "easyocr"
80
138
  lang: List[str] = ["fr", "de", "es", "en"]
81
- use_gpu: bool = True # same default as easyocr.Reader
139
+
140
+ use_gpu: Optional[bool] = None
141
+
142
+ confidence_threshold: float = 0.65
143
+
82
144
  model_storage_directory: Optional[str] = None
83
- download_enabled: bool = True # same default as easyocr.Reader
145
+ recog_network: Optional[str] = "standard"
146
+ download_enabled: bool = True
84
147
 
85
148
  model_config = ConfigDict(
86
149
  extra="forbid",
@@ -153,6 +216,7 @@ class PipelineOptions(BaseModel):
153
216
  True # This default will be set to False on a future version of docling
154
217
  )
155
218
  document_timeout: Optional[float] = None
219
+ accelerator_options: AcceleratorOptions = AcceleratorOptions()
156
220
 
157
221
 
158
222
  class PdfPipelineOptions(PipelineOptions):
@@ -31,6 +31,7 @@ class DebugSettings(BaseModel):
31
31
  visualize_cells: bool = False
32
32
  visualize_ocr: bool = False
33
33
  visualize_layout: bool = False
34
+ visualize_raw_layout: bool = False
34
35
  visualize_tables: bool = False
35
36
 
36
37
  profile_pipeline_timings: bool = False
@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
16
16
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
17
17
  from docling.backend.msword_backend import MsWordDocumentBackend
18
+ from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
18
19
  from docling.datamodel.base_models import (
19
20
  ConversionStatus,
20
21
  DoclingComponentType,
@@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
82
83
  backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
83
84
 
84
85
 
85
- class PdfFormatOption(FormatOption):
86
+ class PatentUsptoFormatOption(FormatOption):
87
+ pipeline_cls: Type = SimplePipeline
88
+ backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
89
+
90
+
91
+ class ImageFormatOption(FormatOption):
86
92
  pipeline_cls: Type = StandardPdfPipeline
87
93
  backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
88
94
 
89
95
 
90
- class ImageFormatOption(FormatOption):
96
+ class PdfFormatOption(FormatOption):
91
97
  pipeline_cls: Type = StandardPdfPipeline
92
98
  backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
93
99
 
@@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
112
118
  InputFormat.HTML: FormatOption(
113
119
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
120
  ),
121
+ InputFormat.XML_USPTO: FormatOption(
122
+ pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
123
+ ),
115
124
  InputFormat.IMAGE: FormatOption(
116
125
  pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
117
126
  ),
@@ -22,9 +22,15 @@ from docling_core.types.legacy_doc.document import (
22
22
  from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
23
23
  from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
24
24
  from PIL import ImageDraw
25
- from pydantic import BaseModel, ConfigDict
26
-
27
- from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
25
+ from pydantic import BaseModel, ConfigDict, TypeAdapter
26
+
27
+ from docling.datamodel.base_models import (
28
+ Cluster,
29
+ ContainerElement,
30
+ FigureElement,
31
+ Table,
32
+ TextElement,
33
+ )
28
34
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
29
35
  from docling.datamodel.settings import settings
30
36
  from docling.utils.glm_utils import to_docling_document
@@ -204,7 +210,31 @@ class GlmModel:
204
210
  )
205
211
  ],
206
212
  obj_type=layout_label_to_ds_type.get(element.label),
207
- # data=[[]],
213
+ payload={
214
+ "children": TypeAdapter(List[Cluster]).dump_python(
215
+ element.cluster.children
216
+ )
217
+ }, # hack to channel child clusters through GLM
218
+ )
219
+ )
220
+ elif isinstance(element, ContainerElement):
221
+ main_text.append(
222
+ BaseText(
223
+ text="",
224
+ payload={
225
+ "children": TypeAdapter(List[Cluster]).dump_python(
226
+ element.cluster.children
227
+ )
228
+ }, # hack to channel child clusters through GLM
229
+ obj_type=layout_label_to_ds_type.get(element.label),
230
+ name=element.label,
231
+ prov=[
232
+ Prov(
233
+ bbox=target_bbox,
234
+ page=element.page_no + 1,
235
+ span=[0, 0],
236
+ )
237
+ ],
208
238
  )
209
239
  )
210
240
 
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import warnings
2
3
  from typing import Iterable
3
4
 
4
5
  import numpy
@@ -7,16 +8,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
7
8
 
8
9
  from docling.datamodel.base_models import Cell, OcrCell, Page
9
10
  from docling.datamodel.document import ConversionResult
10
- from docling.datamodel.pipeline_options import EasyOcrOptions
11
+ from docling.datamodel.pipeline_options import (
12
+ AcceleratorDevice,
13
+ AcceleratorOptions,
14
+ EasyOcrOptions,
15
+ )
11
16
  from docling.datamodel.settings import settings
12
17
  from docling.models.base_ocr_model import BaseOcrModel
18
+ from docling.utils.accelerator_utils import decide_device
13
19
  from docling.utils.profiling import TimeRecorder
14
20
 
15
21
  _log = logging.getLogger(__name__)
16
22
 
17
23
 
18
24
  class EasyOcrModel(BaseOcrModel):
19
- def __init__(self, enabled: bool, options: EasyOcrOptions):
25
+ def __init__(
26
+ self,
27
+ enabled: bool,
28
+ options: EasyOcrOptions,
29
+ accelerator_options: AcceleratorOptions,
30
+ ):
20
31
  super().__init__(enabled=enabled, options=options)
21
32
  self.options: EasyOcrOptions
22
33
 
@@ -31,11 +42,33 @@ class EasyOcrModel(BaseOcrModel):
31
42
  "Alternatively, Docling has support for other OCR engines. See the documentation."
32
43
  )
33
44
 
45
+ if self.options.use_gpu is None:
46
+ device = decide_device(accelerator_options.device)
47
+ # Enable easyocr GPU if running on CUDA, MPS
48
+ use_gpu = any(
49
+ [
50
+ device.startswith(x)
51
+ for x in [
52
+ AcceleratorDevice.CUDA.value,
53
+ AcceleratorDevice.MPS.value,
54
+ ]
55
+ ]
56
+ )
57
+ else:
58
+ warnings.warn(
59
+ "Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
60
+ "When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
61
+ "to run EasyOCR. Otherwise, EasyOCR runs in CPU."
62
+ )
63
+ use_gpu = self.options.use_gpu
64
+
34
65
  self.reader = easyocr.Reader(
35
66
  lang_list=self.options.lang,
36
- gpu=self.options.use_gpu,
67
+ gpu=use_gpu,
37
68
  model_storage_directory=self.options.model_storage_directory,
69
+ recog_network=self.options.recog_network,
38
70
  download_enabled=self.options.download_enabled,
71
+ verbose=False,
39
72
  )
40
73
 
41
74
  def __call__(
@@ -85,6 +118,7 @@ class EasyOcrModel(BaseOcrModel):
85
118
  ),
86
119
  )
87
120
  for ix, line in enumerate(result)
121
+ if line[2] >= self.options.confidence_threshold
88
122
  ]
89
123
  all_ocr_cells.extend(cells)
90
124