docling 2.9.0__tar.gz → 2.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {docling-2.9.0 → docling-2.11.0}/PKG-INFO +4 -4
  2. {docling-2.9.0 → docling-2.11.0}/docling/backend/docling_parse_backend.py +1 -1
  3. {docling-2.9.0 → docling-2.11.0}/docling/backend/docling_parse_v2_backend.py +7 -5
  4. {docling-2.9.0 → docling-2.11.0}/docling/cli/main.py +17 -21
  5. {docling-2.9.0 → docling-2.11.0}/docling/datamodel/base_models.py +12 -12
  6. docling-2.11.0/docling/datamodel/document.py +309 -0
  7. {docling-2.9.0 → docling-2.11.0}/docling/datamodel/pipeline_options.py +27 -2
  8. {docling-2.9.0 → docling-2.11.0}/docling/document_converter.py +5 -5
  9. {docling-2.9.0 → docling-2.11.0}/docling/models/ds_glm_model.py +4 -7
  10. {docling-2.9.0 → docling-2.11.0}/docling/models/rapid_ocr_model.py +18 -17
  11. {docling-2.9.0 → docling-2.11.0}/docling/pipeline/base_pipeline.py +17 -3
  12. {docling-2.9.0 → docling-2.11.0}/docling/pipeline/standard_pdf_pipeline.py +2 -0
  13. docling-2.11.0/docling/utils/glm_utils.py +336 -0
  14. {docling-2.9.0 → docling-2.11.0}/pyproject.toml +4 -4
  15. docling-2.9.0/docling/datamodel/document.py +0 -560
  16. {docling-2.9.0 → docling-2.11.0}/LICENSE +0 -0
  17. {docling-2.9.0 → docling-2.11.0}/README.md +0 -0
  18. {docling-2.9.0 → docling-2.11.0}/docling/__init__.py +0 -0
  19. {docling-2.9.0 → docling-2.11.0}/docling/backend/__init__.py +0 -0
  20. {docling-2.9.0 → docling-2.11.0}/docling/backend/abstract_backend.py +0 -0
  21. {docling-2.9.0 → docling-2.11.0}/docling/backend/asciidoc_backend.py +0 -0
  22. {docling-2.9.0 → docling-2.11.0}/docling/backend/html_backend.py +0 -0
  23. {docling-2.9.0 → docling-2.11.0}/docling/backend/md_backend.py +0 -0
  24. {docling-2.9.0 → docling-2.11.0}/docling/backend/msexcel_backend.py +0 -0
  25. {docling-2.9.0 → docling-2.11.0}/docling/backend/mspowerpoint_backend.py +0 -0
  26. {docling-2.9.0 → docling-2.11.0}/docling/backend/msword_backend.py +0 -0
  27. {docling-2.9.0 → docling-2.11.0}/docling/backend/pdf_backend.py +0 -0
  28. {docling-2.9.0 → docling-2.11.0}/docling/backend/pypdfium2_backend.py +0 -0
  29. {docling-2.9.0 → docling-2.11.0}/docling/chunking/__init__.py +0 -0
  30. {docling-2.9.0 → docling-2.11.0}/docling/cli/__init__.py +0 -0
  31. {docling-2.9.0 → docling-2.11.0}/docling/datamodel/__init__.py +0 -0
  32. {docling-2.9.0 → docling-2.11.0}/docling/datamodel/settings.py +0 -0
  33. {docling-2.9.0 → docling-2.11.0}/docling/exceptions.py +0 -0
  34. {docling-2.9.0 → docling-2.11.0}/docling/models/__init__.py +0 -0
  35. {docling-2.9.0 → docling-2.11.0}/docling/models/base_model.py +0 -0
  36. {docling-2.9.0 → docling-2.11.0}/docling/models/base_ocr_model.py +0 -0
  37. {docling-2.9.0 → docling-2.11.0}/docling/models/easyocr_model.py +0 -0
  38. {docling-2.9.0 → docling-2.11.0}/docling/models/layout_model.py +0 -0
  39. {docling-2.9.0 → docling-2.11.0}/docling/models/ocr_mac_model.py +0 -0
  40. {docling-2.9.0 → docling-2.11.0}/docling/models/page_assemble_model.py +0 -0
  41. {docling-2.9.0 → docling-2.11.0}/docling/models/page_preprocessing_model.py +0 -0
  42. {docling-2.9.0 → docling-2.11.0}/docling/models/table_structure_model.py +0 -0
  43. {docling-2.9.0 → docling-2.11.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  44. {docling-2.9.0 → docling-2.11.0}/docling/models/tesseract_ocr_model.py +0 -0
  45. {docling-2.9.0 → docling-2.11.0}/docling/pipeline/__init__.py +0 -0
  46. {docling-2.9.0 → docling-2.11.0}/docling/pipeline/simple_pipeline.py +0 -0
  47. {docling-2.9.0 → docling-2.11.0}/docling/py.typed +0 -0
  48. {docling-2.9.0 → docling-2.11.0}/docling/utils/__init__.py +0 -0
  49. {docling-2.9.0 → docling-2.11.0}/docling/utils/export.py +0 -0
  50. {docling-2.9.0 → docling-2.11.0}/docling/utils/layout_utils.py +0 -0
  51. {docling-2.9.0 → docling-2.11.0}/docling/utils/profiling.py +0 -0
  52. {docling-2.9.0 → docling-2.11.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.9.0
3
+ Version: 2.11.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,10 +25,10 @@ Provides-Extra: rapidocr
25
25
  Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
- Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
29
- Requires-Dist: docling-core[chunking] (>=2.8.0,<3.0.0)
28
+ Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.9.0,<3.0.0)
30
30
  Requires-Dist: docling-ibm-models (>=2.0.6,<3.0.0)
31
- Requires-Dist: docling-parse (>=2.0.5,<3.0.0)
31
+ Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
34
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -6,7 +6,7 @@ from typing import Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
- from docling_parse.docling_parse import pdf_parser_v1
9
+ from docling_parse.pdf_parsers import pdf_parser_v1
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
- from docling_parse.docling_parse import pdf_parser_v2
9
+ from docling_parse.pdf_parsers import pdf_parser_v2
10
10
  from PIL import Image, ImageDraw
11
11
  from pypdfium2 import PdfPage
12
12
 
@@ -210,12 +210,14 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
210
210
  self.parser = pdf_parser_v2("fatal")
211
211
 
212
212
  success = False
213
- if isinstance(path_or_stream, BytesIO):
213
+ if isinstance(self.path_or_stream, BytesIO):
214
214
  success = self.parser.load_document_from_bytesio(
215
- self.document_hash, path_or_stream
215
+ self.document_hash, self.path_or_stream
216
+ )
217
+ elif isinstance(self.path_or_stream, Path):
218
+ success = self.parser.load_document(
219
+ self.document_hash, str(self.path_or_stream)
216
220
  )
217
- elif isinstance(path_or_stream, Path):
218
- success = self.parser.load_document(self.document_hash, str(path_or_stream))
219
221
 
220
222
  if not success:
221
223
  raise RuntimeError(
@@ -27,8 +27,10 @@ from docling.datamodel.base_models import (
27
27
  from docling.datamodel.document import ConversionResult
28
28
  from docling.datamodel.pipeline_options import (
29
29
  EasyOcrOptions,
30
+ OcrEngine,
30
31
  OcrMacOptions,
31
32
  OcrOptions,
33
+ PdfBackend,
32
34
  PdfPipelineOptions,
33
35
  RapidOcrOptions,
34
36
  TableFormerMode,
@@ -68,22 +70,6 @@ def version_callback(value: bool):
68
70
  raise typer.Exit()
69
71
 
70
72
 
71
- # Define an enum for the backend options
72
- class PdfBackend(str, Enum):
73
- PYPDFIUM2 = "pypdfium2"
74
- DLPARSE_V1 = "dlparse_v1"
75
- DLPARSE_V2 = "dlparse_v2"
76
-
77
-
78
- # Define an enum for the ocr engines
79
- class OcrEngine(str, Enum):
80
- EASYOCR = "easyocr"
81
- TESSERACT_CLI = "tesseract_cli"
82
- TESSERACT = "tesseract"
83
- OCRMAC = "ocrmac"
84
- RAPIDOCR = "rapidocr"
85
-
86
-
87
73
  def export_documents(
88
74
  conv_results: Iterable[ConversionResult],
89
75
  output_dir: Path,
@@ -208,7 +194,7 @@ def convert(
208
194
  ] = None,
209
195
  pdf_backend: Annotated[
210
196
  PdfBackend, typer.Option(..., help="The PDF backend to use.")
211
- ] = PdfBackend.DLPARSE_V1,
197
+ ] = PdfBackend.DLPARSE_V2,
212
198
  table_mode: Annotated[
213
199
  TableFormerMode,
214
200
  typer.Option(..., help="The mode to use in the table structure model."),
@@ -264,6 +250,13 @@ def convert(
264
250
  help="Show version information.",
265
251
  ),
266
252
  ] = None,
253
+ document_timeout: Annotated[
254
+ Optional[float],
255
+ typer.Option(
256
+ ...,
257
+ help="The timeout for processing each document, in seconds.",
258
+ ),
259
+ ] = None,
267
260
  ):
268
261
  if verbose == 0:
269
262
  logging.basicConfig(level=logging.WARNING)
@@ -347,6 +340,7 @@ def convert(
347
340
  do_ocr=ocr,
348
341
  ocr_options=ocr_options,
349
342
  do_table_structure=True,
343
+ document_timeout=document_timeout,
350
344
  )
351
345
  pipeline_options.table_structure_options.do_cell_matching = (
352
346
  True # do_cell_matching
@@ -372,11 +366,13 @@ def convert(
372
366
  else:
373
367
  raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
374
368
 
369
+ pdf_format_option = PdfFormatOption(
370
+ pipeline_options=pipeline_options,
371
+ backend=backend, # pdf_backend
372
+ )
375
373
  format_options: Dict[InputFormat, FormatOption] = {
376
- InputFormat.PDF: PdfFormatOption(
377
- pipeline_options=pipeline_options,
378
- backend=backend, # pdf_backend
379
- )
374
+ InputFormat.PDF: pdf_format_option,
375
+ InputFormat.IMAGE: pdf_format_option,
380
376
  }
381
377
  doc_converter = DocumentConverter(
382
378
  allowed_formats=from_formats,
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
 
20
20
 
21
21
  class ConversionStatus(str, Enum):
22
- PENDING = auto()
23
- STARTED = auto()
24
- FAILURE = auto()
25
- SUCCESS = auto()
26
- PARTIAL_SUCCESS = auto()
27
- SKIPPED = auto()
22
+ PENDING = "pending"
23
+ STARTED = "started"
24
+ FAILURE = "failure"
25
+ SUCCESS = "success"
26
+ PARTIAL_SUCCESS = "partial_success"
27
+ SKIPPED = "skipped"
28
28
 
29
29
 
30
30
  class InputFormat(str, Enum):
@@ -89,15 +89,15 @@ MimeTypeToFormat = {
89
89
 
90
90
 
91
91
  class DocInputType(str, Enum):
92
- PATH = auto()
93
- STREAM = auto()
92
+ PATH = "path"
93
+ STREAM = "stream"
94
94
 
95
95
 
96
96
  class DoclingComponentType(str, Enum):
97
- DOCUMENT_BACKEND = auto()
98
- MODEL = auto()
99
- DOC_ASSEMBLER = auto()
100
- USER_INPUT = auto()
97
+ DOCUMENT_BACKEND = "document_backend"
98
+ MODEL = "model"
99
+ DOC_ASSEMBLER = "doc_assembler"
100
+ USER_INPUT = "user_input"
101
101
 
102
102
 
103
103
  class ErrorItem(BaseModel):
@@ -0,0 +1,309 @@
1
+ import logging
2
+ import re
3
+ from enum import Enum
4
+ from io import BytesIO
5
+ from pathlib import Path, PurePath
6
+ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
7
+
8
+ import filetype
9
+ from docling_core.types.doc import (
10
+ DocItem,
11
+ DocItemLabel,
12
+ DoclingDocument,
13
+ PictureItem,
14
+ SectionHeaderItem,
15
+ TableItem,
16
+ TextItem,
17
+ )
18
+ from docling_core.types.doc.document import ListItem
19
+ from docling_core.types.legacy_doc.base import (
20
+ BaseText,
21
+ Figure,
22
+ GlmTableCell,
23
+ PageDimensions,
24
+ PageReference,
25
+ Prov,
26
+ Ref,
27
+ )
28
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
29
+ from docling_core.types.legacy_doc.base import TableCell
30
+ from docling_core.types.legacy_doc.document import (
31
+ CCSDocumentDescription as DsDocumentDescription,
32
+ )
33
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
34
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
35
+ from docling_core.utils.file import resolve_source_to_stream
36
+ from docling_core.utils.legacy import docling_document_to_legacy
37
+ from pydantic import BaseModel
38
+ from typing_extensions import deprecated
39
+
40
+ from docling.backend.abstract_backend import (
41
+ AbstractDocumentBackend,
42
+ PaginatedDocumentBackend,
43
+ )
44
+ from docling.datamodel.base_models import (
45
+ AssembledUnit,
46
+ ConversionStatus,
47
+ DocumentStream,
48
+ ErrorItem,
49
+ FormatToExtensions,
50
+ FormatToMimeType,
51
+ InputFormat,
52
+ MimeTypeToFormat,
53
+ Page,
54
+ )
55
+ from docling.datamodel.settings import DocumentLimits
56
+ from docling.utils.profiling import ProfilingItem
57
+ from docling.utils.utils import create_file_hash, create_hash
58
+
59
+ if TYPE_CHECKING:
60
+ from docling.document_converter import FormatOption
61
+
62
+ _log = logging.getLogger(__name__)
63
+
64
+ layout_label_to_ds_type = {
65
+ DocItemLabel.TITLE: "title",
66
+ DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
67
+ DocItemLabel.SECTION_HEADER: "subtitle-level-1",
68
+ DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
69
+ DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
70
+ DocItemLabel.CAPTION: "caption",
71
+ DocItemLabel.PAGE_HEADER: "page-header",
72
+ DocItemLabel.PAGE_FOOTER: "page-footer",
73
+ DocItemLabel.FOOTNOTE: "footnote",
74
+ DocItemLabel.TABLE: "table",
75
+ DocItemLabel.FORMULA: "equation",
76
+ DocItemLabel.LIST_ITEM: "paragraph",
77
+ DocItemLabel.CODE: "paragraph",
78
+ DocItemLabel.PICTURE: "figure",
79
+ DocItemLabel.TEXT: "paragraph",
80
+ DocItemLabel.PARAGRAPH: "paragraph",
81
+ }
82
+
83
+ _EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
84
+
85
+
86
+ class InputDocument(BaseModel):
87
+ file: PurePath
88
+ document_hash: str # = None
89
+ valid: bool = True
90
+ limits: DocumentLimits = DocumentLimits()
91
+ format: InputFormat # = None
92
+
93
+ filesize: Optional[int] = None
94
+ page_count: int = 0
95
+
96
+ _backend: AbstractDocumentBackend # Internal PDF backend used
97
+
98
+ def __init__(
99
+ self,
100
+ path_or_stream: Union[BytesIO, Path],
101
+ format: InputFormat,
102
+ backend: Type[AbstractDocumentBackend],
103
+ filename: Optional[str] = None,
104
+ limits: Optional[DocumentLimits] = None,
105
+ ):
106
+ super().__init__(
107
+ file="", document_hash="", format=InputFormat.PDF
108
+ ) # initialize with dummy values
109
+
110
+ self.limits = limits or DocumentLimits()
111
+ self.format = format
112
+
113
+ try:
114
+ if isinstance(path_or_stream, Path):
115
+ self.file = path_or_stream
116
+ self.filesize = path_or_stream.stat().st_size
117
+ if self.filesize > self.limits.max_file_size:
118
+ self.valid = False
119
+ else:
120
+ self.document_hash = create_file_hash(path_or_stream)
121
+ self._init_doc(backend, path_or_stream)
122
+
123
+ elif isinstance(path_or_stream, BytesIO):
124
+ assert (
125
+ filename is not None
126
+ ), "Can't construct InputDocument from stream without providing filename arg."
127
+ self.file = PurePath(filename)
128
+ self.filesize = path_or_stream.getbuffer().nbytes
129
+
130
+ if self.filesize > self.limits.max_file_size:
131
+ self.valid = False
132
+ else:
133
+ self.document_hash = create_file_hash(path_or_stream)
134
+ self._init_doc(backend, path_or_stream)
135
+ else:
136
+ raise RuntimeError(
137
+ f"Unexpected type path_or_stream: {type(path_or_stream)}"
138
+ )
139
+
140
+ # For paginated backends, check if the maximum page count is exceeded.
141
+ if self.valid and self._backend.is_valid():
142
+ if self._backend.supports_pagination() and isinstance(
143
+ self._backend, PaginatedDocumentBackend
144
+ ):
145
+ self.page_count = self._backend.page_count()
146
+ if not self.page_count <= self.limits.max_num_pages:
147
+ self.valid = False
148
+
149
+ except (FileNotFoundError, OSError) as e:
150
+ self.valid = False
151
+ _log.exception(
152
+ f"File {self.file.name} not found or cannot be opened.", exc_info=e
153
+ )
154
+ # raise
155
+ except RuntimeError as e:
156
+ self.valid = False
157
+ _log.exception(
158
+ f"An unexpected error occurred while opening the document {self.file.name}",
159
+ exc_info=e,
160
+ )
161
+ # raise
162
+
163
+ def _init_doc(
164
+ self,
165
+ backend: Type[AbstractDocumentBackend],
166
+ path_or_stream: Union[BytesIO, Path],
167
+ ) -> None:
168
+ self._backend = backend(self, path_or_stream=path_or_stream)
169
+ if not self._backend.is_valid():
170
+ self.valid = False
171
+
172
+
173
+ class DocumentFormat(str, Enum):
174
+ V2 = "v2"
175
+ V1 = "v1"
176
+
177
+
178
+ class ConversionResult(BaseModel):
179
+ input: InputDocument
180
+
181
+ status: ConversionStatus = ConversionStatus.PENDING # failure, success
182
+ errors: List[ErrorItem] = [] # structure to keep errors
183
+
184
+ pages: List[Page] = []
185
+ assembled: AssembledUnit = AssembledUnit()
186
+ timings: Dict[str, ProfilingItem] = {}
187
+
188
+ document: DoclingDocument = _EMPTY_DOCLING_DOC
189
+
190
+ @property
191
+ @deprecated("Use document instead.")
192
+ def legacy_document(self):
193
+ return docling_document_to_legacy(self.document)
194
+
195
+
196
+ class _DummyBackend(AbstractDocumentBackend):
197
+ def __init__(self, *args, **kwargs):
198
+ super().__init__(*args, **kwargs)
199
+
200
+ def is_valid(self) -> bool:
201
+ return False
202
+
203
+ @classmethod
204
+ def supported_formats(cls) -> Set[InputFormat]:
205
+ return set()
206
+
207
+ @classmethod
208
+ def supports_pagination(cls) -> bool:
209
+ return False
210
+
211
+ def unload(self):
212
+ return super().unload()
213
+
214
+
215
+ class _DocumentConversionInput(BaseModel):
216
+
217
+ path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
218
+ limits: Optional[DocumentLimits] = DocumentLimits()
219
+
220
+ def docs(
221
+ self, format_options: Dict[InputFormat, "FormatOption"]
222
+ ) -> Iterable[InputDocument]:
223
+ for item in self.path_or_stream_iterator:
224
+ obj = resolve_source_to_stream(item) if isinstance(item, str) else item
225
+ format = self._guess_format(obj)
226
+ backend: Type[AbstractDocumentBackend]
227
+ if format not in format_options.keys():
228
+ _log.error(
229
+ f"Input document {obj.name} does not match any allowed format."
230
+ )
231
+ backend = _DummyBackend
232
+ else:
233
+ backend = format_options[format].backend
234
+
235
+ if isinstance(obj, Path):
236
+ yield InputDocument(
237
+ path_or_stream=obj,
238
+ format=format,
239
+ filename=obj.name,
240
+ limits=self.limits,
241
+ backend=backend,
242
+ )
243
+ elif isinstance(obj, DocumentStream):
244
+ yield InputDocument(
245
+ path_or_stream=obj.stream,
246
+ format=format,
247
+ filename=obj.name,
248
+ limits=self.limits,
249
+ backend=backend,
250
+ )
251
+ else:
252
+ raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
253
+
254
+ def _guess_format(self, obj: Union[Path, DocumentStream]):
255
+ content = b"" # empty binary blob
256
+ format = None
257
+
258
+ if isinstance(obj, Path):
259
+ mime = filetype.guess_mime(str(obj))
260
+ if mime is None:
261
+ ext = obj.suffix[1:]
262
+ mime = self._mime_from_extension(ext)
263
+ if mime is None: # must guess from
264
+ with obj.open("rb") as f:
265
+ content = f.read(1024) # Read first 1KB
266
+
267
+ elif isinstance(obj, DocumentStream):
268
+ content = obj.stream.read(8192)
269
+ obj.stream.seek(0)
270
+ mime = filetype.guess_mime(content)
271
+ if mime is None:
272
+ ext = (
273
+ obj.name.rsplit(".", 1)[-1]
274
+ if ("." in obj.name and not obj.name.startswith("."))
275
+ else ""
276
+ )
277
+ mime = self._mime_from_extension(ext)
278
+
279
+ mime = mime or self._detect_html_xhtml(content)
280
+ mime = mime or "text/plain"
281
+
282
+ format = MimeTypeToFormat.get(mime)
283
+ return format
284
+
285
+ def _mime_from_extension(self, ext):
286
+ mime = None
287
+ if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
288
+ mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
289
+ elif ext in FormatToExtensions[InputFormat.HTML]:
290
+ mime = FormatToMimeType[InputFormat.HTML][0]
291
+ elif ext in FormatToExtensions[InputFormat.MD]:
292
+ mime = FormatToMimeType[InputFormat.MD][0]
293
+
294
+ return mime
295
+
296
+ def _detect_html_xhtml(self, content):
297
+ content_str = content.decode("ascii", errors="ignore").lower()
298
+ # Remove XML comments
299
+ content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
300
+ content_str = content_str.lstrip()
301
+
302
+ if re.match(r"<\?xml", content_str):
303
+ if "xhtml" in content_str[:1000]:
304
+ return "application/xhtml+xml"
305
+
306
+ if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
307
+ return "text/html"
308
+
309
+ return None
@@ -126,12 +126,33 @@ class OcrMacOptions(OcrOptions):
126
126
  )
127
127
 
128
128
 
129
+ # Define an enum for the backend options
130
+ class PdfBackend(str, Enum):
131
+ """Enum of valid PDF backends."""
132
+
133
+ PYPDFIUM2 = "pypdfium2"
134
+ DLPARSE_V1 = "dlparse_v1"
135
+ DLPARSE_V2 = "dlparse_v2"
136
+
137
+
138
+ # Define an enum for the ocr engines
139
+ class OcrEngine(str, Enum):
140
+ """Enum of valid OCR engines."""
141
+
142
+ EASYOCR = "easyocr"
143
+ TESSERACT_CLI = "tesseract_cli"
144
+ TESSERACT = "tesseract"
145
+ OCRMAC = "ocrmac"
146
+ RAPIDOCR = "rapidocr"
147
+
148
+
129
149
  class PipelineOptions(BaseModel):
130
150
  """Base pipeline options."""
131
151
 
132
152
  create_legacy_output: bool = (
133
- True # This defautl will be set to False on a future version of docling
153
+ True # This default will be set to False on a future version of docling
134
154
  )
155
+ document_timeout: Optional[float] = None
135
156
 
136
157
 
137
158
  class PdfPipelineOptions(PipelineOptions):
@@ -143,7 +164,11 @@ class PdfPipelineOptions(PipelineOptions):
143
164
 
144
165
  table_structure_options: TableStructureOptions = TableStructureOptions()
145
166
  ocr_options: Union[
146
- EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
167
+ EasyOcrOptions,
168
+ TesseractCliOcrOptions,
169
+ TesseractOcrOptions,
170
+ OcrMacOptions,
171
+ RapidOcrOptions,
147
172
  ] = Field(EasyOcrOptions(), discriminator="kind")
148
173
 
149
174
  images_scale: float = 1.0
@@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
9
 
10
10
  from docling.backend.abstract_backend import AbstractDocumentBackend
11
11
  from docling.backend.asciidoc_backend import AsciiDocBackend
12
- from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
12
+ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
13
13
  from docling.backend.html_backend import HTMLDocumentBackend
14
14
  from docling.backend.md_backend import MarkdownDocumentBackend
15
15
  from docling.backend.msexcel_backend import MsExcelDocumentBackend
@@ -84,12 +84,12 @@ class HTMLFormatOption(FormatOption):
84
84
 
85
85
  class PdfFormatOption(FormatOption):
86
86
  pipeline_cls: Type = StandardPdfPipeline
87
- backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
87
+ backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
88
88
 
89
89
 
90
90
  class ImageFormatOption(FormatOption):
91
91
  pipeline_cls: Type = StandardPdfPipeline
92
- backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
92
+ backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
93
93
 
94
94
 
95
95
  def _get_default_option(format: InputFormat) -> FormatOption:
@@ -113,10 +113,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
113
113
  pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
114
114
  ),
115
115
  InputFormat.IMAGE: FormatOption(
116
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
116
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
117
117
  ),
118
118
  InputFormat.PDF: FormatOption(
119
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
119
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
120
120
  ),
121
121
  }
122
122
  if (options := format_to_default_options.get(format)) is not None:
@@ -3,9 +3,7 @@ import random
3
3
  from pathlib import Path
4
4
  from typing import List, Union
5
5
 
6
- from deepsearch_glm.nlp_utils import init_nlp_model
7
- from deepsearch_glm.utils.doc_utils import to_docling_document
8
- from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
6
+ from deepsearch_glm.andromeda_nlp import nlp_model
9
7
  from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
10
8
  from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
11
9
  from docling_core.types.legacy_doc.base import (
@@ -29,6 +27,7 @@ from pydantic import BaseModel, ConfigDict
29
27
  from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
30
28
  from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
31
29
  from docling.datamodel.settings import settings
30
+ from docling.utils.glm_utils import to_docling_document
32
31
  from docling.utils.profiling import ProfilingScope, TimeRecorder
33
32
  from docling.utils.utils import create_hash
34
33
 
@@ -43,9 +42,7 @@ class GlmModel:
43
42
  def __init__(self, options: GlmOptions):
44
43
  self.options = options
45
44
 
46
- if self.options.model_names != "":
47
- load_pretrained_nlp_models()
48
- self.model = init_nlp_model(model_names=self.options.model_names)
45
+ self.model = nlp_model(loglevel="error", text_ordering=True)
49
46
 
50
47
  def _to_legacy_document(self, conv_res) -> DsDocument:
51
48
  title = ""
@@ -232,7 +229,7 @@ class GlmModel:
232
229
  def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
233
230
  with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
234
231
  ds_doc = self._to_legacy_document(conv_res)
235
- ds_doc_dict = ds_doc.model_dump(by_alias=True)
232
+ ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
236
233
 
237
234
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
238
235
 
@@ -118,24 +118,25 @@ class RapidOcrModel(BaseOcrModel):
118
118
  del high_res_image
119
119
  del im
120
120
 
121
- cells = [
122
- OcrCell(
123
- id=ix,
124
- text=line[1],
125
- confidence=line[2],
126
- bbox=BoundingBox.from_tuple(
127
- coord=(
128
- (line[0][0][0] / self.scale) + ocr_rect.l,
129
- (line[0][0][1] / self.scale) + ocr_rect.t,
130
- (line[0][2][0] / self.scale) + ocr_rect.l,
131
- (line[0][2][1] / self.scale) + ocr_rect.t,
121
+ if result is not None:
122
+ cells = [
123
+ OcrCell(
124
+ id=ix,
125
+ text=line[1],
126
+ confidence=line[2],
127
+ bbox=BoundingBox.from_tuple(
128
+ coord=(
129
+ (line[0][0][0] / self.scale) + ocr_rect.l,
130
+ (line[0][0][1] / self.scale) + ocr_rect.t,
131
+ (line[0][2][0] / self.scale) + ocr_rect.l,
132
+ (line[0][2][1] / self.scale) + ocr_rect.t,
133
+ ),
134
+ origin=CoordOrigin.TOPLEFT,
132
135
  ),
133
- origin=CoordOrigin.TOPLEFT,
134
- ),
135
- )
136
- for ix, line in enumerate(result)
137
- ]
138
- all_ocr_cells.extend(cells)
136
+ )
137
+ for ix, line in enumerate(result)
138
+ ]
139
+ all_ocr_cells.extend(cells)
139
140
 
140
141
  # Post-process the cells
141
142
  page.cells = self.post_process_cells(all_ocr_cells, page.cells)