docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -1,12 +1,13 @@
1
- from enum import Enum, auto
1
+ from enum import Enum
2
+ from pathlib import Path
2
3
  from typing import List, Literal, Optional, Union
3
4
 
4
5
  from pydantic import BaseModel, ConfigDict, Field
5
6
 
6
7
 
7
8
  class TableFormerMode(str, Enum):
8
- FAST = auto()
9
- ACCURATE = auto()
9
+ FAST = "fast"
10
+ ACCURATE = "accurate"
10
11
 
11
12
 
12
13
  class TableStructureOptions(BaseModel):
@@ -21,6 +22,9 @@ class TableStructureOptions(BaseModel):
21
22
 
22
23
  class OcrOptions(BaseModel):
23
24
  kind: str
25
+ bitmap_area_threshold: float = (
26
+ 0.05 # percentage of the area for a bitmap to processed with OCR
27
+ )
24
28
 
25
29
 
26
30
  class EasyOcrOptions(OcrOptions):
@@ -58,6 +62,13 @@ class TesseractOcrOptions(OcrOptions):
58
62
 
59
63
 
60
64
  class PipelineOptions(BaseModel):
65
+ create_legacy_output: bool = (
66
+ True # This defautl will be set to False on a future version of docling
67
+ )
68
+
69
+
70
+ class PdfPipelineOptions(PipelineOptions):
71
+ artifacts_path: Optional[Union[Path, str]] = None
61
72
  do_table_structure: bool = True # True: perform table structure extraction
62
73
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
63
74
 
@@ -65,3 +76,8 @@ class PipelineOptions(BaseModel):
65
76
  ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
66
77
  Field(EasyOcrOptions(), discriminator="kind")
67
78
  )
79
+
80
+ images_scale: float = 1.0
81
+ generate_page_images: bool = False
82
+ generate_picture_images: bool = False
83
+ generate_table_images: bool = False
@@ -1,4 +1,5 @@
1
1
  import sys
2
+ from pathlib import Path
2
3
 
3
4
  from pydantic import BaseModel
4
5
  from pydantic_settings import BaseSettings
@@ -14,6 +15,7 @@ class BatchConcurrencySettings(BaseModel):
14
15
  doc_batch_concurrency: int = 2
15
16
  page_batch_size: int = 4
16
17
  page_batch_concurrency: int = 2
18
+ elements_batch_size: int = 16
17
19
 
18
20
  # doc_batch_size: int = 1
19
21
  # doc_batch_concurrency: int = 1
@@ -25,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
25
27
  # To force models into single core: export OMP_NUM_THREADS=1
26
28
 
27
29
 
30
+ class DebugSettings(BaseModel):
31
+ visualize_cells: bool = False
32
+ visualize_ocr: bool = False
33
+ visualize_layout: bool = False
34
+ visualize_tables: bool = False
35
+
36
+ profile_pipeline_timings: bool = False
37
+
38
+ # Path used to output debug information.
39
+ debug_output_path: str = str(Path.cwd() / "debug")
40
+
41
+
28
42
  class AppSettings(BaseSettings):
29
43
  perf: BatchConcurrencySettings
44
+ debug: DebugSettings
30
45
 
31
46
 
32
- settings = AppSettings(perf=BatchConcurrencySettings())
47
+ settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
@@ -1,297 +1,286 @@
1
- import functools
2
1
  import logging
3
- import tempfile
2
+ import sys
4
3
  import time
5
- import traceback
4
+ from functools import partial
6
5
  from pathlib import Path
7
- from typing import Iterable, Optional, Type, Union
8
-
9
- import requests
10
- from PIL import ImageDraw
11
- from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
12
-
13
- from docling.backend.abstract_backend import PdfDocumentBackend
14
- from docling.datamodel.base_models import (
15
- AssembledUnit,
16
- AssembleOptions,
17
- ConversionStatus,
18
- DoclingComponentType,
19
- ErrorItem,
20
- Page,
21
- )
6
+ from typing import Dict, Iterable, Iterator, List, Optional, Type
7
+
8
+ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
+
10
+ from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.asciidoc_backend import AsciiDocBackend
12
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
13
+ from docling.backend.html_backend import HTMLDocumentBackend
14
+ from docling.backend.md_backend import MarkdownDocumentBackend
15
+ from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
16
+ from docling.backend.msword_backend import MsWordDocumentBackend
17
+ from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
22
18
  from docling.datamodel.document import (
23
19
  ConversionResult,
24
- DocumentConversionInput,
25
20
  InputDocument,
21
+ _DocumentConversionInput,
26
22
  )
27
23
  from docling.datamodel.pipeline_options import PipelineOptions
28
- from docling.datamodel.settings import settings
29
- from docling.models.ds_glm_model import GlmModel
30
- from docling.models.page_assemble_model import PageAssembleModel
31
- from docling.pipeline.base_model_pipeline import BaseModelPipeline
32
- from docling.pipeline.standard_model_pipeline import StandardModelPipeline
33
- from docling.utils.utils import chunkify, create_hash
24
+ from docling.datamodel.settings import DocumentLimits, settings
25
+ from docling.pipeline.base_pipeline import BasePipeline
26
+ from docling.pipeline.simple_pipeline import SimplePipeline
27
+ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
28
+ from docling.utils.utils import chunkify
34
29
 
35
30
  _log = logging.getLogger(__name__)
36
31
 
37
32
 
38
- class DocumentConverter:
39
- _default_download_filename = "file.pdf"
33
+ class FormatOption(BaseModel):
34
+ pipeline_cls: Type[BasePipeline]
35
+ pipeline_options: Optional[PipelineOptions] = None
36
+ backend: Type[AbstractDocumentBackend]
40
37
 
41
- def __init__(
42
- self,
43
- artifacts_path: Optional[Union[Path, str]] = None,
44
- pipeline_options: PipelineOptions = PipelineOptions(),
45
- pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
46
- pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
47
- assemble_options: AssembleOptions = AssembleOptions(),
48
- ):
49
- if not artifacts_path:
50
- artifacts_path = self.download_models_hf()
38
+ model_config = ConfigDict(arbitrary_types_allowed=True)
51
39
 
52
- artifacts_path = Path(artifacts_path)
40
+ @model_validator(mode="after")
41
+ def set_optional_field_default(self) -> "FormatOption":
42
+ if self.pipeline_options is None:
43
+ self.pipeline_options = self.pipeline_cls.get_default_options()
44
+ return self
53
45
 
54
- self.model_pipeline = pipeline_cls(
55
- artifacts_path=artifacts_path, pipeline_options=pipeline_options
56
- )
57
46
 
58
- self.page_assemble_model = PageAssembleModel(config={})
59
- self.glm_model = GlmModel(config={})
60
- self.pdf_backend = pdf_backend
61
- self.assemble_options = assemble_options
62
-
63
- @staticmethod
64
- def download_models_hf(
65
- local_dir: Optional[Path] = None, force: bool = False
66
- ) -> Path:
67
- from huggingface_hub import snapshot_download
68
-
69
- download_path = snapshot_download(
70
- repo_id="ds4sd/docling-models",
71
- force_download=force,
72
- local_dir=local_dir,
73
- revision="v2.0.0",
74
- )
47
+ class WordFormatOption(FormatOption):
48
+ pipeline_cls: Type = SimplePipeline
49
+ backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
75
50
 
76
- return Path(download_path)
77
51
 
78
- def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
52
+ class PowerpointFormatOption(FormatOption):
53
+ pipeline_cls: Type = SimplePipeline
54
+ backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
79
55
 
80
- for input_batch in chunkify(
81
- input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
82
- ):
83
- _log.info(f"Going to convert document batch...")
84
- # parallel processing only within input_batch
85
- # with ThreadPoolExecutor(
86
- # max_workers=settings.perf.doc_batch_concurrency
87
- # ) as pool:
88
- # yield from pool.map(self.process_document, input_batch)
89
56
 
90
- # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
91
- yield from map(self._process_document, input_batch)
92
-
93
- def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
94
- """Convert a single document.
95
-
96
- Args:
97
- source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
98
-
99
- Raises:
100
- ValueError: If source is of unexpected type.
101
- RuntimeError: If conversion fails.
102
-
103
- Returns:
104
- ConversionResult: The conversion result object.
105
- """
106
- with tempfile.TemporaryDirectory() as temp_dir:
107
- try:
108
- http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
109
- res = requests.get(http_url, stream=True)
110
- res.raise_for_status()
111
- fname = None
112
- # try to get filename from response header
113
- if cont_disp := res.headers.get("Content-Disposition"):
114
- for par in cont_disp.strip().split(";"):
115
- # currently only handling directive "filename" (not "*filename")
116
- if (split := par.split("=")) and split[0].strip() == "filename":
117
- fname = "=".join(split[1:]).strip().strip("'\"") or None
118
- break
119
- # otherwise, use name from URL:
120
- if fname is None:
121
- fname = Path(http_url.path).name or self._default_download_filename
122
- local_path = Path(temp_dir) / fname
123
- with open(local_path, "wb") as f:
124
- for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
125
- f.write(chunk)
126
- except ValidationError:
127
- try:
128
- local_path = TypeAdapter(Path).validate_python(source)
129
- except ValidationError:
130
- raise ValueError(
131
- f"Unexpected file path type encountered: {type(source)}"
132
- )
133
- conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
134
- conv_res_iter = self.convert(conv_inp)
135
- conv_res: ConversionResult = next(conv_res_iter)
136
- if conv_res.status not in {
137
- ConversionStatus.SUCCESS,
138
- ConversionStatus.PARTIAL_SUCCESS,
139
- }:
140
- raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
141
- return conv_res
57
+ class MarkdownFormatOption(FormatOption):
58
+ pipeline_cls: Type = SimplePipeline
59
+ backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
142
60
 
143
- def _process_document(self, in_doc: InputDocument) -> ConversionResult:
144
- start_doc_time = time.time()
145
- conv_res = ConversionResult(input=in_doc)
146
61
 
147
- _log.info(f"Processing document {in_doc.file.name}")
62
+ class AsciiDocFormatOption(FormatOption):
63
+ pipeline_cls: Type = SimplePipeline
64
+ backend: Type[AbstractDocumentBackend] = AsciiDocBackend
148
65
 
149
- if not in_doc.valid:
150
- conv_res.status = ConversionStatus.FAILURE
151
- return conv_res
152
66
 
153
- for i in range(0, in_doc.page_count):
154
- conv_res.pages.append(Page(page_no=i))
67
+ class HTMLFormatOption(FormatOption):
68
+ pipeline_cls: Type = SimplePipeline
69
+ backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
155
70
 
156
- all_assembled_pages = []
157
71
 
158
- try:
159
- # Iterate batches of pages (page_batch_size) in the doc
160
- for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
161
- start_pb_time = time.time()
162
- # Pipeline
72
+ class PdfFormatOption(FormatOption):
73
+ pipeline_cls: Type = StandardPdfPipeline
74
+ backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
163
75
 
164
- # 1. Initialise the page resources
165
- init_pages = map(
166
- functools.partial(self._initialize_page, in_doc), page_batch
167
- )
168
76
 
169
- # 2. Populate page image
170
- pages_with_images = map(
171
- functools.partial(self._populate_page_images, in_doc), init_pages
172
- )
77
+ class ImageFormatOption(FormatOption):
78
+ pipeline_cls: Type = StandardPdfPipeline
79
+ backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
173
80
 
174
- # 3. Populate programmatic page cells
175
- pages_with_cells = map(
176
- functools.partial(self._parse_page_cells, in_doc),
177
- pages_with_images,
178
- )
179
81
 
180
- # 4. Run pipeline stages
181
- pipeline_pages = self.model_pipeline.apply(pages_with_cells)
82
+ _format_to_default_options = {
83
+ InputFormat.DOCX: FormatOption(
84
+ pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
85
+ ),
86
+ InputFormat.PPTX: FormatOption(
87
+ pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
88
+ ),
89
+ InputFormat.MD: FormatOption(
90
+ pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
91
+ ),
92
+ InputFormat.ASCIIDOC: FormatOption(
93
+ pipeline_cls=SimplePipeline, backend=AsciiDocBackend
94
+ ),
95
+ InputFormat.HTML: FormatOption(
96
+ pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
97
+ ),
98
+ InputFormat.IMAGE: FormatOption(
99
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
100
+ ),
101
+ InputFormat.PDF: FormatOption(
102
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
103
+ ),
104
+ }
182
105
 
183
- # 5. Assemble page elements (per page)
184
- assembled_pages = self.page_assemble_model(pipeline_pages)
185
106
 
186
- # exhaust assembled_pages
187
- for assembled_page in assembled_pages:
188
- # Free up mem resources before moving on with next batch
107
+ class DocumentConverter:
108
+ _default_download_filename = "file"
189
109
 
190
- # Remove page images (can be disabled)
191
- if self.assemble_options.images_scale is None:
192
- assembled_page._image_cache = {}
110
+ def __init__(
111
+ self,
112
+ allowed_formats: Optional[List[InputFormat]] = None,
113
+ format_options: Optional[Dict[InputFormat, FormatOption]] = None,
114
+ ):
115
+ self.allowed_formats = allowed_formats
116
+ self.format_to_options = format_options
117
+
118
+ if self.allowed_formats is None:
119
+ # if self.format_to_options is not None:
120
+ # self.allowed_formats = self.format_to_options.keys()
121
+ # else:
122
+ self.allowed_formats = [e for e in InputFormat] # all formats
123
+
124
+ if self.format_to_options is None:
125
+ self.format_to_options = _format_to_default_options
126
+ else:
127
+ for f in self.allowed_formats:
128
+ if f not in self.format_to_options.keys():
129
+ _log.debug(f"Requested format {f} will use default options.")
130
+ self.format_to_options[f] = _format_to_default_options[f]
131
+
132
+ remove_keys = []
133
+ for f in self.format_to_options.keys():
134
+ if f not in self.allowed_formats:
135
+ remove_keys.append(f)
136
+
137
+ for f in remove_keys:
138
+ self.format_to_options.pop(f)
139
+
140
+ self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
141
+
142
+ def initialize_pipeline(self, format: InputFormat):
143
+ """Initialize the conversion pipeline for the selected format."""
144
+ self._get_pipeline(doc_format=format)
145
+
146
+ @validate_call(config=ConfigDict(strict=True))
147
+ def convert(
148
+ self,
149
+ source: Path | str | DocumentStream, # TODO review naming
150
+ raises_on_error: bool = True,
151
+ max_num_pages: int = sys.maxsize,
152
+ max_file_size: int = sys.maxsize,
153
+ ) -> ConversionResult:
154
+
155
+ all_res = self.convert_all(
156
+ source=[source],
157
+ raises_on_error=raises_on_error,
158
+ max_num_pages=max_num_pages,
159
+ max_file_size=max_file_size,
160
+ )
161
+ return next(all_res)
193
162
 
194
- # Unload backend
195
- assembled_page._backend.unload()
163
+ @validate_call(config=ConfigDict(strict=True))
164
+ def convert_all(
165
+ self,
166
+ source: Iterable[Path | str | DocumentStream], # TODO review naming
167
+ raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
168
+ max_num_pages: int = sys.maxsize,
169
+ max_file_size: int = sys.maxsize,
170
+ ) -> Iterator[ConversionResult]:
171
+ limits = DocumentLimits(
172
+ max_num_pages=max_num_pages,
173
+ max_file_size=max_file_size,
174
+ )
175
+ conv_input = _DocumentConversionInput(
176
+ path_or_stream_iterator=source,
177
+ limit=limits,
178
+ )
179
+ conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
180
+ for conv_res in conv_res_iter:
181
+ if raises_on_error and conv_res.status not in {
182
+ ConversionStatus.SUCCESS,
183
+ ConversionStatus.PARTIAL_SUCCESS,
184
+ }:
185
+ raise RuntimeError(
186
+ f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
187
+ )
188
+ else:
189
+ yield conv_res
196
190
 
197
- all_assembled_pages.append(assembled_page)
191
+ def _convert(
192
+ self, conv_input: _DocumentConversionInput, raises_on_error: bool
193
+ ) -> Iterator[ConversionResult]:
194
+ assert self.format_to_options is not None
198
195
 
199
- end_pb_time = time.time() - start_pb_time
200
- _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
196
+ start_time = time.monotonic()
201
197
 
202
- conv_res.pages = all_assembled_pages
203
- self._assemble_doc(conv_res)
198
+ for input_batch in chunkify(
199
+ conv_input.docs(self.format_to_options),
200
+ settings.perf.doc_batch_size, # pass format_options
201
+ ):
202
+ _log.info(f"Going to convert document batch...")
204
203
 
205
- status = ConversionStatus.SUCCESS
206
- for page in conv_res.pages:
207
- if not page._backend.is_valid():
208
- conv_res.errors.append(
209
- ErrorItem(
210
- component_type=DoclingComponentType.PDF_BACKEND,
211
- module_name=type(page._backend).__name__,
212
- error_message=f"Page {page.page_no} failed to parse.",
213
- )
204
+ # parallel processing only within input_batch
205
+ # with ThreadPoolExecutor(
206
+ # max_workers=settings.perf.doc_batch_concurrency
207
+ # ) as pool:
208
+ # yield from pool.map(self.process_document, input_batch)
209
+ # Note: PDF backends are not thread-safe, thread pool usage was disabled.
210
+
211
+ for item in map(
212
+ partial(self._process_document, raises_on_error=raises_on_error),
213
+ input_batch,
214
+ ):
215
+ elapsed = time.monotonic() - start_time
216
+ start_time = time.monotonic()
217
+
218
+ if item is not None:
219
+ _log.info(
220
+ f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
214
221
  )
215
- status = ConversionStatus.PARTIAL_SUCCESS
216
-
217
- conv_res.status = status
218
-
219
- except Exception as e:
220
- conv_res.status = ConversionStatus.FAILURE
221
- trace = "\n".join(traceback.format_exception(e))
222
- _log.info(
223
- f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
224
- f"{trace}"
222
+ yield item
223
+ else:
224
+ _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
225
+
226
+ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
227
+ assert self.format_to_options is not None
228
+
229
+ fopt = self.format_to_options.get(doc_format)
230
+
231
+ if fopt is None:
232
+ raise RuntimeError(f"Could not get pipeline for {doc_format}")
233
+ else:
234
+ pipeline_class = fopt.pipeline_cls
235
+ pipeline_options = fopt.pipeline_options
236
+
237
+ assert pipeline_options is not None
238
+ # TODO this will ignore if different options have been defined for the same pipeline class.
239
+ if (
240
+ pipeline_class not in self.initialized_pipelines
241
+ or self.initialized_pipelines[pipeline_class].pipeline_options
242
+ != pipeline_options
243
+ ):
244
+ self.initialized_pipelines[pipeline_class] = pipeline_class(
245
+ pipeline_options=pipeline_options
225
246
  )
247
+ return self.initialized_pipelines[pipeline_class]
226
248
 
227
- finally:
228
- # Always unload the PDF backend, even in case of failure
229
- if in_doc._backend:
230
- in_doc._backend.unload()
249
+ def _process_document(
250
+ self, in_doc: InputDocument, raises_on_error: bool
251
+ ) -> Optional[ConversionResult]:
252
+ assert self.allowed_formats is not None
253
+ assert in_doc.format in self.allowed_formats
231
254
 
232
- end_doc_time = time.time() - start_doc_time
233
- _log.info(
234
- f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
235
- )
255
+ conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
236
256
 
237
257
  return conv_res
238
258
 
239
- # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
240
- def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
241
- page._backend = doc._backend.load_page(page.page_no)
242
- page.size = page._backend.get_size()
243
- page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
244
-
245
- return page
246
-
247
- # Generate the page image and store it in the page object
248
- def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
249
- # default scale
250
- page.get_image(
251
- scale=1.0
252
- ) # puts the page image on the image cache at default scale
253
-
254
- # user requested scales
255
- if self.assemble_options.images_scale is not None:
256
- page._default_image_scale = self.assemble_options.images_scale
257
- page.get_image(
258
- scale=self.assemble_options.images_scale
259
- ) # this will trigger storing the image in the internal cache
260
-
261
- return page
262
-
263
- # Extract and populate the page cells and store it in the page object
264
- def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
265
- page.cells = page._backend.get_text_cells()
266
-
267
- # DEBUG code:
268
- def draw_text_boxes(image, cells):
269
- draw = ImageDraw.Draw(image)
270
- for c in cells:
271
- x0, y0, x1, y1 = c.bbox.as_tuple()
272
- draw.rectangle([(x0, y0), (x1, y1)], outline="red")
273
- image.show()
274
-
275
- # draw_text_boxes(page.get_image(scale=1.0), cells)
276
-
277
- return page
278
-
279
- def _assemble_doc(self, conv_res: ConversionResult):
280
- all_elements = []
281
- all_headers = []
282
- all_body = []
283
-
284
- for p in conv_res.pages:
285
-
286
- for el in p.assembled.body:
287
- all_body.append(el)
288
- for el in p.assembled.headers:
289
- all_headers.append(el)
290
- for el in p.assembled.elements:
291
- all_elements.append(el)
292
-
293
- conv_res.assembled = AssembledUnit(
294
- elements=all_elements, headers=all_headers, body=all_body
295
- )
259
+ def _execute_pipeline(
260
+ self, in_doc: InputDocument, raises_on_error: bool
261
+ ) -> ConversionResult:
262
+ if in_doc.valid:
263
+ pipeline = self._get_pipeline(in_doc.format)
264
+ if pipeline is None: # Can't find a default pipeline. Should this raise?
265
+ if raises_on_error:
266
+ raise RuntimeError(
267
+ f"No pipeline could be initialized for {in_doc.file}."
268
+ )
269
+ else:
270
+ conv_res = ConversionResult(input=in_doc)
271
+ conv_res.status = ConversionStatus.FAILURE
272
+ return conv_res
273
+
274
+ conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
296
275
 
297
- conv_res.output = self.glm_model(conv_res)
276
+ else:
277
+ if raises_on_error:
278
+ raise RuntimeError(f"Input document {in_doc.file} is not valid.")
279
+
280
+ else:
281
+ # invalid doc or not of desired format
282
+ conv_res = ConversionResult(input=in_doc)
283
+ conv_res.status = ConversionStatus.FAILURE
284
+ # TODO add error log why it failed.
285
+
286
+ return conv_res
@@ -0,0 +1,28 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Iterable
3
+
4
+ from docling_core.types.doc import DoclingDocument, NodeItem
5
+
6
+ from docling.datamodel.base_models import Page
7
+ from docling.datamodel.document import ConversionResult
8
+
9
+
10
+ class BasePageModel(ABC):
11
+ @abstractmethod
12
+ def __call__(
13
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
14
+ ) -> Iterable[Page]:
15
+ pass
16
+
17
+
18
+ class BaseEnrichmentModel(ABC):
19
+
20
+ @abstractmethod
21
+ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
22
+ pass
23
+
24
+ @abstractmethod
25
+ def __call__(
26
+ self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
27
+ ) -> Iterable[Any]:
28
+ pass