docling 1.19.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.1.dist-info/METADATA +0 -380
  35. docling-1.19.1.dist-info/RECORD +0 -34
  36. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  from enum import Enum, auto
2
+ from pathlib import Path
2
3
  from typing import List, Literal, Optional, Union
3
4
 
4
5
  from pydantic import BaseModel, ConfigDict, Field
@@ -58,6 +59,13 @@ class TesseractOcrOptions(OcrOptions):
58
59
 
59
60
 
60
61
  class PipelineOptions(BaseModel):
62
+ create_legacy_output: bool = (
63
+ True # This defautl will be set to False on a future version of docling
64
+ )
65
+
66
+
67
+ class PdfPipelineOptions(PipelineOptions):
68
+ artifacts_path: Optional[Union[Path, str]] = None
61
69
  do_table_structure: bool = True # True: perform table structure extraction
62
70
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
63
71
 
@@ -65,3 +73,8 @@ class PipelineOptions(BaseModel):
65
73
  ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
66
74
  Field(EasyOcrOptions(), discriminator="kind")
67
75
  )
76
+
77
+ images_scale: float = 1.0
78
+ generate_page_images: bool = False
79
+ generate_picture_images: bool = False
80
+ generate_table_images: bool = False
@@ -14,6 +14,7 @@ class BatchConcurrencySettings(BaseModel):
14
14
  doc_batch_concurrency: int = 2
15
15
  page_batch_size: int = 4
16
16
  page_batch_concurrency: int = 2
17
+ elements_batch_size: int = 16
17
18
 
18
19
  # doc_batch_size: int = 1
19
20
  # doc_batch_concurrency: int = 1
@@ -1,297 +1,260 @@
1
- import functools
2
1
  import logging
3
- import tempfile
2
+ import sys
4
3
  import time
5
- import traceback
4
+ from functools import partial
6
5
  from pathlib import Path
7
- from typing import Iterable, Optional, Type, Union
8
-
9
- import requests
10
- from PIL import ImageDraw
11
- from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
12
-
13
- from docling.backend.abstract_backend import PdfDocumentBackend
14
- from docling.datamodel.base_models import (
15
- AssembledUnit,
16
- AssembleOptions,
17
- ConversionStatus,
18
- DoclingComponentType,
19
- ErrorItem,
20
- Page,
21
- )
6
+ from typing import Dict, Iterable, Iterator, List, Optional, Type
7
+
8
+ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
9
+
10
+ from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
12
+ from docling.backend.html_backend import HTMLDocumentBackend
13
+ from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
14
+ from docling.backend.msword_backend import MsWordDocumentBackend
15
+ from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
22
16
  from docling.datamodel.document import (
23
17
  ConversionResult,
24
- DocumentConversionInput,
25
18
  InputDocument,
19
+ _DocumentConversionInput,
26
20
  )
27
21
  from docling.datamodel.pipeline_options import PipelineOptions
28
- from docling.datamodel.settings import settings
29
- from docling.models.ds_glm_model import GlmModel
30
- from docling.models.page_assemble_model import PageAssembleModel
31
- from docling.pipeline.base_model_pipeline import BaseModelPipeline
32
- from docling.pipeline.standard_model_pipeline import StandardModelPipeline
33
- from docling.utils.utils import chunkify, create_hash
22
+ from docling.datamodel.settings import DocumentLimits, settings
23
+ from docling.pipeline.base_pipeline import BasePipeline
24
+ from docling.pipeline.simple_pipeline import SimplePipeline
25
+ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
26
+ from docling.utils.utils import chunkify
34
27
 
35
28
  _log = logging.getLogger(__name__)
36
29
 
37
30
 
38
- class DocumentConverter:
39
- _default_download_filename = "file.pdf"
31
+ class FormatOption(BaseModel):
32
+ pipeline_cls: Type[BasePipeline]
33
+ pipeline_options: Optional[PipelineOptions] = None
34
+ backend: Type[AbstractDocumentBackend]
40
35
 
41
- def __init__(
42
- self,
43
- artifacts_path: Optional[Union[Path, str]] = None,
44
- pipeline_options: PipelineOptions = PipelineOptions(),
45
- pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
46
- pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
47
- assemble_options: AssembleOptions = AssembleOptions(),
48
- ):
49
- if not artifacts_path:
50
- artifacts_path = self.download_models_hf()
36
+ model_config = ConfigDict(arbitrary_types_allowed=True)
51
37
 
52
- artifacts_path = Path(artifacts_path)
38
+ @model_validator(mode="after")
39
+ def set_optional_field_default(self) -> "FormatOption":
40
+ if self.pipeline_options is None:
41
+ self.pipeline_options = self.pipeline_cls.get_default_options()
42
+ return self
53
43
 
54
- self.model_pipeline = pipeline_cls(
55
- artifacts_path=artifacts_path, pipeline_options=pipeline_options
56
- )
57
44
 
58
- self.page_assemble_model = PageAssembleModel(config={})
59
- self.glm_model = GlmModel(config={})
60
- self.pdf_backend = pdf_backend
61
- self.assemble_options = assemble_options
62
-
63
- @staticmethod
64
- def download_models_hf(
65
- local_dir: Optional[Path] = None, force: bool = False
66
- ) -> Path:
67
- from huggingface_hub import snapshot_download
68
-
69
- download_path = snapshot_download(
70
- repo_id="ds4sd/docling-models",
71
- force_download=force,
72
- local_dir=local_dir,
73
- revision="v2.0.0",
74
- )
45
+ class WordFormatOption(FormatOption):
46
+ pipeline_cls: Type = SimplePipeline
47
+ backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
75
48
 
76
- return Path(download_path)
77
49
 
78
- def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
50
+ class PowerpointFormatOption(FormatOption):
51
+ pipeline_cls: Type = SimplePipeline
52
+ backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
79
53
 
80
- for input_batch in chunkify(
81
- input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
82
- ):
83
- _log.info(f"Going to convert document batch...")
84
- # parallel processing only within input_batch
85
- # with ThreadPoolExecutor(
86
- # max_workers=settings.perf.doc_batch_concurrency
87
- # ) as pool:
88
- # yield from pool.map(self.process_document, input_batch)
89
54
 
90
- # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
91
- yield from map(self._process_document, input_batch)
92
-
93
- def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
94
- """Convert a single document.
95
-
96
- Args:
97
- source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
98
-
99
- Raises:
100
- ValueError: If source is of unexpected type.
101
- RuntimeError: If conversion fails.
102
-
103
- Returns:
104
- ConversionResult: The conversion result object.
105
- """
106
- with tempfile.TemporaryDirectory() as temp_dir:
107
- try:
108
- http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
109
- res = requests.get(http_url, stream=True)
110
- res.raise_for_status()
111
- fname = None
112
- # try to get filename from response header
113
- if cont_disp := res.headers.get("Content-Disposition"):
114
- for par in cont_disp.strip().split(";"):
115
- # currently only handling directive "filename" (not "*filename")
116
- if (split := par.split("=")) and split[0].strip() == "filename":
117
- fname = "=".join(split[1:]).strip().strip("'\"") or None
118
- break
119
- # otherwise, use name from URL:
120
- if fname is None:
121
- fname = Path(http_url.path).name or self._default_download_filename
122
- local_path = Path(temp_dir) / fname
123
- with open(local_path, "wb") as f:
124
- for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
125
- f.write(chunk)
126
- except ValidationError:
127
- try:
128
- local_path = TypeAdapter(Path).validate_python(source)
129
- except ValidationError:
130
- raise ValueError(
131
- f"Unexpected file path type encountered: {type(source)}"
132
- )
133
- conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
134
- conv_res_iter = self.convert(conv_inp)
135
- conv_res: ConversionResult = next(conv_res_iter)
136
- if conv_res.status not in {
137
- ConversionStatus.SUCCESS,
138
- ConversionStatus.PARTIAL_SUCCESS,
139
- }:
140
- raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
141
- return conv_res
55
+ class HTMLFormatOption(FormatOption):
56
+ pipeline_cls: Type = SimplePipeline
57
+ backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
142
58
 
143
- def _process_document(self, in_doc: InputDocument) -> ConversionResult:
144
- start_doc_time = time.time()
145
- conv_res = ConversionResult(input=in_doc)
146
59
 
147
- _log.info(f"Processing document {in_doc.file.name}")
60
+ class PdfFormatOption(FormatOption):
61
+ pipeline_cls: Type = StandardPdfPipeline
62
+ backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
148
63
 
149
- if not in_doc.valid:
150
- conv_res.status = ConversionStatus.FAILURE
151
- return conv_res
152
64
 
153
- for i in range(0, in_doc.page_count):
154
- conv_res.pages.append(Page(page_no=i))
65
+ class ImageFormatOption(FormatOption):
66
+ pipeline_cls: Type = StandardPdfPipeline
67
+ backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
155
68
 
156
- all_assembled_pages = []
157
69
 
158
- try:
159
- # Iterate batches of pages (page_batch_size) in the doc
160
- for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
161
- start_pb_time = time.time()
162
- # Pipeline
70
+ _format_to_default_options = {
71
+ InputFormat.DOCX: FormatOption(
72
+ pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
73
+ ),
74
+ InputFormat.PPTX: FormatOption(
75
+ pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
76
+ ),
77
+ InputFormat.HTML: FormatOption(
78
+ pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
79
+ ),
80
+ InputFormat.IMAGE: FormatOption(
81
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
82
+ ),
83
+ InputFormat.PDF: FormatOption(
84
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
85
+ ),
86
+ }
163
87
 
164
- # 1. Initialise the page resources
165
- init_pages = map(
166
- functools.partial(self._initialize_page, in_doc), page_batch
167
- )
168
88
 
169
- # 2. Populate page image
170
- pages_with_images = map(
171
- functools.partial(self._populate_page_images, in_doc), init_pages
172
- )
89
+ class DocumentConverter:
90
+ _default_download_filename = "file"
173
91
 
174
- # 3. Populate programmatic page cells
175
- pages_with_cells = map(
176
- functools.partial(self._parse_page_cells, in_doc),
177
- pages_with_images,
92
+ def __init__(
93
+ self,
94
+ allowed_formats: Optional[List[InputFormat]] = None,
95
+ format_options: Optional[Dict[InputFormat, FormatOption]] = None,
96
+ ):
97
+ self.allowed_formats = allowed_formats
98
+ self.format_to_options = format_options
99
+
100
+ if self.allowed_formats is None:
101
+ # if self.format_to_options is not None:
102
+ # self.allowed_formats = self.format_to_options.keys()
103
+ # else:
104
+ self.allowed_formats = [e for e in InputFormat] # all formats
105
+
106
+ if self.format_to_options is None:
107
+ self.format_to_options = _format_to_default_options
108
+ else:
109
+ for f in self.allowed_formats:
110
+ if f not in self.format_to_options.keys():
111
+ _log.debug(f"Requested format {f} will use default options.")
112
+ self.format_to_options[f] = _format_to_default_options[f]
113
+
114
+ remove_keys = []
115
+ for f in self.format_to_options.keys():
116
+ if f not in self.allowed_formats:
117
+ remove_keys.append(f)
118
+
119
+ for f in remove_keys:
120
+ self.format_to_options.pop(f)
121
+
122
+ self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
123
+
124
+ @validate_call(config=ConfigDict(strict=True))
125
+ def convert(
126
+ self,
127
+ source: Path | str | DocumentStream, # TODO review naming
128
+ raises_on_error: bool = True,
129
+ max_num_pages: int = sys.maxsize,
130
+ max_file_size: int = sys.maxsize,
131
+ ) -> ConversionResult:
132
+
133
+ all_res = self.convert_all(
134
+ source=[source],
135
+ raises_on_error=raises_on_error,
136
+ max_num_pages=max_num_pages,
137
+ max_file_size=max_file_size,
138
+ )
139
+ return next(all_res)
140
+
141
+ @validate_call(config=ConfigDict(strict=True))
142
+ def convert_all(
143
+ self,
144
+ source: Iterable[Path | str | DocumentStream], # TODO review naming
145
+ raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
146
+ max_num_pages: int = sys.maxsize,
147
+ max_file_size: int = sys.maxsize,
148
+ ) -> Iterator[ConversionResult]:
149
+ limits = DocumentLimits(
150
+ max_num_pages=max_num_pages,
151
+ max_file_size=max_file_size,
152
+ )
153
+ conv_input = _DocumentConversionInput(
154
+ path_or_stream_iterator=source,
155
+ limit=limits,
156
+ )
157
+ conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
158
+ for conv_res in conv_res_iter:
159
+ if raises_on_error and conv_res.status not in {
160
+ ConversionStatus.SUCCESS,
161
+ ConversionStatus.PARTIAL_SUCCESS,
162
+ }:
163
+ raise RuntimeError(
164
+ f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
178
165
  )
166
+ else:
167
+ yield conv_res
179
168
 
180
- # 4. Run pipeline stages
181
- pipeline_pages = self.model_pipeline.apply(pages_with_cells)
169
+ def _convert(
170
+ self, conv_input: _DocumentConversionInput, raises_on_error: bool
171
+ ) -> Iterator[ConversionResult]:
172
+ assert self.format_to_options is not None
182
173
 
183
- # 5. Assemble page elements (per page)
184
- assembled_pages = self.page_assemble_model(pipeline_pages)
174
+ for input_batch in chunkify(
175
+ conv_input.docs(self.format_to_options),
176
+ settings.perf.doc_batch_size, # pass format_options
177
+ ):
178
+ _log.info(f"Going to convert document batch...")
179
+ # parallel processing only within input_batch
180
+ # with ThreadPoolExecutor(
181
+ # max_workers=settings.perf.doc_batch_concurrency
182
+ # ) as pool:
183
+ # yield from pool.map(self.process_document, input_batch)
185
184
 
186
- # exhaust assembled_pages
187
- for assembled_page in assembled_pages:
188
- # Free up mem resources before moving on with next batch
185
+ # Note: PDF backends are not thread-safe, thread pool usage was disabled.
186
+ for item in map(
187
+ partial(self._process_document, raises_on_error=raises_on_error),
188
+ input_batch,
189
+ ):
190
+ if item is not None:
191
+ yield item
192
+
193
+ def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
194
+ assert self.format_to_options is not None
195
+
196
+ fopt = self.format_to_options.get(doc.format)
197
+
198
+ if fopt is None:
199
+ raise RuntimeError(f"Could not get pipeline for document {doc.file}")
200
+ else:
201
+ pipeline_class = fopt.pipeline_cls
202
+ pipeline_options = fopt.pipeline_options
203
+
204
+ assert pipeline_options is not None
205
+ # TODO this will ignore if different options have been defined for the same pipeline class.
206
+ if (
207
+ pipeline_class not in self.initialized_pipelines
208
+ or self.initialized_pipelines[pipeline_class].pipeline_options
209
+ != pipeline_options
210
+ ):
211
+ self.initialized_pipelines[pipeline_class] = pipeline_class(
212
+ pipeline_options=pipeline_options
213
+ )
214
+ return self.initialized_pipelines[pipeline_class]
189
215
 
190
- # Remove page images (can be disabled)
191
- if self.assemble_options.images_scale is None:
192
- assembled_page._image_cache = {}
216
+ def _process_document(
217
+ self, in_doc: InputDocument, raises_on_error: bool
218
+ ) -> Optional[ConversionResult]:
219
+ assert self.allowed_formats is not None
220
+ assert in_doc.format in self.allowed_formats
193
221
 
194
- # Unload backend
195
- assembled_page._backend.unload()
222
+ start_doc_time = time.time()
196
223
 
197
- all_assembled_pages.append(assembled_page)
224
+ conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
198
225
 
199
- end_pb_time = time.time() - start_pb_time
200
- _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
226
+ end_doc_time = time.time() - start_doc_time
227
+ _log.info(
228
+ f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
229
+ )
201
230
 
202
- conv_res.pages = all_assembled_pages
203
- self._assemble_doc(conv_res)
231
+ return conv_res
204
232
 
205
- status = ConversionStatus.SUCCESS
206
- for page in conv_res.pages:
207
- if not page._backend.is_valid():
208
- conv_res.errors.append(
209
- ErrorItem(
210
- component_type=DoclingComponentType.PDF_BACKEND,
211
- module_name=type(page._backend).__name__,
212
- error_message=f"Page {page.page_no} failed to parse.",
213
- )
233
+ def _execute_pipeline(
234
+ self, in_doc: InputDocument, raises_on_error: bool
235
+ ) -> ConversionResult:
236
+ if in_doc.valid:
237
+ pipeline = self._get_pipeline(in_doc)
238
+ if pipeline is None: # Can't find a default pipeline. Should this raise?
239
+ if raises_on_error:
240
+ raise RuntimeError(
241
+ f"No pipeline could be initialized for {in_doc.file}."
214
242
  )
215
- status = ConversionStatus.PARTIAL_SUCCESS
243
+ else:
244
+ conv_res = ConversionResult(input=in_doc)
245
+ conv_res.status = ConversionStatus.FAILURE
246
+ return conv_res
216
247
 
217
- conv_res.status = status
248
+ conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
218
249
 
219
- except Exception as e:
220
- conv_res.status = ConversionStatus.FAILURE
221
- trace = "\n".join(traceback.format_exception(e))
222
- _log.info(
223
- f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
224
- f"{trace}"
225
- )
250
+ else:
251
+ if raises_on_error:
252
+ raise RuntimeError(f"Input document {in_doc.file} is not valid.")
226
253
 
227
- finally:
228
- # Always unload the PDF backend, even in case of failure
229
- if in_doc._backend:
230
- in_doc._backend.unload()
231
-
232
- end_doc_time = time.time() - start_doc_time
233
- _log.info(
234
- f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
235
- )
254
+ else:
255
+ # invalid doc or not of desired format
256
+ conv_res = ConversionResult(input=in_doc)
257
+ conv_res.status = ConversionStatus.FAILURE
258
+ # TODO add error log why it failed.
236
259
 
237
260
  return conv_res
238
-
239
- # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
240
- def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
241
- page._backend = doc._backend.load_page(page.page_no)
242
- page.size = page._backend.get_size()
243
- page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
244
-
245
- return page
246
-
247
- # Generate the page image and store it in the page object
248
- def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
249
- # default scale
250
- page.get_image(
251
- scale=1.0
252
- ) # puts the page image on the image cache at default scale
253
-
254
- # user requested scales
255
- if self.assemble_options.images_scale is not None:
256
- page._default_image_scale = self.assemble_options.images_scale
257
- page.get_image(
258
- scale=self.assemble_options.images_scale
259
- ) # this will trigger storing the image in the internal cache
260
-
261
- return page
262
-
263
- # Extract and populate the page cells and store it in the page object
264
- def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
265
- page.cells = page._backend.get_text_cells()
266
-
267
- # DEBUG code:
268
- def draw_text_boxes(image, cells):
269
- draw = ImageDraw.Draw(image)
270
- for c in cells:
271
- x0, y0, x1, y1 = c.bbox.as_tuple()
272
- draw.rectangle([(x0, y0), (x1, y1)], outline="red")
273
- image.show()
274
-
275
- # draw_text_boxes(page.get_image(scale=1.0), cells)
276
-
277
- return page
278
-
279
- def _assemble_doc(self, conv_res: ConversionResult):
280
- all_elements = []
281
- all_headers = []
282
- all_body = []
283
-
284
- for p in conv_res.pages:
285
-
286
- for el in p.assembled.body:
287
- all_body.append(el)
288
- for el in p.assembled.headers:
289
- all_headers.append(el)
290
- for el in p.assembled.elements:
291
- all_elements.append(el)
292
-
293
- conv_res.assembled = AssembledUnit(
294
- elements=all_elements, headers=all_headers, body=all_body
295
- )
296
-
297
- conv_res.output = self.glm_model(conv_res)
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Iterable
3
+
4
+ from docling_core.types.doc import DoclingDocument, NodeItem
5
+
6
+ from docling.datamodel.base_models import Page
7
+
8
+
9
+ class BasePageModel(ABC):
10
+ @abstractmethod
11
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
12
+ pass
13
+
14
+
15
+ class BaseEnrichmentModel(ABC):
16
+
17
+ @abstractmethod
18
+ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
19
+ pass
20
+
21
+ @abstractmethod
22
+ def __call__(
23
+ self, doc: DoclingDocument, element_batch: Iterable[NodeItem]
24
+ ) -> Iterable[Any]:
25
+ pass
@@ -1,14 +1,15 @@
1
1
  import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
- from typing import Iterable, List, Tuple
4
+ from typing import Iterable, List
5
5
 
6
6
  import numpy as np
7
+ from docling_core.types.doc import BoundingBox, CoordOrigin
7
8
  from PIL import Image, ImageDraw
8
9
  from rtree import index
9
10
  from scipy.ndimage import find_objects, label
10
11
 
11
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
12
+ from docling.datamodel.base_models import OcrCell, Page
12
13
  from docling.datamodel.pipeline_options import OcrOptions
13
14
 
14
15
  _log = logging.getLogger(__name__)
@@ -20,8 +21,9 @@ class BaseOcrModel:
20
21
  self.options = options
21
22
 
22
23
  # Computes the optimum amount and coordinates of rectangles to OCR on a given page
23
- def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
24
+ def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
24
25
  BITMAP_COVERAGE_TRESHOLD = 0.75
26
+ assert page.size is not None
25
27
 
26
28
  def find_ocr_rects(size, bitmap_rects):
27
29
  image = Image.new(
@@ -60,7 +62,10 @@ class BaseOcrModel:
60
62
 
61
63
  return (area_frac, bounding_boxes) # fraction covered # boxes
62
64
 
63
- bitmap_rects = page._backend.get_bitmap_rects()
65
+ if page._backend is not None:
66
+ bitmap_rects = page._backend.get_bitmap_rects()
67
+ else:
68
+ bitmap_rects = []
64
69
  coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
65
70
 
66
71
  # return full-page rectangle if sufficiently covered with bitmaps
@@ -75,7 +80,7 @@ class BaseOcrModel:
75
80
  )
76
81
  ]
77
82
  # return individual rectangles if the bitmap coverage is smaller
78
- elif coverage < BITMAP_COVERAGE_TRESHOLD:
83
+ else: # coverage <= BITMAP_COVERAGE_TRESHOLD:
79
84
  return ocr_rects
80
85
 
81
86
  # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.