docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -1,43 +1,58 @@
1
1
  import logging
2
2
  from typing import Iterable
3
3
 
4
- import numpy
4
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
5
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
- from docling.datamodel.pipeline_options import TesseractCliOcrOptions
6
+ from docling.datamodel.base_models import OcrCell, Page
7
+ from docling.datamodel.document import ConversionResult
8
+ from docling.datamodel.pipeline_options import TesseractOcrOptions
9
+ from docling.datamodel.settings import settings
8
10
  from docling.models.base_ocr_model import BaseOcrModel
11
+ from docling.utils.profiling import TimeRecorder
9
12
 
10
13
  _log = logging.getLogger(__name__)
11
14
 
12
15
 
13
16
  class TesseractOcrModel(BaseOcrModel):
14
- def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
17
+ def __init__(self, enabled: bool, options: TesseractOcrOptions):
15
18
  super().__init__(enabled=enabled, options=options)
16
- self.options: TesseractCliOcrOptions
19
+ self.options: TesseractOcrOptions
17
20
 
18
21
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
19
22
  self.reader = None
20
23
 
21
24
  if self.enabled:
22
- setup_errmsg = (
25
+ install_errmsg = (
23
26
  "tesserocr is not correctly installed. "
24
27
  "Please install it via `pip install tesserocr` to use this OCR engine. "
25
- "Note that tesserocr might have to be manually compiled for working with"
28
+ "Note that tesserocr might have to be manually compiled for working with "
26
29
  "your Tesseract installation. The Docling documentation provides examples for it. "
27
- "Alternatively, Docling has support for other OCR engines. See the documentation."
30
+ "Alternatively, Docling has support for other OCR engines. See the documentation: "
31
+ "https://ds4sd.github.io/docling/installation/"
28
32
  )
33
+ missing_langs_errmsg = (
34
+ "tesserocr is not correctly configured. No language models have been detected. "
35
+ "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
36
+ "You can find more information how to setup other OCR engines in Docling "
37
+ "documentation: "
38
+ "https://ds4sd.github.io/docling/installation/"
39
+ )
40
+
29
41
  try:
30
42
  import tesserocr
31
43
  except ImportError:
32
- raise ImportError(setup_errmsg)
33
-
44
+ raise ImportError(install_errmsg)
34
45
  try:
35
46
  tesseract_version = tesserocr.tesseract_version()
36
- _log.debug("Initializing TesserOCR: %s", tesseract_version)
37
47
  except:
38
- raise ImportError(setup_errmsg)
48
+ raise ImportError(install_errmsg)
49
+
50
+ _, tesserocr_languages = tesserocr.get_languages()
51
+ if not tesserocr_languages:
52
+ raise ImportError(missing_langs_errmsg)
39
53
 
40
54
  # Initialize the tesseractAPI
55
+ _log.debug("Initializing TesserOCR: %s", tesseract_version)
41
56
  lang = "+".join(self.options.lang)
42
57
  if self.options.path is not None:
43
58
  self.reader = tesserocr.PyTessBaseAPI(
@@ -61,62 +76,79 @@ class TesseractOcrModel(BaseOcrModel):
61
76
  # Finalize the tesseractAPI
62
77
  self.reader.End()
63
78
 
64
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
79
+ def __call__(
80
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
81
+ ) -> Iterable[Page]:
65
82
 
66
83
  if not self.enabled:
67
84
  yield from page_batch
68
85
  return
69
86
 
70
87
  for page in page_batch:
71
- ocr_rects = self.get_ocr_rects(page)
72
-
73
- all_ocr_cells = []
74
- for ocr_rect in ocr_rects:
75
- # Skip zero area boxes
76
- if ocr_rect.area() == 0:
77
- continue
78
- high_res_image = page._backend.get_page_image(
79
- scale=self.scale, cropbox=ocr_rect
80
- )
88
+ assert page._backend is not None
89
+ if not page._backend.is_valid():
90
+ yield page
91
+ else:
92
+ with TimeRecorder(conv_res, "ocr"):
81
93
 
82
- # Retrieve text snippets with their bounding boxes
83
- self.reader.SetImage(high_res_image)
84
- boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
85
-
86
- cells = []
87
- for ix, (im, box, _, _) in enumerate(boxes):
88
- # Set the area of interest. Tesseract uses Bottom-Left for the origin
89
- self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
90
-
91
- # Extract text within the bounding box
92
- text = self.reader.GetUTF8Text().strip()
93
- confidence = self.reader.MeanTextConf()
94
- left = box["x"] / self.scale
95
- bottom = box["y"] / self.scale
96
- right = (box["x"] + box["w"]) / self.scale
97
- top = (box["y"] + box["h"]) / self.scale
98
-
99
- cells.append(
100
- OcrCell(
101
- id=ix,
102
- text=text,
103
- confidence=confidence,
104
- bbox=BoundingBox.from_tuple(
105
- coord=(left, top, right, bottom),
106
- origin=CoordOrigin.TOPLEFT,
107
- ),
94
+ assert self.reader is not None
95
+
96
+ ocr_rects = self.get_ocr_rects(page)
97
+
98
+ all_ocr_cells = []
99
+ for ocr_rect in ocr_rects:
100
+ # Skip zero area boxes
101
+ if ocr_rect.area() == 0:
102
+ continue
103
+ high_res_image = page._backend.get_page_image(
104
+ scale=self.scale, cropbox=ocr_rect
108
105
  )
109
- )
110
106
 
111
- # del high_res_image
112
- all_ocr_cells.extend(cells)
107
+ # Retrieve text snippets with their bounding boxes
108
+ self.reader.SetImage(high_res_image)
109
+ boxes = self.reader.GetComponentImages(
110
+ self.reader_RIL.TEXTLINE, True
111
+ )
113
112
 
114
- ## Remove OCR cells which overlap with programmatic cells.
115
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
113
+ cells = []
114
+ for ix, (im, box, _, _) in enumerate(boxes):
115
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
116
+ self.reader.SetRectangle(
117
+ box["x"], box["y"], box["w"], box["h"]
118
+ )
119
+
120
+ # Extract text within the bounding box
121
+ text = self.reader.GetUTF8Text().strip()
122
+ confidence = self.reader.MeanTextConf()
123
+ left = box["x"] / self.scale
124
+ bottom = box["y"] / self.scale
125
+ right = (box["x"] + box["w"]) / self.scale
126
+ top = (box["y"] + box["h"]) / self.scale
127
+
128
+ cells.append(
129
+ OcrCell(
130
+ id=ix,
131
+ text=text,
132
+ confidence=confidence,
133
+ bbox=BoundingBox.from_tuple(
134
+ coord=(left, top, right, bottom),
135
+ origin=CoordOrigin.TOPLEFT,
136
+ ),
137
+ )
138
+ )
139
+
140
+ # del high_res_image
141
+ all_ocr_cells.extend(cells)
142
+
143
+ ## Remove OCR cells which overlap with programmatic cells.
144
+ filtered_ocr_cells = self.filter_ocr_cells(
145
+ all_ocr_cells, page.cells
146
+ )
116
147
 
117
- page.cells.extend(filtered_ocr_cells)
148
+ page.cells.extend(filtered_ocr_cells)
118
149
 
119
- # DEBUG code:
120
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
150
+ # DEBUG code:
151
+ if settings.debug.visualize_ocr:
152
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
121
153
 
122
- yield page
154
+ yield page
@@ -0,0 +1,189 @@
1
+ import functools
2
+ import logging
3
+ import time
4
+ import traceback
5
+ from abc import ABC, abstractmethod
6
+ from typing import Callable, Iterable, List
7
+
8
+ from docling_core.types.doc import DoclingDocument, NodeItem
9
+
10
+ from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.pdf_backend import PdfDocumentBackend
12
+ from docling.datamodel.base_models import (
13
+ ConversionStatus,
14
+ DoclingComponentType,
15
+ ErrorItem,
16
+ Page,
17
+ )
18
+ from docling.datamodel.document import ConversionResult, InputDocument
19
+ from docling.datamodel.pipeline_options import PipelineOptions
20
+ from docling.datamodel.settings import settings
21
+ from docling.models.base_model import BaseEnrichmentModel
22
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
23
+ from docling.utils.utils import chunkify
24
+
25
+ _log = logging.getLogger(__name__)
26
+
27
+
28
+ class BasePipeline(ABC):
29
+ def __init__(self, pipeline_options: PipelineOptions):
30
+ self.pipeline_options = pipeline_options
31
+ self.build_pipe: List[Callable] = []
32
+ self.enrichment_pipe: List[BaseEnrichmentModel] = []
33
+
34
+ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
35
+ conv_res = ConversionResult(input=in_doc)
36
+
37
+ _log.info(f"Processing document {in_doc.file.name}")
38
+ try:
39
+ with TimeRecorder(
40
+ conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
41
+ ):
42
+ # These steps are building and assembling the structure of the
43
+ # output DoclingDocument
44
+ conv_res = self._build_document(conv_res)
45
+ conv_res = self._assemble_document(conv_res)
46
+ # From this stage, all operations should rely only on conv_res.output
47
+ conv_res = self._enrich_document(conv_res)
48
+ conv_res.status = self._determine_status(conv_res)
49
+ except Exception as e:
50
+ conv_res.status = ConversionStatus.FAILURE
51
+ if raises_on_error:
52
+ raise e
53
+
54
+ return conv_res
55
+
56
+ @abstractmethod
57
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
58
+ pass
59
+
60
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
61
+ return conv_res
62
+
63
+ def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
64
+
65
+ def _filter_elements(
66
+ doc: DoclingDocument, model: BaseEnrichmentModel
67
+ ) -> Iterable[NodeItem]:
68
+ for element, _level in doc.iterate_items():
69
+ if model.is_processable(doc=doc, element=element):
70
+ yield element
71
+
72
+ with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
73
+ for model in self.enrichment_pipe:
74
+ for element_batch in chunkify(
75
+ _filter_elements(conv_res.document, model),
76
+ settings.perf.elements_batch_size,
77
+ ):
78
+ # TODO: currently we assume the element itself is modified, because
79
+ # we don't have an interface to save the element back to the document
80
+ for element in model(
81
+ doc=conv_res.document, element_batch=element_batch
82
+ ): # Must exhaust!
83
+ pass
84
+
85
+ return conv_res
86
+
87
+ @abstractmethod
88
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
89
+ pass
90
+
91
+ @classmethod
92
+ @abstractmethod
93
+ def get_default_options(cls) -> PipelineOptions:
94
+ pass
95
+
96
+ @classmethod
97
+ @abstractmethod
98
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
99
+ pass
100
+
101
+ # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
102
+ # for model in self.build_pipe:
103
+ # element_batch = model(element_batch)
104
+ #
105
+ # yield from element_batch
106
+
107
+
108
+ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
109
+
110
+ def _apply_on_pages(
111
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
112
+ ) -> Iterable[Page]:
113
+ for model in self.build_pipe:
114
+ page_batch = model(conv_res, page_batch)
115
+
116
+ yield from page_batch
117
+
118
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
119
+
120
+ if not isinstance(conv_res.input._backend, PdfDocumentBackend):
121
+ raise RuntimeError(
122
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
123
+ f"Can not convert this with a PDF pipeline. "
124
+ f"Please check your format configuration on DocumentConverter."
125
+ )
126
+ # conv_res.status = ConversionStatus.FAILURE
127
+ # return conv_res
128
+
129
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
130
+
131
+ for i in range(0, conv_res.input.page_count):
132
+ conv_res.pages.append(Page(page_no=i))
133
+
134
+ try:
135
+ # Iterate batches of pages (page_batch_size) in the doc
136
+ for page_batch in chunkify(
137
+ conv_res.pages, settings.perf.page_batch_size
138
+ ):
139
+ start_pb_time = time.time()
140
+
141
+ # 1. Initialise the page resources
142
+ init_pages = map(
143
+ functools.partial(self.initialize_page, conv_res), page_batch
144
+ )
145
+
146
+ # 2. Run pipeline stages
147
+ pipeline_pages = self._apply_on_pages(conv_res, init_pages)
148
+
149
+ for p in pipeline_pages: # Must exhaust!
150
+ pass
151
+
152
+ end_pb_time = time.time() - start_pb_time
153
+ _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
154
+
155
+ except Exception as e:
156
+ conv_res.status = ConversionStatus.FAILURE
157
+ trace = "\n".join(traceback.format_exception(e))
158
+ _log.warning(
159
+ f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n"
160
+ f"{trace}"
161
+ )
162
+ raise e
163
+
164
+ finally:
165
+ # Always unload the PDF backend, even in case of failure
166
+ if conv_res.input._backend:
167
+ conv_res.input._backend.unload()
168
+
169
+ return conv_res
170
+
171
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
172
+ status = ConversionStatus.SUCCESS
173
+ for page in conv_res.pages:
174
+ if page._backend is None or not page._backend.is_valid():
175
+ conv_res.errors.append(
176
+ ErrorItem(
177
+ component_type=DoclingComponentType.DOCUMENT_BACKEND,
178
+ module_name=type(page._backend).__name__,
179
+ error_message=f"Page {page.page_no} failed to parse.",
180
+ )
181
+ )
182
+ status = ConversionStatus.PARTIAL_SUCCESS
183
+
184
+ return status
185
+
186
+ # Initialise and load resources for a page
187
+ @abstractmethod
188
+ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
189
+ pass
@@ -0,0 +1,56 @@
1
+ import logging
2
+
3
+ from docling.backend.abstract_backend import (
4
+ AbstractDocumentBackend,
5
+ DeclarativeDocumentBackend,
6
+ )
7
+ from docling.datamodel.base_models import ConversionStatus
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import PipelineOptions
10
+ from docling.pipeline.base_pipeline import BasePipeline
11
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
12
+
13
+ _log = logging.getLogger(__name__)
14
+
15
+
16
+ class SimplePipeline(BasePipeline):
17
+ """SimpleModelPipeline.
18
+
19
+ This class is used at the moment for formats / backends
20
+ which produce straight DoclingDocument output.
21
+ """
22
+
23
+ def __init__(self, pipeline_options: PipelineOptions):
24
+ super().__init__(pipeline_options)
25
+
26
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
27
+
28
+ if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
29
+ raise RuntimeError(
30
+ f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
31
+ f"Can not convert this with simple pipeline. "
32
+ f"Please check your format configuration on DocumentConverter."
33
+ )
34
+ # conv_res.status = ConversionStatus.FAILURE
35
+ # return conv_res
36
+
37
+ # Instead of running a page-level pipeline to build up the document structure,
38
+ # the backend is expected to be of type DeclarativeDocumentBackend, which can output
39
+ # a DoclingDocument straight.
40
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
41
+ conv_res.document = conv_res.input._backend.convert()
42
+ return conv_res
43
+
44
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
45
+ # This is called only if the previous steps didn't raise.
46
+ # Since we don't have anything else to evaluate, we can
47
+ # safely return SUCCESS.
48
+ return ConversionStatus.SUCCESS
49
+
50
+ @classmethod
51
+ def get_default_options(cls) -> PipelineOptions:
52
+ return PipelineOptions()
53
+
54
+ @classmethod
55
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
56
+ return isinstance(backend, DeclarativeDocumentBackend)
@@ -0,0 +1,201 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
6
+
7
+ from docling.backend.abstract_backend import AbstractDocumentBackend
8
+ from docling.backend.pdf_backend import PdfDocumentBackend
9
+ from docling.datamodel.base_models import AssembledUnit, Page
10
+ from docling.datamodel.document import ConversionResult
11
+ from docling.datamodel.pipeline_options import (
12
+ EasyOcrOptions,
13
+ PdfPipelineOptions,
14
+ TesseractCliOcrOptions,
15
+ TesseractOcrOptions,
16
+ )
17
+ from docling.models.base_ocr_model import BaseOcrModel
18
+ from docling.models.ds_glm_model import GlmModel, GlmOptions
19
+ from docling.models.easyocr_model import EasyOcrModel
20
+ from docling.models.layout_model import LayoutModel
21
+ from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
22
+ from docling.models.page_preprocessing_model import (
23
+ PagePreprocessingModel,
24
+ PagePreprocessingOptions,
25
+ )
26
+ from docling.models.table_structure_model import TableStructureModel
27
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
28
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
29
+ from docling.pipeline.base_pipeline import PaginatedPipeline
30
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
31
+
32
+ _log = logging.getLogger(__name__)
33
+
34
+
35
+ class StandardPdfPipeline(PaginatedPipeline):
36
+ _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
37
+ _table_model_path = "model_artifacts/tableformer"
38
+
39
+ def __init__(self, pipeline_options: PdfPipelineOptions):
40
+ super().__init__(pipeline_options)
41
+ self.pipeline_options: PdfPipelineOptions
42
+
43
+ if pipeline_options.artifacts_path is None:
44
+ self.artifacts_path = self.download_models_hf()
45
+ else:
46
+ self.artifacts_path = Path(pipeline_options.artifacts_path)
47
+
48
+ keep_images = (
49
+ self.pipeline_options.generate_page_images
50
+ or self.pipeline_options.generate_picture_images
51
+ or self.pipeline_options.generate_table_images
52
+ )
53
+
54
+ self.glm_model = GlmModel(options=GlmOptions())
55
+
56
+ if (ocr_model := self.get_ocr_model()) is None:
57
+ raise RuntimeError(
58
+ f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
59
+ )
60
+
61
+ self.build_pipe = [
62
+ # Pre-processing
63
+ PagePreprocessingModel(
64
+ options=PagePreprocessingOptions(
65
+ images_scale=pipeline_options.images_scale
66
+ )
67
+ ),
68
+ # OCR
69
+ ocr_model,
70
+ # Layout model
71
+ LayoutModel(
72
+ artifacts_path=self.artifacts_path
73
+ / StandardPdfPipeline._layout_model_path
74
+ ),
75
+ # Table structure model
76
+ TableStructureModel(
77
+ enabled=pipeline_options.do_table_structure,
78
+ artifacts_path=self.artifacts_path
79
+ / StandardPdfPipeline._table_model_path,
80
+ options=pipeline_options.table_structure_options,
81
+ ),
82
+ # Page assemble
83
+ PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
84
+ ]
85
+
86
+ self.enrichment_pipe = [
87
+ # Other models working on `NodeItem` elements in the DoclingDocument
88
+ ]
89
+
90
+ @staticmethod
91
+ def download_models_hf(
92
+ local_dir: Optional[Path] = None, force: bool = False
93
+ ) -> Path:
94
+ from huggingface_hub import snapshot_download
95
+
96
+ download_path = snapshot_download(
97
+ repo_id="ds4sd/docling-models",
98
+ force_download=force,
99
+ local_dir=local_dir,
100
+ revision="v2.0.1",
101
+ )
102
+
103
+ return Path(download_path)
104
+
105
+ def get_ocr_model(self) -> Optional[BaseOcrModel]:
106
+ if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
107
+ return EasyOcrModel(
108
+ enabled=self.pipeline_options.do_ocr,
109
+ options=self.pipeline_options.ocr_options,
110
+ )
111
+ elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
112
+ return TesseractOcrCliModel(
113
+ enabled=self.pipeline_options.do_ocr,
114
+ options=self.pipeline_options.ocr_options,
115
+ )
116
+ elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
117
+ return TesseractOcrModel(
118
+ enabled=self.pipeline_options.do_ocr,
119
+ options=self.pipeline_options.ocr_options,
120
+ )
121
+ return None
122
+
123
+ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
124
+ with TimeRecorder(conv_res, "page_init"):
125
+ page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
126
+ if page._backend is not None and page._backend.is_valid():
127
+ page.size = page._backend.get_size()
128
+
129
+ return page
130
+
131
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
132
+ all_elements = []
133
+ all_headers = []
134
+ all_body = []
135
+
136
+ with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
137
+ for p in conv_res.pages:
138
+ if p.assembled is not None:
139
+ for el in p.assembled.body:
140
+ all_body.append(el)
141
+ for el in p.assembled.headers:
142
+ all_headers.append(el)
143
+ for el in p.assembled.elements:
144
+ all_elements.append(el)
145
+
146
+ conv_res.assembled = AssembledUnit(
147
+ elements=all_elements, headers=all_headers, body=all_body
148
+ )
149
+
150
+ conv_res.document = self.glm_model(conv_res)
151
+
152
+ # Generate page images in the output
153
+ if self.pipeline_options.generate_page_images:
154
+ for page in conv_res.pages:
155
+ assert page.image is not None
156
+ page_no = page.page_no + 1
157
+ conv_res.document.pages[page_no].image = ImageRef.from_pil(
158
+ page.image, dpi=int(72 * self.pipeline_options.images_scale)
159
+ )
160
+
161
+ # Generate images of the requested element types
162
+ if (
163
+ self.pipeline_options.generate_picture_images
164
+ or self.pipeline_options.generate_table_images
165
+ ):
166
+ scale = self.pipeline_options.images_scale
167
+ for element, _level in conv_res.document.iterate_items():
168
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
169
+ continue
170
+ if (
171
+ isinstance(element, PictureItem)
172
+ and self.pipeline_options.generate_picture_images
173
+ ) or (
174
+ isinstance(element, TableItem)
175
+ and self.pipeline_options.generate_table_images
176
+ ):
177
+ page_ix = element.prov[0].page_no - 1
178
+ page = conv_res.pages[page_ix]
179
+ assert page.size is not None
180
+ assert page.image is not None
181
+
182
+ crop_bbox = (
183
+ element.prov[0]
184
+ .bbox.scaled(scale=scale)
185
+ .to_top_left_origin(page_height=page.size.height * scale)
186
+ )
187
+
188
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
189
+ element.image = ImageRef.from_pil(
190
+ cropped_im, dpi=int(72 * scale)
191
+ )
192
+
193
+ return conv_res
194
+
195
+ @classmethod
196
+ def get_default_options(cls) -> PdfPipelineOptions:
197
+ return PdfPipelineOptions()
198
+
199
+ @classmethod
200
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
201
+ return isinstance(backend, PdfDocumentBackend)
docling/utils/export.py CHANGED
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  from typing import Any, Dict, Iterable, List, Tuple, Union
3
3
 
4
- from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
4
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
+ from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
5
6
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
7
+ from docling.datamodel.base_models import OcrCell
7
8
  from docling.datamodel.document import ConversionResult, Page
8
9
 
9
10
  _log = logging.getLogger(__name__)
@@ -40,7 +41,7 @@ def generate_multimodal_pages(
40
41
  end_ix = 0
41
42
  doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
42
43
 
43
- doc = doc_result.output
44
+ doc = doc_result.legacy_document
44
45
 
45
46
  def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
46
47
  segments = []