docling 1.19.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.1.dist-info/METADATA +0 -380
  35. docling-1.19.1.dist-info/RECORD +0 -34
  36. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -3,29 +3,25 @@ from pathlib import Path
3
3
  from typing import Iterable, List
4
4
 
5
5
  import numpy
6
+ from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
6
7
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
7
8
  from PIL import ImageDraw
8
9
 
9
- from docling.datamodel.base_models import (
10
- BoundingBox,
11
- Page,
12
- TableCell,
13
- TableElement,
14
- TableStructurePrediction,
15
- )
16
- from docling.datamodel.pipeline_options import TableFormerMode
10
+ from docling.datamodel.base_models import Page, Table, TableStructurePrediction
11
+ from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
12
+ from docling.models.base_model import BasePageModel
17
13
 
18
14
 
19
- class TableStructureModel:
20
- def __init__(self, config):
21
- self.config = config
22
- self.do_cell_matching = config["do_cell_matching"]
23
- self.mode = config["mode"]
15
+ class TableStructureModel(BasePageModel):
16
+ def __init__(
17
+ self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
18
+ ):
19
+ self.options = options
20
+ self.do_cell_matching = self.options.do_cell_matching
21
+ self.mode = self.options.mode
24
22
 
25
- self.enabled = config["enabled"]
23
+ self.enabled = enabled
26
24
  if self.enabled:
27
- artifacts_path: Path = config["artifacts_path"]
28
-
29
25
  if self.mode == TableFormerMode.ACCURATE:
30
26
  artifacts_path = artifacts_path / "fat"
31
27
 
@@ -39,7 +35,9 @@ class TableStructureModel:
39
35
  self.tf_predictor = TFPredictor(self.tm_config)
40
36
  self.scale = 2.0 # Scale up table input images to 144 dpi
41
37
 
42
- def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
38
+ def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
39
+ assert page._backend is not None
40
+
43
41
  image = (
44
42
  page._backend.get_page_image()
45
43
  ) # make new image to avoid drawing on the saved ones
@@ -50,17 +48,18 @@ class TableStructureModel:
50
48
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
51
49
 
52
50
  for tc in table_element.table_cells:
53
- x0, y0, x1, y1 = tc.bbox.as_tuple()
54
- if tc.column_header:
55
- width = 3
56
- else:
57
- width = 1
58
- draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
59
- draw.text(
60
- (x0 + 3, y0 + 3),
61
- text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
62
- fill="black",
63
- )
51
+ if tc.bbox is not None:
52
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
53
+ if tc.column_header:
54
+ width = 3
55
+ else:
56
+ width = 1
57
+ draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
58
+ draw.text(
59
+ (x0 + 3, y0 + 3),
60
+ text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
61
+ fill="black",
62
+ )
64
63
 
65
64
  image.show()
66
65
 
@@ -71,6 +70,9 @@ class TableStructureModel:
71
70
  return
72
71
 
73
72
  for page in page_batch:
73
+ assert page._backend is not None
74
+ assert page.predictions.layout is not None
75
+ assert page.size is not None
74
76
 
75
77
  page.predictions.tablestructure = TableStructurePrediction() # dummy
76
78
 
@@ -85,7 +87,7 @@ class TableStructureModel:
85
87
  ],
86
88
  )
87
89
  for cluster in page.predictions.layout.clusters
88
- if cluster.label == "Table"
90
+ if cluster.label == DocItemLabel.TABLE
89
91
  ]
90
92
  if not len(in_tables):
91
93
  yield page
@@ -132,7 +134,7 @@ class TableStructureModel:
132
134
  element["bbox"]["token"] = text_piece
133
135
 
134
136
  tc = TableCell.model_validate(element)
135
- if self.do_cell_matching:
137
+ if self.do_cell_matching and tc.bbox is not None:
136
138
  tc.bbox = tc.bbox.scaled(1 / self.scale)
137
139
  table_cells.append(tc)
138
140
 
@@ -141,7 +143,7 @@ class TableStructureModel:
141
143
  num_cols = table_out["predict_details"]["num_cols"]
142
144
  otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
143
145
 
144
- tbl = TableElement(
146
+ tbl = Table(
145
147
  otsl_seq=otsl_seq,
146
148
  table_cells=table_cells,
147
149
  num_rows=num_rows,
@@ -149,7 +151,7 @@ class TableStructureModel:
149
151
  id=table_cluster.id,
150
152
  page_no=page.page_no,
151
153
  cluster=table_cluster,
152
- label="Table",
154
+ label=DocItemLabel.TABLE,
153
155
  )
154
156
 
155
157
  page.predictions.tablestructure.table_map[table_cluster.id] = tbl
@@ -2,11 +2,12 @@ import io
2
2
  import logging
3
3
  import tempfile
4
4
  from subprocess import DEVNULL, PIPE, Popen
5
- from typing import Iterable, Tuple
5
+ from typing import Iterable, Optional, Tuple
6
6
 
7
7
  import pandas as pd
8
+ from docling_core.types.doc import BoundingBox, CoordOrigin
8
9
 
9
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
10
+ from docling.datamodel.base_models import OcrCell, Page
10
11
  from docling.datamodel.pipeline_options import TesseractCliOcrOptions
11
12
  from docling.models.base_ocr_model import BaseOcrModel
12
13
 
@@ -21,8 +22,8 @@ class TesseractOcrCliModel(BaseOcrModel):
21
22
 
22
23
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
24
 
24
- self._name = None
25
- self._version = None
25
+ self._name: Optional[str] = None
26
+ self._version: Optional[str] = None
26
27
 
27
28
  if self.enabled:
28
29
  try:
@@ -39,7 +40,7 @@ class TesseractOcrCliModel(BaseOcrModel):
39
40
  def _get_name_and_version(self) -> Tuple[str, str]:
40
41
 
41
42
  if self._name != None and self._version != None:
42
- return self._name, self._version
43
+ return self._name, self._version # type: ignore
43
44
 
44
45
  cmd = [self.options.tesseract_cmd, "--version"]
45
46
 
@@ -108,6 +109,8 @@ class TesseractOcrCliModel(BaseOcrModel):
108
109
  return
109
110
 
110
111
  for page in page_batch:
112
+ assert page._backend is not None
113
+
111
114
  ocr_rects = self.get_ocr_rects(page)
112
115
 
113
116
  all_ocr_cells = []
@@ -1,19 +1,19 @@
1
1
  import logging
2
2
  from typing import Iterable
3
3
 
4
- import numpy
4
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
5
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
- from docling.datamodel.pipeline_options import TesseractCliOcrOptions
6
+ from docling.datamodel.base_models import OcrCell, Page
7
+ from docling.datamodel.pipeline_options import TesseractOcrOptions
8
8
  from docling.models.base_ocr_model import BaseOcrModel
9
9
 
10
10
  _log = logging.getLogger(__name__)
11
11
 
12
12
 
13
13
  class TesseractOcrModel(BaseOcrModel):
14
- def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
14
+ def __init__(self, enabled: bool, options: TesseractOcrOptions):
15
15
  super().__init__(enabled=enabled, options=options)
16
- self.options: TesseractCliOcrOptions
16
+ self.options: TesseractOcrOptions
17
17
 
18
18
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
19
19
  self.reader = None
@@ -68,6 +68,9 @@ class TesseractOcrModel(BaseOcrModel):
68
68
  return
69
69
 
70
70
  for page in page_batch:
71
+ assert page._backend is not None
72
+ assert self.reader is not None
73
+
71
74
  ocr_rects = self.get_ocr_rects(page)
72
75
 
73
76
  all_ocr_cells = []
@@ -0,0 +1,190 @@
1
+ import functools
2
+ import logging
3
+ import time
4
+ import traceback
5
+ from abc import ABC, abstractmethod
6
+ from typing import Callable, Iterable, List
7
+
8
+ from docling_core.types.doc import DoclingDocument, NodeItem
9
+
10
+ from docling.backend.abstract_backend import AbstractDocumentBackend
11
+ from docling.backend.pdf_backend import PdfDocumentBackend
12
+ from docling.datamodel.base_models import (
13
+ ConversionStatus,
14
+ DoclingComponentType,
15
+ ErrorItem,
16
+ Page,
17
+ )
18
+ from docling.datamodel.document import ConversionResult, InputDocument
19
+ from docling.datamodel.pipeline_options import PipelineOptions
20
+ from docling.datamodel.settings import settings
21
+ from docling.models.base_model import BaseEnrichmentModel
22
+ from docling.utils.utils import chunkify
23
+
24
+ _log = logging.getLogger(__name__)
25
+
26
+
27
+ class BasePipeline(ABC):
28
+ def __init__(self, pipeline_options: PipelineOptions):
29
+ self.pipeline_options = pipeline_options
30
+ self.build_pipe: List[Callable] = []
31
+ self.enrichment_pipe: List[BaseEnrichmentModel] = []
32
+
33
+ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
34
+ conv_res = ConversionResult(input=in_doc)
35
+
36
+ _log.info(f"Processing document {in_doc.file.name}")
37
+ try:
38
+ # These steps are building and assembling the structure of the
39
+ # output DoclingDocument
40
+ conv_res = self._build_document(in_doc, conv_res)
41
+ conv_res = self._assemble_document(in_doc, conv_res)
42
+ # From this stage, all operations should rely only on conv_res.output
43
+ conv_res = self._enrich_document(in_doc, conv_res)
44
+ conv_res.status = self._determine_status(in_doc, conv_res)
45
+ except Exception as e:
46
+ conv_res.status = ConversionStatus.FAILURE
47
+ if raises_on_error:
48
+ raise e
49
+
50
+ return conv_res
51
+
52
+ @abstractmethod
53
+ def _build_document(
54
+ self, in_doc: InputDocument, conv_res: ConversionResult
55
+ ) -> ConversionResult:
56
+ pass
57
+
58
+ def _assemble_document(
59
+ self, in_doc: InputDocument, conv_res: ConversionResult
60
+ ) -> ConversionResult:
61
+ return conv_res
62
+
63
+ def _enrich_document(
64
+ self, in_doc: InputDocument, conv_res: ConversionResult
65
+ ) -> ConversionResult:
66
+
67
+ def _filter_elements(
68
+ doc: DoclingDocument, model: BaseEnrichmentModel
69
+ ) -> Iterable[NodeItem]:
70
+ for element, _level in doc.iterate_items():
71
+ if model.is_processable(doc=doc, element=element):
72
+ yield element
73
+
74
+ for model in self.enrichment_pipe:
75
+ for element_batch in chunkify(
76
+ _filter_elements(conv_res.document, model),
77
+ settings.perf.elements_batch_size,
78
+ ):
79
+ # TODO: currently we assume the element itself is modified, because
80
+ # we don't have an interface to save the element back to the document
81
+ for element in model(
82
+ doc=conv_res.document, element_batch=element_batch
83
+ ): # Must exhaust!
84
+ pass
85
+
86
+ return conv_res
87
+
88
+ @abstractmethod
89
+ def _determine_status(
90
+ self, in_doc: InputDocument, conv_res: ConversionResult
91
+ ) -> ConversionStatus:
92
+ pass
93
+
94
+ @classmethod
95
+ @abstractmethod
96
+ def get_default_options(cls) -> PipelineOptions:
97
+ pass
98
+
99
+ @classmethod
100
+ @abstractmethod
101
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
102
+ pass
103
+
104
+ # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
105
+ # for model in self.build_pipe:
106
+ # element_batch = model(element_batch)
107
+ #
108
+ # yield from element_batch
109
+
110
+
111
+ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
112
+
113
+ def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
114
+ for model in self.build_pipe:
115
+ page_batch = model(page_batch)
116
+
117
+ yield from page_batch
118
+
119
+ def _build_document(
120
+ self, in_doc: InputDocument, conv_res: ConversionResult
121
+ ) -> ConversionResult:
122
+
123
+ if not isinstance(in_doc._backend, PdfDocumentBackend):
124
+ raise RuntimeError(
125
+ f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
126
+ f"Can not convert this with a PDF pipeline. "
127
+ f"Please check your format configuration on DocumentConverter."
128
+ )
129
+ # conv_res.status = ConversionStatus.FAILURE
130
+ # return conv_res
131
+
132
+ for i in range(0, in_doc.page_count):
133
+ conv_res.pages.append(Page(page_no=i))
134
+
135
+ try:
136
+ # Iterate batches of pages (page_batch_size) in the doc
137
+ for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
138
+ start_pb_time = time.time()
139
+
140
+ # 1. Initialise the page resources
141
+ init_pages = map(
142
+ functools.partial(self.initialize_page, in_doc), page_batch
143
+ )
144
+
145
+ # 2. Run pipeline stages
146
+ pipeline_pages = self._apply_on_pages(init_pages)
147
+
148
+ for p in pipeline_pages: # Must exhaust!
149
+ pass
150
+
151
+ end_pb_time = time.time() - start_pb_time
152
+ _log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
153
+
154
+ except Exception as e:
155
+ conv_res.status = ConversionStatus.FAILURE
156
+ trace = "\n".join(traceback.format_exception(e))
157
+ _log.warning(
158
+ f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
159
+ f"{trace}"
160
+ )
161
+ raise e
162
+
163
+ finally:
164
+ # Always unload the PDF backend, even in case of failure
165
+ if in_doc._backend:
166
+ in_doc._backend.unload()
167
+
168
+ return conv_res
169
+
170
+ def _determine_status(
171
+ self, in_doc: InputDocument, conv_res: ConversionResult
172
+ ) -> ConversionStatus:
173
+ status = ConversionStatus.SUCCESS
174
+ for page in conv_res.pages:
175
+ if page._backend is None or not page._backend.is_valid():
176
+ conv_res.errors.append(
177
+ ErrorItem(
178
+ component_type=DoclingComponentType.DOCUMENT_BACKEND,
179
+ module_name=type(page._backend).__name__,
180
+ error_message=f"Page {page.page_no} failed to parse.",
181
+ )
182
+ )
183
+ status = ConversionStatus.PARTIAL_SUCCESS
184
+
185
+ return status
186
+
187
+ # Initialise and load resources for a page
188
+ @abstractmethod
189
+ def initialize_page(self, doc: InputDocument, page: Page) -> Page:
190
+ pass
@@ -0,0 +1,59 @@
1
+ import logging
2
+
3
+ from docling.backend.abstract_backend import (
4
+ AbstractDocumentBackend,
5
+ DeclarativeDocumentBackend,
6
+ )
7
+ from docling.datamodel.base_models import ConversionStatus
8
+ from docling.datamodel.document import ConversionResult, InputDocument
9
+ from docling.datamodel.pipeline_options import PipelineOptions
10
+ from docling.pipeline.base_pipeline import BasePipeline
11
+
12
+ _log = logging.getLogger(__name__)
13
+
14
+
15
+ class SimplePipeline(BasePipeline):
16
+ """SimpleModelPipeline.
17
+
18
+ This class is used at the moment for formats / backends
19
+ which produce straight DoclingDocument output.
20
+ """
21
+
22
+ def __init__(self, pipeline_options: PipelineOptions):
23
+ super().__init__(pipeline_options)
24
+
25
+ def _build_document(
26
+ self, in_doc: InputDocument, conv_res: ConversionResult
27
+ ) -> ConversionResult:
28
+
29
+ if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
30
+ raise RuntimeError(
31
+ f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
32
+ f"Can not convert this with simple pipeline. "
33
+ f"Please check your format configuration on DocumentConverter."
34
+ )
35
+ # conv_res.status = ConversionStatus.FAILURE
36
+ # return conv_res
37
+
38
+ # Instead of running a page-level pipeline to build up the document structure,
39
+ # the backend is expected to be of type DeclarativeDocumentBackend, which can output
40
+ # a DoclingDocument straight.
41
+
42
+ conv_res.document = in_doc._backend.convert()
43
+ return conv_res
44
+
45
+ def _determine_status(
46
+ self, in_doc: InputDocument, conv_res: ConversionResult
47
+ ) -> ConversionStatus:
48
+ # This is called only if the previous steps didn't raise.
49
+ # Since we don't have anything else to evaluate, we can
50
+ # safely return SUCCESS.
51
+ return ConversionStatus.SUCCESS
52
+
53
+ @classmethod
54
+ def get_default_options(cls) -> PipelineOptions:
55
+ return PipelineOptions()
56
+
57
+ @classmethod
58
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
59
+ return isinstance(backend, DeclarativeDocumentBackend)
@@ -0,0 +1,198 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
6
+
7
+ from docling.backend.abstract_backend import AbstractDocumentBackend
8
+ from docling.backend.pdf_backend import PdfDocumentBackend
9
+ from docling.datamodel.base_models import AssembledUnit, Page
10
+ from docling.datamodel.document import ConversionResult, InputDocument
11
+ from docling.datamodel.pipeline_options import (
12
+ EasyOcrOptions,
13
+ PdfPipelineOptions,
14
+ TesseractCliOcrOptions,
15
+ TesseractOcrOptions,
16
+ )
17
+ from docling.models.base_ocr_model import BaseOcrModel
18
+ from docling.models.ds_glm_model import GlmModel, GlmOptions
19
+ from docling.models.easyocr_model import EasyOcrModel
20
+ from docling.models.layout_model import LayoutModel
21
+ from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
22
+ from docling.models.page_preprocessing_model import (
23
+ PagePreprocessingModel,
24
+ PagePreprocessingOptions,
25
+ )
26
+ from docling.models.table_structure_model import TableStructureModel
27
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
28
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
29
+ from docling.pipeline.base_pipeline import PaginatedPipeline
30
+
31
+ _log = logging.getLogger(__name__)
32
+
33
+
34
+ class StandardPdfPipeline(PaginatedPipeline):
35
+ _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
36
+ _table_model_path = "model_artifacts/tableformer"
37
+
38
+ def __init__(self, pipeline_options: PdfPipelineOptions):
39
+ super().__init__(pipeline_options)
40
+ self.pipeline_options: PdfPipelineOptions
41
+
42
+ if pipeline_options.artifacts_path is None:
43
+ self.artifacts_path = self.download_models_hf()
44
+ else:
45
+ self.artifacts_path = Path(pipeline_options.artifacts_path)
46
+
47
+ keep_images = (
48
+ self.pipeline_options.generate_page_images
49
+ or self.pipeline_options.generate_picture_images
50
+ or self.pipeline_options.generate_table_images
51
+ )
52
+
53
+ self.glm_model = GlmModel(options=GlmOptions())
54
+
55
+ if (ocr_model := self.get_ocr_model()) is None:
56
+ raise RuntimeError(
57
+ f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
58
+ )
59
+
60
+ self.build_pipe = [
61
+ # Pre-processing
62
+ PagePreprocessingModel(
63
+ options=PagePreprocessingOptions(
64
+ images_scale=pipeline_options.images_scale
65
+ )
66
+ ),
67
+ # OCR
68
+ ocr_model,
69
+ # Layout model
70
+ LayoutModel(
71
+ artifacts_path=self.artifacts_path
72
+ / StandardPdfPipeline._layout_model_path
73
+ ),
74
+ # Table structure model
75
+ TableStructureModel(
76
+ enabled=pipeline_options.do_table_structure,
77
+ artifacts_path=self.artifacts_path
78
+ / StandardPdfPipeline._table_model_path,
79
+ options=pipeline_options.table_structure_options,
80
+ ),
81
+ # Page assemble
82
+ PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
83
+ ]
84
+
85
+ self.enrichment_pipe = [
86
+ # Other models working on `NodeItem` elements in the DoclingDocument
87
+ ]
88
+
89
+ @staticmethod
90
+ def download_models_hf(
91
+ local_dir: Optional[Path] = None, force: bool = False
92
+ ) -> Path:
93
+ from huggingface_hub import snapshot_download
94
+
95
+ download_path = snapshot_download(
96
+ repo_id="ds4sd/docling-models",
97
+ force_download=force,
98
+ local_dir=local_dir,
99
+ revision="v2.0.1",
100
+ )
101
+
102
+ return Path(download_path)
103
+
104
+ def get_ocr_model(self) -> Optional[BaseOcrModel]:
105
+ if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
106
+ return EasyOcrModel(
107
+ enabled=self.pipeline_options.do_ocr,
108
+ options=self.pipeline_options.ocr_options,
109
+ )
110
+ elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
111
+ return TesseractOcrCliModel(
112
+ enabled=self.pipeline_options.do_ocr,
113
+ options=self.pipeline_options.ocr_options,
114
+ )
115
+ elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
116
+ return TesseractOcrModel(
117
+ enabled=self.pipeline_options.do_ocr,
118
+ options=self.pipeline_options.ocr_options,
119
+ )
120
+ return None
121
+
122
+ def initialize_page(self, doc: InputDocument, page: Page) -> Page:
123
+ page._backend = doc._backend.load_page(page.page_no) # type: ignore
124
+ if page._backend is not None and page._backend.is_valid():
125
+ page.size = page._backend.get_size()
126
+
127
+ return page
128
+
129
+ def _assemble_document(
130
+ self, in_doc: InputDocument, conv_res: ConversionResult
131
+ ) -> ConversionResult:
132
+ all_elements = []
133
+ all_headers = []
134
+ all_body = []
135
+
136
+ for p in conv_res.pages:
137
+ assert p.assembled is not None
138
+ for el in p.assembled.body:
139
+ all_body.append(el)
140
+ for el in p.assembled.headers:
141
+ all_headers.append(el)
142
+ for el in p.assembled.elements:
143
+ all_elements.append(el)
144
+
145
+ conv_res.assembled = AssembledUnit(
146
+ elements=all_elements, headers=all_headers, body=all_body
147
+ )
148
+
149
+ conv_res.document = self.glm_model(conv_res)
150
+
151
+ # Generate page images in the output
152
+ if self.pipeline_options.generate_page_images:
153
+ for page in conv_res.pages:
154
+ assert page.image is not None
155
+ page_no = page.page_no + 1
156
+ conv_res.document.pages[page_no].image = ImageRef.from_pil(
157
+ page.image, dpi=int(72 * self.pipeline_options.images_scale)
158
+ )
159
+
160
+ # Generate images of the requested element types
161
+ if (
162
+ self.pipeline_options.generate_picture_images
163
+ or self.pipeline_options.generate_table_images
164
+ ):
165
+ scale = self.pipeline_options.images_scale
166
+ for element, _level in conv_res.document.iterate_items():
167
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
168
+ continue
169
+ if (
170
+ isinstance(element, PictureItem)
171
+ and self.pipeline_options.generate_picture_images
172
+ ) or (
173
+ isinstance(element, TableItem)
174
+ and self.pipeline_options.generate_table_images
175
+ ):
176
+ page_ix = element.prov[0].page_no - 1
177
+ page = conv_res.pages[page_ix]
178
+ assert page.size is not None
179
+ assert page.image is not None
180
+
181
+ crop_bbox = (
182
+ element.prov[0]
183
+ .bbox.scaled(scale=scale)
184
+ .to_top_left_origin(page_height=page.size.height * scale)
185
+ )
186
+
187
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
188
+ element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale))
189
+
190
+ return conv_res
191
+
192
+ @classmethod
193
+ def get_default_options(cls) -> PdfPipelineOptions:
194
+ return PdfPipelineOptions()
195
+
196
+ @classmethod
197
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
198
+ return isinstance(backend, PdfDocumentBackend)
docling/utils/export.py CHANGED
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  from typing import Any, Dict, Iterable, List, Tuple, Union
3
3
 
4
- from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
4
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
+ from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
5
6
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
7
+ from docling.datamodel.base_models import OcrCell
7
8
  from docling.datamodel.document import ConversionResult, Page
8
9
 
9
10
  _log = logging.getLogger(__name__)
@@ -40,7 +41,7 @@ def generate_multimodal_pages(
40
41
  end_ix = 0
41
42
  doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
42
43
 
43
- doc = doc_result.output
44
+ doc = doc_result.legacy_document
44
45
 
45
46
  def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
46
47
  segments = []