docling 1.20.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +15 -11
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.20.0.dist-info/METADATA +0 -380
  35. docling-1.20.0.dist-info/RECORD +0 -35
  36. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.20.0.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -1,39 +1,228 @@
1
1
  import copy
2
2
  import random
3
+ from typing import List, Union
3
4
 
4
5
  from deepsearch_glm.nlp_utils import init_nlp_model
5
- from deepsearch_glm.utils.doc_utils import to_legacy_document_format
6
+ from deepsearch_glm.utils.doc_utils import to_docling_document
6
7
  from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
7
8
  from docling_core.types import BaseText
8
9
  from docling_core.types import Document as DsDocument
9
- from docling_core.types import Ref
10
+ from docling_core.types import DocumentDescription as DsDocumentDescription
11
+ from docling_core.types import FileInfoObject as DsFileInfoObject
12
+ from docling_core.types import PageDimensions, PageReference, Prov, Ref
13
+ from docling_core.types import Table as DsSchemaTable
14
+ from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
15
+ from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
16
+ from docling_core.types.legacy_doc.base import Figure, TableCell
10
17
  from PIL import ImageDraw
18
+ from pydantic import BaseModel, ConfigDict
11
19
 
12
- from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
13
- from docling.datamodel.document import ConversionResult
20
+ from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
21
+ from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
22
+ from docling.utils.utils import create_hash
23
+
24
+
25
+ class GlmOptions(BaseModel):
26
+ model_config = ConfigDict(protected_namespaces=())
27
+
28
+ model_names: str = "" # e.g. "language;term;reference"
14
29
 
15
30
 
16
31
  class GlmModel:
17
- def __init__(self, config):
18
- self.config = config
19
- self.model_names = self.config.get(
20
- "model_names", ""
21
- ) # "language;term;reference"
32
+ def __init__(self, options: GlmOptions):
33
+ self.options = options
34
+
22
35
  load_pretrained_nlp_models()
23
- # model = init_nlp_model(model_names="language;term;reference")
24
- model = init_nlp_model(model_names=self.model_names)
25
- self.model = model
36
+ self.model = init_nlp_model(model_names=self.options.model_names)
37
+
38
+ def _to_legacy_document(self, conv_res) -> DsDocument:
39
+ title = ""
40
+ desc: DsDocumentDescription = DsDocumentDescription(logs=[])
41
+
42
+ page_hashes = [
43
+ PageReference(
44
+ hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
45
+ page=p.page_no + 1,
46
+ model="default",
47
+ )
48
+ for p in conv_res.pages
49
+ ]
50
+
51
+ file_info = DsFileInfoObject(
52
+ filename=conv_res.input.file.name,
53
+ document_hash=conv_res.input.document_hash,
54
+ num_pages=conv_res.input.page_count,
55
+ page_hashes=page_hashes,
56
+ )
57
+
58
+ main_text: List[Union[Ref, BaseText]] = []
59
+ tables: List[DsSchemaTable] = []
60
+ figures: List[Figure] = []
61
+
62
+ page_no_to_page = {p.page_no: p for p in conv_res.pages}
63
+
64
+ for element in conv_res.assembled.elements:
65
+ # Convert bboxes to lower-left origin.
66
+ target_bbox = DsBoundingBox(
67
+ element.cluster.bbox.to_bottom_left_origin(
68
+ page_no_to_page[element.page_no].size.height
69
+ ).as_tuple()
70
+ )
71
+
72
+ if isinstance(element, TextElement):
73
+ main_text.append(
74
+ BaseText(
75
+ text=element.text,
76
+ obj_type=layout_label_to_ds_type.get(element.label),
77
+ name=element.label,
78
+ prov=[
79
+ Prov(
80
+ bbox=target_bbox,
81
+ page=element.page_no + 1,
82
+ span=[0, len(element.text)],
83
+ )
84
+ ],
85
+ )
86
+ )
87
+ elif isinstance(element, Table):
88
+ index = len(tables)
89
+ ref_str = f"#/tables/{index}"
90
+ main_text.append(
91
+ Ref(
92
+ name=element.label,
93
+ obj_type=layout_label_to_ds_type.get(element.label),
94
+ ref=ref_str,
95
+ ),
96
+ )
97
+
98
+ # Initialise empty table data grid (only empty cells)
99
+ table_data = [
100
+ [
101
+ TableCell(
102
+ text="",
103
+ # bbox=[0,0,0,0],
104
+ spans=[[i, j]],
105
+ obj_type="body",
106
+ )
107
+ for j in range(element.num_cols)
108
+ ]
109
+ for i in range(element.num_rows)
110
+ ]
26
111
 
27
- def __call__(self, conv_res: ConversionResult) -> DsDocument:
28
- ds_doc = conv_res._to_ds_document()
112
+ # Overwrite cells in table data for which there is actual cell content.
113
+ for cell in element.table_cells:
114
+ for i in range(
115
+ min(cell.start_row_offset_idx, element.num_rows),
116
+ min(cell.end_row_offset_idx, element.num_rows),
117
+ ):
118
+ for j in range(
119
+ min(cell.start_col_offset_idx, element.num_cols),
120
+ min(cell.end_col_offset_idx, element.num_cols),
121
+ ):
122
+ celltype = "body"
123
+ if cell.column_header:
124
+ celltype = "col_header"
125
+ elif cell.row_header:
126
+ celltype = "row_header"
127
+ elif cell.row_section:
128
+ celltype = "row_section"
129
+
130
+ def make_spans(cell):
131
+ for rspan in range(
132
+ min(cell.start_row_offset_idx, element.num_rows),
133
+ min(cell.end_row_offset_idx, element.num_rows),
134
+ ):
135
+ for cspan in range(
136
+ min(
137
+ cell.start_col_offset_idx, element.num_cols
138
+ ),
139
+ min(cell.end_col_offset_idx, element.num_cols),
140
+ ):
141
+ yield [rspan, cspan]
142
+
143
+ spans = list(make_spans(cell))
144
+ if cell.bbox is not None:
145
+ bbox = cell.bbox.to_bottom_left_origin(
146
+ page_no_to_page[element.page_no].size.height
147
+ ).as_tuple()
148
+ else:
149
+ bbox = None
150
+
151
+ table_data[i][j] = TableCell(
152
+ text=cell.text,
153
+ bbox=bbox,
154
+ # col=j,
155
+ # row=i,
156
+ spans=spans,
157
+ obj_type=celltype,
158
+ # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
159
+ # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
160
+ )
161
+
162
+ tables.append(
163
+ DsSchemaTable(
164
+ num_cols=element.num_cols,
165
+ num_rows=element.num_rows,
166
+ obj_type=layout_label_to_ds_type.get(element.label),
167
+ data=table_data,
168
+ prov=[
169
+ Prov(
170
+ bbox=target_bbox,
171
+ page=element.page_no + 1,
172
+ span=[0, 0],
173
+ )
174
+ ],
175
+ )
176
+ )
177
+
178
+ elif isinstance(element, FigureElement):
179
+ index = len(figures)
180
+ ref_str = f"#/figures/{index}"
181
+ main_text.append(
182
+ Ref(
183
+ name=element.label,
184
+ obj_type=layout_label_to_ds_type.get(element.label),
185
+ ref=ref_str,
186
+ ),
187
+ )
188
+ figures.append(
189
+ Figure(
190
+ prov=[
191
+ Prov(
192
+ bbox=target_bbox,
193
+ page=element.page_no + 1,
194
+ span=[0, 0],
195
+ )
196
+ ],
197
+ obj_type=layout_label_to_ds_type.get(element.label),
198
+ # data=[[]],
199
+ )
200
+ )
201
+
202
+ page_dimensions = [
203
+ PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
204
+ for p in conv_res.pages
205
+ ]
206
+
207
+ ds_doc: DsDocument = DsDocument(
208
+ name=title,
209
+ description=desc,
210
+ file_info=file_info,
211
+ main_text=main_text,
212
+ tables=tables,
213
+ figures=figures,
214
+ page_dimensions=page_dimensions,
215
+ )
216
+
217
+ return ds_doc
218
+
219
+ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
220
+ ds_doc = self._to_legacy_document(conv_res)
29
221
  ds_doc_dict = ds_doc.model_dump(by_alias=True)
30
222
 
31
223
  glm_doc = self.model.apply_on_doc(ds_doc_dict)
32
- ds_doc_dict = to_legacy_document_format(
33
- glm_doc, ds_doc_dict, update_name_label=True
34
- )
35
224
 
36
- exported_doc = DsDocument.model_validate(ds_doc_dict)
225
+ docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
37
226
 
38
227
  # DEBUG code:
39
228
  def draw_clusters_and_cells(ds_document, page_no):
@@ -48,7 +237,7 @@ class GlmModel:
48
237
  if arr == "tables":
49
238
  prov = ds_document.tables[index].prov[0]
50
239
  elif arr == "figures":
51
- prov = ds_document.figures[index].prov[0]
240
+ prov = ds_document.pictures[index].prov[0]
52
241
  else:
53
242
  prov = None
54
243
 
@@ -83,4 +272,4 @@ class GlmModel:
83
272
  # draw_clusters_and_cells(ds_doc, 0)
84
273
  # draw_clusters_and_cells(exported_doc, 0)
85
274
 
86
- return exported_doc
275
+ return docling_doc
@@ -2,8 +2,9 @@ import logging
2
2
  from typing import Iterable
3
3
 
4
4
  import numpy
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
5
6
 
6
- from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.datamodel.base_models import OcrCell, Page
7
8
  from docling.datamodel.pipeline_options import EasyOcrOptions
8
9
  from docling.models.base_ocr_model import BaseOcrModel
9
10
 
@@ -39,6 +40,8 @@ class EasyOcrModel(BaseOcrModel):
39
40
  return
40
41
 
41
42
  for page in page_batch:
43
+ assert page._backend is not None
44
+
42
45
  ocr_rects = self.get_ocr_rects(page)
43
46
 
44
47
  all_ocr_cells = []
@@ -2,8 +2,10 @@ import copy
2
2
  import logging
3
3
  import random
4
4
  import time
5
+ from pathlib import Path
5
6
  from typing import Iterable, List
6
7
 
8
+ from docling_core.types.doc import CoordOrigin, DocItemLabel
7
9
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
8
10
  from PIL import ImageDraw
9
11
 
@@ -11,74 +13,73 @@ from docling.datamodel.base_models import (
11
13
  BoundingBox,
12
14
  Cell,
13
15
  Cluster,
14
- CoordOrigin,
15
16
  LayoutPrediction,
16
17
  Page,
17
18
  )
19
+ from docling.models.base_model import BasePageModel
18
20
  from docling.utils import layout_utils as lu
19
21
 
20
22
  _log = logging.getLogger(__name__)
21
23
 
22
24
 
23
- class LayoutModel:
25
+ class LayoutModel(BasePageModel):
24
26
 
25
27
  TEXT_ELEM_LABELS = [
26
- "Text",
27
- "Footnote",
28
- "Caption",
29
- "Checkbox-Unselected",
30
- "Checkbox-Selected",
31
- "Section-header",
32
- "Page-header",
33
- "Page-footer",
34
- "Code",
35
- "List-item",
36
- # "Title"
28
+ DocItemLabel.TEXT,
29
+ DocItemLabel.FOOTNOTE,
30
+ DocItemLabel.CAPTION,
31
+ DocItemLabel.CHECKBOX_UNSELECTED,
32
+ DocItemLabel.CHECKBOX_SELECTED,
33
+ DocItemLabel.SECTION_HEADER,
34
+ DocItemLabel.PAGE_HEADER,
35
+ DocItemLabel.PAGE_FOOTER,
36
+ DocItemLabel.CODE,
37
+ DocItemLabel.LIST_ITEM,
37
38
  # "Formula",
38
39
  ]
39
- PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
40
+ PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
40
41
 
41
- TABLE_LABEL = "Table"
42
- FIGURE_LABEL = "Picture"
43
- FORMULA_LABEL = "Formula"
42
+ TABLE_LABEL = DocItemLabel.TABLE
43
+ FIGURE_LABEL = DocItemLabel.PICTURE
44
+ FORMULA_LABEL = DocItemLabel.FORMULA
44
45
 
45
- def __init__(self, config):
46
- self.config = config
47
- self.layout_predictor = LayoutPredictor(
48
- config["artifacts_path"]
49
- ) # TODO temporary
46
+ def __init__(self, artifacts_path: Path):
47
+ self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
50
48
 
51
- def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
49
+ def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
52
50
  MIN_INTERSECTION = 0.2
53
51
  CLASS_THRESHOLDS = {
54
- "Caption": 0.35,
55
- "Footnote": 0.35,
56
- "Formula": 0.35,
57
- "List-item": 0.35,
58
- "Page-footer": 0.35,
59
- "Page-header": 0.35,
60
- "Picture": 0.2, # low threshold adjust to capture chemical structures for examples.
61
- "Section-header": 0.45,
62
- "Table": 0.35,
63
- "Text": 0.45,
64
- "Title": 0.45,
65
- "Document Index": 0.45,
66
- "Code": 0.45,
67
- "Checkbox-Selected": 0.45,
68
- "Checkbox-Unselected": 0.45,
69
- "Form": 0.45,
70
- "Key-Value Region": 0.45,
52
+ DocItemLabel.CAPTION: 0.35,
53
+ DocItemLabel.FOOTNOTE: 0.35,
54
+ DocItemLabel.FORMULA: 0.35,
55
+ DocItemLabel.LIST_ITEM: 0.35,
56
+ DocItemLabel.PAGE_FOOTER: 0.35,
57
+ DocItemLabel.PAGE_HEADER: 0.35,
58
+ DocItemLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
59
+ DocItemLabel.SECTION_HEADER: 0.45,
60
+ DocItemLabel.TABLE: 0.35,
61
+ DocItemLabel.TEXT: 0.45,
62
+ DocItemLabel.TITLE: 0.45,
63
+ DocItemLabel.DOCUMENT_INDEX: 0.45,
64
+ DocItemLabel.CODE: 0.45,
65
+ DocItemLabel.CHECKBOX_SELECTED: 0.45,
66
+ DocItemLabel.CHECKBOX_UNSELECTED: 0.45,
67
+ DocItemLabel.FORM: 0.45,
68
+ DocItemLabel.KEY_VALUE_REGION: 0.45,
71
69
  }
72
70
 
73
- CLASS_REMAPPINGS = {"Document Index": "Table", "Title": "Section-header"}
71
+ CLASS_REMAPPINGS = {
72
+ DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
73
+ DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
74
+ }
74
75
 
75
76
  _log.debug("================= Start postprocess function ====================")
76
77
  start_time = time.time()
77
78
  # Apply Confidence Threshold to cluster predictions
78
79
  # confidence = self.conf_threshold
79
- clusters_out = []
80
+ clusters_mod = []
80
81
 
81
- for cluster in clusters:
82
+ for cluster in clusters_in:
82
83
  confidence = CLASS_THRESHOLDS[cluster.label]
83
84
  if cluster.confidence >= confidence:
84
85
  # annotation["created_by"] = "high_conf_pred"
@@ -86,10 +87,10 @@ class LayoutModel:
86
87
  # Remap class labels where needed.
87
88
  if cluster.label in CLASS_REMAPPINGS.keys():
88
89
  cluster.label = CLASS_REMAPPINGS[cluster.label]
89
- clusters_out.append(cluster)
90
+ clusters_mod.append(cluster)
90
91
 
91
92
  # map to dictionary clusters and cells, with bottom left origin
92
- clusters = [
93
+ clusters_orig = [
93
94
  {
94
95
  "id": c.id,
95
96
  "bbox": list(
@@ -99,7 +100,7 @@ class LayoutModel:
99
100
  "cell_ids": [],
100
101
  "type": c.label,
101
102
  }
102
- for c in clusters
103
+ for c in clusters_in
103
104
  ]
104
105
 
105
106
  clusters_out = [
@@ -113,9 +114,11 @@ class LayoutModel:
113
114
  "cell_ids": [],
114
115
  "type": c.label,
115
116
  }
116
- for c in clusters_out
117
+ for c in clusters_mod
117
118
  ]
118
119
 
120
+ del clusters_mod
121
+
119
122
  raw_cells = [
120
123
  {
121
124
  "id": c.id,
@@ -149,7 +152,7 @@ class LayoutModel:
149
152
 
150
153
  # Assign orphan cells with lower confidence predictions
151
154
  clusters_out, orphan_cell_indices = lu.assign_orphans_with_low_conf_pred(
152
- clusters_out, clusters, raw_cells, orphan_cell_indices
155
+ clusters_out, clusters_orig, raw_cells, orphan_cell_indices
153
156
  )
154
157
 
155
158
  # Refresh the cell_ids assignment, after creating new clusters using low conf predictions
@@ -178,7 +181,7 @@ class LayoutModel:
178
181
  ) = lu.cell_id_state_map(clusters_out, cell_count)
179
182
 
180
183
  clusters_out, orphan_cell_indices = lu.set_orphan_as_text(
181
- clusters_out, clusters, raw_cells, orphan_cell_indices
184
+ clusters_out, clusters_orig, raw_cells, orphan_cell_indices
182
185
  )
183
186
 
184
187
  _log.debug("---- 5. Merge Cells & and adapt the bounding boxes")
@@ -237,46 +240,55 @@ class LayoutModel:
237
240
  end_time = time.time() - start_time
238
241
  _log.debug(f"Finished post processing in seconds={end_time:.3f}")
239
242
 
240
- cells_out = [
243
+ cells_out_new = [
241
244
  Cell(
242
- id=c["id"],
245
+ id=c["id"], # type: ignore
243
246
  bbox=BoundingBox.from_tuple(
244
- coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
247
+ coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
245
248
  ).to_top_left_origin(page_height),
246
- text=c["text"],
249
+ text=c["text"], # type: ignore
247
250
  )
248
251
  for c in cells_out
249
252
  ]
253
+
254
+ del cells_out
255
+
250
256
  clusters_out_new = []
251
257
  for c in clusters_out:
252
- cluster_cells = [ccell for ccell in cells_out if ccell.id in c["cell_ids"]]
258
+ cluster_cells = [
259
+ ccell for ccell in cells_out_new if ccell.id in c["cell_ids"] # type: ignore
260
+ ]
253
261
  c_new = Cluster(
254
- id=c["id"],
262
+ id=c["id"], # type: ignore
255
263
  bbox=BoundingBox.from_tuple(
256
- coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
264
+ coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT # type: ignore
257
265
  ).to_top_left_origin(page_height),
258
- confidence=c["confidence"],
259
- label=c["type"],
266
+ confidence=c["confidence"], # type: ignore
267
+ label=DocItemLabel(c["type"]),
260
268
  cells=cluster_cells,
261
269
  )
262
270
  clusters_out_new.append(c_new)
263
271
 
264
- return clusters_out_new, cells_out
272
+ return clusters_out_new, cells_out_new
265
273
 
266
274
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
267
275
  for page in page_batch:
276
+ assert page.size is not None
277
+
268
278
  clusters = []
269
279
  for ix, pred_item in enumerate(
270
280
  self.layout_predictor.predict(page.get_image(scale=1.0))
271
281
  ):
282
+ label = DocItemLabel(
283
+ pred_item["label"].lower().replace(" ", "_").replace("-", "_")
284
+ ) # Temporary, until docling-ibm-model uses docling-core types
272
285
  cluster = Cluster(
273
286
  id=ix,
274
- label=pred_item["label"],
287
+ label=label,
275
288
  confidence=pred_item["confidence"],
276
289
  bbox=BoundingBox.model_validate(pred_item),
277
290
  cells=[],
278
291
  )
279
-
280
292
  clusters.append(cluster)
281
293
 
282
294
  # Map cells to clusters
@@ -2,22 +2,29 @@ import logging
2
2
  import re
3
3
  from typing import Iterable, List
4
4
 
5
+ from pydantic import BaseModel
6
+
5
7
  from docling.datamodel.base_models import (
6
8
  AssembledUnit,
7
9
  FigureElement,
8
10
  Page,
9
11
  PageElement,
10
- TableElement,
12
+ Table,
11
13
  TextElement,
12
14
  )
15
+ from docling.models.base_model import BasePageModel
13
16
  from docling.models.layout_model import LayoutModel
14
17
 
15
18
  _log = logging.getLogger(__name__)
16
19
 
17
20
 
18
- class PageAssembleModel:
19
- def __init__(self, config):
20
- self.config = config
21
+ class PageAssembleOptions(BaseModel):
22
+ keep_images: bool = False
23
+
24
+
25
+ class PageAssembleModel(BasePageModel):
26
+ def __init__(self, options: PageAssembleOptions):
27
+ self.options = options
21
28
 
22
29
  def sanitize_text(self, lines):
23
30
  if len(lines) <= 1:
@@ -46,6 +53,8 @@ class PageAssembleModel:
46
53
 
47
54
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
48
55
  for page in page_batch:
56
+ assert page._backend is not None
57
+ assert page.predictions.layout is not None
49
58
  # assembles some JSON output page by page.
50
59
 
51
60
  elements: List[PageElement] = []
@@ -84,7 +93,7 @@ class PageAssembleModel:
84
93
  if (
85
94
  not tbl
86
95
  ): # fallback: add table without structure, if it isn't present
87
- tbl = TableElement(
96
+ tbl = Table(
88
97
  label=cluster.label,
89
98
  id=cluster.id,
90
99
  text="",
@@ -145,4 +154,11 @@ class PageAssembleModel:
145
154
  elements=elements, headers=headers, body=body
146
155
  )
147
156
 
157
+ # Remove page images (can be disabled)
158
+ if not self.options.keep_images:
159
+ page._image_cache = {}
160
+
161
+ # Unload backend
162
+ page._backend.unload()
163
+
148
164
  yield page
@@ -0,0 +1,57 @@
1
+ from typing import Iterable, Optional
2
+
3
+ from PIL import ImageDraw
4
+ from pydantic import BaseModel
5
+
6
+ from docling.datamodel.base_models import Page
7
+ from docling.models.base_model import BasePageModel
8
+
9
+
10
+ class PagePreprocessingOptions(BaseModel):
11
+ images_scale: Optional[float]
12
+
13
+
14
+ class PagePreprocessingModel(BasePageModel):
15
+ def __init__(self, options: PagePreprocessingOptions):
16
+ self.options = options
17
+
18
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
19
+ for page in page_batch:
20
+ page = self._populate_page_images(page)
21
+ page = self._parse_page_cells(page)
22
+ yield page
23
+
24
+ # Generate the page image and store it in the page object
25
+ def _populate_page_images(self, page: Page) -> Page:
26
+ # default scale
27
+ page.get_image(
28
+ scale=1.0
29
+ ) # puts the page image on the image cache at default scale
30
+
31
+ images_scale = self.options.images_scale
32
+ # user requested scales
33
+ if images_scale is not None:
34
+ page._default_image_scale = images_scale
35
+ page.get_image(
36
+ scale=images_scale
37
+ ) # this will trigger storing the image in the internal cache
38
+
39
+ return page
40
+
41
+ # Extract and populate the page cells and store it in the page object
42
+ def _parse_page_cells(self, page: Page) -> Page:
43
+ assert page._backend is not None
44
+
45
+ page.cells = list(page._backend.get_text_cells())
46
+
47
+ # DEBUG code:
48
+ def draw_text_boxes(image, cells):
49
+ draw = ImageDraw.Draw(image)
50
+ for c in cells:
51
+ x0, y0, x1, y1 = c.bbox.as_tuple()
52
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
53
+ image.show()
54
+
55
+ # draw_text_boxes(page.get_image(scale=1.0), cells)
56
+
57
+ return page