docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +192 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +19 -1
  14. docling/backend/msword_backend.py +68 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +135 -53
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +54 -32
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/hf_mlx_model.py +137 -0
  31. docling/models/ocr_mac_model.py +39 -11
  32. docling/models/page_preprocessing_model.py +4 -0
  33. docling/models/picture_description_api_model.py +20 -3
  34. docling/models/picture_description_base_model.py +19 -3
  35. docling/models/picture_description_vlm_model.py +14 -2
  36. docling/models/plugins/__init__.py +0 -0
  37. docling/models/plugins/defaults.py +28 -0
  38. docling/models/rapid_ocr_model.py +34 -13
  39. docling/models/table_structure_model.py +13 -4
  40. docling/models/tesseract_ocr_cli_model.py +40 -15
  41. docling/models/tesseract_ocr_model.py +37 -12
  42. docling/pipeline/standard_pdf_pipeline.py +25 -78
  43. docling/pipeline/vlm_pipeline.py +78 -398
  44. docling/utils/export.py +8 -6
  45. docling/utils/layout_postprocessor.py +26 -23
  46. docling/utils/visualization.py +1 -1
  47. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
  48. docling-2.28.0.dist-info/RECORD +84 -0
  49. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
  50. docling-2.26.0.dist-info/RECORD +0 -72
  51. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
  52. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
@@ -1,30 +1,13 @@
1
- import itertools
2
1
  import logging
3
- import re
4
2
  import warnings
5
3
  from io import BytesIO
6
-
7
- # from io import BytesIO
8
4
  from pathlib import Path
9
- from typing import Optional
5
+ from typing import List, Optional, Union, cast
10
6
 
11
- from docling_core.types import DoclingDocument
12
- from docling_core.types.doc import (
13
- BoundingBox,
14
- DocItem,
15
- DocItemLabel,
16
- DoclingDocument,
17
- GroupLabel,
18
- ImageRef,
19
- ImageRefMode,
20
- PictureItem,
21
- ProvenanceItem,
22
- Size,
23
- TableCell,
24
- TableData,
25
- TableItem,
26
- )
27
- from docling_core.types.doc.tokens import DocumentToken, TableToken
7
+ # from docling_core.types import DoclingDocument
8
+ from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
9
+ from docling_core.types.doc.document import DocTagsDocument
10
+ from PIL import Image as PILImage
28
11
 
29
12
  from docling.backend.abstract_backend import AbstractDocumentBackend
30
13
  from docling.backend.md_backend import MarkdownDocumentBackend
@@ -32,11 +15,12 @@ from docling.backend.pdf_backend import PdfDocumentBackend
32
15
  from docling.datamodel.base_models import InputFormat, Page
33
16
  from docling.datamodel.document import ConversionResult, InputDocument
34
17
  from docling.datamodel.pipeline_options import (
35
- PdfPipelineOptions,
18
+ InferenceFramework,
36
19
  ResponseFormat,
37
20
  VlmPipelineOptions,
38
21
  )
39
22
  from docling.datamodel.settings import settings
23
+ from docling.models.hf_mlx_model import HuggingFaceMlxModel
40
24
  from docling.models.hf_vlm_model import HuggingFaceVlmModel
41
25
  from docling.pipeline.base_pipeline import PaginatedPipeline
42
26
  from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -50,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
50
34
  super().__init__(pipeline_options)
51
35
  self.keep_backend = True
52
36
 
53
- warnings.warn(
54
- "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
55
- category=UserWarning,
56
- stacklevel=2,
57
- )
58
-
59
37
  self.pipeline_options: VlmPipelineOptions
60
38
 
61
39
  artifacts_path: Optional[Path] = None
@@ -79,14 +57,27 @@ class VlmPipeline(PaginatedPipeline):
79
57
 
80
58
  self.keep_images = self.pipeline_options.generate_page_images
81
59
 
82
- self.build_pipe = [
83
- HuggingFaceVlmModel(
84
- enabled=True, # must be always enabled for this pipeline to make sense.
85
- artifacts_path=artifacts_path,
86
- accelerator_options=pipeline_options.accelerator_options,
87
- vlm_options=self.pipeline_options.vlm_options,
88
- ),
89
- ]
60
+ if (
61
+ self.pipeline_options.vlm_options.inference_framework
62
+ == InferenceFramework.MLX
63
+ ):
64
+ self.build_pipe = [
65
+ HuggingFaceMlxModel(
66
+ enabled=True, # must be always enabled for this pipeline to make sense.
67
+ artifacts_path=artifacts_path,
68
+ accelerator_options=pipeline_options.accelerator_options,
69
+ vlm_options=self.pipeline_options.vlm_options,
70
+ ),
71
+ ]
72
+ else:
73
+ self.build_pipe = [
74
+ HuggingFaceVlmModel(
75
+ enabled=True, # must be always enabled for this pipeline to make sense.
76
+ artifacts_path=artifacts_path,
77
+ accelerator_options=pipeline_options.accelerator_options,
78
+ vlm_options=self.pipeline_options.vlm_options,
79
+ ),
80
+ ]
90
81
 
91
82
  self.enrichment_pipe = [
92
83
  # Other models working on `NodeItem` elements in the DoclingDocument
@@ -100,6 +91,17 @@ class VlmPipeline(PaginatedPipeline):
100
91
 
101
92
  return page
102
93
 
94
+ def extract_text_from_backend(
95
+ self, page: Page, bbox: Union[BoundingBox, None]
96
+ ) -> str:
97
+ # Convert bounding box normalized to 0-100 into page coordinates for cropping
98
+ text = ""
99
+ if bbox:
100
+ if page.size:
101
+ if page._backend:
102
+ text = page._backend.get_text_in_rect(bbox)
103
+ return text
104
+
103
105
  def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
104
106
  with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
105
107
 
@@ -107,7 +109,45 @@ class VlmPipeline(PaginatedPipeline):
107
109
  self.pipeline_options.vlm_options.response_format
108
110
  == ResponseFormat.DOCTAGS
109
111
  ):
110
- conv_res.document = self._turn_tags_into_doc(conv_res.pages)
112
+ doctags_list = []
113
+ image_list = []
114
+ for page in conv_res.pages:
115
+ predicted_doctags = ""
116
+ img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
117
+ if page.predictions.vlm_response:
118
+ predicted_doctags = page.predictions.vlm_response.text
119
+ if page.image:
120
+ img = page.image
121
+ image_list.append(img)
122
+ doctags_list.append(predicted_doctags)
123
+
124
+ doctags_list_c = cast(List[Union[Path, str]], doctags_list)
125
+ image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
126
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
127
+ doctags_list_c, image_list_c
128
+ )
129
+ conv_res.document.load_from_doctags(doctags_doc)
130
+
131
+ # If forced backend text, replace model predicted text with backend one
132
+ if page.size:
133
+ if self.force_backend_text:
134
+ scale = self.pipeline_options.images_scale
135
+ for element, _level in conv_res.document.iterate_items():
136
+ if (
137
+ not isinstance(element, TextItem)
138
+ or len(element.prov) == 0
139
+ ):
140
+ continue
141
+ crop_bbox = (
142
+ element.prov[0]
143
+ .bbox.scaled(scale=scale)
144
+ .to_top_left_origin(
145
+ page_height=page.size.height * scale
146
+ )
147
+ )
148
+ txt = self.extract_text_from_backend(page, crop_bbox)
149
+ element.text = txt
150
+ element.orig = txt
111
151
  elif (
112
152
  self.pipeline_options.vlm_options.response_format
113
153
  == ResponseFormat.MARKDOWN
@@ -165,366 +205,6 @@ class VlmPipeline(PaginatedPipeline):
165
205
  )
166
206
  return backend.convert()
167
207
 
168
- def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
169
- ###############################################
170
- # Tag definitions and color mappings
171
- ###############################################
172
-
173
- # Maps the recognized tag to a Docling label.
174
- # Code items will be given DocItemLabel.CODE
175
- tag_to_doclabel = {
176
- "title": DocItemLabel.TITLE,
177
- "document_index": DocItemLabel.DOCUMENT_INDEX,
178
- "otsl": DocItemLabel.TABLE,
179
- "section_header_level_1": DocItemLabel.SECTION_HEADER,
180
- "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
181
- "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
182
- "text": DocItemLabel.TEXT,
183
- "page_header": DocItemLabel.PAGE_HEADER,
184
- "page_footer": DocItemLabel.PAGE_FOOTER,
185
- "formula": DocItemLabel.FORMULA,
186
- "caption": DocItemLabel.CAPTION,
187
- "picture": DocItemLabel.PICTURE,
188
- "list_item": DocItemLabel.LIST_ITEM,
189
- "footnote": DocItemLabel.FOOTNOTE,
190
- "code": DocItemLabel.CODE,
191
- }
192
-
193
- # Maps each tag to an associated bounding box color.
194
- tag_to_color = {
195
- "title": "blue",
196
- "document_index": "darkblue",
197
- "otsl": "green",
198
- "section_header_level_1": "purple",
199
- "checkbox_selected": "black",
200
- "checkbox_unselected": "gray",
201
- "text": "red",
202
- "page_header": "orange",
203
- "page_footer": "cyan",
204
- "formula": "pink",
205
- "caption": "magenta",
206
- "picture": "yellow",
207
- "list_item": "brown",
208
- "footnote": "darkred",
209
- "code": "lightblue",
210
- }
211
-
212
- def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
213
- """Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
214
- coords = re.findall(r"<loc_(\d+)>", text_chunk)
215
- if len(coords) == 4:
216
- l, t, r, b = map(float, coords)
217
- return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
218
- return None
219
-
220
- def extract_inner_text(text_chunk: str) -> str:
221
- """Strips all <...> tags inside the chunk to get the raw text content."""
222
- return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
223
-
224
- def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
225
- # Convert bounding box normalized to 0-100 into page coordinates for cropping
226
- text = ""
227
- if bbox:
228
- if page.size:
229
- bbox.l = bbox.l * page.size.width
230
- bbox.t = bbox.t * page.size.height
231
- bbox.r = bbox.r * page.size.width
232
- bbox.b = bbox.b * page.size.height
233
- if page._backend:
234
- text = page._backend.get_text_in_rect(bbox)
235
- return text
236
-
237
- def otsl_parse_texts(texts, tokens):
238
- split_word = TableToken.OTSL_NL.value
239
- split_row_tokens = [
240
- list(y)
241
- for x, y in itertools.groupby(tokens, lambda z: z == split_word)
242
- if not x
243
- ]
244
- table_cells = []
245
- r_idx = 0
246
- c_idx = 0
247
-
248
- def count_right(tokens, c_idx, r_idx, which_tokens):
249
- span = 0
250
- c_idx_iter = c_idx
251
- while tokens[r_idx][c_idx_iter] in which_tokens:
252
- c_idx_iter += 1
253
- span += 1
254
- if c_idx_iter >= len(tokens[r_idx]):
255
- return span
256
- return span
257
-
258
- def count_down(tokens, c_idx, r_idx, which_tokens):
259
- span = 0
260
- r_idx_iter = r_idx
261
- while tokens[r_idx_iter][c_idx] in which_tokens:
262
- r_idx_iter += 1
263
- span += 1
264
- if r_idx_iter >= len(tokens):
265
- return span
266
- return span
267
-
268
- for i, text in enumerate(texts):
269
- cell_text = ""
270
- if text in [
271
- TableToken.OTSL_FCEL.value,
272
- TableToken.OTSL_ECEL.value,
273
- TableToken.OTSL_CHED.value,
274
- TableToken.OTSL_RHED.value,
275
- TableToken.OTSL_SROW.value,
276
- ]:
277
- row_span = 1
278
- col_span = 1
279
- right_offset = 1
280
- if text != TableToken.OTSL_ECEL.value:
281
- cell_text = texts[i + 1]
282
- right_offset = 2
283
-
284
- # Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
285
- next_right_cell = ""
286
- if i + right_offset < len(texts):
287
- next_right_cell = texts[i + right_offset]
288
-
289
- next_bottom_cell = ""
290
- if r_idx + 1 < len(split_row_tokens):
291
- if c_idx < len(split_row_tokens[r_idx + 1]):
292
- next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
293
-
294
- if next_right_cell in [
295
- TableToken.OTSL_LCEL.value,
296
- TableToken.OTSL_XCEL.value,
297
- ]:
298
- # we have horisontal spanning cell or 2d spanning cell
299
- col_span += count_right(
300
- split_row_tokens,
301
- c_idx + 1,
302
- r_idx,
303
- [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
304
- )
305
- if next_bottom_cell in [
306
- TableToken.OTSL_UCEL.value,
307
- TableToken.OTSL_XCEL.value,
308
- ]:
309
- # we have a vertical spanning cell or 2d spanning cell
310
- row_span += count_down(
311
- split_row_tokens,
312
- c_idx,
313
- r_idx + 1,
314
- [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
315
- )
316
-
317
- table_cells.append(
318
- TableCell(
319
- text=cell_text.strip(),
320
- row_span=row_span,
321
- col_span=col_span,
322
- start_row_offset_idx=r_idx,
323
- end_row_offset_idx=r_idx + row_span,
324
- start_col_offset_idx=c_idx,
325
- end_col_offset_idx=c_idx + col_span,
326
- )
327
- )
328
- if text in [
329
- TableToken.OTSL_FCEL.value,
330
- TableToken.OTSL_ECEL.value,
331
- TableToken.OTSL_CHED.value,
332
- TableToken.OTSL_RHED.value,
333
- TableToken.OTSL_SROW.value,
334
- TableToken.OTSL_LCEL.value,
335
- TableToken.OTSL_UCEL.value,
336
- TableToken.OTSL_XCEL.value,
337
- ]:
338
- c_idx += 1
339
- if text == TableToken.OTSL_NL.value:
340
- r_idx += 1
341
- c_idx = 0
342
- return table_cells, split_row_tokens
343
-
344
- def otsl_extract_tokens_and_text(s: str):
345
- # Pattern to match anything enclosed by < > (including the angle brackets themselves)
346
- pattern = r"(<[^>]+>)"
347
- # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
348
- tokens = re.findall(pattern, s)
349
- # Remove any tokens that start with "<loc_"
350
- tokens = [
351
- token
352
- for token in tokens
353
- if not (
354
- token.startswith(rf"<{DocumentToken.LOC.value}")
355
- or token
356
- in [
357
- rf"<{DocumentToken.OTSL.value}>",
358
- rf"</{DocumentToken.OTSL.value}>",
359
- ]
360
- )
361
- ]
362
- # Split the string by those tokens to get the in-between text
363
- text_parts = re.split(pattern, s)
364
- text_parts = [
365
- token
366
- for token in text_parts
367
- if not (
368
- token.startswith(rf"<{DocumentToken.LOC.value}")
369
- or token
370
- in [
371
- rf"<{DocumentToken.OTSL.value}>",
372
- rf"</{DocumentToken.OTSL.value}>",
373
- ]
374
- )
375
- ]
376
- # Remove any empty or purely whitespace strings from text_parts
377
- text_parts = [part for part in text_parts if part.strip()]
378
-
379
- return tokens, text_parts
380
-
381
- def parse_table_content(otsl_content: str) -> TableData:
382
- tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
383
- table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
384
-
385
- return TableData(
386
- num_rows=len(split_row_tokens),
387
- num_cols=(
388
- max(len(row) for row in split_row_tokens) if split_row_tokens else 0
389
- ),
390
- table_cells=table_cells,
391
- )
392
-
393
- doc = DoclingDocument(name="Document")
394
- for pg_idx, page in enumerate(pages):
395
- xml_content = ""
396
- predicted_text = ""
397
- if page.predictions.vlm_response:
398
- predicted_text = page.predictions.vlm_response.text
399
- image = page.image
400
-
401
- page_no = pg_idx + 1
402
- bounding_boxes = []
403
-
404
- if page.size:
405
- pg_width = page.size.width
406
- pg_height = page.size.height
407
- size = Size(width=pg_width, height=pg_height)
408
- parent_page = doc.add_page(page_no=page_no, size=size)
409
-
410
- """
411
- 1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
412
- 2. For each chunk, extracts bounding box (if any) and inner text.
413
- 3. Adds the item to a DoclingDocument structure with the right label.
414
- 4. Tracks bounding boxes + color in a separate list for later visualization.
415
- """
416
-
417
- # Regex for all recognized tags
418
- tag_pattern = (
419
- rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
420
- rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
421
- rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
422
- rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
423
- rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
424
- rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
425
- rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
426
- )
427
-
428
- # DocumentToken.OTSL
429
- pattern = re.compile(tag_pattern, re.DOTALL)
430
-
431
- # Go through each match in order
432
- for match in pattern.finditer(predicted_text):
433
- full_chunk = match.group(0)
434
- tag_name = match.group("tag")
435
-
436
- bbox = extract_bounding_box(full_chunk)
437
- doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
438
- color = tag_to_color.get(tag_name, "white")
439
-
440
- # Store bounding box + color
441
- if bbox:
442
- bounding_boxes.append((bbox, color))
443
-
444
- if tag_name == DocumentToken.OTSL.value:
445
- table_data = parse_table_content(full_chunk)
446
- bbox = extract_bounding_box(full_chunk)
447
-
448
- if bbox:
449
- prov = ProvenanceItem(
450
- bbox=bbox.resize_by_scale(pg_width, pg_height),
451
- charspan=(0, 0),
452
- page_no=page_no,
453
- )
454
- doc.add_table(data=table_data, prov=prov)
455
- else:
456
- doc.add_table(data=table_data)
457
-
458
- elif tag_name == DocItemLabel.PICTURE:
459
- text_caption_content = extract_inner_text(full_chunk)
460
- if image:
461
- if bbox:
462
- im_width, im_height = image.size
463
-
464
- crop_box = (
465
- int(bbox.l * im_width),
466
- int(bbox.t * im_height),
467
- int(bbox.r * im_width),
468
- int(bbox.b * im_height),
469
- )
470
- cropped_image = image.crop(crop_box)
471
- pic = doc.add_picture(
472
- parent=None,
473
- image=ImageRef.from_pil(image=cropped_image, dpi=72),
474
- prov=(
475
- ProvenanceItem(
476
- bbox=bbox.resize_by_scale(pg_width, pg_height),
477
- charspan=(0, 0),
478
- page_no=page_no,
479
- )
480
- ),
481
- )
482
- # If there is a caption to an image, add it as well
483
- if len(text_caption_content) > 0:
484
- caption_item = doc.add_text(
485
- label=DocItemLabel.CAPTION,
486
- text=text_caption_content,
487
- parent=None,
488
- )
489
- pic.captions.append(caption_item.get_ref())
490
- else:
491
- if bbox:
492
- # In case we don't have access to an binary of an image
493
- doc.add_picture(
494
- parent=None,
495
- prov=ProvenanceItem(
496
- bbox=bbox, charspan=(0, 0), page_no=page_no
497
- ),
498
- )
499
- # If there is a caption to an image, add it as well
500
- if len(text_caption_content) > 0:
501
- caption_item = doc.add_text(
502
- label=DocItemLabel.CAPTION,
503
- text=text_caption_content,
504
- parent=None,
505
- )
506
- pic.captions.append(caption_item.get_ref())
507
- else:
508
- # For everything else, treat as text
509
- if self.force_backend_text:
510
- text_content = extract_text_from_backend(page, bbox)
511
- else:
512
- text_content = extract_inner_text(full_chunk)
513
- doc.add_text(
514
- label=doc_label,
515
- text=text_content,
516
- prov=(
517
- ProvenanceItem(
518
- bbox=bbox.resize_by_scale(pg_width, pg_height),
519
- charspan=(0, len(text_content)),
520
- page_no=page_no,
521
- )
522
- if bbox
523
- else None
524
- ),
525
- )
526
- return doc
527
-
528
208
  @classmethod
529
209
  def get_default_options(cls) -> VlmPipelineOptions:
530
210
  return VlmPipelineOptions()
docling/utils/export.py CHANGED
@@ -2,9 +2,9 @@ import logging
2
2
  from typing import Any, Dict, Iterable, List, Tuple, Union
3
3
 
4
4
  from docling_core.types.doc import BoundingBox, CoordOrigin
5
+ from docling_core.types.doc.page import TextCell
5
6
  from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
6
7
 
7
- from docling.datamodel.base_models import OcrCell
8
8
  from docling.datamodel.document import ConversionResult, Page
9
9
 
10
10
  _log = logging.getLogger(__name__)
@@ -86,11 +86,13 @@ def generate_multimodal_pages(
86
86
  if page.size is None:
87
87
  return cells
88
88
  for cell in page.cells:
89
- new_bbox = cell.bbox.to_top_left_origin(
90
- page_height=page.size.height
91
- ).normalized(page_size=page.size)
92
- is_ocr = isinstance(cell, OcrCell)
93
- ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
89
+ new_bbox = (
90
+ cell.rect.to_bounding_box()
91
+ .to_top_left_origin(page_height=page.size.height)
92
+ .normalized(page_size=page.size)
93
+ )
94
+ is_ocr = cell.from_ocr
95
+ ocr_confidence = cell.confidence
94
96
  cells.append(
95
97
  {
96
98
  "text": cell.text,