docling 2.36.0__py3-none-any.whl → 2.37.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
20
20
  from PIL import Image, UnidentifiedImageError
21
21
  from pptx import Presentation
22
22
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
23
+ from pptx.oxml.text import CT_TextLineBreak
23
24
 
24
25
  from docling.backend.abstract_backend import (
25
26
  DeclarativeDocumentBackend,
@@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
120
121
 
121
122
  return prov
122
123
 
123
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
124
- is_a_list = False
124
+ def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
125
125
  is_list_group_created = False
126
126
  enum_list_item_value = 0
127
127
  new_list = None
128
- bullet_type = "None"
129
- list_label = GroupLabel.LIST
130
128
  doc_label = DocItemLabel.LIST_ITEM
131
129
  prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
132
130
 
133
- # Identify if shape contains lists
134
- for paragraph in shape.text_frame.paragraphs:
135
- # Check if paragraph is a bullet point using the `element` XML
131
+ def is_list_item(paragraph):
132
+ """Check if the paragraph is a list item."""
136
133
  p = paragraph._element
137
134
  if (
138
135
  p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
139
136
  is not None
140
137
  ):
141
- bullet_type = "Bullet"
142
- is_a_list = True
138
+ return (True, "Bullet")
143
139
  elif (
144
140
  p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
145
141
  is not None
146
142
  ):
147
- bullet_type = "Numbered"
148
- is_a_list = True
149
- else:
150
- is_a_list = False
151
-
152
- if paragraph.level > 0:
143
+ return (True, "Numbered")
144
+ elif paragraph.level > 0:
153
145
  # Most likely a sub-list
154
- is_a_list = True
155
-
156
- if is_a_list:
157
- # Determine if this is an unordered list or an ordered list.
158
- # Set GroupLabel.ORDERED_LIST when it fits.
159
- if bullet_type == "Numbered":
160
- list_label = GroupLabel.ORDERED_LIST
161
-
162
- if is_a_list:
163
- _log.debug("LIST DETECTED!")
146
+ return (True, "None")
164
147
  else:
165
- _log.debug("No List")
166
-
167
- # If there is a list inside of the shape, create a new docling list to assign list items to
168
- # if is_a_list:
169
- # new_list = doc.add_group(
170
- # label=list_label, name=f"list", parent=parent_slide
171
- # )
148
+ return (False, "None")
172
149
 
173
150
  # Iterate through paragraphs to build up text
174
151
  for paragraph in shape.text_frame.paragraphs:
175
- # p_text = paragraph.text.strip()
152
+ is_a_list, bullet_type = is_list_item(paragraph)
176
153
  p = paragraph._element
177
- enum_list_item_value += 1
178
- inline_paragraph_text = ""
179
- inline_list_item_text = ""
180
-
181
- for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
182
- if len(e.text.strip()) > 0:
183
- e_is_a_list_item = False
184
- is_numbered = False
185
- if (
186
- p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
187
- is not None
188
- ):
189
- bullet_type = "Bullet"
190
- e_is_a_list_item = True
191
- elif (
192
- p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
193
- is not None
194
- ):
195
- bullet_type = "Numbered"
196
- is_numbered = True
197
- e_is_a_list_item = True
198
- else:
199
- e_is_a_list_item = False
200
-
201
- if e_is_a_list_item:
202
- if len(inline_paragraph_text) > 0:
203
- # output accumulated inline text:
204
- doc.add_text(
205
- label=doc_label,
206
- parent=parent_slide,
207
- text=inline_paragraph_text,
208
- prov=prov,
209
- )
210
- # Set marker and enumerated arguments if this is an enumeration element.
211
- inline_list_item_text += e.text
212
- # print(e.text)
213
- else:
214
- # Assign proper label to the text, depending if it's a Title or Section Header
215
- # For other types of text, assign - PARAGRAPH
216
- doc_label = DocItemLabel.PARAGRAPH
217
- if shape.is_placeholder:
218
- placeholder_type = shape.placeholder_format.type
219
- if placeholder_type in [
220
- PP_PLACEHOLDER.CENTER_TITLE,
221
- PP_PLACEHOLDER.TITLE,
222
- ]:
223
- # It's a title
224
- doc_label = DocItemLabel.TITLE
225
- elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
226
- DocItemLabel.SECTION_HEADER
227
- enum_list_item_value = 0
228
- inline_paragraph_text += e.text
229
-
230
- if len(inline_paragraph_text) > 0:
231
- # output accumulated inline text:
232
- doc.add_text(
233
- label=doc_label,
234
- parent=parent_slide,
235
- text=inline_paragraph_text,
236
- prov=prov,
237
- )
238
154
 
239
- if len(inline_list_item_text) > 0:
155
+ # Convert line breaks to spaces and accumulate text
156
+ p_text = ""
157
+ for e in p.content_children:
158
+ if isinstance(e, CT_TextLineBreak):
159
+ p_text += " "
160
+ else:
161
+ p_text += e.text
162
+
163
+ if is_a_list:
240
164
  enum_marker = ""
241
- if is_numbered:
242
- enum_marker = str(enum_list_item_value) + "."
165
+ enumerated = bullet_type == "Numbered"
166
+
243
167
  if not is_list_group_created:
244
168
  new_list = doc.add_group(
245
- label=list_label, name="list", parent=parent_slide
169
+ label=GroupLabel.ORDERED_LIST
170
+ if enumerated
171
+ else GroupLabel.LIST,
172
+ name="list",
173
+ parent=parent_slide,
246
174
  )
247
175
  is_list_group_created = True
176
+ enum_list_item_value = 0
177
+
178
+ if enumerated:
179
+ enum_list_item_value += 1
180
+ enum_marker = str(enum_list_item_value) + "."
181
+
248
182
  doc.add_list_item(
249
183
  marker=enum_marker,
250
- enumerated=is_numbered,
184
+ enumerated=enumerated,
251
185
  parent=new_list,
252
- text=inline_list_item_text,
186
+ text=p_text,
187
+ prov=prov,
188
+ )
189
+ else: # is paragraph not a list item
190
+ # Assign proper label to the text, depending if it's a Title or Section Header
191
+ # For other types of text, assign - PARAGRAPH
192
+ doc_label = DocItemLabel.PARAGRAPH
193
+ if shape.is_placeholder:
194
+ placeholder_type = shape.placeholder_format.type
195
+ if placeholder_type in [
196
+ PP_PLACEHOLDER.CENTER_TITLE,
197
+ PP_PLACEHOLDER.TITLE,
198
+ ]:
199
+ # It's a title
200
+ doc_label = DocItemLabel.TITLE
201
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
202
+ DocItemLabel.SECTION_HEADER
203
+
204
+ # output accumulated inline text:
205
+ doc.add_text(
206
+ label=doc_label,
207
+ parent=parent_slide,
208
+ text=p_text,
253
209
  prov=prov,
254
210
  )
255
211
  return
@@ -423,18 +379,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
423
379
  # Handle notes slide
424
380
  if slide.has_notes_slide:
425
381
  notes_slide = slide.notes_slide
426
- notes_text = notes_slide.notes_text_frame.text.strip()
427
- if notes_text:
428
- bbox = BoundingBox(l=0, t=0, r=0, b=0)
429
- prov = ProvenanceItem(
430
- page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
431
- )
432
- doc.add_text(
433
- label=DocItemLabel.TEXT,
434
- parent=parent_slide,
435
- text=notes_text,
436
- prov=prov,
437
- content_layer=ContentLayer.FURNITURE,
438
- )
382
+ if notes_slide.notes_text_frame is not None:
383
+ notes_text = notes_slide.notes_text_frame.text.strip()
384
+ if notes_text:
385
+ bbox = BoundingBox(l=0, t=0, r=0, b=0)
386
+ prov = ProvenanceItem(
387
+ page_no=slide_ind + 1,
388
+ charspan=[0, len(notes_text)],
389
+ bbox=bbox,
390
+ )
391
+ doc.add_text(
392
+ label=DocItemLabel.TEXT,
393
+ parent=parent_slide,
394
+ text=notes_text,
395
+ prov=prov,
396
+ content_layer=ContentLayer.FURNITURE,
397
+ )
439
398
 
440
399
  return doc
@@ -60,8 +60,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
60
60
  self.equation_bookends: str = "<eq>{EQ}</eq>"
61
61
  # Track processed textbox elements to avoid duplication
62
62
  self.processed_textbox_elements: List[int] = []
63
- # Track content hash of processed paragraphs to avoid duplicate content
64
- self.processed_paragraph_content: List[str] = []
65
63
 
66
64
  for i in range(-1, self.max_levels):
67
65
  self.parents[i] = None
@@ -593,9 +591,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
593
591
  # Add the sorted paragraphs to our processing list
594
592
  all_paragraphs.extend(sorted_container_paragraphs)
595
593
 
594
+ # Track processed paragraphs to avoid duplicates (same content and position)
595
+ processed_paragraphs = set()
596
+
596
597
  # Process all the paragraphs
597
- for p, _ in all_paragraphs:
598
- self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
598
+ for p, position in all_paragraphs:
599
+ # Create paragraph object to get text content
600
+ paragraph = Paragraph(p, docx_obj)
601
+ text_content = paragraph.text
602
+
603
+ # Create a unique identifier based on content and position
604
+ paragraph_id = (text_content, position)
605
+
606
+ # Skip if this paragraph (same content and position) was already processed
607
+ if paragraph_id in processed_paragraphs:
608
+ _log.debug(
609
+ f"Skipping duplicate paragraph: content='{text_content[:50]}...', position={position}"
610
+ )
611
+ continue
612
+
613
+ # Mark this paragraph as processed
614
+ processed_paragraphs.add(paragraph_id)
615
+
616
+ self._handle_text_elements(p, docx_obj, doc)
599
617
 
600
618
  # Restore original parent
601
619
  self.parents[level] = original_parent
@@ -669,22 +687,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
669
687
  element: BaseOxmlElement,
670
688
  docx_obj: DocxDocument,
671
689
  doc: DoclingDocument,
672
- is_from_textbox: bool = False,
673
690
  ) -> None:
674
691
  paragraph = Paragraph(element, docx_obj)
675
692
 
676
- # Skip if from a textbox and this exact paragraph content was already processed
677
- # Skip if from a textbox and this exact paragraph content was already processed
678
- raw_text = paragraph.text
679
- if is_from_textbox and raw_text:
680
- # Create a simple hash of content to detect duplicates
681
- content_hash = f"{len(raw_text)}:{raw_text[:50]}"
682
- if content_hash in self.processed_paragraph_content:
683
- _log.debug(f"Skipping duplicate paragraph content: {content_hash}")
684
- return
685
- self.processed_paragraph_content.append(content_hash)
686
-
687
- text, equations = self._handle_equations_in_text(element=element, text=raw_text)
693
+ text, equations = self._handle_equations_in_text(
694
+ element=element, text=paragraph.text
695
+ )
688
696
 
689
697
  if text is None:
690
698
  return
@@ -750,7 +758,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
750
758
  self._add_header(doc, p_level, text, is_numbered_style)
751
759
 
752
760
  elif len(equations) > 0:
753
- if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
761
+ if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
762
+ text
763
+ ) > 0:
754
764
  # Standalone equation
755
765
  level = self._get_level()
756
766
  doc.add_text(
@@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
8
8
  import pypdfium2 as pdfium
9
9
  import pypdfium2.raw as pdfium_c
10
10
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
11
- from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
11
+ from docling_core.types.doc.page import (
12
+ BoundingRectangle,
13
+ PdfPageBoundaryType,
14
+ PdfPageGeometry,
15
+ SegmentedPdfPage,
16
+ TextCell,
17
+ )
12
18
  from PIL import Image, ImageDraw
13
19
  from pypdfium2 import PdfTextPage
14
20
  from pypdfium2._helpers.misc import PdfiumError
@@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
16
22
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
17
23
  from docling.utils.locks import pypdfium2_lock
18
24
 
25
+
26
+ def get_pdf_page_geometry(
27
+ ppage: pdfium.PdfPage,
28
+ angle: float = 0.0,
29
+ boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
30
+ ) -> PdfPageGeometry:
31
+ """
32
+ Create PdfPageGeometry from a pypdfium2 PdfPage object.
33
+
34
+ Args:
35
+ ppage: pypdfium2 PdfPage object
36
+ angle: Page rotation angle in degrees (default: 0.0)
37
+ boundary_type: The boundary type for the page (default: CROP_BOX)
38
+
39
+ Returns:
40
+ PdfPageGeometry with all the different bounding boxes properly set
41
+ """
42
+ with pypdfium2_lock:
43
+ # Get the main bounding box (intersection of crop_box and media_box)
44
+ bbox_tuple = ppage.get_bbox()
45
+ bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
46
+
47
+ # Get all the different page boxes from pypdfium2
48
+ media_box_tuple = ppage.get_mediabox()
49
+ crop_box_tuple = ppage.get_cropbox()
50
+ art_box_tuple = ppage.get_artbox()
51
+ bleed_box_tuple = ppage.get_bleedbox()
52
+ trim_box_tuple = ppage.get_trimbox()
53
+
54
+ # Convert to BoundingBox objects using existing from_tuple method
55
+ # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
56
+ # Use bbox as fallback when specific box types are not defined
57
+ media_bbox = (
58
+ BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
59
+ if media_box_tuple
60
+ else bbox
61
+ )
62
+ crop_bbox = (
63
+ BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
64
+ if crop_box_tuple
65
+ else bbox
66
+ )
67
+ art_bbox = (
68
+ BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
69
+ if art_box_tuple
70
+ else bbox
71
+ )
72
+ bleed_bbox = (
73
+ BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
74
+ if bleed_box_tuple
75
+ else bbox
76
+ )
77
+ trim_bbox = (
78
+ BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
79
+ if trim_box_tuple
80
+ else bbox
81
+ )
82
+
83
+ return PdfPageGeometry(
84
+ angle=angle,
85
+ rect=BoundingRectangle.from_bounding_box(bbox),
86
+ boundary_type=boundary_type,
87
+ art_bbox=art_bbox,
88
+ bleed_bbox=bleed_bbox,
89
+ crop_bbox=crop_bbox,
90
+ media_bbox=media_bbox,
91
+ trim_bbox=trim_bbox,
92
+ )
93
+
94
+
19
95
  if TYPE_CHECKING:
20
96
  from docling.datamodel.document import InputDocument
21
97
 
@@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
41
117
  def is_valid(self) -> bool:
42
118
  return self.valid
43
119
 
44
- def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
45
- AREA_THRESHOLD = 0 # 32 * 32
46
- page_size = self.get_size()
47
- with pypdfium2_lock:
48
- for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
49
- pos = obj.get_pos()
50
- cropbox = BoundingBox.from_tuple(
51
- pos, origin=CoordOrigin.BOTTOMLEFT
52
- ).to_top_left_origin(page_height=page_size.height)
53
-
54
- if cropbox.area() > AREA_THRESHOLD:
55
- cropbox = cropbox.scaled(scale=scale)
56
-
57
- yield cropbox
58
-
59
- def get_text_in_rect(self, bbox: BoundingBox) -> str:
60
- with pypdfium2_lock:
61
- if not self.text_page:
62
- self.text_page = self._ppage.get_textpage()
63
-
64
- if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
65
- bbox = bbox.to_bottom_left_origin(self.get_size().height)
66
-
67
- with pypdfium2_lock:
68
- text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
69
-
70
- return text_piece
71
-
72
- def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
73
- return None
74
-
75
- def get_text_cells(self) -> Iterable[TextCell]:
120
+ def _compute_text_cells(self) -> List[TextCell]:
121
+ """Compute text cells from pypdfium."""
76
122
  with pypdfium2_lock:
77
123
  if not self.text_page:
78
124
  self.text_page = self._ppage.get_textpage()
@@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
203
249
 
204
250
  return merged_cells
205
251
 
206
- def draw_clusters_and_cells():
207
- image = (
208
- self.get_page_image()
209
- ) # make new image to avoid drawing on the saved ones
210
- draw = ImageDraw.Draw(image)
211
- for c in cells:
212
- x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
213
- cell_color = (
214
- random.randint(30, 140),
215
- random.randint(30, 140),
216
- random.randint(30, 140),
217
- )
218
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
219
- image.show()
252
+ return merge_horizontal_cells(cells)
220
253
 
221
- # before merge:
222
- # draw_clusters_and_cells()
254
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
255
+ AREA_THRESHOLD = 0 # 32 * 32
256
+ page_size = self.get_size()
257
+ with pypdfium2_lock:
258
+ for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
259
+ pos = obj.get_pos()
260
+ cropbox = BoundingBox.from_tuple(
261
+ pos, origin=CoordOrigin.BOTTOMLEFT
262
+ ).to_top_left_origin(page_height=page_size.height)
223
263
 
224
- cells = merge_horizontal_cells(cells)
264
+ if cropbox.area() > AREA_THRESHOLD:
265
+ cropbox = cropbox.scaled(scale=scale)
225
266
 
226
- # after merge:
227
- # draw_clusters_and_cells()
267
+ yield cropbox
228
268
 
229
- return cells
269
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
270
+ with pypdfium2_lock:
271
+ if not self.text_page:
272
+ self.text_page = self._ppage.get_textpage()
273
+
274
+ if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
275
+ bbox = bbox.to_bottom_left_origin(self.get_size().height)
276
+
277
+ with pypdfium2_lock:
278
+ text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
279
+
280
+ return text_piece
281
+
282
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
283
+ if not self.valid:
284
+ return None
285
+
286
+ text_cells = self._compute_text_cells()
287
+
288
+ # Get the PDF page geometry from pypdfium2
289
+ dimension = get_pdf_page_geometry(self._ppage)
290
+
291
+ # Create SegmentedPdfPage
292
+ return SegmentedPdfPage(
293
+ dimension=dimension,
294
+ textline_cells=text_cells,
295
+ char_cells=[],
296
+ word_cells=[],
297
+ has_textlines=len(text_cells) > 0,
298
+ has_words=False,
299
+ has_chars=False,
300
+ )
301
+
302
+ def get_text_cells(self) -> Iterable[TextCell]:
303
+ return self._compute_text_cells()
230
304
 
231
305
  def get_page_image(
232
306
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
@@ -67,10 +67,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
67
67
  InputFormat.MD: ["md"],
68
68
  InputFormat.HTML: ["html", "htm", "xhtml"],
69
69
  InputFormat.XML_JATS: ["xml", "nxml"],
70
- InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
70
+ InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
71
71
  InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
72
72
  InputFormat.CSV: ["csv"],
73
- InputFormat.XLSX: ["xlsx"],
73
+ InputFormat.XLSX: ["xlsx", "xlsm"],
74
74
  InputFormat.XML_USPTO: ["xml", "txt"],
75
75
  InputFormat.JSON_DOCLING: ["json"],
76
76
  }
@@ -232,7 +232,6 @@ class Page(BaseModel):
232
232
  page_no: int
233
233
  # page_hash: Optional[str] = None
234
234
  size: Optional[Size] = None
235
- cells: List[TextCell] = []
236
235
  parsed_page: Optional[SegmentedPdfPage] = None
237
236
  predictions: PagePredictions = PagePredictions()
238
237
  assembled: Optional[AssembledUnit] = None
@@ -245,6 +244,14 @@ class Page(BaseModel):
245
244
  float, Image
246
245
  ] = {} # Cache of images in different scales. By default it is cleared during assembling.
247
246
 
247
+ @property
248
+ def cells(self) -> List[TextCell]:
249
+ """Return text cells as a read-only view of parsed_page.textline_cells."""
250
+ if self.parsed_page is not None:
251
+ return self.parsed_page.textline_cells
252
+ else:
253
+ return []
254
+
248
255
  def get_image(
249
256
  self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
250
257
  ) -> Optional[Image]:
@@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
292
292
  ),
293
293
  )
294
294
 
295
- generate_parsed_pages: bool = False
295
+ generate_parsed_pages: Literal[True] = (
296
+ True # Always True since parsed_page is now mandatory
297
+ )
296
298
 
297
299
 
298
300
  class PdfPipeline(str, Enum):
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Any, Dict, List, Literal
2
+ from typing import Any, Dict, List, Literal, Optional, Union
3
3
 
4
4
  from pydantic import AnyUrl, BaseModel
5
5
  from typing_extensions import deprecated
@@ -42,6 +42,7 @@ class InlineVlmOptions(BaseVlmOptions):
42
42
  transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
43
43
  response_format: ResponseFormat
44
44
 
45
+ torch_dtype: Optional[str] = None
45
46
  supported_devices: List[AcceleratorDevice] = [
46
47
  AcceleratorDevice.CPU,
47
48
  AcceleratorDevice.CUDA,
@@ -7,6 +7,7 @@ from typing import List, Optional, Type
7
7
 
8
8
  import numpy as np
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
10
+ from docling_core.types.doc.page import TextCell
10
11
  from PIL import Image, ImageDraw
11
12
  from rtree import index
12
13
  from scipy.ndimage import binary_dilation, find_objects, label
@@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
107
108
  return []
108
109
 
109
110
  # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
110
- def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
111
+ def _filter_ocr_cells(
112
+ self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
113
+ ) -> List[TextCell]:
111
114
  # Create R-tree index for programmatic cells
112
115
  p = index.Property()
113
116
  p.dimension = 2
@@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
130
133
  ]
131
134
  return filtered_ocr_cells
132
135
 
133
- def post_process_cells(self, ocr_cells, programmatic_cells):
136
+ def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
134
137
  r"""
135
- Post-process the ocr and programmatic cells and return the final list of of cells
138
+ Post-process the OCR cells and update the page object.
139
+ Updates parsed_page.textline_cells directly since page.cells is now read-only.
136
140
  """
141
+ # Get existing cells from the read-only property
142
+ existing_cells = page.cells
143
+
144
+ # Combine existing and OCR cells with overlap filtering
145
+ final_cells = self._combine_cells(existing_cells, ocr_cells)
146
+
147
+ assert page.parsed_page is not None
148
+
149
+ # Update parsed_page.textline_cells directly
150
+ page.parsed_page.textline_cells = final_cells
151
+ page.parsed_page.has_lines = len(final_cells) > 0
152
+
153
+ def _combine_cells(
154
+ self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
155
+ ) -> List[TextCell]:
156
+ """Combine existing and OCR cells with filtering and re-indexing."""
137
157
  if self.options.force_full_page_ocr:
138
- # If a full page OCR is forced, use only the OCR cells
139
- cells = ocr_cells
140
- return cells
141
-
142
- ## Remove OCR cells which overlap with programmatic cells.
143
- filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
144
- programmatic_cells.extend(filtered_ocr_cells)
145
- return programmatic_cells
158
+ combined = ocr_cells
159
+ else:
160
+ filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
161
+ combined = list(existing_cells) + filtered_ocr_cells
162
+
163
+ # Re-index in-place
164
+ for i, cell in enumerate(combined):
165
+ cell.index = i
166
+
167
+ return combined
146
168
 
147
169
  def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
148
170
  image = copy.deepcopy(page.image)
@@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
177
177
  all_ocr_cells.extend(cells)
178
178
 
179
179
  # Post-process the cells
180
- page.cells = self.post_process_cells(all_ocr_cells, page.cells)
180
+ self.post_process_cells(all_ocr_cells, page)
181
181
 
182
182
  # DEBUG code:
183
183
  if settings.debug.visualize_ocr: