docling 1.19.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.1.dist-info/METADATA +0 -380
  35. docling-1.19.1.dist-info/RECORD +0 -34
  36. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,509 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ import docx
7
+ from docling_core.types.doc import (
8
+ DocItemLabel,
9
+ DoclingDocument,
10
+ DocumentOrigin,
11
+ GroupLabel,
12
+ TableCell,
13
+ TableData,
14
+ )
15
+ from lxml import etree
16
+
17
+ from docling.backend.abstract_backend import DeclarativeDocumentBackend
18
+ from docling.datamodel.base_models import InputFormat
19
+ from docling.datamodel.document import InputDocument
20
+
21
+ _log = logging.getLogger(__name__)
22
+
23
+
24
+ class MsWordDocumentBackend(DeclarativeDocumentBackend):
25
+
26
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
27
+ super().__init__(in_doc, path_or_stream)
28
+ self.XML_KEY = (
29
+ "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
30
+ )
31
+ self.xml_namespaces = {
32
+ "w": "http://schemas.microsoft.com/office/word/2003/wordml"
33
+ }
34
+ # self.initialise(path_or_stream)
35
+ # Word file:
36
+ self.path_or_stream = path_or_stream
37
+ self.valid = False
38
+ # Initialise the parents for the hierarchy
39
+ self.max_levels = 10
40
+ self.level_at_new_list = None
41
+ self.parents = {} # type: ignore
42
+ for i in range(-1, self.max_levels):
43
+ self.parents[i] = None
44
+
45
+ self.level = 0
46
+ self.listIter = 0
47
+
48
+ self.history = {
49
+ "names": [None],
50
+ "levels": [None],
51
+ "numids": [None],
52
+ "indents": [None],
53
+ }
54
+
55
+ self.docx_obj = None
56
+ try:
57
+ if isinstance(self.path_or_stream, BytesIO):
58
+ self.docx_obj = docx.Document(self.path_or_stream)
59
+ elif isinstance(self.path_or_stream, Path):
60
+ self.docx_obj = docx.Document(str(self.path_or_stream))
61
+
62
+ self.valid = True
63
+ except Exception as e:
64
+ raise RuntimeError(
65
+ f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
66
+ ) from e
67
+
68
+ def is_valid(self) -> bool:
69
+ return self.valid
70
+
71
+ @classmethod
72
+ def supports_pagination(cls) -> bool:
73
+ return False
74
+
75
+ def unload(self):
76
+ if isinstance(self.path_or_stream, BytesIO):
77
+ self.path_or_stream.close()
78
+
79
+ self.path_or_stream = None
80
+
81
+ @classmethod
82
+ def supported_formats(cls) -> Set[InputFormat]:
83
+ return {InputFormat.DOCX}
84
+
85
+ def convert(self) -> DoclingDocument:
86
+ # Parses the DOCX into a structured document model.
87
+
88
+ fname = ""
89
+ if isinstance(self.path_or_stream, Path):
90
+ fname = self.path_or_stream.name
91
+
92
+ origin = DocumentOrigin(
93
+ filename=fname,
94
+ mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
95
+ binary_hash=self.document_hash,
96
+ )
97
+ if len(fname) > 0:
98
+ docname = Path(fname).stem
99
+ else:
100
+ docname = "stream"
101
+ doc = DoclingDocument(name=docname, origin=origin)
102
+ if self.is_valid():
103
+ assert self.docx_obj is not None
104
+ doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
105
+ return doc
106
+ else:
107
+ raise RuntimeError(
108
+ f"Cannot convert doc with {self.document_hash} because the backend failed to init."
109
+ )
110
+
111
+ def update_history(self, name, level, numid, ilevel):
112
+ self.history["names"].append(name)
113
+ self.history["levels"].append(level)
114
+
115
+ self.history["numids"].append(numid)
116
+ self.history["indents"].append(ilevel)
117
+
118
+ def prev_name(self):
119
+ return self.history["names"][-1]
120
+
121
+ def prev_level(self):
122
+ return self.history["levels"][-1]
123
+
124
+ def prev_numid(self):
125
+ return self.history["numids"][-1]
126
+
127
+ def prev_indent(self):
128
+ return self.history["indents"][-1]
129
+
130
+ def get_level(self) -> int:
131
+ """Return the first None index."""
132
+ for k, v in self.parents.items():
133
+ if k >= 0 and v == None:
134
+ return k
135
+ return 0
136
+
137
+ def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
138
+ for element in body:
139
+ tag_name = etree.QName(element).localname
140
+
141
+ # Check for Inline Images (drawings or blip elements)
142
+ found_drawing = etree.ElementBase.xpath(
143
+ element, ".//w:drawing", namespaces=self.xml_namespaces
144
+ )
145
+ found_pict = etree.ElementBase.xpath(
146
+ element, ".//w:pict", namespaces=self.xml_namespaces
147
+ )
148
+
149
+ # Check for Tables
150
+ if element.tag.endswith("tbl"):
151
+ try:
152
+ self.handle_tables(element, docx_obj, doc)
153
+ except Exception:
154
+ _log.debug("could not parse a table, broken docx table")
155
+
156
+ elif found_drawing or found_pict:
157
+ self.handle_pictures(element, docx_obj, doc)
158
+ # Check for Text
159
+ elif tag_name in ["p"]:
160
+ self.handle_text_elements(element, docx_obj, doc)
161
+ else:
162
+ _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
163
+ return doc
164
+
165
+ def str_to_int(self, s, default=0):
166
+ if s is None:
167
+ return None
168
+ try:
169
+ return int(s)
170
+ except ValueError:
171
+ return default
172
+
173
+ def get_numId_and_ilvl(self, paragraph):
174
+ # Access the XML element of the paragraph
175
+ numPr = paragraph._element.find(
176
+ ".//w:numPr", namespaces=paragraph._element.nsmap
177
+ )
178
+
179
+ if numPr is not None:
180
+ # Get the numId element and extract the value
181
+ numId_elem = numPr.find("w:numId", namespaces=paragraph._element.nsmap)
182
+ ilvl_elem = numPr.find("w:ilvl", namespaces=paragraph._element.nsmap)
183
+ numId = numId_elem.get(self.XML_KEY) if numId_elem is not None else None
184
+ ilvl = ilvl_elem.get(self.XML_KEY) if ilvl_elem is not None else None
185
+
186
+ return self.str_to_int(numId, default=None), self.str_to_int(
187
+ ilvl, default=None
188
+ )
189
+
190
+ return None, None # If the paragraph is not part of a list
191
+
192
+ def get_label_and_level(self, paragraph):
193
+ if paragraph.style is None:
194
+ return "Normal", None
195
+ label = paragraph.style.name
196
+ if label is None:
197
+ return "Normal", None
198
+ if ":" in label:
199
+ parts = label.split(":")
200
+
201
+ if len(parts) == 2:
202
+ return parts[0], int(parts[1])
203
+
204
+ parts = label.split(" ")
205
+
206
+ if "Heading" in label and len(parts) == 2:
207
+ parts.sort()
208
+ label_str = ""
209
+ label_level = 0
210
+ if parts[0] == "Heading":
211
+ # print("{} - {}".format(parts[0], parts[1]))
212
+ label_str = parts[0]
213
+ label_level = self.str_to_int(parts[1], default=None)
214
+ if parts[1] == "Heading":
215
+ label_str = parts[1]
216
+ label_level = self.str_to_int(parts[0], default=None)
217
+ return label_str, label_level
218
+ else:
219
+ return label, None
220
+
221
+ def handle_text_elements(self, element, docx_obj, doc):
222
+ paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
223
+
224
+ if paragraph.text is None:
225
+ # _log.warn(f"paragraph has text==None")
226
+ return
227
+
228
+ text = paragraph.text.strip()
229
+ # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
230
+
231
+ # Common styles for bullet and numbered lists.
232
+ # "List Bullet", "List Number", "List Paragraph"
233
+ # TODO: reliably identify wether list is a numbered list or not
234
+ # is_numbered = "List Bullet" not in paragraph.style.name
235
+ is_numbered = False
236
+
237
+ p_style_name, p_level = self.get_label_and_level(paragraph)
238
+ numid, ilevel = self.get_numId_and_ilvl(paragraph)
239
+ # print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
240
+
241
+ if numid == 0:
242
+ numid = None
243
+
244
+ # Handle lists
245
+ if numid is not None and ilevel is not None:
246
+ self.add_listitem(
247
+ element,
248
+ docx_obj,
249
+ doc,
250
+ p_style_name,
251
+ p_level,
252
+ numid,
253
+ ilevel,
254
+ text,
255
+ is_numbered,
256
+ )
257
+ self.update_history(p_style_name, p_level, numid, ilevel)
258
+ return
259
+ elif numid is None and self.prev_numid() is not None: # Close list
260
+ for key, val in self.parents.items():
261
+ if key >= self.level_at_new_list:
262
+ self.parents[key] = None
263
+ self.level = self.level_at_new_list - 1
264
+ self.level_at_new_list = None
265
+ if p_style_name in ["Title"]:
266
+ for key, val in self.parents.items():
267
+ self.parents[key] = None
268
+ self.parents[0] = doc.add_text(
269
+ parent=None, label=DocItemLabel.TITLE, text=text
270
+ )
271
+ elif "Heading" in p_style_name:
272
+ self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
273
+
274
+ elif p_style_name in [
275
+ "Paragraph",
276
+ "Normal",
277
+ "Subtitle",
278
+ "Author",
279
+ "Default Text",
280
+ "List Paragraph",
281
+ "List Bullet",
282
+ "Quote",
283
+ ]:
284
+ level = self.get_level()
285
+ doc.add_text(
286
+ label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
287
+ )
288
+
289
+ else:
290
+ # Text style names can, and will have, not only default values but user values too
291
+ # hence we treat all other labels as pure text
292
+ level = self.get_level()
293
+ doc.add_text(
294
+ label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
295
+ )
296
+
297
+ self.update_history(p_style_name, p_level, numid, ilevel)
298
+ return
299
+
300
+ def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
301
+ level = self.get_level()
302
+ if isinstance(curr_level, int):
303
+
304
+ if curr_level == level:
305
+
306
+ self.parents[level] = doc.add_heading(
307
+ parent=self.parents[level - 1], text=text
308
+ )
309
+
310
+ elif curr_level > level:
311
+
312
+ # add invisible group
313
+ for i in range(level, curr_level):
314
+ self.parents[i] = doc.add_group(
315
+ parent=self.parents[i - 1],
316
+ label=GroupLabel.SECTION,
317
+ name=f"header-{i}",
318
+ )
319
+
320
+ self.parents[curr_level] = doc.add_heading(
321
+ parent=self.parents[curr_level - 1], text=text
322
+ )
323
+
324
+ elif curr_level < level:
325
+
326
+ # remove the tail
327
+ for key, val in self.parents.items():
328
+ if key >= curr_level:
329
+ self.parents[key] = None
330
+
331
+ self.parents[curr_level] = doc.add_heading(
332
+ parent=self.parents[curr_level - 1], text=text
333
+ )
334
+
335
+ else:
336
+ self.parents[self.level] = doc.add_heading(
337
+ parent=self.parents[self.level - 1], text=text
338
+ )
339
+ return
340
+
341
+ def add_listitem(
342
+ self,
343
+ element,
344
+ docx_obj,
345
+ doc,
346
+ p_style_name,
347
+ p_level,
348
+ numid,
349
+ ilevel,
350
+ text: str,
351
+ is_numbered=False,
352
+ ):
353
+ # is_numbered = is_numbered
354
+ enum_marker = ""
355
+
356
+ level = self.get_level()
357
+ if self.prev_numid() is None: # Open new list
358
+ self.level_at_new_list = level # type: ignore
359
+
360
+ self.parents[level] = doc.add_group(
361
+ label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
362
+ )
363
+
364
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
365
+ self.listIter += 1
366
+ if is_numbered:
367
+ enum_marker = str(self.listIter) + "."
368
+ is_numbered = True
369
+ doc.add_list_item(
370
+ marker=enum_marker,
371
+ enumerated=is_numbered,
372
+ parent=self.parents[level],
373
+ text=text,
374
+ )
375
+
376
+ elif (
377
+ self.prev_numid() == numid and self.prev_indent() < ilevel
378
+ ): # Open indented list
379
+ for i in range(
380
+ self.level_at_new_list + self.prev_indent() + 1,
381
+ self.level_at_new_list + ilevel + 1,
382
+ ):
383
+ # TODO: determine if this is an unordered list or an ordered list.
384
+ # Set GroupLabel.ORDERED_LIST when it fits.
385
+ self.listIter = 0
386
+ if is_numbered:
387
+ self.parents[i] = doc.add_group(
388
+ label=GroupLabel.ORDERED_LIST,
389
+ name="list",
390
+ parent=self.parents[i - 1],
391
+ )
392
+ else:
393
+ self.parents[i] = doc.add_group(
394
+ label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
395
+ )
396
+
397
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
398
+ self.listIter += 1
399
+ if is_numbered:
400
+ enum_marker = str(self.listIter) + "."
401
+ is_numbered = True
402
+ doc.add_list_item(
403
+ marker=enum_marker,
404
+ enumerated=is_numbered,
405
+ parent=self.parents[self.level_at_new_list + ilevel],
406
+ text=text,
407
+ )
408
+
409
+ elif self.prev_numid() == numid and ilevel < self.prev_indent(): # Close list
410
+ for k, v in self.parents.items():
411
+ if k > self.level_at_new_list + ilevel:
412
+ self.parents[k] = None
413
+
414
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
415
+ self.listIter += 1
416
+ if is_numbered:
417
+ enum_marker = str(self.listIter) + "."
418
+ is_numbered = True
419
+ doc.add_list_item(
420
+ marker=enum_marker,
421
+ enumerated=is_numbered,
422
+ parent=self.parents[self.level_at_new_list + ilevel],
423
+ text=text,
424
+ )
425
+ self.listIter = 0
426
+
427
+ elif self.prev_numid() == numid or self.prev_indent() == ilevel:
428
+ # TODO: Set marker and enumerated arguments if this is an enumeration element.
429
+ self.listIter += 1
430
+ if is_numbered:
431
+ enum_marker = str(self.listIter) + "."
432
+ is_numbered = True
433
+ doc.add_list_item(
434
+ marker=enum_marker,
435
+ enumerated=is_numbered,
436
+ parent=self.parents[level - 1],
437
+ text=text,
438
+ )
439
+ return
440
+
441
+ def handle_tables(self, element, docx_obj, doc):
442
+
443
+ # Function to check if a cell has a colspan (gridSpan)
444
+ def get_colspan(cell):
445
+ grid_span = cell._element.xpath("@w:gridSpan")
446
+ if grid_span:
447
+ return int(grid_span[0]) # Return the number of columns spanned
448
+ return 1 # Default is 1 (no colspan)
449
+
450
+ # Function to check if a cell has a rowspan (vMerge)
451
+ def get_rowspan(cell):
452
+ v_merge = cell._element.xpath("@w:vMerge")
453
+ if v_merge:
454
+ return v_merge[
455
+ 0
456
+ ] # 'restart' indicates the beginning of a rowspan, others are continuation
457
+ return 1
458
+
459
+ table = docx.table.Table(element, docx_obj)
460
+
461
+ num_rows = len(table.rows)
462
+ num_cols = 0
463
+ for row in table.rows:
464
+ # Calculate the max number of columns
465
+ num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
466
+ # if row.cells:
467
+ # num_cols = max(num_cols, len(row.cells))
468
+
469
+ # Initialize the table grid
470
+ table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
471
+
472
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
473
+
474
+ for row_idx, row in enumerate(table.rows):
475
+ col_idx = 0
476
+ for c, cell in enumerate(row.cells):
477
+ row_span = get_rowspan(cell)
478
+ col_span = get_colspan(cell)
479
+
480
+ # Find the next available column in the grid
481
+ while table_grid[row_idx][col_idx] is not None:
482
+ col_idx += 1
483
+
484
+ # Fill the grid with the cell value, considering rowspan and colspan
485
+ for i in range(row_span if row_span == "restart" else 1):
486
+ for j in range(col_span):
487
+ table_grid[row_idx + i][col_idx + j] = ""
488
+
489
+ cell = TableCell(
490
+ text=cell.text,
491
+ row_span=row_span,
492
+ col_span=col_span,
493
+ start_row_offset_idx=row_idx,
494
+ end_row_offset_idx=row_idx + row_span,
495
+ start_col_offset_idx=col_idx,
496
+ end_col_offset_idx=col_idx + col_span,
497
+ col_header=False, # col_header,
498
+ row_header=False, # ((not col_header) and html_cell.name=='th')
499
+ )
500
+
501
+ data.table_cells.append(cell)
502
+
503
+ level = self.get_level()
504
+ doc.add_table(data=data, parent=self.parents[level - 1])
505
+ return
506
+
507
+ def handle_pictures(self, element, docx_obj, doc):
508
+ doc.add_picture(parent=self.parents[self.level], caption=None)
509
+ return
@@ -0,0 +1,78 @@
1
+ from abc import ABC, abstractmethod
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Iterable, Optional, Set, Union
5
+
6
+ from docling_core.types.doc import BoundingBox, Size
7
+ from PIL import Image
8
+
9
+ from docling.backend.abstract_backend import PaginatedDocumentBackend
10
+ from docling.datamodel.base_models import Cell, InputFormat
11
+ from docling.datamodel.document import InputDocument
12
+
13
+
14
+ class PdfPageBackend(ABC):
15
+
16
+ @abstractmethod
17
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
18
+ pass
19
+
20
+ @abstractmethod
21
+ def get_text_cells(self) -> Iterable[Cell]:
22
+ pass
23
+
24
+ @abstractmethod
25
+ def get_bitmap_rects(self, float: int = 1) -> Iterable[BoundingBox]:
26
+ pass
27
+
28
+ @abstractmethod
29
+ def get_page_image(
30
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
31
+ ) -> Image.Image:
32
+ pass
33
+
34
+ @abstractmethod
35
+ def get_size(self) -> Size:
36
+ pass
37
+
38
+ @abstractmethod
39
+ def is_valid(self) -> bool:
40
+ pass
41
+
42
+ @abstractmethod
43
+ def unload(self):
44
+ pass
45
+
46
+
47
+ class PdfDocumentBackend(PaginatedDocumentBackend):
48
+
49
+ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
50
+ super().__init__(in_doc, path_or_stream)
51
+
52
+ if self.input_format is not InputFormat.PDF:
53
+ if self.input_format is InputFormat.IMAGE:
54
+ buf = BytesIO()
55
+ img = Image.open(self.path_or_stream)
56
+ img.save(buf, "PDF")
57
+ buf.seek(0)
58
+ self.path_or_stream = buf
59
+ else:
60
+ raise RuntimeError(
61
+ f"Incompatible file format {self.input_format} was passed to a PdfDocumentBackend."
62
+ )
63
+
64
+ @abstractmethod
65
+ def load_page(self, page_no: int) -> PdfPageBackend:
66
+ pass
67
+
68
+ @abstractmethod
69
+ def page_count(self) -> int:
70
+ pass
71
+
72
+ @classmethod
73
+ def supported_formats(cls) -> Set[InputFormat]:
74
+ return {InputFormat.PDF}
75
+
76
+ @classmethod
77
+ def supports_pagination(cls) -> bool:
78
+ return True
@@ -2,16 +2,20 @@ import logging
2
2
  import random
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
6
 
7
7
  import pypdfium2 as pdfium
8
8
  import pypdfium2.raw as pdfium_c
9
+ from docling_core.types.doc import BoundingBox, CoordOrigin, Size
9
10
  from PIL import Image, ImageDraw
10
- from pypdfium2 import PdfPage, PdfTextPage
11
+ from pypdfium2 import PdfTextPage
11
12
  from pypdfium2._helpers.misc import PdfiumError
12
13
 
13
- from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
14
- from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
14
+ from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15
+ from docling.datamodel.base_models import Cell
16
+
17
+ if TYPE_CHECKING:
18
+ from docling.datamodel.document import InputDocument
15
19
 
16
20
  _log = logging.getLogger(__name__)
17
21
 
@@ -222,8 +226,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
222
226
 
223
227
  return image
224
228
 
225
- def get_size(self) -> PageSize:
226
- return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
229
+ def get_size(self) -> Size:
230
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
227
231
 
228
232
  def unload(self):
229
233
  self._ppage = None
@@ -231,13 +235,14 @@ class PyPdfiumPageBackend(PdfPageBackend):
231
235
 
232
236
 
233
237
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
234
- def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
235
- super().__init__(path_or_stream, document_hash)
238
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
239
+ super().__init__(in_doc, path_or_stream)
240
+
236
241
  try:
237
- self._pdoc = pdfium.PdfDocument(path_or_stream)
242
+ self._pdoc = pdfium.PdfDocument(self.path_or_stream)
238
243
  except PdfiumError as e:
239
244
  raise RuntimeError(
240
- f"pypdfium could not load document {document_hash}"
245
+ f"pypdfium could not load document with hash {self.document_hash}"
241
246
  ) from e
242
247
 
243
248
  def page_count(self) -> int: