docling 1.6.2__py3-none-any.whl → 1.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,15 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Any, Iterable, Optional, Union
4
+ from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
5
5
 
6
6
  from PIL import Image
7
7
 
8
+ if TYPE_CHECKING:
9
+ from docling.datamodel.base_models import BoundingBox, Cell, PageSize
10
+
8
11
 
9
12
  class PdfPageBackend(ABC):
10
- def __init__(self, page_obj: Any) -> object:
11
- pass
12
13
 
13
14
  @abstractmethod
14
15
  def get_text_in_rect(self, bbox: "BoundingBox") -> str:
@@ -19,12 +20,12 @@ class PdfPageBackend(ABC):
19
20
  pass
20
21
 
21
22
  @abstractmethod
22
- def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
23
+ def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
23
24
  pass
24
25
 
25
26
  @abstractmethod
26
27
  def get_page_image(
27
- self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
28
+ self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
28
29
  ) -> Image.Image:
29
30
  pass
30
31
 
@@ -32,6 +33,10 @@ class PdfPageBackend(ABC):
32
33
  def get_size(self) -> "PageSize":
33
34
  pass
34
35
 
36
+ @abstractmethod
37
+ def is_valid(self) -> bool:
38
+ pass
39
+
35
40
  @abstractmethod
36
41
  def unload(self):
37
42
  pass
@@ -39,8 +44,9 @@ class PdfPageBackend(ABC):
39
44
 
40
45
  class PdfDocumentBackend(ABC):
41
46
  @abstractmethod
42
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
43
- pass
47
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
48
+ self.path_or_stream = path_or_stream
49
+ self.document_hash = document_hash
44
50
 
45
51
  @abstractmethod
46
52
  def load_page(self, page_no: int) -> PdfPageBackend:
@@ -56,4 +62,7 @@ class PdfDocumentBackend(ABC):
56
62
 
57
63
  @abstractmethod
58
64
  def unload(self):
59
- pass
65
+ if isinstance(self.path_or_stream, BytesIO):
66
+ self.path_or_stream.close()
67
+
68
+ self.path_or_stream = None
@@ -1,9 +1,8 @@
1
1
  import logging
2
2
  import random
3
- import time
4
3
  from io import BytesIO
5
4
  from pathlib import Path
6
- from typing import Iterable, Optional, Union
5
+ from typing import Iterable, List, Optional, Union
7
6
 
8
7
  import pypdfium2 as pdfium
9
8
  from docling_parse.docling_parse import pdf_parser
@@ -17,13 +16,26 @@ _log = logging.getLogger(__name__)
17
16
 
18
17
 
19
18
  class DoclingParsePageBackend(PdfPageBackend):
20
- def __init__(self, page_obj: PdfPage, docling_page_obj):
21
- super().__init__(page_obj)
19
+ def __init__(
20
+ self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
21
+ ):
22
22
  self._ppage = page_obj
23
- self._dpage = docling_page_obj
24
- self.text_page = None
23
+ parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
24
+
25
+ self.valid = "pages" in parsed_page
26
+ if self.valid:
27
+ self._dpage = parsed_page["pages"][0]
28
+ else:
29
+ _log.info(
30
+ f"An error occured when loading page {page_no} of document {document_hash}."
31
+ )
32
+
33
+ def is_valid(self) -> bool:
34
+ return self.valid
25
35
 
26
36
  def get_text_in_rect(self, bbox: BoundingBox) -> str:
37
+ if not self.valid:
38
+ return ""
27
39
  # Find intersecting cells on the page
28
40
  text_piece = ""
29
41
  page_size = self.get_size()
@@ -55,9 +67,12 @@ class DoclingParsePageBackend(PdfPageBackend):
55
67
  return text_piece
56
68
 
57
69
  def get_text_cells(self) -> Iterable[Cell]:
58
- cells = []
70
+ cells: List[Cell] = []
59
71
  cell_counter = 0
60
72
 
73
+ if not self.valid:
74
+ return cells
75
+
61
76
  page_size = self.get_size()
62
77
 
63
78
  parser_width = self._dpage["width"]
@@ -114,7 +129,7 @@ class DoclingParsePageBackend(PdfPageBackend):
114
129
 
115
130
  return cells
116
131
 
117
- def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
132
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
118
133
  AREA_THRESHOLD = 32 * 32
119
134
 
120
135
  for i in range(len(self._dpage["images"])):
@@ -129,7 +144,7 @@ class DoclingParsePageBackend(PdfPageBackend):
129
144
  yield cropbox
130
145
 
131
146
  def get_page_image(
132
- self, scale: int = 1, cropbox: Optional[BoundingBox] = None
147
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
133
148
  ) -> Image.Image:
134
149
 
135
150
  page_size = self.get_size()
@@ -168,40 +183,41 @@ class DoclingParsePageBackend(PdfPageBackend):
168
183
  def unload(self):
169
184
  self._ppage = None
170
185
  self._dpage = None
171
- self.text_page = None
172
186
 
173
187
 
174
188
  class DoclingParseDocumentBackend(PdfDocumentBackend):
175
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
176
- super().__init__(path_or_stream)
177
- self._pdoc = pdfium.PdfDocument(path_or_stream)
178
- # Parsing cells with docling_parser call
179
- parser = pdf_parser()
189
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
190
+ super().__init__(path_or_stream, document_hash)
180
191
 
181
- start_pb_time = time.time()
192
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
193
+ self.parser = pdf_parser()
182
194
 
195
+ success = False
183
196
  if isinstance(path_or_stream, BytesIO):
184
- self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
185
- else:
186
- self._parser_doc = parser.find_cells(str(path_or_stream))
197
+ success = self.parser.load_document_from_bytesio(
198
+ document_hash, path_or_stream
199
+ )
200
+ elif isinstance(path_or_stream, Path):
201
+ success = self.parser.load_document(document_hash, str(path_or_stream))
187
202
 
188
- end_pb_time = time.time() - start_pb_time
189
- _log.info(
190
- f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
191
- )
203
+ if not success:
204
+ raise RuntimeError(
205
+ f"docling-parse could not load document {document_hash}."
206
+ )
192
207
 
193
208
  def page_count(self) -> int:
194
- return len(self._parser_doc["pages"])
209
+ return len(self._pdoc) # To be replaced with docling-parse API
195
210
 
196
211
  def load_page(self, page_no: int) -> DoclingParsePageBackend:
197
212
  return DoclingParsePageBackend(
198
- self._pdoc[page_no], self._parser_doc["pages"][page_no]
213
+ self.parser, self.document_hash, page_no, self._pdoc[page_no]
199
214
  )
200
215
 
201
216
  def is_valid(self) -> bool:
202
217
  return self.page_count() > 0
203
218
 
204
219
  def unload(self):
220
+ super().unload()
221
+ self.parser.unload_document(self.document_hash)
205
222
  self._pdoc.close()
206
223
  self._pdoc = None
207
- self._parser_doc = None
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import random
2
3
  from io import BytesIO
3
4
  from pathlib import Path
@@ -6,19 +7,34 @@ from typing import Iterable, List, Optional, Union
6
7
  import pypdfium2 as pdfium
7
8
  import pypdfium2.raw as pdfium_c
8
9
  from PIL import Image, ImageDraw
9
- from pypdfium2 import PdfPage
10
+ from pypdfium2 import PdfPage, PdfTextPage
11
+ from pypdfium2._helpers.misc import PdfiumError
10
12
 
11
13
  from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
12
14
  from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
13
15
 
16
+ _log = logging.getLogger(__name__)
17
+
14
18
 
15
19
  class PyPdfiumPageBackend(PdfPageBackend):
16
- def __init__(self, page_obj: PdfPage):
17
- super().__init__(page_obj)
18
- self._ppage = page_obj
19
- self.text_page = None
20
+ def __init__(
21
+ self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
22
+ ):
23
+ self.valid = True # No better way to tell from pypdfium.
24
+ try:
25
+ self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
26
+ except PdfiumError as e:
27
+ _log.info(
28
+ f"An exception occured when loading page {page_no} of document {document_hash}.",
29
+ exc_info=True,
30
+ )
31
+ self.valid = False
32
+ self.text_page: Optional[PdfTextPage] = None
33
+
34
+ def is_valid(self) -> bool:
35
+ return self.valid
20
36
 
21
- def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
37
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
22
38
  AREA_THRESHOLD = 32 * 32
23
39
  for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
24
40
  pos = obj.get_pos()
@@ -173,7 +189,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
173
189
  return cells
174
190
 
175
191
  def get_page_image(
176
- self, scale: int = 1, cropbox: Optional[BoundingBox] = None
192
+ self, scale: float = 1, cropbox: Optional[BoundingBox] = None
177
193
  ) -> Image.Image:
178
194
 
179
195
  page_size = self.get_size()
@@ -215,19 +231,25 @@ class PyPdfiumPageBackend(PdfPageBackend):
215
231
 
216
232
 
217
233
  class PyPdfiumDocumentBackend(PdfDocumentBackend):
218
- def __init__(self, path_or_stream: Union[BytesIO, Path]):
219
- super().__init__(path_or_stream)
220
- self._pdoc = pdfium.PdfDocument(path_or_stream)
234
+ def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
235
+ super().__init__(path_or_stream, document_hash)
236
+ try:
237
+ self._pdoc = pdfium.PdfDocument(path_or_stream)
238
+ except PdfiumError as e:
239
+ raise RuntimeError(
240
+ f"pypdfium could not load document {document_hash}"
241
+ ) from e
221
242
 
222
243
  def page_count(self) -> int:
223
244
  return len(self._pdoc)
224
245
 
225
246
  def load_page(self, page_no: int) -> PyPdfiumPageBackend:
226
- return PyPdfiumPageBackend(self._pdoc[page_no])
247
+ return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
227
248
 
228
249
  def is_valid(self) -> bool:
229
250
  return self.page_count() > 0
230
251
 
231
252
  def unload(self):
253
+ super().unload()
232
254
  self._pdoc.close()
233
255
  self._pdoc = None
File without changes
docling/cli/main.py ADDED
@@ -0,0 +1,253 @@
1
+ import importlib
2
+ import json
3
+ import logging
4
+ import time
5
+ import warnings
6
+ from enum import Enum
7
+ from pathlib import Path
8
+ from typing import Annotated, Iterable, List, Optional
9
+
10
+ import typer
11
+ from docling_core.utils.file import resolve_file_source
12
+
13
+ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
15
+ from docling.datamodel.base_models import ConversionStatus
16
+ from docling.datamodel.document import ConversionResult, DocumentConversionInput
17
+ from docling.datamodel.pipeline_options import (
18
+ EasyOcrOptions,
19
+ PipelineOptions,
20
+ TesseractCliOcrOptions,
21
+ TesseractOcrOptions,
22
+ )
23
+ from docling.document_converter import DocumentConverter
24
+
25
+ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
26
+ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
27
+
28
+ _log = logging.getLogger(__name__)
29
+ from rich.console import Console
30
+
31
+ err_console = Console(stderr=True)
32
+
33
+
34
+ app = typer.Typer(
35
+ name="Docling",
36
+ no_args_is_help=True,
37
+ add_completion=False,
38
+ pretty_exceptions_enable=False,
39
+ )
40
+
41
+
42
+ def version_callback(value: bool):
43
+ if value:
44
+ docling_version = importlib.metadata.version("docling")
45
+ docling_core_version = importlib.metadata.version("docling-core")
46
+ docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
47
+ docling_parse_version = importlib.metadata.version("docling-parse")
48
+ print(f"Docling version: {docling_version}")
49
+ print(f"Docling Core version: {docling_core_version}")
50
+ print(f"Docling IBM Models version: {docling_ibm_models_version}")
51
+ print(f"Docling Parse version: {docling_parse_version}")
52
+ raise typer.Exit()
53
+
54
+
55
+ # Define an enum for the backend options
56
+ class Backend(str, Enum):
57
+ PYPDFIUM2 = "pypdfium2"
58
+ DOCLING = "docling"
59
+
60
+
61
+ # Define an enum for the ocr engines
62
+ class OcrEngine(str, Enum):
63
+ EASYOCR = "easyocr"
64
+ TESSERACT_CLI = "tesseract_cli"
65
+ TESSERACT = "tesseract"
66
+
67
+
68
+ def export_documents(
69
+ conv_results: Iterable[ConversionResult],
70
+ output_dir: Path,
71
+ export_json: bool,
72
+ export_md: bool,
73
+ export_txt: bool,
74
+ export_doctags: bool,
75
+ ):
76
+
77
+ success_count = 0
78
+ failure_count = 0
79
+
80
+ for conv_res in conv_results:
81
+ if conv_res.status == ConversionStatus.SUCCESS:
82
+ success_count += 1
83
+ doc_filename = conv_res.input.file.stem
84
+
85
+ # Export Deep Search document JSON format:
86
+ if export_json:
87
+ fname = output_dir / f"{doc_filename}.json"
88
+ with fname.open("w") as fp:
89
+ _log.info(f"writing JSON output to {fname}")
90
+ fp.write(json.dumps(conv_res.render_as_dict()))
91
+
92
+ # Export Text format:
93
+ if export_txt:
94
+ fname = output_dir / f"{doc_filename}.txt"
95
+ with fname.open("w") as fp:
96
+ _log.info(f"writing Text output to {fname}")
97
+ fp.write(conv_res.render_as_text())
98
+
99
+ # Export Markdown format:
100
+ if export_md:
101
+ fname = output_dir / f"{doc_filename}.md"
102
+ with fname.open("w") as fp:
103
+ _log.info(f"writing Markdown output to {fname}")
104
+ fp.write(conv_res.render_as_markdown())
105
+
106
+ # Export Document Tags format:
107
+ if export_doctags:
108
+ fname = output_dir / f"{doc_filename}.doctags"
109
+ with fname.open("w") as fp:
110
+ _log.info(f"writing Doc Tags output to {fname}")
111
+ fp.write(conv_res.render_as_doctags())
112
+
113
+ else:
114
+ _log.warning(f"Document {conv_res.input.file} failed to convert.")
115
+ failure_count += 1
116
+
117
+ _log.info(
118
+ f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
119
+ )
120
+
121
+
122
+ @app.command(no_args_is_help=True)
123
+ def convert(
124
+ input_sources: Annotated[
125
+ List[str],
126
+ typer.Argument(
127
+ ...,
128
+ metavar="source",
129
+ help="PDF files to convert. Can be local file / directory paths or URL.",
130
+ ),
131
+ ],
132
+ export_json: Annotated[
133
+ bool,
134
+ typer.Option(
135
+ ..., "--json/--no-json", help="If enabled the document is exported as JSON."
136
+ ),
137
+ ] = False,
138
+ export_md: Annotated[
139
+ bool,
140
+ typer.Option(
141
+ ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
142
+ ),
143
+ ] = True,
144
+ export_txt: Annotated[
145
+ bool,
146
+ typer.Option(
147
+ ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
148
+ ),
149
+ ] = False,
150
+ export_doctags: Annotated[
151
+ bool,
152
+ typer.Option(
153
+ ...,
154
+ "--doctags/--no-doctags",
155
+ help="If enabled the document is exported as Doc Tags.",
156
+ ),
157
+ ] = False,
158
+ ocr: Annotated[
159
+ bool,
160
+ typer.Option(
161
+ ..., help="If enabled, the bitmap content will be processed using OCR."
162
+ ),
163
+ ] = True,
164
+ backend: Annotated[
165
+ Backend, typer.Option(..., help="The PDF backend to use.")
166
+ ] = Backend.DOCLING,
167
+ ocr_engine: Annotated[
168
+ OcrEngine, typer.Option(..., help="The OCR engine to use.")
169
+ ] = OcrEngine.EASYOCR,
170
+ output: Annotated[
171
+ Path, typer.Option(..., help="Output directory where results are saved.")
172
+ ] = Path("."),
173
+ version: Annotated[
174
+ Optional[bool],
175
+ typer.Option(
176
+ "--version",
177
+ callback=version_callback,
178
+ is_eager=True,
179
+ help="Show version information.",
180
+ ),
181
+ ] = None,
182
+ ):
183
+ logging.basicConfig(level=logging.INFO)
184
+
185
+ input_doc_paths: List[Path] = []
186
+ for src in input_sources:
187
+ source = resolve_file_source(source=src)
188
+ if not source.exists():
189
+ err_console.print(
190
+ f"[red]Error: The input file {source} does not exist.[/red]"
191
+ )
192
+ raise typer.Abort()
193
+ elif source.is_dir():
194
+ input_doc_paths.extend(list(source.glob("**/*.pdf")))
195
+ input_doc_paths.extend(list(source.glob("**/*.PDF")))
196
+ else:
197
+ input_doc_paths.append(source)
198
+
199
+ match backend:
200
+ case Backend.PYPDFIUM2:
201
+ do_cell_matching = ocr # only do cell matching when OCR enabled
202
+ pdf_backend = PyPdfiumDocumentBackend
203
+ case Backend.DOCLING:
204
+ do_cell_matching = True
205
+ pdf_backend = DoclingParseDocumentBackend
206
+ case _:
207
+ raise RuntimeError(f"Unexpected backend type {backend}")
208
+
209
+ match ocr_engine:
210
+ case OcrEngine.EASYOCR:
211
+ ocr_options = EasyOcrOptions()
212
+ case OcrEngine.TESSERACT_CLI:
213
+ ocr_options = TesseractCliOcrOptions()
214
+ case OcrEngine.TESSERACT:
215
+ ocr_options = TesseractOcrOptions()
216
+ case _:
217
+ raise RuntimeError(f"Unexpected backend type {backend}")
218
+
219
+ pipeline_options = PipelineOptions(
220
+ do_ocr=ocr,
221
+ ocr_options=ocr_options,
222
+ do_table_structure=True,
223
+ )
224
+ pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
225
+ doc_converter = DocumentConverter(
226
+ pipeline_options=pipeline_options,
227
+ pdf_backend=pdf_backend,
228
+ )
229
+
230
+ # Define input files
231
+ input = DocumentConversionInput.from_paths(input_doc_paths)
232
+
233
+ start_time = time.time()
234
+
235
+ conv_results = doc_converter.convert(input)
236
+
237
+ output.mkdir(parents=True, exist_ok=True)
238
+ export_documents(
239
+ conv_results,
240
+ output_dir=output,
241
+ export_json=export_json,
242
+ export_md=export_md,
243
+ export_txt=export_txt,
244
+ export_doctags=export_doctags,
245
+ )
246
+
247
+ end_time = time.time() - start_time
248
+
249
+ _log.info(f"All documents were converted in {end_time:.2f} seconds.")
250
+
251
+
252
+ if __name__ == "__main__":
253
+ app()
@@ -9,6 +9,10 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
9
  from typing_extensions import Self
10
10
 
11
11
  from docling.backend.abstract_backend import PdfPageBackend
12
+ from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
13
+ PipelineOptions,
14
+ TableStructureOptions,
15
+ )
12
16
 
13
17
 
14
18
  class ConversionStatus(str, Enum):
@@ -16,7 +20,7 @@ class ConversionStatus(str, Enum):
16
20
  STARTED = auto()
17
21
  FAILURE = auto()
18
22
  SUCCESS = auto()
19
- SUCCESS_WITH_ERRORS = auto()
23
+ PARTIAL_SUCCESS = auto()
20
24
 
21
25
 
22
26
  class DocInputType(str, Enum):
@@ -29,6 +33,18 @@ class CoordOrigin(str, Enum):
29
33
  BOTTOMLEFT = auto()
30
34
 
31
35
 
36
+ class DoclingComponentType(str, Enum):
37
+ PDF_BACKEND = auto()
38
+ MODEL = auto()
39
+ DOC_ASSEMBLER = auto()
40
+
41
+
42
+ class ErrorItem(BaseModel):
43
+ component_type: DoclingComponentType
44
+ module_name: str
45
+ error_message: str
46
+
47
+
32
48
  class PageSize(BaseModel):
33
49
  width: float = 0.0
34
50
  height: float = 0.0
@@ -59,6 +75,15 @@ class BoundingBox(BaseModel):
59
75
 
60
76
  return out_bbox
61
77
 
78
+ def normalized(self, page_size: PageSize) -> "BoundingBox":
79
+ out_bbox = copy.deepcopy(self)
80
+ out_bbox.l /= page_size.width
81
+ out_bbox.r /= page_size.width
82
+ out_bbox.t /= page_size.height
83
+ out_bbox.b /= page_size.height
84
+
85
+ return out_bbox
86
+
62
87
  def as_tuple(self):
63
88
  if self.coord_origin == CoordOrigin.TOPLEFT:
64
89
  return (self.l, self.t, self.r, self.b)
@@ -66,7 +91,7 @@ class BoundingBox(BaseModel):
66
91
  return (self.l, self.b, self.r, self.t)
67
92
 
68
93
  @classmethod
69
- def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
94
+ def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
70
95
  if origin == CoordOrigin.TOPLEFT:
71
96
  l, t, r, b = coord[0], coord[1], coord[2], coord[3]
72
97
  if r < l:
@@ -85,7 +110,10 @@ class BoundingBox(BaseModel):
85
110
  return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
86
111
 
87
112
  def area(self) -> float:
88
- return (self.r - self.l) * (self.b - self.t)
113
+ area = (self.r - self.l) * (self.b - self.t)
114
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
115
+ area = -area
116
+ return area
89
117
 
90
118
  def intersection_area_with(self, other: "BoundingBox") -> float:
91
119
  # Calculate intersection coordinates
@@ -225,19 +253,19 @@ class EquationPrediction(BaseModel):
225
253
 
226
254
 
227
255
  class PagePredictions(BaseModel):
228
- layout: LayoutPrediction = None
229
- tablestructure: TableStructurePrediction = None
230
- figures_classification: FigureClassificationPrediction = None
231
- equations_prediction: EquationPrediction = None
256
+ layout: Optional[LayoutPrediction] = None
257
+ tablestructure: Optional[TableStructurePrediction] = None
258
+ figures_classification: Optional[FigureClassificationPrediction] = None
259
+ equations_prediction: Optional[EquationPrediction] = None
232
260
 
233
261
 
234
262
  PageElement = Union[TextElement, TableElement, FigureElement]
235
263
 
236
264
 
237
265
  class AssembledUnit(BaseModel):
238
- elements: List[PageElement]
239
- body: List[PageElement]
240
- headers: List[PageElement]
266
+ elements: List[PageElement] = []
267
+ body: List[PageElement] = []
268
+ headers: List[PageElement] = []
241
269
 
242
270
 
243
271
  class Page(BaseModel):
@@ -246,7 +274,7 @@ class Page(BaseModel):
246
274
  page_no: int
247
275
  page_hash: Optional[str] = None
248
276
  size: Optional[PageSize] = None
249
- cells: List[Cell] = None
277
+ cells: List[Cell] = []
250
278
  predictions: PagePredictions = PagePredictions()
251
279
  assembled: Optional[AssembledUnit] = None
252
280
 
@@ -277,22 +305,6 @@ class DocumentStream(BaseModel):
277
305
  stream: BytesIO
278
306
 
279
307
 
280
- class TableStructureOptions(BaseModel):
281
- do_cell_matching: bool = (
282
- True
283
- # True: Matches predictions back to PDF cells. Can break table output if PDF cells
284
- # are merged across table columns.
285
- # False: Let table structure model define the text cells, ignore PDF cells.
286
- )
287
-
288
-
289
- class PipelineOptions(BaseModel):
290
- do_table_structure: bool = True # True: perform table structure extraction
291
- do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
292
-
293
- table_structure_options: TableStructureOptions = TableStructureOptions()
294
-
295
-
296
308
  class AssembleOptions(BaseModel):
297
309
  keep_page_images: Annotated[
298
310
  bool,