docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. docling/backend/abstract_backend.py +33 -37
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +20 -16
  4. docling/backend/docling_parse_v2_backend.py +248 -0
  5. docling/backend/html_backend.py +429 -0
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +398 -0
  8. docling/backend/msword_backend.py +496 -0
  9. docling/backend/pdf_backend.py +78 -0
  10. docling/backend/pypdfium2_backend.py +16 -11
  11. docling/cli/main.py +96 -65
  12. docling/datamodel/base_models.py +79 -193
  13. docling/datamodel/document.py +405 -320
  14. docling/datamodel/pipeline_options.py +19 -3
  15. docling/datamodel/settings.py +16 -1
  16. docling/document_converter.py +240 -251
  17. docling/models/base_model.py +28 -0
  18. docling/models/base_ocr_model.py +40 -10
  19. docling/models/ds_glm_model.py +244 -30
  20. docling/models/easyocr_model.py +57 -42
  21. docling/models/layout_model.py +158 -116
  22. docling/models/page_assemble_model.py +127 -101
  23. docling/models/page_preprocessing_model.py +79 -0
  24. docling/models/table_structure_model.py +162 -116
  25. docling/models/tesseract_ocr_cli_model.py +76 -59
  26. docling/models/tesseract_ocr_model.py +90 -58
  27. docling/pipeline/base_pipeline.py +189 -0
  28. docling/pipeline/simple_pipeline.py +56 -0
  29. docling/pipeline/standard_pdf_pipeline.py +201 -0
  30. docling/utils/export.py +4 -3
  31. docling/utils/layout_utils.py +17 -11
  32. docling/utils/profiling.py +62 -0
  33. docling-2.4.1.dist-info/METADATA +154 -0
  34. docling-2.4.1.dist-info/RECORD +45 -0
  35. docling/pipeline/base_model_pipeline.py +0 -18
  36. docling/pipeline/standard_model_pipeline.py +0 -66
  37. docling-1.19.1.dist-info/METADATA +0 -380
  38. docling-1.19.1.dist-info/RECORD +0 -34
  39. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  40. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  41. {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
docling/cli/main.py CHANGED
@@ -5,22 +5,31 @@ import time
5
5
  import warnings
6
6
  from enum import Enum
7
7
  from pathlib import Path
8
- from typing import Annotated, Iterable, List, Optional
8
+ from typing import Annotated, Dict, Iterable, List, Optional, Type
9
9
 
10
10
  import typer
11
11
  from docling_core.utils.file import resolve_file_source
12
12
 
13
13
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
+ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
15
+ from docling.backend.pdf_backend import PdfDocumentBackend
14
16
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
15
- from docling.datamodel.base_models import ConversionStatus
16
- from docling.datamodel.document import ConversionResult, DocumentConversionInput
17
+ from docling.datamodel.base_models import (
18
+ ConversionStatus,
19
+ FormatToExtensions,
20
+ InputFormat,
21
+ OutputFormat,
22
+ )
23
+ from docling.datamodel.document import ConversionResult
17
24
  from docling.datamodel.pipeline_options import (
18
25
  EasyOcrOptions,
19
- PipelineOptions,
26
+ OcrOptions,
27
+ PdfPipelineOptions,
28
+ TableFormerMode,
20
29
  TesseractCliOcrOptions,
21
30
  TesseractOcrOptions,
22
31
  )
23
- from docling.document_converter import DocumentConverter
32
+ from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
24
33
 
25
34
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
26
35
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -53,9 +62,10 @@ def version_callback(value: bool):
53
62
 
54
63
 
55
64
  # Define an enum for the backend options
56
- class Backend(str, Enum):
65
+ class PdfBackend(str, Enum):
57
66
  PYPDFIUM2 = "pypdfium2"
58
- DOCLING = "docling"
67
+ DLPARSE_V1 = "dlparse_v1"
68
+ DLPARSE_V2 = "dlparse_v2"
59
69
 
60
70
 
61
71
  # Define an enum for the ocr engines
@@ -85,30 +95,30 @@ def export_documents(
85
95
  # Export Deep Search document JSON format:
86
96
  if export_json:
87
97
  fname = output_dir / f"{doc_filename}.json"
88
- with fname.open("w") as fp:
98
+ with fname.open("w", encoding="utf8") as fp:
89
99
  _log.info(f"writing JSON output to {fname}")
90
- fp.write(json.dumps(conv_res.render_as_dict()))
100
+ fp.write(json.dumps(conv_res.document.export_to_dict()))
91
101
 
92
102
  # Export Text format:
93
103
  if export_txt:
94
104
  fname = output_dir / f"{doc_filename}.txt"
95
- with fname.open("w") as fp:
105
+ with fname.open("w", encoding="utf8") as fp:
96
106
  _log.info(f"writing Text output to {fname}")
97
- fp.write(conv_res.render_as_text())
107
+ fp.write(conv_res.document.export_to_markdown(strict_text=True))
98
108
 
99
109
  # Export Markdown format:
100
110
  if export_md:
101
111
  fname = output_dir / f"{doc_filename}.md"
102
- with fname.open("w") as fp:
112
+ with fname.open("w", encoding="utf8") as fp:
103
113
  _log.info(f"writing Markdown output to {fname}")
104
- fp.write(conv_res.render_as_markdown())
114
+ fp.write(conv_res.document.export_to_markdown())
105
115
 
106
116
  # Export Document Tags format:
107
117
  if export_doctags:
108
118
  fname = output_dir / f"{doc_filename}.doctags"
109
- with fname.open("w") as fp:
119
+ with fname.open("w", encoding="utf8") as fp:
110
120
  _log.info(f"writing Doc Tags output to {fname}")
111
- fp.write(conv_res.render_as_doctags())
121
+ fp.write(conv_res.document.export_to_document_tokens())
112
122
 
113
123
  else:
114
124
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -129,44 +139,42 @@ def convert(
129
139
  help="PDF files to convert. Can be local file / directory paths or URL.",
130
140
  ),
131
141
  ],
132
- export_json: Annotated[
133
- bool,
134
- typer.Option(
135
- ..., "--json/--no-json", help="If enabled the document is exported as JSON."
136
- ),
137
- ] = False,
138
- export_md: Annotated[
139
- bool,
140
- typer.Option(
141
- ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
142
- ),
143
- ] = True,
144
- export_txt: Annotated[
145
- bool,
146
- typer.Option(
147
- ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
148
- ),
149
- ] = False,
150
- export_doctags: Annotated[
151
- bool,
152
- typer.Option(
153
- ...,
154
- "--doctags/--no-doctags",
155
- help="If enabled the document is exported as Doc Tags.",
156
- ),
157
- ] = False,
142
+ from_formats: List[InputFormat] = typer.Option(
143
+ None,
144
+ "--from",
145
+ help="Specify input formats to convert from. Defaults to all formats.",
146
+ ),
147
+ to_formats: List[OutputFormat] = typer.Option(
148
+ None, "--to", help="Specify output formats. Defaults to Markdown."
149
+ ),
158
150
  ocr: Annotated[
159
151
  bool,
160
152
  typer.Option(
161
153
  ..., help="If enabled, the bitmap content will be processed using OCR."
162
154
  ),
163
155
  ] = True,
164
- backend: Annotated[
165
- Backend, typer.Option(..., help="The PDF backend to use.")
166
- ] = Backend.DOCLING,
167
156
  ocr_engine: Annotated[
168
157
  OcrEngine, typer.Option(..., help="The OCR engine to use.")
169
158
  ] = OcrEngine.EASYOCR,
159
+ pdf_backend: Annotated[
160
+ PdfBackend, typer.Option(..., help="The PDF backend to use.")
161
+ ] = PdfBackend.DLPARSE_V1,
162
+ table_mode: Annotated[
163
+ TableFormerMode,
164
+ typer.Option(..., help="The mode to use in the table structure model."),
165
+ ] = TableFormerMode.FAST,
166
+ artifacts_path: Annotated[
167
+ Optional[Path],
168
+ typer.Option(..., help="If provided, the location of the model artifacts."),
169
+ ] = None,
170
+ abort_on_error: Annotated[
171
+ bool,
172
+ typer.Option(
173
+ ...,
174
+ "--abort-on-error/--no-abort-on-error",
175
+ help="If enabled, the bitmap content will be processed using OCR.",
176
+ ),
177
+ ] = False,
170
178
  output: Annotated[
171
179
  Path, typer.Option(..., help="Output directory where results are saved.")
172
180
  ] = Path("."),
@@ -182,6 +190,9 @@ def convert(
182
190
  ):
183
191
  logging.basicConfig(level=logging.INFO)
184
192
 
193
+ if from_formats is None:
194
+ from_formats = [e for e in InputFormat]
195
+
185
196
  input_doc_paths: List[Path] = []
186
197
  for src in input_sources:
187
198
  source = resolve_file_source(source=src)
@@ -191,48 +202,68 @@ def convert(
191
202
  )
192
203
  raise typer.Abort()
193
204
  elif source.is_dir():
194
- input_doc_paths.extend(list(source.glob("**/*.pdf")))
195
- input_doc_paths.extend(list(source.glob("**/*.PDF")))
205
+ for fmt in from_formats:
206
+ for ext in FormatToExtensions[fmt]:
207
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
208
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
196
209
  else:
197
210
  input_doc_paths.append(source)
198
211
 
199
- match backend:
200
- case Backend.PYPDFIUM2:
201
- do_cell_matching = ocr # only do cell matching when OCR enabled
202
- pdf_backend = PyPdfiumDocumentBackend
203
- case Backend.DOCLING:
204
- do_cell_matching = True
205
- pdf_backend = DoclingParseDocumentBackend
206
- case _:
207
- raise RuntimeError(f"Unexpected backend type {backend}")
212
+ if to_formats is None:
213
+ to_formats = [OutputFormat.MARKDOWN]
214
+
215
+ export_json = OutputFormat.JSON in to_formats
216
+ export_md = OutputFormat.MARKDOWN in to_formats
217
+ export_txt = OutputFormat.TEXT in to_formats
218
+ export_doctags = OutputFormat.DOCTAGS in to_formats
208
219
 
209
220
  match ocr_engine:
210
221
  case OcrEngine.EASYOCR:
211
- ocr_options = EasyOcrOptions()
222
+ ocr_options: OcrOptions = EasyOcrOptions()
212
223
  case OcrEngine.TESSERACT_CLI:
213
224
  ocr_options = TesseractCliOcrOptions()
214
225
  case OcrEngine.TESSERACT:
215
226
  ocr_options = TesseractOcrOptions()
216
227
  case _:
217
- raise RuntimeError(f"Unexpected backend type {backend}")
228
+ raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
218
229
 
219
- pipeline_options = PipelineOptions(
230
+ pipeline_options = PdfPipelineOptions(
220
231
  do_ocr=ocr,
221
232
  ocr_options=ocr_options,
222
233
  do_table_structure=True,
223
234
  )
224
- pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
235
+ pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
236
+ pipeline_options.table_structure_options.mode = table_mode
237
+
238
+ if artifacts_path is not None:
239
+ pipeline_options.artifacts_path = artifacts_path
240
+
241
+ match pdf_backend:
242
+ case PdfBackend.DLPARSE_V1:
243
+ backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
244
+ case PdfBackend.DLPARSE_V2:
245
+ backend = DoclingParseV2DocumentBackend
246
+ case PdfBackend.PYPDFIUM2:
247
+ backend = PyPdfiumDocumentBackend
248
+ case _:
249
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
250
+
251
+ format_options: Dict[InputFormat, FormatOption] = {
252
+ InputFormat.PDF: PdfFormatOption(
253
+ pipeline_options=pipeline_options,
254
+ backend=backend, # pdf_backend
255
+ )
256
+ }
225
257
  doc_converter = DocumentConverter(
226
- pipeline_options=pipeline_options,
227
- pdf_backend=pdf_backend,
258
+ allowed_formats=from_formats,
259
+ format_options=format_options,
228
260
  )
229
261
 
230
- # Define input files
231
- input = DocumentConversionInput.from_paths(input_doc_paths)
232
-
233
262
  start_time = time.time()
234
263
 
235
- conv_results = doc_converter.convert(input)
264
+ conv_results = doc_converter.convert_all(
265
+ input_doc_paths, raises_on_error=abort_on_error
266
+ )
236
267
 
237
268
  output.mkdir(parents=True, exist_ok=True)
238
269
  export_documents(
@@ -1,18 +1,19 @@
1
- import copy
2
- import warnings
3
1
  from enum import Enum, auto
4
2
  from io import BytesIO
5
- from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
6
-
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Union
4
+
5
+ from docling_core.types.doc import (
6
+ BoundingBox,
7
+ DocItemLabel,
8
+ PictureDataType,
9
+ Size,
10
+ TableCell,
11
+ )
7
12
  from PIL.Image import Image
8
- from pydantic import BaseModel, ConfigDict, Field, model_validator
9
- from typing_extensions import Self
13
+ from pydantic import BaseModel, ConfigDict
10
14
 
11
- from docling.backend.abstract_backend import PdfPageBackend
12
- from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
13
- PipelineOptions,
14
- TableStructureOptions,
15
- )
15
+ if TYPE_CHECKING:
16
+ from docling.backend.pdf_backend import PdfPageBackend
16
17
 
17
18
 
18
19
  class ConversionStatus(str, Enum):
@@ -23,18 +24,67 @@ class ConversionStatus(str, Enum):
23
24
  PARTIAL_SUCCESS = auto()
24
25
 
25
26
 
27
+ class InputFormat(str, Enum):
28
+ DOCX = "docx"
29
+ PPTX = "pptx"
30
+ HTML = "html"
31
+ IMAGE = "image"
32
+ PDF = "pdf"
33
+ ASCIIDOC = "asciidoc"
34
+ MD = "md"
35
+
36
+
37
+ class OutputFormat(str, Enum):
38
+ MARKDOWN = "md"
39
+ JSON = "json"
40
+ TEXT = "text"
41
+ DOCTAGS = "doctags"
42
+
43
+
44
+ FormatToExtensions: Dict[InputFormat, List[str]] = {
45
+ InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
46
+ InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
47
+ InputFormat.PDF: ["pdf"],
48
+ InputFormat.MD: ["md"],
49
+ InputFormat.HTML: ["html", "htm", "xhtml"],
50
+ InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
51
+ InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
52
+ }
53
+
54
+ FormatToMimeType: Dict[InputFormat, List[str]] = {
55
+ InputFormat.DOCX: [
56
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
57
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
58
+ ],
59
+ InputFormat.PPTX: [
60
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
61
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
62
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
63
+ ],
64
+ InputFormat.HTML: ["text/html", "application/xhtml+xml"],
65
+ InputFormat.IMAGE: [
66
+ "image/png",
67
+ "image/jpeg",
68
+ "image/tiff",
69
+ "image/gif",
70
+ "image/bmp",
71
+ ],
72
+ InputFormat.PDF: ["application/pdf"],
73
+ InputFormat.ASCIIDOC: ["text/asciidoc"],
74
+ InputFormat.MD: ["text/markdown", "text/x-markdown"],
75
+ }
76
+ MimeTypeToFormat = {
77
+ mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
78
+ }
79
+
80
+
26
81
  class DocInputType(str, Enum):
27
82
  PATH = auto()
28
83
  STREAM = auto()
29
84
 
30
85
 
31
- class CoordOrigin(str, Enum):
32
- TOPLEFT = auto()
33
- BOTTOMLEFT = auto()
34
-
35
-
36
86
  class DoclingComponentType(str, Enum):
37
- PDF_BACKEND = auto()
87
+ DOCUMENT_BACKEND = auto()
38
88
  MODEL = auto()
39
89
  DOC_ASSEMBLER = auto()
40
90
 
@@ -45,118 +95,6 @@ class ErrorItem(BaseModel):
45
95
  error_message: str
46
96
 
47
97
 
48
- class PageSize(BaseModel):
49
- width: float = 0.0
50
- height: float = 0.0
51
-
52
-
53
- class BoundingBox(BaseModel):
54
- l: float # left
55
- t: float # top
56
- r: float # right
57
- b: float # bottom
58
-
59
- coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
60
-
61
- @property
62
- def width(self):
63
- return self.r - self.l
64
-
65
- @property
66
- def height(self):
67
- return abs(self.t - self.b)
68
-
69
- def scaled(self, scale: float) -> "BoundingBox":
70
- out_bbox = copy.deepcopy(self)
71
- out_bbox.l *= scale
72
- out_bbox.r *= scale
73
- out_bbox.t *= scale
74
- out_bbox.b *= scale
75
-
76
- return out_bbox
77
-
78
- def normalized(self, page_size: PageSize) -> "BoundingBox":
79
- out_bbox = copy.deepcopy(self)
80
- out_bbox.l /= page_size.width
81
- out_bbox.r /= page_size.width
82
- out_bbox.t /= page_size.height
83
- out_bbox.b /= page_size.height
84
-
85
- return out_bbox
86
-
87
- def as_tuple(self):
88
- if self.coord_origin == CoordOrigin.TOPLEFT:
89
- return (self.l, self.t, self.r, self.b)
90
- elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
91
- return (self.l, self.b, self.r, self.t)
92
-
93
- @classmethod
94
- def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
95
- if origin == CoordOrigin.TOPLEFT:
96
- l, t, r, b = coord[0], coord[1], coord[2], coord[3]
97
- if r < l:
98
- l, r = r, l
99
- if b < t:
100
- b, t = t, b
101
-
102
- return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
103
- elif origin == CoordOrigin.BOTTOMLEFT:
104
- l, b, r, t = coord[0], coord[1], coord[2], coord[3]
105
- if r < l:
106
- l, r = r, l
107
- if b > t:
108
- b, t = t, b
109
-
110
- return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
111
-
112
- def area(self) -> float:
113
- area = (self.r - self.l) * (self.b - self.t)
114
- if self.coord_origin == CoordOrigin.BOTTOMLEFT:
115
- area = -area
116
- return area
117
-
118
- def intersection_area_with(self, other: "BoundingBox") -> float:
119
- # Calculate intersection coordinates
120
- left = max(self.l, other.l)
121
- top = max(self.t, other.t)
122
- right = min(self.r, other.r)
123
- bottom = min(self.b, other.b)
124
-
125
- # Calculate intersection dimensions
126
- width = right - left
127
- height = bottom - top
128
-
129
- # If the bounding boxes do not overlap, width or height will be negative
130
- if width <= 0 or height <= 0:
131
- return 0.0
132
-
133
- return width * height
134
-
135
- def to_bottom_left_origin(self, page_height) -> "BoundingBox":
136
- if self.coord_origin == CoordOrigin.BOTTOMLEFT:
137
- return self
138
- elif self.coord_origin == CoordOrigin.TOPLEFT:
139
- return BoundingBox(
140
- l=self.l,
141
- r=self.r,
142
- t=page_height - self.t,
143
- b=page_height - self.b,
144
- coord_origin=CoordOrigin.BOTTOMLEFT,
145
- )
146
-
147
- def to_top_left_origin(self, page_height):
148
- if self.coord_origin == CoordOrigin.TOPLEFT:
149
- return self
150
- elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
151
- return BoundingBox(
152
- l=self.l,
153
- r=self.r,
154
- t=page_height - self.t, # self.b
155
- b=page_height - self.b, # self.t
156
- coord_origin=CoordOrigin.TOPLEFT,
157
- )
158
-
159
-
160
98
  class Cell(BaseModel):
161
99
  id: int
162
100
  text: str
@@ -169,14 +107,14 @@ class OcrCell(Cell):
169
107
 
170
108
  class Cluster(BaseModel):
171
109
  id: int
172
- label: str
110
+ label: DocItemLabel
173
111
  bbox: BoundingBox
174
112
  confidence: float = 1.0
175
113
  cells: List[Cell] = []
176
114
 
177
115
 
178
116
  class BasePageElement(BaseModel):
179
- label: str
117
+ label: DocItemLabel
180
118
  id: int
181
119
  page_no: int
182
120
  cluster: Cluster
@@ -187,37 +125,7 @@ class LayoutPrediction(BaseModel):
187
125
  clusters: List[Cluster] = []
188
126
 
189
127
 
190
- class TableCell(BaseModel):
191
- bbox: BoundingBox
192
- row_span: int
193
- col_span: int
194
- start_row_offset_idx: int
195
- end_row_offset_idx: int
196
- start_col_offset_idx: int
197
- end_col_offset_idx: int
198
- text: str
199
- column_header: bool = False
200
- row_header: bool = False
201
- row_section: bool = False
202
-
203
- @model_validator(mode="before")
204
- @classmethod
205
- def from_dict_format(cls, data: Any) -> Any:
206
- if isinstance(data, Dict):
207
- text = data["bbox"].get("token", "")
208
- if not len(text):
209
- text_cells = data.pop("text_cell_bboxes", None)
210
- if text_cells:
211
- for el in text_cells:
212
- text += el["token"] + " "
213
-
214
- text = text.strip()
215
- data["text"] = text
216
-
217
- return data
218
-
219
-
220
- class TableElement(BasePageElement):
128
+ class Table(BasePageElement):
221
129
  otsl_seq: List[str]
222
130
  num_rows: int = 0
223
131
  num_cols: int = 0
@@ -225,18 +133,15 @@ class TableElement(BasePageElement):
225
133
 
226
134
 
227
135
  class TableStructurePrediction(BaseModel):
228
- table_map: Dict[int, TableElement] = {}
229
-
230
-
231
- class TextElement(BasePageElement): ...
136
+ table_map: Dict[int, Table] = {}
232
137
 
233
138
 
234
- class FigureData(BaseModel):
235
- pass
139
+ class TextElement(BasePageElement):
140
+ text: str
236
141
 
237
142
 
238
143
  class FigureElement(BasePageElement):
239
- data: Optional[FigureData] = None
144
+ annotations: List[PictureDataType] = []
240
145
  provenance: Optional[str] = None
241
146
  predicted_class: Optional[str] = None
242
147
  confidence: Optional[float] = None
@@ -259,7 +164,7 @@ class PagePredictions(BaseModel):
259
164
  equations_prediction: Optional[EquationPrediction] = None
260
165
 
261
166
 
262
- PageElement = Union[TextElement, TableElement, FigureElement]
167
+ PageElement = Union[TextElement, Table, FigureElement]
263
168
 
264
169
 
265
170
  class AssembledUnit(BaseModel):
@@ -272,13 +177,13 @@ class Page(BaseModel):
272
177
  model_config = ConfigDict(arbitrary_types_allowed=True)
273
178
 
274
179
  page_no: int
275
- page_hash: Optional[str] = None
276
- size: Optional[PageSize] = None
180
+ # page_hash: Optional[str] = None
181
+ size: Optional[Size] = None
277
182
  cells: List[Cell] = []
278
183
  predictions: PagePredictions = PagePredictions()
279
184
  assembled: Optional[AssembledUnit] = None
280
185
 
281
- _backend: Optional[PdfPageBackend] = (
186
+ _backend: Optional["PdfPageBackend"] = (
282
187
  None # Internal PDF backend. By default it is cleared during assembling.
283
188
  )
284
189
  _default_image_scale: float = 1.0 # Default image scale for external usage.
@@ -301,24 +206,5 @@ class Page(BaseModel):
301
206
  class DocumentStream(BaseModel):
302
207
  model_config = ConfigDict(arbitrary_types_allowed=True)
303
208
 
304
- filename: str
209
+ name: str
305
210
  stream: BytesIO
306
-
307
-
308
- class AssembleOptions(BaseModel):
309
- keep_page_images: Annotated[
310
- bool,
311
- Field(
312
- deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
313
- ),
314
- ] = False # False: page images are removed in the assemble step
315
- images_scale: Optional[float] = None # if set, the scale for generated images
316
-
317
- @model_validator(mode="after")
318
- def set_page_images_from_deprecated(self) -> Self:
319
- with warnings.catch_warnings():
320
- warnings.simplefilter("ignore", DeprecationWarning)
321
- default_scale = 1.0
322
- if self.keep_page_images and self.images_scale is None:
323
- self.images_scale = default_scale
324
- return self