docling 1.19.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. docling/backend/abstract_backend.py +32 -37
  2. docling/backend/docling_parse_backend.py +16 -12
  3. docling/backend/docling_parse_v2_backend.py +240 -0
  4. docling/backend/html_backend.py +425 -0
  5. docling/backend/mspowerpoint_backend.py +375 -0
  6. docling/backend/msword_backend.py +509 -0
  7. docling/backend/pdf_backend.py +78 -0
  8. docling/backend/pypdfium2_backend.py +15 -10
  9. docling/cli/main.py +61 -60
  10. docling/datamodel/base_models.py +73 -193
  11. docling/datamodel/document.py +364 -318
  12. docling/datamodel/pipeline_options.py +13 -0
  13. docling/datamodel/settings.py +1 -0
  14. docling/document_converter.py +215 -252
  15. docling/models/base_model.py +25 -0
  16. docling/models/base_ocr_model.py +10 -5
  17. docling/models/ds_glm_model.py +209 -20
  18. docling/models/easyocr_model.py +4 -1
  19. docling/models/layout_model.py +73 -61
  20. docling/models/page_assemble_model.py +21 -5
  21. docling/models/page_preprocessing_model.py +57 -0
  22. docling/models/table_structure_model.py +34 -32
  23. docling/models/tesseract_ocr_cli_model.py +8 -5
  24. docling/models/tesseract_ocr_model.py +8 -5
  25. docling/pipeline/base_pipeline.py +190 -0
  26. docling/pipeline/simple_pipeline.py +59 -0
  27. docling/pipeline/standard_pdf_pipeline.py +198 -0
  28. docling/utils/export.py +4 -3
  29. docling/utils/layout_utils.py +17 -11
  30. docling-2.0.0.dist-info/METADATA +149 -0
  31. docling-2.0.0.dist-info/RECORD +42 -0
  32. docling/pipeline/base_model_pipeline.py +0 -18
  33. docling/pipeline/standard_model_pipeline.py +0 -66
  34. docling-1.19.1.dist-info/METADATA +0 -380
  35. docling-1.19.1.dist-info/RECORD +0 -34
  36. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/LICENSE +0 -0
  37. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/WHEEL +0 -0
  38. {docling-1.19.1.dist-info → docling-2.0.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py CHANGED
@@ -5,22 +5,27 @@ import time
5
5
  import warnings
6
6
  from enum import Enum
7
7
  from pathlib import Path
8
- from typing import Annotated, Iterable, List, Optional
8
+ from typing import Annotated, Dict, Iterable, List, Optional
9
9
 
10
10
  import typer
11
11
  from docling_core.utils.file import resolve_file_source
12
12
 
13
13
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
- from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
15
- from docling.datamodel.base_models import ConversionStatus
16
- from docling.datamodel.document import ConversionResult, DocumentConversionInput
14
+ from docling.datamodel.base_models import (
15
+ ConversionStatus,
16
+ FormatToExtensions,
17
+ InputFormat,
18
+ OutputFormat,
19
+ )
20
+ from docling.datamodel.document import ConversionResult
17
21
  from docling.datamodel.pipeline_options import (
18
22
  EasyOcrOptions,
19
- PipelineOptions,
23
+ OcrOptions,
24
+ PdfPipelineOptions,
20
25
  TesseractCliOcrOptions,
21
26
  TesseractOcrOptions,
22
27
  )
23
- from docling.document_converter import DocumentConverter
28
+ from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
24
29
 
25
30
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
26
31
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -87,28 +92,28 @@ def export_documents(
87
92
  fname = output_dir / f"{doc_filename}.json"
88
93
  with fname.open("w") as fp:
89
94
  _log.info(f"writing JSON output to {fname}")
90
- fp.write(json.dumps(conv_res.render_as_dict()))
95
+ fp.write(json.dumps(conv_res.document.export_to_dict()))
91
96
 
92
97
  # Export Text format:
93
98
  if export_txt:
94
99
  fname = output_dir / f"{doc_filename}.txt"
95
100
  with fname.open("w") as fp:
96
101
  _log.info(f"writing Text output to {fname}")
97
- fp.write(conv_res.render_as_text())
102
+ fp.write(conv_res.document.export_to_markdown(strict_text=True))
98
103
 
99
104
  # Export Markdown format:
100
105
  if export_md:
101
106
  fname = output_dir / f"{doc_filename}.md"
102
107
  with fname.open("w") as fp:
103
108
  _log.info(f"writing Markdown output to {fname}")
104
- fp.write(conv_res.render_as_markdown())
109
+ fp.write(conv_res.document.export_to_markdown())
105
110
 
106
111
  # Export Document Tags format:
107
112
  if export_doctags:
108
113
  fname = output_dir / f"{doc_filename}.doctags"
109
114
  with fname.open("w") as fp:
110
115
  _log.info(f"writing Doc Tags output to {fname}")
111
- fp.write(conv_res.render_as_doctags())
116
+ fp.write(conv_res.document.export_to_document_tokens())
112
117
 
113
118
  else:
114
119
  _log.warning(f"Document {conv_res.input.file} failed to convert.")
@@ -129,44 +134,31 @@ def convert(
129
134
  help="PDF files to convert. Can be local file / directory paths or URL.",
130
135
  ),
131
136
  ],
132
- export_json: Annotated[
133
- bool,
134
- typer.Option(
135
- ..., "--json/--no-json", help="If enabled the document is exported as JSON."
136
- ),
137
- ] = False,
138
- export_md: Annotated[
139
- bool,
140
- typer.Option(
141
- ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
142
- ),
143
- ] = True,
144
- export_txt: Annotated[
145
- bool,
146
- typer.Option(
147
- ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
148
- ),
149
- ] = False,
150
- export_doctags: Annotated[
151
- bool,
152
- typer.Option(
153
- ...,
154
- "--doctags/--no-doctags",
155
- help="If enabled the document is exported as Doc Tags.",
156
- ),
157
- ] = False,
137
+ from_formats: List[InputFormat] = typer.Option(
138
+ None,
139
+ "--from",
140
+ help="Specify input formats to convert from. Defaults to all formats.",
141
+ ),
142
+ to_formats: List[OutputFormat] = typer.Option(
143
+ None, "--to", help="Specify output formats. Defaults to Markdown."
144
+ ),
158
145
  ocr: Annotated[
159
146
  bool,
160
147
  typer.Option(
161
148
  ..., help="If enabled, the bitmap content will be processed using OCR."
162
149
  ),
163
150
  ] = True,
164
- backend: Annotated[
165
- Backend, typer.Option(..., help="The PDF backend to use.")
166
- ] = Backend.DOCLING,
167
151
  ocr_engine: Annotated[
168
152
  OcrEngine, typer.Option(..., help="The OCR engine to use.")
169
153
  ] = OcrEngine.EASYOCR,
154
+ abort_on_error: Annotated[
155
+ bool,
156
+ typer.Option(
157
+ ...,
158
+ "--abort-on-error/--no-abort-on-error",
159
+ help="If enabled, the bitmap content will be processed using OCR.",
160
+ ),
161
+ ] = False,
170
162
  output: Annotated[
171
163
  Path, typer.Option(..., help="Output directory where results are saved.")
172
164
  ] = Path("."),
@@ -182,6 +174,9 @@ def convert(
182
174
  ):
183
175
  logging.basicConfig(level=logging.INFO)
184
176
 
177
+ if from_formats is None:
178
+ from_formats = [e for e in InputFormat]
179
+
185
180
  input_doc_paths: List[Path] = []
186
181
  for src in input_sources:
187
182
  source = resolve_file_source(source=src)
@@ -191,48 +186,54 @@ def convert(
191
186
  )
192
187
  raise typer.Abort()
193
188
  elif source.is_dir():
194
- input_doc_paths.extend(list(source.glob("**/*.pdf")))
195
- input_doc_paths.extend(list(source.glob("**/*.PDF")))
189
+ for fmt in from_formats:
190
+ for ext in FormatToExtensions[fmt]:
191
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
192
+ input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
196
193
  else:
197
194
  input_doc_paths.append(source)
198
195
 
199
- match backend:
200
- case Backend.PYPDFIUM2:
201
- do_cell_matching = ocr # only do cell matching when OCR enabled
202
- pdf_backend = PyPdfiumDocumentBackend
203
- case Backend.DOCLING:
204
- do_cell_matching = True
205
- pdf_backend = DoclingParseDocumentBackend
206
- case _:
207
- raise RuntimeError(f"Unexpected backend type {backend}")
196
+ if to_formats is None:
197
+ to_formats = [OutputFormat.MARKDOWN]
198
+
199
+ export_json = OutputFormat.JSON in to_formats
200
+ export_md = OutputFormat.MARKDOWN in to_formats
201
+ export_txt = OutputFormat.TEXT in to_formats
202
+ export_doctags = OutputFormat.DOCTAGS in to_formats
208
203
 
209
204
  match ocr_engine:
210
205
  case OcrEngine.EASYOCR:
211
- ocr_options = EasyOcrOptions()
206
+ ocr_options: OcrOptions = EasyOcrOptions()
212
207
  case OcrEngine.TESSERACT_CLI:
213
208
  ocr_options = TesseractCliOcrOptions()
214
209
  case OcrEngine.TESSERACT:
215
210
  ocr_options = TesseractOcrOptions()
216
211
  case _:
217
- raise RuntimeError(f"Unexpected backend type {backend}")
212
+ raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
218
213
 
219
- pipeline_options = PipelineOptions(
214
+ pipeline_options = PdfPipelineOptions(
220
215
  do_ocr=ocr,
221
216
  ocr_options=ocr_options,
222
217
  do_table_structure=True,
223
218
  )
224
- pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
219
+ pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
220
+
221
+ format_options: Dict[InputFormat, FormatOption] = {
222
+ InputFormat.PDF: PdfFormatOption(
223
+ pipeline_options=pipeline_options,
224
+ backend=DoclingParseDocumentBackend, # pdf_backend
225
+ )
226
+ }
225
227
  doc_converter = DocumentConverter(
226
- pipeline_options=pipeline_options,
227
- pdf_backend=pdf_backend,
228
+ allowed_formats=from_formats,
229
+ format_options=format_options,
228
230
  )
229
231
 
230
- # Define input files
231
- input = DocumentConversionInput.from_paths(input_doc_paths)
232
-
233
232
  start_time = time.time()
234
233
 
235
- conv_results = doc_converter.convert(input)
234
+ conv_results = doc_converter.convert_all(
235
+ input_doc_paths, raises_on_error=abort_on_error
236
+ )
236
237
 
237
238
  output.mkdir(parents=True, exist_ok=True)
238
239
  export_documents(
@@ -1,18 +1,19 @@
1
- import copy
2
- import warnings
3
1
  from enum import Enum, auto
4
2
  from io import BytesIO
5
- from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
6
-
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
4
+
5
+ from docling_core.types.doc import (
6
+ BoundingBox,
7
+ DocItemLabel,
8
+ PictureDataType,
9
+ Size,
10
+ TableCell,
11
+ )
7
12
  from PIL.Image import Image
8
- from pydantic import BaseModel, ConfigDict, Field, model_validator
9
- from typing_extensions import Self
13
+ from pydantic import BaseModel, ConfigDict
10
14
 
11
- from docling.backend.abstract_backend import PdfPageBackend
12
- from docling.datamodel.pipeline_options import ( # Must be imported here for backward compatibility.
13
- PipelineOptions,
14
- TableStructureOptions,
15
- )
15
+ if TYPE_CHECKING:
16
+ from docling.backend.pdf_backend import PdfPageBackend
16
17
 
17
18
 
18
19
  class ConversionStatus(str, Enum):
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
23
24
  PARTIAL_SUCCESS = auto()
24
25
 
25
26
 
27
+ class InputFormat(str, Enum):
28
+ DOCX = "docx"
29
+ PPTX = "pptx"
30
+ HTML = "html"
31
+ IMAGE = "image"
32
+ PDF = "pdf"
33
+
34
+
35
+ class OutputFormat(str, Enum):
36
+ MARKDOWN = "md"
37
+ JSON = "json"
38
+ TEXT = "text"
39
+ DOCTAGS = "doctags"
40
+
41
+
42
+ FormatToExtensions: Dict[InputFormat, List[str]] = {
43
+ InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
44
+ InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
45
+ InputFormat.PDF: ["pdf"],
46
+ InputFormat.HTML: ["html", "htm", "xhtml"],
47
+ InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
48
+ }
49
+
50
+ FormatToMimeType: Dict[InputFormat, Set[str]] = {
51
+ InputFormat.DOCX: {
52
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
53
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
54
+ },
55
+ InputFormat.PPTX: {
56
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
57
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
58
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
59
+ },
60
+ InputFormat.HTML: {"text/html", "application/xhtml+xml"},
61
+ InputFormat.IMAGE: {
62
+ "image/png",
63
+ "image/jpeg",
64
+ "image/tiff",
65
+ "image/gif",
66
+ "image/bmp",
67
+ },
68
+ InputFormat.PDF: {"application/pdf"},
69
+ }
70
+ MimeTypeToFormat = {
71
+ mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
72
+ }
73
+
74
+
26
75
  class DocInputType(str, Enum):
27
76
  PATH = auto()
28
77
  STREAM = auto()
29
78
 
30
79
 
31
- class CoordOrigin(str, Enum):
32
- TOPLEFT = auto()
33
- BOTTOMLEFT = auto()
34
-
35
-
36
80
  class DoclingComponentType(str, Enum):
37
- PDF_BACKEND = auto()
81
+ DOCUMENT_BACKEND = auto()
38
82
  MODEL = auto()
39
83
  DOC_ASSEMBLER = auto()
40
84
 
@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
45
89
  error_message: str
46
90
 
47
91
 
48
- class PageSize(BaseModel):
49
- width: float = 0.0
50
- height: float = 0.0
51
-
52
-
53
- class BoundingBox(BaseModel):
54
- l: float # left
55
- t: float # top
56
- r: float # right
57
- b: float # bottom
58
-
59
- coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
60
-
61
- @property
62
- def width(self):
63
- return self.r - self.l
64
-
65
- @property
66
- def height(self):
67
- return abs(self.t - self.b)
68
-
69
- def scaled(self, scale: float) -> "BoundingBox":
70
- out_bbox = copy.deepcopy(self)
71
- out_bbox.l *= scale
72
- out_bbox.r *= scale
73
- out_bbox.t *= scale
74
- out_bbox.b *= scale
75
-
76
- return out_bbox
77
-
78
- def normalized(self, page_size: PageSize) -> "BoundingBox":
79
- out_bbox = copy.deepcopy(self)
80
- out_bbox.l /= page_size.width
81
- out_bbox.r /= page_size.width
82
- out_bbox.t /= page_size.height
83
- out_bbox.b /= page_size.height
84
-
85
- return out_bbox
86
-
87
- def as_tuple(self):
88
- if self.coord_origin == CoordOrigin.TOPLEFT:
89
- return (self.l, self.t, self.r, self.b)
90
- elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
91
- return (self.l, self.b, self.r, self.t)
92
-
93
- @classmethod
94
- def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
95
- if origin == CoordOrigin.TOPLEFT:
96
- l, t, r, b = coord[0], coord[1], coord[2], coord[3]
97
- if r < l:
98
- l, r = r, l
99
- if b < t:
100
- b, t = t, b
101
-
102
- return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
103
- elif origin == CoordOrigin.BOTTOMLEFT:
104
- l, b, r, t = coord[0], coord[1], coord[2], coord[3]
105
- if r < l:
106
- l, r = r, l
107
- if b > t:
108
- b, t = t, b
109
-
110
- return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
111
-
112
- def area(self) -> float:
113
- area = (self.r - self.l) * (self.b - self.t)
114
- if self.coord_origin == CoordOrigin.BOTTOMLEFT:
115
- area = -area
116
- return area
117
-
118
- def intersection_area_with(self, other: "BoundingBox") -> float:
119
- # Calculate intersection coordinates
120
- left = max(self.l, other.l)
121
- top = max(self.t, other.t)
122
- right = min(self.r, other.r)
123
- bottom = min(self.b, other.b)
124
-
125
- # Calculate intersection dimensions
126
- width = right - left
127
- height = bottom - top
128
-
129
- # If the bounding boxes do not overlap, width or height will be negative
130
- if width <= 0 or height <= 0:
131
- return 0.0
132
-
133
- return width * height
134
-
135
- def to_bottom_left_origin(self, page_height) -> "BoundingBox":
136
- if self.coord_origin == CoordOrigin.BOTTOMLEFT:
137
- return self
138
- elif self.coord_origin == CoordOrigin.TOPLEFT:
139
- return BoundingBox(
140
- l=self.l,
141
- r=self.r,
142
- t=page_height - self.t,
143
- b=page_height - self.b,
144
- coord_origin=CoordOrigin.BOTTOMLEFT,
145
- )
146
-
147
- def to_top_left_origin(self, page_height):
148
- if self.coord_origin == CoordOrigin.TOPLEFT:
149
- return self
150
- elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
151
- return BoundingBox(
152
- l=self.l,
153
- r=self.r,
154
- t=page_height - self.t, # self.b
155
- b=page_height - self.b, # self.t
156
- coord_origin=CoordOrigin.TOPLEFT,
157
- )
158
-
159
-
160
92
  class Cell(BaseModel):
161
93
  id: int
162
94
  text: str
@@ -169,14 +101,14 @@ class OcrCell(Cell):
169
101
 
170
102
  class Cluster(BaseModel):
171
103
  id: int
172
- label: str
104
+ label: DocItemLabel
173
105
  bbox: BoundingBox
174
106
  confidence: float = 1.0
175
107
  cells: List[Cell] = []
176
108
 
177
109
 
178
110
  class BasePageElement(BaseModel):
179
- label: str
111
+ label: DocItemLabel
180
112
  id: int
181
113
  page_no: int
182
114
  cluster: Cluster
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
187
119
  clusters: List[Cluster] = []
188
120
 
189
121
 
190
- class TableCell(BaseModel):
191
- bbox: BoundingBox
192
- row_span: int
193
- col_span: int
194
- start_row_offset_idx: int
195
- end_row_offset_idx: int
196
- start_col_offset_idx: int
197
- end_col_offset_idx: int
198
- text: str
199
- column_header: bool = False
200
- row_header: bool = False
201
- row_section: bool = False
202
-
203
- @model_validator(mode="before")
204
- @classmethod
205
- def from_dict_format(cls, data: Any) -> Any:
206
- if isinstance(data, Dict):
207
- text = data["bbox"].get("token", "")
208
- if not len(text):
209
- text_cells = data.pop("text_cell_bboxes", None)
210
- if text_cells:
211
- for el in text_cells:
212
- text += el["token"] + " "
213
-
214
- text = text.strip()
215
- data["text"] = text
216
-
217
- return data
218
-
219
-
220
- class TableElement(BasePageElement):
122
+ class Table(BasePageElement):
221
123
  otsl_seq: List[str]
222
124
  num_rows: int = 0
223
125
  num_cols: int = 0
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):
225
127
 
226
128
 
227
129
  class TableStructurePrediction(BaseModel):
228
- table_map: Dict[int, TableElement] = {}
229
-
230
-
231
- class TextElement(BasePageElement): ...
130
+ table_map: Dict[int, Table] = {}
232
131
 
233
132
 
234
- class FigureData(BaseModel):
235
- pass
133
+ class TextElement(BasePageElement):
134
+ text: str
236
135
 
237
136
 
238
137
  class FigureElement(BasePageElement):
239
- data: Optional[FigureData] = None
138
+ annotations: List[PictureDataType] = []
240
139
  provenance: Optional[str] = None
241
140
  predicted_class: Optional[str] = None
242
141
  confidence: Optional[float] = None
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
259
158
  equations_prediction: Optional[EquationPrediction] = None
260
159
 
261
160
 
262
- PageElement = Union[TextElement, TableElement, FigureElement]
161
+ PageElement = Union[TextElement, Table, FigureElement]
263
162
 
264
163
 
265
164
  class AssembledUnit(BaseModel):
@@ -272,13 +171,13 @@ class Page(BaseModel):
272
171
  model_config = ConfigDict(arbitrary_types_allowed=True)
273
172
 
274
173
  page_no: int
275
- page_hash: Optional[str] = None
276
- size: Optional[PageSize] = None
174
+ # page_hash: Optional[str] = None
175
+ size: Optional[Size] = None
277
176
  cells: List[Cell] = []
278
177
  predictions: PagePredictions = PagePredictions()
279
178
  assembled: Optional[AssembledUnit] = None
280
179
 
281
- _backend: Optional[PdfPageBackend] = (
180
+ _backend: Optional["PdfPageBackend"] = (
282
181
  None # Internal PDF backend. By default it is cleared during assembling.
283
182
  )
284
183
  _default_image_scale: float = 1.0 # Default image scale for external usage.
@@ -301,24 +200,5 @@ class Page(BaseModel):
301
200
  class DocumentStream(BaseModel):
302
201
  model_config = ConfigDict(arbitrary_types_allowed=True)
303
202
 
304
- filename: str
203
+ name: str
305
204
  stream: BytesIO
306
-
307
-
308
- class AssembleOptions(BaseModel):
309
- keep_page_images: Annotated[
310
- bool,
311
- Field(
312
- deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
313
- ),
314
- ] = False # False: page images are removed in the assemble step
315
- images_scale: Optional[float] = None # if set, the scale for generated images
316
-
317
- @model_validator(mode="after")
318
- def set_page_images_from_deprecated(self) -> Self:
319
- with warnings.catch_warnings():
320
- warnings.simplefilter("ignore", DeprecationWarning)
321
- default_scale = 1.0
322
- if self.keep_page_images and self.images_scale is None:
323
- self.images_scale = default_scale
324
- return self