docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +26 -9
- docling/backend/md_backend.py +5 -7
- docling/backend/msexcel_backend.py +271 -95
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +23 -15
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +10 -13
- docling/backend/xml/uspto_backend.py +15 -19
- docling/cli/main.py +27 -9
- docling/cli/models.py +2 -3
- docling/datamodel/base_models.py +40 -5
- docling/datamodel/document.py +18 -10
- docling/datamodel/pipeline_options.py +29 -4
- docling/document_converter.py +5 -5
- docling/models/api_vlm_model.py +66 -0
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +9 -75
- docling/models/picture_description_base_model.py +16 -5
- docling/models/picture_description_vlm_model.py +2 -3
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +8 -23
- docling/models/table_structure_model.py +2 -6
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +8 -6
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +6 -3
- docling/pipeline/vlm_pipeline.py +27 -20
- docling/utils/api_image_request.py +61 -0
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +7 -7
- docling/utils/utils.py +1 -1
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
- docling-2.31.0.dist-info/RECORD +86 -0
- docling-2.29.0.dist-info/RECORD +0 -84
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
import pypdfium2.raw as pdfium_c
|
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
29
30
|
self.valid = True # No better way to tell from pypdfium.
|
30
31
|
try:
|
31
32
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
32
|
-
except PdfiumError
|
33
|
+
except PdfiumError:
|
33
34
|
_log.info(
|
34
35
|
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
35
36
|
exc_info=True,
|
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
225
226
|
def get_page_image(
|
226
227
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
227
228
|
) -> Image.Image:
|
228
|
-
|
229
229
|
page_size = self.get_size()
|
230
230
|
|
231
231
|
if not cropbox:
|
@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
102
102
|
|
103
103
|
doc_info: etree.DocInfo = self.tree.docinfo
|
104
104
|
if doc_info.system_url and any(
|
105
|
-
|
105
|
+
kwd in doc_info.system_url for kwd in JATS_DTD_URL
|
106
106
|
):
|
107
107
|
self.valid = True
|
108
108
|
return
|
109
109
|
for ent in doc_info.internalDTD.iterentities():
|
110
110
|
if ent.system_url and any(
|
111
|
-
|
111
|
+
kwd in ent.system_url for kwd in JATS_DTD_URL
|
112
112
|
):
|
113
113
|
self.valid = True
|
114
114
|
return
|
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
232
232
|
# TODO: once superscript is supported, add label with formatting
|
233
233
|
aff = aff.removeprefix(f"{label[0].text}, ")
|
234
234
|
affiliation_names.append(aff)
|
235
|
-
affiliation_ids_names =
|
236
|
-
id
|
237
|
-
|
238
|
-
}
|
235
|
+
affiliation_ids_names = dict(
|
236
|
+
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
237
|
+
)
|
239
238
|
|
240
239
|
# Get author names and affiliation names
|
241
240
|
for author_node in meta.xpath(
|
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
300
299
|
def _add_abstract(
|
301
300
|
self, doc: DoclingDocument, xml_components: XMLComponents
|
302
301
|
) -> None:
|
303
|
-
|
304
302
|
for abstract in xml_components["abstract"]:
|
305
303
|
text: str = abstract["content"]
|
306
304
|
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
349
347
|
|
350
348
|
return
|
351
349
|
|
352
|
-
def _parse_element_citation(self, node: etree._Element) -> str:
|
350
|
+
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
|
353
351
|
citation: Citation = {
|
354
352
|
"author_names": "",
|
355
353
|
"title": "",
|
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
440
438
|
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
441
439
|
if len(node.xpath("lpage")) > 0:
|
442
440
|
citation["page"] += (
|
443
|
-
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
441
|
+
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
|
444
442
|
)
|
445
443
|
|
446
444
|
# Flatten the citation to string
|
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
595
593
|
|
596
594
|
try:
|
597
595
|
self._add_table(doc, parent, table)
|
598
|
-
except Exception
|
599
|
-
_log.warning(f"Skipping unsupported table in {
|
600
|
-
pass
|
596
|
+
except Exception:
|
597
|
+
_log.warning(f"Skipping unsupported table in {self.file!s}")
|
601
598
|
|
602
599
|
return
|
603
600
|
|
@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
609
606
|
)
|
610
607
|
return
|
611
608
|
|
612
|
-
def _walk_linear(
|
609
|
+
def _walk_linear( # noqa: C901
|
613
610
|
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
614
611
|
) -> str:
|
615
612
|
skip_tags = ["term"]
|
@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
|
122
122
|
|
123
123
|
@override
|
124
124
|
def convert(self) -> DoclingDocument:
|
125
|
-
|
126
125
|
if self.parser is not None:
|
127
126
|
doc = self.parser.parse(self.patent_content)
|
128
127
|
if doc is None:
|
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
|
|
163
162
|
Returns:
|
164
163
|
The patent parsed as a docling document.
|
165
164
|
"""
|
166
|
-
pass
|
167
165
|
|
168
166
|
|
169
167
|
class PatentUsptoIce(PatentUspto):
|
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
|
|
265
263
|
self.style_html = HtmlEntity()
|
266
264
|
|
267
265
|
@override
|
268
|
-
def startElement(self, tag, attributes):
|
266
|
+
def startElement(self, tag, attributes):
|
269
267
|
"""Signal the start of an element.
|
270
268
|
|
271
269
|
Args:
|
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
|
|
281
279
|
self._start_registered_elements(tag, attributes)
|
282
280
|
|
283
281
|
@override
|
284
|
-
def skippedEntity(self, name):
|
282
|
+
def skippedEntity(self, name):
|
285
283
|
"""Receive notification of a skipped entity.
|
286
284
|
|
287
285
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
|
|
315
313
|
self.text += unescaped
|
316
314
|
|
317
315
|
@override
|
318
|
-
def endElement(self, tag):
|
316
|
+
def endElement(self, tag):
|
319
317
|
"""Signal the end of an element.
|
320
318
|
|
321
319
|
Args:
|
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
603
601
|
self.style_html = HtmlEntity()
|
604
602
|
|
605
603
|
@override
|
606
|
-
def startElement(self, tag, attributes):
|
604
|
+
def startElement(self, tag, attributes):
|
607
605
|
"""Signal the start of an element.
|
608
606
|
|
609
607
|
Args:
|
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
616
614
|
self._start_registered_elements(tag, attributes)
|
617
615
|
|
618
616
|
@override
|
619
|
-
def skippedEntity(self, name):
|
617
|
+
def skippedEntity(self, name):
|
620
618
|
"""Receive notification of a skipped entity.
|
621
619
|
|
622
620
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
650
648
|
self.text += unescaped
|
651
649
|
|
652
650
|
@override
|
653
|
-
def endElement(self, tag):
|
651
|
+
def endElement(self, tag):
|
654
652
|
"""Signal the end of an element.
|
655
653
|
|
656
654
|
Args:
|
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
691
689
|
if tag in [member.value for member in self.Element]:
|
692
690
|
if (
|
693
691
|
tag == self.Element.HEADING.value
|
694
|
-
and
|
692
|
+
and self.Element.SDOCL.value not in self.property
|
695
693
|
):
|
696
694
|
level_attr: str = attributes.get("LVL", "")
|
697
695
|
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
743
741
|
# headers except claims statement
|
744
742
|
elif (
|
745
743
|
self.Element.HEADING.value in self.property
|
746
|
-
and
|
744
|
+
and self.Element.SDOCL.value not in self.property
|
747
745
|
and text.strip()
|
748
746
|
):
|
749
747
|
self.parents[self.level + 1] = self.doc.add_heading(
|
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1164
1162
|
self.style_html = HtmlEntity()
|
1165
1163
|
|
1166
1164
|
@override
|
1167
|
-
def startElement(self, tag, attributes):
|
1165
|
+
def startElement(self, tag, attributes):
|
1168
1166
|
"""Signal the start of an element.
|
1169
1167
|
|
1170
1168
|
Args:
|
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1177
1175
|
self._start_registered_elements(tag, attributes)
|
1178
1176
|
|
1179
1177
|
@override
|
1180
|
-
def skippedEntity(self, name):
|
1178
|
+
def skippedEntity(self, name):
|
1181
1179
|
"""Receive notification of a skipped entity.
|
1182
1180
|
|
1183
1181
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1211
1209
|
self.text += unescaped
|
1212
1210
|
|
1213
1211
|
@override
|
1214
|
-
def endElement(self, tag):
|
1212
|
+
def endElement(self, tag):
|
1215
1213
|
"""Signal the end of an element.
|
1216
1214
|
|
1217
1215
|
Args:
|
@@ -1474,9 +1472,7 @@ class XmlTable:
|
|
1474
1472
|
if cw == 0:
|
1475
1473
|
offset_w0.append(col["offset"][ic])
|
1476
1474
|
|
1477
|
-
min_colinfo["offset"] = sorted(
|
1478
|
-
list(set(col["offset"] + min_colinfo["offset"]))
|
1479
|
-
)
|
1475
|
+
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
|
1480
1476
|
|
1481
1477
|
# add back the 0 width cols to offset list
|
1482
1478
|
offset_w0 = list(set(offset_w0))
|
@@ -1527,7 +1523,7 @@ class XmlTable:
|
|
1527
1523
|
|
1528
1524
|
return ncols_max
|
1529
1525
|
|
1530
|
-
def _parse_table(self, table: Tag) -> TableData:
|
1526
|
+
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
|
1531
1527
|
"""Parse the content of a table tag.
|
1532
1528
|
|
1533
1529
|
Args:
|
@@ -1722,7 +1718,7 @@ class HtmlEntity:
|
|
1722
1718
|
"0": "⁰",
|
1723
1719
|
"+": "⁺",
|
1724
1720
|
"-": "⁻",
|
1725
|
-
"−": "⁻",
|
1721
|
+
"−": "⁻", # noqa: RUF001
|
1726
1722
|
"=": "⁼",
|
1727
1723
|
"(": "⁽",
|
1728
1724
|
")": "⁾",
|
@@ -1746,7 +1742,7 @@ class HtmlEntity:
|
|
1746
1742
|
"0": "₀",
|
1747
1743
|
"+": "₊",
|
1748
1744
|
"-": "₋",
|
1749
|
-
"−": "₋",
|
1745
|
+
"−": "₋", # noqa: RUF001
|
1750
1746
|
"=": "₌",
|
1751
1747
|
"(": "₍",
|
1752
1748
|
")": "₎",
|
docling/cli/main.py
CHANGED
@@ -6,14 +6,16 @@ import sys
|
|
6
6
|
import tempfile
|
7
7
|
import time
|
8
8
|
import warnings
|
9
|
+
from collections.abc import Iterable
|
9
10
|
from pathlib import Path
|
10
|
-
from typing import Annotated, Dict,
|
11
|
+
from typing import Annotated, Dict, List, Optional, Type
|
11
12
|
|
12
13
|
import rich.table
|
13
14
|
import typer
|
14
15
|
from docling_core.types.doc import ImageRefMode
|
15
16
|
from docling_core.utils.file import resolve_source_to_path
|
16
17
|
from pydantic import TypeAdapter
|
18
|
+
from rich.console import Console
|
17
19
|
|
18
20
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
19
21
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -40,6 +42,7 @@ from docling.datamodel.pipeline_options import (
|
|
40
42
|
VlmModelType,
|
41
43
|
VlmPipelineOptions,
|
42
44
|
granite_vision_vlm_conversion_options,
|
45
|
+
granite_vision_vlm_ollama_conversion_options,
|
43
46
|
smoldocling_vlm_conversion_options,
|
44
47
|
smoldocling_vlm_mlx_conversion_options,
|
45
48
|
)
|
@@ -52,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
|
|
52
55
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
53
56
|
|
54
57
|
_log = logging.getLogger(__name__)
|
55
|
-
from rich.console import Console
|
56
58
|
|
57
59
|
console = Console()
|
58
60
|
err_console = Console(stderr=True)
|
@@ -153,12 +155,12 @@ def export_documents(
|
|
153
155
|
output_dir: Path,
|
154
156
|
export_json: bool,
|
155
157
|
export_html: bool,
|
158
|
+
export_html_split_page: bool,
|
156
159
|
export_md: bool,
|
157
160
|
export_txt: bool,
|
158
161
|
export_doctags: bool,
|
159
162
|
image_export_mode: ImageRefMode,
|
160
163
|
):
|
161
|
-
|
162
164
|
success_count = 0
|
163
165
|
failure_count = 0
|
164
166
|
|
@@ -180,7 +182,15 @@ def export_documents(
|
|
180
182
|
fname = output_dir / f"{doc_filename}.html"
|
181
183
|
_log.info(f"writing HTML output to {fname}")
|
182
184
|
conv_res.document.save_as_html(
|
183
|
-
filename=fname, image_mode=image_export_mode
|
185
|
+
filename=fname, image_mode=image_export_mode, split_page_view=False
|
186
|
+
)
|
187
|
+
|
188
|
+
# Export HTML format:
|
189
|
+
if export_html_split_page:
|
190
|
+
fname = output_dir / f"{doc_filename}.html"
|
191
|
+
_log.info(f"writing HTML output to {fname}")
|
192
|
+
conv_res.document.save_as_html(
|
193
|
+
filename=fname, image_mode=image_export_mode, split_page_view=True
|
184
194
|
)
|
185
195
|
|
186
196
|
# Export Text format:
|
@@ -223,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
|
223
233
|
|
224
234
|
|
225
235
|
@app.command(no_args_is_help=True)
|
226
|
-
def convert(
|
236
|
+
def convert( # noqa: C901
|
227
237
|
input_sources: Annotated[
|
228
238
|
List[str],
|
229
239
|
typer.Argument(
|
@@ -279,7 +289,7 @@ def convert(
|
|
279
289
|
...,
|
280
290
|
help=(
|
281
291
|
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
282
|
-
f"{', '.join(
|
292
|
+
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
|
283
293
|
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
284
294
|
),
|
285
295
|
),
|
@@ -411,7 +421,7 @@ def convert(
|
|
411
421
|
logging.basicConfig(level=logging.WARNING)
|
412
422
|
elif verbose == 1:
|
413
423
|
logging.basicConfig(level=logging.INFO)
|
414
|
-
|
424
|
+
else:
|
415
425
|
logging.basicConfig(level=logging.DEBUG)
|
416
426
|
|
417
427
|
settings.debug.visualize_cells = debug_visualize_cells
|
@@ -420,7 +430,7 @@ def convert(
|
|
420
430
|
settings.debug.visualize_ocr = debug_visualize_ocr
|
421
431
|
|
422
432
|
if from_formats is None:
|
423
|
-
from_formats =
|
433
|
+
from_formats = list(InputFormat)
|
424
434
|
|
425
435
|
parsed_headers: Optional[Dict[str, str]] = None
|
426
436
|
if headers is not None:
|
@@ -471,6 +481,7 @@ def convert(
|
|
471
481
|
|
472
482
|
export_json = OutputFormat.JSON in to_formats
|
473
483
|
export_html = OutputFormat.HTML in to_formats
|
484
|
+
export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
|
474
485
|
export_md = OutputFormat.MARKDOWN in to_formats
|
475
486
|
export_txt = OutputFormat.TEXT in to_formats
|
476
487
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
@@ -531,10 +542,16 @@ def convert(
|
|
531
542
|
backend=backend, # pdf_backend
|
532
543
|
)
|
533
544
|
elif pipeline == PdfPipeline.VLM:
|
534
|
-
pipeline_options = VlmPipelineOptions(
|
545
|
+
pipeline_options = VlmPipelineOptions(
|
546
|
+
enable_remote_services=enable_remote_services,
|
547
|
+
)
|
535
548
|
|
536
549
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
537
550
|
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
551
|
+
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
552
|
+
pipeline_options.vlm_options = (
|
553
|
+
granite_vision_vlm_ollama_conversion_options
|
554
|
+
)
|
538
555
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
539
556
|
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
540
557
|
if sys.platform == "darwin":
|
@@ -578,6 +595,7 @@ def convert(
|
|
578
595
|
output_dir=output,
|
579
596
|
export_json=export_json,
|
580
597
|
export_html=export_html,
|
598
|
+
export_html_split_page=export_html_split_page,
|
581
599
|
export_md=export_md,
|
582
600
|
export_txt=export_txt,
|
583
601
|
export_doctags=export_doctags,
|
docling/cli/models.py
CHANGED
@@ -62,7 +62,7 @@ def download(
|
|
62
62
|
models: Annotated[
|
63
63
|
Optional[list[_AvailableModels]],
|
64
64
|
typer.Argument(
|
65
|
-
help=
|
65
|
+
help="Models to download (default behavior: a predefined set of models will be downloaded).",
|
66
66
|
),
|
67
67
|
] = None,
|
68
68
|
all: Annotated[
|
@@ -89,14 +89,13 @@ def download(
|
|
89
89
|
"Cannot simultaneously set 'all' parameter and specify models to download."
|
90
90
|
)
|
91
91
|
if not quiet:
|
92
|
-
FORMAT = "%(message)s"
|
93
92
|
logging.basicConfig(
|
94
93
|
level=logging.INFO,
|
95
94
|
format="[blue]%(message)s[/blue]",
|
96
95
|
datefmt="[%X]",
|
97
96
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
98
97
|
)
|
99
|
-
to_download = models or (
|
98
|
+
to_download = models or (list(_AvailableModels) if all else _default_models)
|
100
99
|
output_dir = download_models(
|
101
100
|
output_dir=output_dir,
|
102
101
|
force=force,
|
docling/datamodel/base_models.py
CHANGED
@@ -10,7 +10,9 @@ from docling_core.types.doc import (
|
|
10
10
|
TableCell,
|
11
11
|
)
|
12
12
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
13
|
-
|
13
|
+
|
14
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
15
|
+
from docling_core.types.io import (
|
14
16
|
DocumentStream,
|
15
17
|
)
|
16
18
|
from PIL.Image import Image
|
@@ -50,6 +52,7 @@ class OutputFormat(str, Enum):
|
|
50
52
|
MARKDOWN = "md"
|
51
53
|
JSON = "json"
|
52
54
|
HTML = "html"
|
55
|
+
HTML_SPLIT_PAGE = "html_split_page"
|
53
56
|
TEXT = "text"
|
54
57
|
DOCTAGS = "doctags"
|
55
58
|
|
@@ -232,9 +235,9 @@ class Page(BaseModel):
|
|
232
235
|
None # Internal PDF backend. By default it is cleared during assembling.
|
233
236
|
)
|
234
237
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
235
|
-
_image_cache: Dict[
|
236
|
-
|
237
|
-
|
238
|
+
_image_cache: Dict[
|
239
|
+
float, Image
|
240
|
+
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
238
241
|
|
239
242
|
def get_image(
|
240
243
|
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
@@ -242,7 +245,7 @@ class Page(BaseModel):
|
|
242
245
|
if self._backend is None:
|
243
246
|
return self._image_cache.get(scale, None)
|
244
247
|
|
245
|
-
if not
|
248
|
+
if scale not in self._image_cache:
|
246
249
|
if cropbox is None:
|
247
250
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
248
251
|
else:
|
@@ -262,3 +265,35 @@ class Page(BaseModel):
|
|
262
265
|
@property
|
263
266
|
def image(self) -> Optional[Image]:
|
264
267
|
return self.get_image(scale=self._default_image_scale)
|
268
|
+
|
269
|
+
|
270
|
+
## OpenAI API Request / Response Models ##
|
271
|
+
|
272
|
+
|
273
|
+
class OpenAiChatMessage(BaseModel):
|
274
|
+
role: str
|
275
|
+
content: str
|
276
|
+
|
277
|
+
|
278
|
+
class OpenAiResponseChoice(BaseModel):
|
279
|
+
index: int
|
280
|
+
message: OpenAiChatMessage
|
281
|
+
finish_reason: str
|
282
|
+
|
283
|
+
|
284
|
+
class OpenAiResponseUsage(BaseModel):
|
285
|
+
prompt_tokens: int
|
286
|
+
completion_tokens: int
|
287
|
+
total_tokens: int
|
288
|
+
|
289
|
+
|
290
|
+
class OpenAiApiResponse(BaseModel):
|
291
|
+
model_config = ConfigDict(
|
292
|
+
protected_namespaces=(),
|
293
|
+
)
|
294
|
+
|
295
|
+
id: str
|
296
|
+
model: Optional[str] = None # returned by openai
|
297
|
+
choices: List[OpenAiResponseChoice]
|
298
|
+
created: int
|
299
|
+
usage: OpenAiResponseUsage
|
docling/datamodel/document.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
import csv
|
2
2
|
import logging
|
3
3
|
import re
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from enum import Enum
|
5
6
|
from io import BytesIO
|
6
7
|
from pathlib import Path, PurePath
|
7
8
|
from typing import (
|
8
9
|
TYPE_CHECKING,
|
9
10
|
Dict,
|
10
|
-
Iterable,
|
11
11
|
List,
|
12
12
|
Literal,
|
13
13
|
Optional,
|
@@ -17,6 +17,8 @@ from typing import (
|
|
17
17
|
)
|
18
18
|
|
19
19
|
import filetype
|
20
|
+
|
21
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
20
22
|
from docling_core.types.doc import (
|
21
23
|
DocItem,
|
22
24
|
DocItemLabel,
|
@@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
|
|
35
37
|
PageReference,
|
36
38
|
Prov,
|
37
39
|
Ref,
|
40
|
+
Table as DsSchemaTable,
|
41
|
+
TableCell,
|
38
42
|
)
|
39
|
-
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
40
|
-
from docling_core.types.legacy_doc.base import TableCell
|
41
43
|
from docling_core.types.legacy_doc.document import (
|
42
44
|
CCSDocumentDescription as DsDocumentDescription,
|
45
|
+
CCSFileInfoObject as DsFileInfoObject,
|
46
|
+
ExportedCCSDocument as DsDocument,
|
43
47
|
)
|
44
|
-
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
45
|
-
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
46
48
|
from docling_core.utils.file import resolve_source_to_stream
|
47
49
|
from docling_core.utils.legacy import docling_document_to_legacy
|
48
50
|
from pydantic import BaseModel
|
@@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
|
|
65
67
|
)
|
66
68
|
from docling.datamodel.settings import DocumentLimits
|
67
69
|
from docling.utils.profiling import ProfilingItem
|
68
|
-
from docling.utils.utils import create_file_hash
|
70
|
+
from docling.utils.utils import create_file_hash
|
69
71
|
|
70
72
|
if TYPE_CHECKING:
|
71
73
|
from docling.document_converter import FormatOption
|
@@ -134,9 +136,9 @@ class InputDocument(BaseModel):
|
|
134
136
|
self._init_doc(backend, path_or_stream)
|
135
137
|
|
136
138
|
elif isinstance(path_or_stream, BytesIO):
|
137
|
-
assert (
|
138
|
-
|
139
|
-
)
|
139
|
+
assert filename is not None, (
|
140
|
+
"Can't construct InputDocument from stream without providing filename arg."
|
141
|
+
)
|
140
142
|
self.file = PurePath(filename)
|
141
143
|
self.filesize = path_or_stream.getbuffer().nbytes
|
142
144
|
|
@@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
|
|
228
230
|
|
229
231
|
|
230
232
|
class _DocumentConversionInput(BaseModel):
|
231
|
-
|
232
233
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
233
234
|
headers: Optional[Dict[str, str]] = None
|
234
235
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
@@ -283,6 +284,13 @@ class _DocumentConversionInput(BaseModel):
|
|
283
284
|
if mime is None: # must guess from
|
284
285
|
with obj.open("rb") as f:
|
285
286
|
content = f.read(1024) # Read first 1KB
|
287
|
+
if mime is not None and mime.lower() == "application/zip":
|
288
|
+
if obj.suffixes[-1].lower() == ".xlsx":
|
289
|
+
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
290
|
+
elif obj.suffixes[-1].lower() == ".docx":
|
291
|
+
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
292
|
+
elif obj.suffixes[-1].lower() == ".pptx":
|
293
|
+
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
286
294
|
|
287
295
|
elif isinstance(obj, DocumentStream):
|
288
296
|
content = obj.stream.read(8192)
|
@@ -213,8 +213,8 @@ class PictureDescriptionBaseOptions(BaseOptions):
|
|
213
213
|
batch_size: int = 8
|
214
214
|
scale: float = 2
|
215
215
|
|
216
|
-
|
217
|
-
0.
|
216
|
+
picture_area_threshold: float = (
|
217
|
+
0.05 # percentage of the area for a picture to processed with the models
|
218
218
|
)
|
219
219
|
|
220
220
|
|
@@ -266,6 +266,7 @@ class ResponseFormat(str, Enum):
|
|
266
266
|
class InferenceFramework(str, Enum):
|
267
267
|
MLX = "mlx"
|
268
268
|
TRANSFORMERS = "transformers"
|
269
|
+
OPENAI = "openai"
|
269
270
|
|
270
271
|
|
271
272
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
@@ -284,6 +285,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
284
285
|
return self.repo_id.replace("/", "--")
|
285
286
|
|
286
287
|
|
288
|
+
class ApiVlmOptions(BaseVlmOptions):
|
289
|
+
kind: Literal["api_model_options"] = "api_model_options"
|
290
|
+
|
291
|
+
url: AnyUrl = AnyUrl(
|
292
|
+
"http://localhost:11434/v1/chat/completions"
|
293
|
+
) # Default to ollama
|
294
|
+
headers: Dict[str, str] = {}
|
295
|
+
params: Dict[str, Any] = {}
|
296
|
+
scale: float = 2.0
|
297
|
+
timeout: float = 60
|
298
|
+
response_format: ResponseFormat
|
299
|
+
|
300
|
+
|
287
301
|
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
288
302
|
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
289
303
|
prompt="Convert this page to docling.",
|
@@ -307,10 +321,20 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
307
321
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
308
322
|
)
|
309
323
|
|
324
|
+
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
325
|
+
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
326
|
+
params={"model": "granite3.2-vision:2b"},
|
327
|
+
prompt="OCR the full page to markdown.",
|
328
|
+
scale=1.0,
|
329
|
+
timeout=120,
|
330
|
+
response_format=ResponseFormat.MARKDOWN,
|
331
|
+
)
|
332
|
+
|
310
333
|
|
311
334
|
class VlmModelType(str, Enum):
|
312
335
|
SMOLDOCLING = "smoldocling"
|
313
336
|
GRANITE_VISION = "granite_vision"
|
337
|
+
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
314
338
|
|
315
339
|
|
316
340
|
# Define an enum for the backend options
|
@@ -356,13 +380,14 @@ class PaginatedPipelineOptions(PipelineOptions):
|
|
356
380
|
|
357
381
|
|
358
382
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
359
|
-
|
360
383
|
generate_page_images: bool = True
|
361
384
|
force_backend_text: bool = (
|
362
385
|
False # (To be used with vlms, or other generative models)
|
363
386
|
)
|
364
387
|
# If True, text from backend will be used instead of generated text
|
365
|
-
vlm_options: Union[HuggingFaceVlmOptions] =
|
388
|
+
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
|
389
|
+
smoldocling_vlm_conversion_options
|
390
|
+
)
|
366
391
|
|
367
392
|
|
368
393
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
docling/document_converter.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
|
-
import math
|
4
3
|
import sys
|
5
4
|
import time
|
5
|
+
from collections.abc import Iterable, Iterator
|
6
6
|
from functools import partial
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Dict,
|
8
|
+
from typing import Dict, List, Optional, Tuple, Type, Union
|
9
9
|
|
10
10
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
11
11
|
|
@@ -172,7 +172,7 @@ class DocumentConverter:
|
|
172
172
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
173
173
|
):
|
174
174
|
self.allowed_formats = (
|
175
|
-
allowed_formats if allowed_formats is not None else
|
175
|
+
allowed_formats if allowed_formats is not None else list(InputFormat)
|
176
176
|
)
|
177
177
|
self.format_to_options = {
|
178
178
|
format: (
|
@@ -254,7 +254,7 @@ class DocumentConverter:
|
|
254
254
|
|
255
255
|
if not had_result and raises_on_error:
|
256
256
|
raise ConversionError(
|
257
|
-
|
257
|
+
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
258
258
|
)
|
259
259
|
|
260
260
|
def _convert(
|
@@ -266,7 +266,7 @@ class DocumentConverter:
|
|
266
266
|
conv_input.docs(self.format_to_options),
|
267
267
|
settings.perf.doc_batch_size, # pass format_options
|
268
268
|
):
|
269
|
-
_log.info(
|
269
|
+
_log.info("Going to convert document batch...")
|
270
270
|
|
271
271
|
# parallel processing only within input_batch
|
272
272
|
# with ThreadPoolExecutor(
|