docling 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +66 -25
- docling/backend/md_backend.py +6 -8
- docling/backend/msexcel_backend.py +1 -7
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +5 -5
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +11 -14
- docling/backend/xml/uspto_backend.py +19 -23
- docling/cli/main.py +8 -8
- docling/cli/models.py +6 -3
- docling/datamodel/base_models.py +7 -5
- docling/datamodel/document.py +19 -10
- docling/datamodel/pipeline_options.py +0 -1
- docling/document_converter.py +8 -6
- docling/models/api_vlm_model.py +1 -2
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +2 -1
- docling/models/picture_description_base_model.py +2 -3
- docling/models/picture_description_vlm_model.py +6 -4
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +9 -24
- docling/models/table_structure_model.py +4 -8
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +9 -5
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/pipeline/vlm_pipeline.py +0 -3
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +31 -7
- docling/utils/utils.py +3 -3
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/METADATA +2 -1
- docling-2.31.1.dist-info/RECORD +86 -0
- docling-2.30.0.dist-info/RECORD +0 -86
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0
@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
158
158
|
def _get_level(self) -> int:
|
159
159
|
"""Return the first None index."""
|
160
160
|
for k, v in self.parents.items():
|
161
|
-
if k >= 0 and v
|
161
|
+
if k >= 0 and v is None:
|
162
162
|
return k
|
163
163
|
return 0
|
164
164
|
|
@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
418
418
|
else prev_parent
|
419
419
|
)
|
420
420
|
|
421
|
-
def _handle_text_elements(
|
421
|
+
def _handle_text_elements( # noqa: C901
|
422
422
|
self,
|
423
423
|
element: BaseOxmlElement,
|
424
424
|
docx_obj: DocxDocument,
|
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
436
436
|
|
437
437
|
# Common styles for bullet and numbered lists.
|
438
438
|
# "List Bullet", "List Number", "List Paragraph"
|
439
|
-
# Identify
|
439
|
+
# Identify whether list is a numbered list or not
|
440
440
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
441
441
|
is_numbered = False
|
442
442
|
p_style_id, p_level = self._get_label_and_level(paragraph)
|
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
812
812
|
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
813
813
|
)
|
814
814
|
if cell is None or cell._tc in cell_set:
|
815
|
-
_log.debug(
|
815
|
+
_log.debug(" skipped since repeated content")
|
816
816
|
col_idx += cell.grid_span
|
817
817
|
continue
|
818
818
|
else:
|
@@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
879
879
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
880
880
|
caption=None,
|
881
881
|
)
|
882
|
-
except (UnidentifiedImageError, OSError)
|
882
|
+
except (UnidentifiedImageError, OSError):
|
883
883
|
_log.warning("Warning: image cannot be loaded by Pillow")
|
884
884
|
doc.add_picture(
|
885
885
|
parent=self.parents[level - 1],
|
docling/backend/pdf_backend.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
+
from collections.abc import Iterable
|
2
3
|
from io import BytesIO
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional, Set, Union
|
5
6
|
|
6
7
|
from docling_core.types.doc import BoundingBox, Size
|
7
8
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import random
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from io import BytesIO
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import TYPE_CHECKING,
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
6
7
|
|
7
8
|
import pypdfium2 as pdfium
|
8
9
|
import pypdfium2.raw as pdfium_c
|
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
29
30
|
self.valid = True # No better way to tell from pypdfium.
|
30
31
|
try:
|
31
32
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
32
|
-
except PdfiumError
|
33
|
+
except PdfiumError:
|
33
34
|
_log.info(
|
34
35
|
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
35
36
|
exc_info=True,
|
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
225
226
|
def get_page_image(
|
226
227
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
227
228
|
) -> Image.Image:
|
228
|
-
|
229
229
|
page_size = self.get_size()
|
230
230
|
|
231
231
|
if not cropbox:
|
@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
91
91
|
super().__init__(in_doc, path_or_stream)
|
92
92
|
self.path_or_stream = path_or_stream
|
93
93
|
|
94
|
-
# Initialize the root of the document
|
94
|
+
# Initialize the root of the document hierarchy
|
95
95
|
self.root: Optional[NodeItem] = None
|
96
96
|
|
97
97
|
self.valid = False
|
@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
102
102
|
|
103
103
|
doc_info: etree.DocInfo = self.tree.docinfo
|
104
104
|
if doc_info.system_url and any(
|
105
|
-
|
105
|
+
kwd in doc_info.system_url for kwd in JATS_DTD_URL
|
106
106
|
):
|
107
107
|
self.valid = True
|
108
108
|
return
|
109
109
|
for ent in doc_info.internalDTD.iterentities():
|
110
110
|
if ent.system_url and any(
|
111
|
-
|
111
|
+
kwd in ent.system_url for kwd in JATS_DTD_URL
|
112
112
|
):
|
113
113
|
self.valid = True
|
114
114
|
return
|
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
232
232
|
# TODO: once superscript is supported, add label with formatting
|
233
233
|
aff = aff.removeprefix(f"{label[0].text}, ")
|
234
234
|
affiliation_names.append(aff)
|
235
|
-
affiliation_ids_names =
|
236
|
-
id
|
237
|
-
|
238
|
-
}
|
235
|
+
affiliation_ids_names = dict(
|
236
|
+
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
237
|
+
)
|
239
238
|
|
240
239
|
# Get author names and affiliation names
|
241
240
|
for author_node in meta.xpath(
|
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
300
299
|
def _add_abstract(
|
301
300
|
self, doc: DoclingDocument, xml_components: XMLComponents
|
302
301
|
) -> None:
|
303
|
-
|
304
302
|
for abstract in xml_components["abstract"]:
|
305
303
|
text: str = abstract["content"]
|
306
304
|
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
349
347
|
|
350
348
|
return
|
351
349
|
|
352
|
-
def _parse_element_citation(self, node: etree._Element) -> str:
|
350
|
+
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
|
353
351
|
citation: Citation = {
|
354
352
|
"author_names": "",
|
355
353
|
"title": "",
|
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
440
438
|
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
441
439
|
if len(node.xpath("lpage")) > 0:
|
442
440
|
citation["page"] += (
|
443
|
-
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
441
|
+
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
|
444
442
|
)
|
445
443
|
|
446
444
|
# Flatten the citation to string
|
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
595
593
|
|
596
594
|
try:
|
597
595
|
self._add_table(doc, parent, table)
|
598
|
-
except Exception
|
599
|
-
_log.warning(f"Skipping unsupported table in {
|
600
|
-
pass
|
596
|
+
except Exception:
|
597
|
+
_log.warning(f"Skipping unsupported table in {self.file!s}")
|
601
598
|
|
602
599
|
return
|
603
600
|
|
@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
609
606
|
)
|
610
607
|
return
|
611
608
|
|
612
|
-
def _walk_linear(
|
609
|
+
def _walk_linear( # noqa: C901
|
613
610
|
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
614
611
|
) -> str:
|
615
612
|
skip_tags = ["term"]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
"""Backend to parse patents from the United States Patent Office (USPTO).
|
2
2
|
|
3
|
-
The parsers included in this module can handle patent grants
|
3
|
+
The parsers included in this module can handle patent grants published since 1976 and
|
4
4
|
patent applications since 2001.
|
5
5
|
The original files can be found in https://bulkdata.uspto.gov.
|
6
6
|
"""
|
@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
|
122
122
|
|
123
123
|
@override
|
124
124
|
def convert(self) -> DoclingDocument:
|
125
|
-
|
126
125
|
if self.parser is not None:
|
127
126
|
doc = self.parser.parse(self.patent_content)
|
128
127
|
if doc is None:
|
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
|
|
163
162
|
Returns:
|
164
163
|
The patent parsed as a docling document.
|
165
164
|
"""
|
166
|
-
pass
|
167
165
|
|
168
166
|
|
169
167
|
class PatentUsptoIce(PatentUspto):
|
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
|
|
265
263
|
self.style_html = HtmlEntity()
|
266
264
|
|
267
265
|
@override
|
268
|
-
def startElement(self, tag, attributes):
|
266
|
+
def startElement(self, tag, attributes):
|
269
267
|
"""Signal the start of an element.
|
270
268
|
|
271
269
|
Args:
|
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
|
|
281
279
|
self._start_registered_elements(tag, attributes)
|
282
280
|
|
283
281
|
@override
|
284
|
-
def skippedEntity(self, name):
|
282
|
+
def skippedEntity(self, name):
|
285
283
|
"""Receive notification of a skipped entity.
|
286
284
|
|
287
285
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
|
|
315
313
|
self.text += unescaped
|
316
314
|
|
317
315
|
@override
|
318
|
-
def endElement(self, tag):
|
316
|
+
def endElement(self, tag):
|
319
317
|
"""Signal the end of an element.
|
320
318
|
|
321
319
|
Args:
|
@@ -442,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
|
|
442
440
|
)
|
443
441
|
|
444
442
|
elif name == self.Element.PARAGRAPH.value and text:
|
445
|
-
#
|
443
|
+
# remove blank spaces added in paragraphs
|
446
444
|
text = re.sub("\\s+", " ", text)
|
447
445
|
if self.Element.ABSTRACT.value in self.property:
|
448
446
|
self.abstract = (
|
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
603
601
|
self.style_html = HtmlEntity()
|
604
602
|
|
605
603
|
@override
|
606
|
-
def startElement(self, tag, attributes):
|
604
|
+
def startElement(self, tag, attributes):
|
607
605
|
"""Signal the start of an element.
|
608
606
|
|
609
607
|
Args:
|
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
616
614
|
self._start_registered_elements(tag, attributes)
|
617
615
|
|
618
616
|
@override
|
619
|
-
def skippedEntity(self, name):
|
617
|
+
def skippedEntity(self, name):
|
620
618
|
"""Receive notification of a skipped entity.
|
621
619
|
|
622
620
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
650
648
|
self.text += unescaped
|
651
649
|
|
652
650
|
@override
|
653
|
-
def endElement(self, tag):
|
651
|
+
def endElement(self, tag):
|
654
652
|
"""Signal the end of an element.
|
655
653
|
|
656
654
|
Args:
|
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
691
689
|
if tag in [member.value for member in self.Element]:
|
692
690
|
if (
|
693
691
|
tag == self.Element.HEADING.value
|
694
|
-
and
|
692
|
+
and self.Element.SDOCL.value not in self.property
|
695
693
|
):
|
696
694
|
level_attr: str = attributes.get("LVL", "")
|
697
695
|
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
743
741
|
# headers except claims statement
|
744
742
|
elif (
|
745
743
|
self.Element.HEADING.value in self.property
|
746
|
-
and
|
744
|
+
and self.Element.SDOCL.value not in self.property
|
747
745
|
and text.strip()
|
748
746
|
):
|
749
747
|
self.parents[self.level + 1] = self.doc.add_heading(
|
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1164
1162
|
self.style_html = HtmlEntity()
|
1165
1163
|
|
1166
1164
|
@override
|
1167
|
-
def startElement(self, tag, attributes):
|
1165
|
+
def startElement(self, tag, attributes):
|
1168
1166
|
"""Signal the start of an element.
|
1169
1167
|
|
1170
1168
|
Args:
|
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1177
1175
|
self._start_registered_elements(tag, attributes)
|
1178
1176
|
|
1179
1177
|
@override
|
1180
|
-
def skippedEntity(self, name):
|
1178
|
+
def skippedEntity(self, name):
|
1181
1179
|
"""Receive notification of a skipped entity.
|
1182
1180
|
|
1183
1181
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1211
1209
|
self.text += unescaped
|
1212
1210
|
|
1213
1211
|
@override
|
1214
|
-
def endElement(self, tag):
|
1212
|
+
def endElement(self, tag):
|
1215
1213
|
"""Signal the end of an element.
|
1216
1214
|
|
1217
1215
|
Args:
|
@@ -1474,9 +1472,7 @@ class XmlTable:
|
|
1474
1472
|
if cw == 0:
|
1475
1473
|
offset_w0.append(col["offset"][ic])
|
1476
1474
|
|
1477
|
-
min_colinfo["offset"] = sorted(
|
1478
|
-
list(set(col["offset"] + min_colinfo["offset"]))
|
1479
|
-
)
|
1475
|
+
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
|
1480
1476
|
|
1481
1477
|
# add back the 0 width cols to offset list
|
1482
1478
|
offset_w0 = list(set(offset_w0))
|
@@ -1527,7 +1523,7 @@ class XmlTable:
|
|
1527
1523
|
|
1528
1524
|
return ncols_max
|
1529
1525
|
|
1530
|
-
def _parse_table(self, table: Tag) -> TableData:
|
1526
|
+
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
|
1531
1527
|
"""Parse the content of a table tag.
|
1532
1528
|
|
1533
1529
|
Args:
|
@@ -1701,7 +1697,7 @@ class XmlTable:
|
|
1701
1697
|
class HtmlEntity:
|
1702
1698
|
"""Provide utility functions to get the HTML entities of styled characters.
|
1703
1699
|
|
1704
|
-
This class has been
|
1700
|
+
This class has been developed from:
|
1705
1701
|
https://unicode-table.com/en/html-entities/
|
1706
1702
|
https://www.w3.org/TR/WD-math-970515/table03.html
|
1707
1703
|
"""
|
@@ -1722,7 +1718,7 @@ class HtmlEntity:
|
|
1722
1718
|
"0": "⁰",
|
1723
1719
|
"+": "⁺",
|
1724
1720
|
"-": "⁻",
|
1725
|
-
"−": "⁻",
|
1721
|
+
"−": "⁻", # noqa: RUF001
|
1726
1722
|
"=": "⁼",
|
1727
1723
|
"(": "⁽",
|
1728
1724
|
")": "⁾",
|
@@ -1746,7 +1742,7 @@ class HtmlEntity:
|
|
1746
1742
|
"0": "₀",
|
1747
1743
|
"+": "₊",
|
1748
1744
|
"-": "₋",
|
1749
|
-
"−": "₋",
|
1745
|
+
"−": "₋", # noqa: RUF001
|
1750
1746
|
"=": "₌",
|
1751
1747
|
"(": "₍",
|
1752
1748
|
")": "₎",
|
@@ -1900,7 +1896,7 @@ class HtmlEntity:
|
|
1900
1896
|
"""Get an HTML entity of a greek letter in ISO 8879.
|
1901
1897
|
|
1902
1898
|
Args:
|
1903
|
-
The text to transform, as an ISO 8879
|
1899
|
+
The text to transform, as an ISO 8879 entity.
|
1904
1900
|
|
1905
1901
|
Returns:
|
1906
1902
|
The HTML entity representing a greek letter. If the input text is not
|
docling/cli/main.py
CHANGED
@@ -6,14 +6,16 @@ import sys
|
|
6
6
|
import tempfile
|
7
7
|
import time
|
8
8
|
import warnings
|
9
|
+
from collections.abc import Iterable
|
9
10
|
from pathlib import Path
|
10
|
-
from typing import Annotated, Dict,
|
11
|
+
from typing import Annotated, Dict, List, Optional, Type
|
11
12
|
|
12
13
|
import rich.table
|
13
14
|
import typer
|
14
15
|
from docling_core.types.doc import ImageRefMode
|
15
16
|
from docling_core.utils.file import resolve_source_to_path
|
16
17
|
from pydantic import TypeAdapter
|
18
|
+
from rich.console import Console
|
17
19
|
|
18
20
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
19
21
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
|
|
53
55
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
54
56
|
|
55
57
|
_log = logging.getLogger(__name__)
|
56
|
-
from rich.console import Console
|
57
58
|
|
58
59
|
console = Console()
|
59
60
|
err_console = Console(stderr=True)
|
@@ -160,7 +161,6 @@ def export_documents(
|
|
160
161
|
export_doctags: bool,
|
161
162
|
image_export_mode: ImageRefMode,
|
162
163
|
):
|
163
|
-
|
164
164
|
success_count = 0
|
165
165
|
failure_count = 0
|
166
166
|
|
@@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
|
233
233
|
|
234
234
|
|
235
235
|
@app.command(no_args_is_help=True)
|
236
|
-
def convert(
|
236
|
+
def convert( # noqa: C901
|
237
237
|
input_sources: Annotated[
|
238
238
|
List[str],
|
239
239
|
typer.Argument(
|
@@ -289,7 +289,7 @@ def convert(
|
|
289
289
|
...,
|
290
290
|
help=(
|
291
291
|
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
292
|
-
f"{', '.join(
|
292
|
+
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
|
293
293
|
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
294
294
|
),
|
295
295
|
),
|
@@ -421,7 +421,7 @@ def convert(
|
|
421
421
|
logging.basicConfig(level=logging.WARNING)
|
422
422
|
elif verbose == 1:
|
423
423
|
logging.basicConfig(level=logging.INFO)
|
424
|
-
|
424
|
+
else:
|
425
425
|
logging.basicConfig(level=logging.DEBUG)
|
426
426
|
|
427
427
|
settings.debug.visualize_cells = debug_visualize_cells
|
@@ -430,7 +430,7 @@ def convert(
|
|
430
430
|
settings.debug.visualize_ocr = debug_visualize_ocr
|
431
431
|
|
432
432
|
if from_formats is None:
|
433
|
-
from_formats =
|
433
|
+
from_formats = list(InputFormat)
|
434
434
|
|
435
435
|
parsed_headers: Optional[Dict[str, str]] = None
|
436
436
|
if headers is not None:
|
@@ -521,7 +521,7 @@ def convert(
|
|
521
521
|
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
522
522
|
pipeline_options.generate_page_images = True
|
523
523
|
pipeline_options.generate_picture_images = (
|
524
|
-
True # FIXME: to be deprecated in
|
524
|
+
True # FIXME: to be deprecated in version 3
|
525
525
|
)
|
526
526
|
pipeline_options.images_scale = 2
|
527
527
|
|
docling/cli/models.py
CHANGED
@@ -32,6 +32,8 @@ class _AvailableModels(str, Enum):
|
|
32
32
|
CODE_FORMULA = "code_formula"
|
33
33
|
PICTURE_CLASSIFIER = "picture_classifier"
|
34
34
|
SMOLVLM = "smolvlm"
|
35
|
+
SMOLDOCLING = "smoldocling"
|
36
|
+
SMOLDOCLING_MLX = "smoldocling_mlx"
|
35
37
|
GRANITE_VISION = "granite_vision"
|
36
38
|
EASYOCR = "easyocr"
|
37
39
|
|
@@ -62,7 +64,7 @@ def download(
|
|
62
64
|
models: Annotated[
|
63
65
|
Optional[list[_AvailableModels]],
|
64
66
|
typer.Argument(
|
65
|
-
help=
|
67
|
+
help="Models to download (default behavior: a predefined set of models will be downloaded).",
|
66
68
|
),
|
67
69
|
] = None,
|
68
70
|
all: Annotated[
|
@@ -89,14 +91,13 @@ def download(
|
|
89
91
|
"Cannot simultaneously set 'all' parameter and specify models to download."
|
90
92
|
)
|
91
93
|
if not quiet:
|
92
|
-
FORMAT = "%(message)s"
|
93
94
|
logging.basicConfig(
|
94
95
|
level=logging.INFO,
|
95
96
|
format="[blue]%(message)s[/blue]",
|
96
97
|
datefmt="[%X]",
|
97
98
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
98
99
|
)
|
99
|
-
to_download = models or (
|
100
|
+
to_download = models or (list(_AvailableModels) if all else _default_models)
|
100
101
|
output_dir = download_models(
|
101
102
|
output_dir=output_dir,
|
102
103
|
force=force,
|
@@ -106,6 +107,8 @@ def download(
|
|
106
107
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
107
108
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
108
109
|
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
110
|
+
with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
|
111
|
+
with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
|
109
112
|
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
110
113
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
111
114
|
)
|
docling/datamodel/base_models.py
CHANGED
@@ -10,7 +10,9 @@ from docling_core.types.doc import (
|
|
10
10
|
TableCell,
|
11
11
|
)
|
12
12
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
13
|
-
|
13
|
+
|
14
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
15
|
+
from docling_core.types.io import (
|
14
16
|
DocumentStream,
|
15
17
|
)
|
16
18
|
from PIL.Image import Image
|
@@ -233,9 +235,9 @@ class Page(BaseModel):
|
|
233
235
|
None # Internal PDF backend. By default it is cleared during assembling.
|
234
236
|
)
|
235
237
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
236
|
-
_image_cache: Dict[
|
237
|
-
|
238
|
-
|
238
|
+
_image_cache: Dict[
|
239
|
+
float, Image
|
240
|
+
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
239
241
|
|
240
242
|
def get_image(
|
241
243
|
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
@@ -243,7 +245,7 @@ class Page(BaseModel):
|
|
243
245
|
if self._backend is None:
|
244
246
|
return self._image_cache.get(scale, None)
|
245
247
|
|
246
|
-
if not
|
248
|
+
if scale not in self._image_cache:
|
247
249
|
if cropbox is None:
|
248
250
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
249
251
|
else:
|
docling/datamodel/document.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
import csv
|
2
2
|
import logging
|
3
3
|
import re
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from enum import Enum
|
5
6
|
from io import BytesIO
|
6
7
|
from pathlib import Path, PurePath
|
7
8
|
from typing import (
|
8
9
|
TYPE_CHECKING,
|
9
10
|
Dict,
|
10
|
-
Iterable,
|
11
11
|
List,
|
12
12
|
Literal,
|
13
13
|
Optional,
|
@@ -17,6 +17,8 @@ from typing import (
|
|
17
17
|
)
|
18
18
|
|
19
19
|
import filetype
|
20
|
+
|
21
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
20
22
|
from docling_core.types.doc import (
|
21
23
|
DocItem,
|
22
24
|
DocItemLabel,
|
@@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
|
|
35
37
|
PageReference,
|
36
38
|
Prov,
|
37
39
|
Ref,
|
40
|
+
Table as DsSchemaTable,
|
41
|
+
TableCell,
|
38
42
|
)
|
39
|
-
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
40
|
-
from docling_core.types.legacy_doc.base import TableCell
|
41
43
|
from docling_core.types.legacy_doc.document import (
|
42
44
|
CCSDocumentDescription as DsDocumentDescription,
|
45
|
+
CCSFileInfoObject as DsFileInfoObject,
|
46
|
+
ExportedCCSDocument as DsDocument,
|
43
47
|
)
|
44
|
-
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
45
|
-
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
46
48
|
from docling_core.utils.file import resolve_source_to_stream
|
47
49
|
from docling_core.utils.legacy import docling_document_to_legacy
|
48
50
|
from pydantic import BaseModel
|
@@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
|
|
65
67
|
)
|
66
68
|
from docling.datamodel.settings import DocumentLimits
|
67
69
|
from docling.utils.profiling import ProfilingItem
|
68
|
-
from docling.utils.utils import create_file_hash
|
70
|
+
from docling.utils.utils import create_file_hash
|
69
71
|
|
70
72
|
if TYPE_CHECKING:
|
71
73
|
from docling.document_converter import FormatOption
|
@@ -134,9 +136,9 @@ class InputDocument(BaseModel):
|
|
134
136
|
self._init_doc(backend, path_or_stream)
|
135
137
|
|
136
138
|
elif isinstance(path_or_stream, BytesIO):
|
137
|
-
assert (
|
138
|
-
|
139
|
-
)
|
139
|
+
assert filename is not None, (
|
140
|
+
"Can't construct InputDocument from stream without providing filename arg."
|
141
|
+
)
|
140
142
|
self.file = PurePath(filename)
|
141
143
|
self.filesize = path_or_stream.getbuffer().nbytes
|
142
144
|
|
@@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
|
|
228
230
|
|
229
231
|
|
230
232
|
class _DocumentConversionInput(BaseModel):
|
231
|
-
|
232
233
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
233
234
|
headers: Optional[Dict[str, str]] = None
|
234
235
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
@@ -302,6 +303,14 @@ class _DocumentConversionInput(BaseModel):
|
|
302
303
|
else ""
|
303
304
|
)
|
304
305
|
mime = _DocumentConversionInput._mime_from_extension(ext)
|
306
|
+
if mime is not None and mime.lower() == "application/zip":
|
307
|
+
objname = obj.name.lower()
|
308
|
+
if objname.endswith(".xlsx"):
|
309
|
+
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
310
|
+
elif objname.endswith(".docx"):
|
311
|
+
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
312
|
+
elif objname.endswith(".pptx"):
|
313
|
+
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
305
314
|
|
306
315
|
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
307
316
|
mime = mime or _DocumentConversionInput._detect_csv(content)
|
docling/document_converter.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
|
-
import math
|
4
3
|
import sys
|
5
4
|
import time
|
5
|
+
from collections.abc import Iterable, Iterator
|
6
6
|
from functools import partial
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Dict,
|
8
|
+
from typing import Dict, List, Optional, Tuple, Type, Union
|
9
9
|
|
10
10
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
11
11
|
|
@@ -172,7 +172,7 @@ class DocumentConverter:
|
|
172
172
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
173
173
|
):
|
174
174
|
self.allowed_formats = (
|
175
|
-
allowed_formats if allowed_formats is not None else
|
175
|
+
allowed_formats if allowed_formats is not None else list(InputFormat)
|
176
176
|
)
|
177
177
|
self.format_to_options = {
|
178
178
|
format: (
|
@@ -189,7 +189,9 @@ class DocumentConverter:
|
|
189
189
|
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
190
190
|
"""Generate a hash of pipeline options to use as part of the cache key."""
|
191
191
|
options_str = str(pipeline_options.model_dump())
|
192
|
-
return hashlib.md5(
|
192
|
+
return hashlib.md5(
|
193
|
+
options_str.encode("utf-8"), usedforsecurity=False
|
194
|
+
).hexdigest()
|
193
195
|
|
194
196
|
def initialize_pipeline(self, format: InputFormat):
|
195
197
|
"""Initialize the conversion pipeline for the selected format."""
|
@@ -254,7 +256,7 @@ class DocumentConverter:
|
|
254
256
|
|
255
257
|
if not had_result and raises_on_error:
|
256
258
|
raise ConversionError(
|
257
|
-
|
259
|
+
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
258
260
|
)
|
259
261
|
|
260
262
|
def _convert(
|
@@ -266,7 +268,7 @@ class DocumentConverter:
|
|
266
268
|
conv_input.docs(self.format_to_options),
|
267
269
|
settings.perf.doc_batch_size, # pass format_options
|
268
270
|
):
|
269
|
-
_log.info(
|
271
|
+
_log.info("Going to convert document batch...")
|
270
272
|
|
271
273
|
# parallel processing only within input_batch
|
272
274
|
# with ThreadPoolExecutor(
|
docling/models/api_vlm_model.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from
|
1
|
+
from collections.abc import Iterable
|
2
2
|
|
3
3
|
from docling.datamodel.base_models import Page, VlmPrediction
|
4
4
|
from docling.datamodel.document import ConversionResult
|
@@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
|
|
10
10
|
|
11
11
|
|
12
12
|
class ApiVlmModel(BasePageModel):
|
13
|
-
|
14
13
|
def __init__(
|
15
14
|
self,
|
16
15
|
enabled: bool,
|