docling 2.30.0__py3-none-any.whl → 2.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +26 -9
- docling/backend/md_backend.py +5 -7
- docling/backend/msexcel_backend.py +1 -7
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +4 -4
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +10 -13
- docling/backend/xml/uspto_backend.py +15 -19
- docling/cli/main.py +7 -7
- docling/cli/models.py +2 -3
- docling/datamodel/base_models.py +7 -5
- docling/datamodel/document.py +11 -10
- docling/datamodel/pipeline_options.py +0 -1
- docling/document_converter.py +5 -5
- docling/models/api_vlm_model.py +1 -2
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +2 -1
- docling/models/picture_description_base_model.py +2 -3
- docling/models/picture_description_vlm_model.py +2 -3
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +8 -23
- docling/models/table_structure_model.py +2 -6
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +8 -6
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/pipeline/vlm_pipeline.py +0 -3
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +7 -7
- docling/utils/utils.py +1 -1
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/METADATA +2 -1
- docling-2.31.0.dist-info/RECORD +86 -0
- docling-2.30.0.dist-info/RECORD +0 -86
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
- {docling-2.30.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
102
102
|
|
103
103
|
doc_info: etree.DocInfo = self.tree.docinfo
|
104
104
|
if doc_info.system_url and any(
|
105
|
-
|
105
|
+
kwd in doc_info.system_url for kwd in JATS_DTD_URL
|
106
106
|
):
|
107
107
|
self.valid = True
|
108
108
|
return
|
109
109
|
for ent in doc_info.internalDTD.iterentities():
|
110
110
|
if ent.system_url and any(
|
111
|
-
|
111
|
+
kwd in ent.system_url for kwd in JATS_DTD_URL
|
112
112
|
):
|
113
113
|
self.valid = True
|
114
114
|
return
|
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
232
232
|
# TODO: once superscript is supported, add label with formatting
|
233
233
|
aff = aff.removeprefix(f"{label[0].text}, ")
|
234
234
|
affiliation_names.append(aff)
|
235
|
-
affiliation_ids_names =
|
236
|
-
id
|
237
|
-
|
238
|
-
}
|
235
|
+
affiliation_ids_names = dict(
|
236
|
+
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
237
|
+
)
|
239
238
|
|
240
239
|
# Get author names and affiliation names
|
241
240
|
for author_node in meta.xpath(
|
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
300
299
|
def _add_abstract(
|
301
300
|
self, doc: DoclingDocument, xml_components: XMLComponents
|
302
301
|
) -> None:
|
303
|
-
|
304
302
|
for abstract in xml_components["abstract"]:
|
305
303
|
text: str = abstract["content"]
|
306
304
|
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
349
347
|
|
350
348
|
return
|
351
349
|
|
352
|
-
def _parse_element_citation(self, node: etree._Element) -> str:
|
350
|
+
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
|
353
351
|
citation: Citation = {
|
354
352
|
"author_names": "",
|
355
353
|
"title": "",
|
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
440
438
|
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
441
439
|
if len(node.xpath("lpage")) > 0:
|
442
440
|
citation["page"] += (
|
443
|
-
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
441
|
+
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
|
444
442
|
)
|
445
443
|
|
446
444
|
# Flatten the citation to string
|
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
595
593
|
|
596
594
|
try:
|
597
595
|
self._add_table(doc, parent, table)
|
598
|
-
except Exception
|
599
|
-
_log.warning(f"Skipping unsupported table in {
|
600
|
-
pass
|
596
|
+
except Exception:
|
597
|
+
_log.warning(f"Skipping unsupported table in {self.file!s}")
|
601
598
|
|
602
599
|
return
|
603
600
|
|
@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
609
606
|
)
|
610
607
|
return
|
611
608
|
|
612
|
-
def _walk_linear(
|
609
|
+
def _walk_linear( # noqa: C901
|
613
610
|
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
614
611
|
) -> str:
|
615
612
|
skip_tags = ["term"]
|
@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
|
122
122
|
|
123
123
|
@override
|
124
124
|
def convert(self) -> DoclingDocument:
|
125
|
-
|
126
125
|
if self.parser is not None:
|
127
126
|
doc = self.parser.parse(self.patent_content)
|
128
127
|
if doc is None:
|
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
|
|
163
162
|
Returns:
|
164
163
|
The patent parsed as a docling document.
|
165
164
|
"""
|
166
|
-
pass
|
167
165
|
|
168
166
|
|
169
167
|
class PatentUsptoIce(PatentUspto):
|
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
|
|
265
263
|
self.style_html = HtmlEntity()
|
266
264
|
|
267
265
|
@override
|
268
|
-
def startElement(self, tag, attributes):
|
266
|
+
def startElement(self, tag, attributes):
|
269
267
|
"""Signal the start of an element.
|
270
268
|
|
271
269
|
Args:
|
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
|
|
281
279
|
self._start_registered_elements(tag, attributes)
|
282
280
|
|
283
281
|
@override
|
284
|
-
def skippedEntity(self, name):
|
282
|
+
def skippedEntity(self, name):
|
285
283
|
"""Receive notification of a skipped entity.
|
286
284
|
|
287
285
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
|
|
315
313
|
self.text += unescaped
|
316
314
|
|
317
315
|
@override
|
318
|
-
def endElement(self, tag):
|
316
|
+
def endElement(self, tag):
|
319
317
|
"""Signal the end of an element.
|
320
318
|
|
321
319
|
Args:
|
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
603
601
|
self.style_html = HtmlEntity()
|
604
602
|
|
605
603
|
@override
|
606
|
-
def startElement(self, tag, attributes):
|
604
|
+
def startElement(self, tag, attributes):
|
607
605
|
"""Signal the start of an element.
|
608
606
|
|
609
607
|
Args:
|
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
616
614
|
self._start_registered_elements(tag, attributes)
|
617
615
|
|
618
616
|
@override
|
619
|
-
def skippedEntity(self, name):
|
617
|
+
def skippedEntity(self, name):
|
620
618
|
"""Receive notification of a skipped entity.
|
621
619
|
|
622
620
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
650
648
|
self.text += unescaped
|
651
649
|
|
652
650
|
@override
|
653
|
-
def endElement(self, tag):
|
651
|
+
def endElement(self, tag):
|
654
652
|
"""Signal the end of an element.
|
655
653
|
|
656
654
|
Args:
|
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
691
689
|
if tag in [member.value for member in self.Element]:
|
692
690
|
if (
|
693
691
|
tag == self.Element.HEADING.value
|
694
|
-
and
|
692
|
+
and self.Element.SDOCL.value not in self.property
|
695
693
|
):
|
696
694
|
level_attr: str = attributes.get("LVL", "")
|
697
695
|
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|
743
741
|
# headers except claims statement
|
744
742
|
elif (
|
745
743
|
self.Element.HEADING.value in self.property
|
746
|
-
and
|
744
|
+
and self.Element.SDOCL.value not in self.property
|
747
745
|
and text.strip()
|
748
746
|
):
|
749
747
|
self.parents[self.level + 1] = self.doc.add_heading(
|
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1164
1162
|
self.style_html = HtmlEntity()
|
1165
1163
|
|
1166
1164
|
@override
|
1167
|
-
def startElement(self, tag, attributes):
|
1165
|
+
def startElement(self, tag, attributes):
|
1168
1166
|
"""Signal the start of an element.
|
1169
1167
|
|
1170
1168
|
Args:
|
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1177
1175
|
self._start_registered_elements(tag, attributes)
|
1178
1176
|
|
1179
1177
|
@override
|
1180
|
-
def skippedEntity(self, name):
|
1178
|
+
def skippedEntity(self, name):
|
1181
1179
|
"""Receive notification of a skipped entity.
|
1182
1180
|
|
1183
1181
|
HTML entities will be skipped by the parser. This method will unescape them
|
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|
1211
1209
|
self.text += unescaped
|
1212
1210
|
|
1213
1211
|
@override
|
1214
|
-
def endElement(self, tag):
|
1212
|
+
def endElement(self, tag):
|
1215
1213
|
"""Signal the end of an element.
|
1216
1214
|
|
1217
1215
|
Args:
|
@@ -1474,9 +1472,7 @@ class XmlTable:
|
|
1474
1472
|
if cw == 0:
|
1475
1473
|
offset_w0.append(col["offset"][ic])
|
1476
1474
|
|
1477
|
-
min_colinfo["offset"] = sorted(
|
1478
|
-
list(set(col["offset"] + min_colinfo["offset"]))
|
1479
|
-
)
|
1475
|
+
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
|
1480
1476
|
|
1481
1477
|
# add back the 0 width cols to offset list
|
1482
1478
|
offset_w0 = list(set(offset_w0))
|
@@ -1527,7 +1523,7 @@ class XmlTable:
|
|
1527
1523
|
|
1528
1524
|
return ncols_max
|
1529
1525
|
|
1530
|
-
def _parse_table(self, table: Tag) -> TableData:
|
1526
|
+
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
|
1531
1527
|
"""Parse the content of a table tag.
|
1532
1528
|
|
1533
1529
|
Args:
|
@@ -1722,7 +1718,7 @@ class HtmlEntity:
|
|
1722
1718
|
"0": "⁰",
|
1723
1719
|
"+": "⁺",
|
1724
1720
|
"-": "⁻",
|
1725
|
-
"−": "⁻",
|
1721
|
+
"−": "⁻", # noqa: RUF001
|
1726
1722
|
"=": "⁼",
|
1727
1723
|
"(": "⁽",
|
1728
1724
|
")": "⁾",
|
@@ -1746,7 +1742,7 @@ class HtmlEntity:
|
|
1746
1742
|
"0": "₀",
|
1747
1743
|
"+": "₊",
|
1748
1744
|
"-": "₋",
|
1749
|
-
"−": "₋",
|
1745
|
+
"−": "₋", # noqa: RUF001
|
1750
1746
|
"=": "₌",
|
1751
1747
|
"(": "₍",
|
1752
1748
|
")": "₎",
|
docling/cli/main.py
CHANGED
@@ -6,14 +6,16 @@ import sys
|
|
6
6
|
import tempfile
|
7
7
|
import time
|
8
8
|
import warnings
|
9
|
+
from collections.abc import Iterable
|
9
10
|
from pathlib import Path
|
10
|
-
from typing import Annotated, Dict,
|
11
|
+
from typing import Annotated, Dict, List, Optional, Type
|
11
12
|
|
12
13
|
import rich.table
|
13
14
|
import typer
|
14
15
|
from docling_core.types.doc import ImageRefMode
|
15
16
|
from docling_core.utils.file import resolve_source_to_path
|
16
17
|
from pydantic import TypeAdapter
|
18
|
+
from rich.console import Console
|
17
19
|
|
18
20
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
19
21
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
@@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
|
|
53
55
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
54
56
|
|
55
57
|
_log = logging.getLogger(__name__)
|
56
|
-
from rich.console import Console
|
57
58
|
|
58
59
|
console = Console()
|
59
60
|
err_console = Console(stderr=True)
|
@@ -160,7 +161,6 @@ def export_documents(
|
|
160
161
|
export_doctags: bool,
|
161
162
|
image_export_mode: ImageRefMode,
|
162
163
|
):
|
163
|
-
|
164
164
|
success_count = 0
|
165
165
|
failure_count = 0
|
166
166
|
|
@@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
|
233
233
|
|
234
234
|
|
235
235
|
@app.command(no_args_is_help=True)
|
236
|
-
def convert(
|
236
|
+
def convert( # noqa: C901
|
237
237
|
input_sources: Annotated[
|
238
238
|
List[str],
|
239
239
|
typer.Argument(
|
@@ -289,7 +289,7 @@ def convert(
|
|
289
289
|
...,
|
290
290
|
help=(
|
291
291
|
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
292
|
-
f"{', '.join(
|
292
|
+
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
|
293
293
|
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
294
294
|
),
|
295
295
|
),
|
@@ -421,7 +421,7 @@ def convert(
|
|
421
421
|
logging.basicConfig(level=logging.WARNING)
|
422
422
|
elif verbose == 1:
|
423
423
|
logging.basicConfig(level=logging.INFO)
|
424
|
-
|
424
|
+
else:
|
425
425
|
logging.basicConfig(level=logging.DEBUG)
|
426
426
|
|
427
427
|
settings.debug.visualize_cells = debug_visualize_cells
|
@@ -430,7 +430,7 @@ def convert(
|
|
430
430
|
settings.debug.visualize_ocr = debug_visualize_ocr
|
431
431
|
|
432
432
|
if from_formats is None:
|
433
|
-
from_formats =
|
433
|
+
from_formats = list(InputFormat)
|
434
434
|
|
435
435
|
parsed_headers: Optional[Dict[str, str]] = None
|
436
436
|
if headers is not None:
|
docling/cli/models.py
CHANGED
@@ -62,7 +62,7 @@ def download(
|
|
62
62
|
models: Annotated[
|
63
63
|
Optional[list[_AvailableModels]],
|
64
64
|
typer.Argument(
|
65
|
-
help=
|
65
|
+
help="Models to download (default behavior: a predefined set of models will be downloaded).",
|
66
66
|
),
|
67
67
|
] = None,
|
68
68
|
all: Annotated[
|
@@ -89,14 +89,13 @@ def download(
|
|
89
89
|
"Cannot simultaneously set 'all' parameter and specify models to download."
|
90
90
|
)
|
91
91
|
if not quiet:
|
92
|
-
FORMAT = "%(message)s"
|
93
92
|
logging.basicConfig(
|
94
93
|
level=logging.INFO,
|
95
94
|
format="[blue]%(message)s[/blue]",
|
96
95
|
datefmt="[%X]",
|
97
96
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
98
97
|
)
|
99
|
-
to_download = models or (
|
98
|
+
to_download = models or (list(_AvailableModels) if all else _default_models)
|
100
99
|
output_dir = download_models(
|
101
100
|
output_dir=output_dir,
|
102
101
|
force=force,
|
docling/datamodel/base_models.py
CHANGED
@@ -10,7 +10,9 @@ from docling_core.types.doc import (
|
|
10
10
|
TableCell,
|
11
11
|
)
|
12
12
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
13
|
-
|
13
|
+
|
14
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
15
|
+
from docling_core.types.io import (
|
14
16
|
DocumentStream,
|
15
17
|
)
|
16
18
|
from PIL.Image import Image
|
@@ -233,9 +235,9 @@ class Page(BaseModel):
|
|
233
235
|
None # Internal PDF backend. By default it is cleared during assembling.
|
234
236
|
)
|
235
237
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
236
|
-
_image_cache: Dict[
|
237
|
-
|
238
|
-
|
238
|
+
_image_cache: Dict[
|
239
|
+
float, Image
|
240
|
+
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
239
241
|
|
240
242
|
def get_image(
|
241
243
|
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
@@ -243,7 +245,7 @@ class Page(BaseModel):
|
|
243
245
|
if self._backend is None:
|
244
246
|
return self._image_cache.get(scale, None)
|
245
247
|
|
246
|
-
if not
|
248
|
+
if scale not in self._image_cache:
|
247
249
|
if cropbox is None:
|
248
250
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
249
251
|
else:
|
docling/datamodel/document.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
import csv
|
2
2
|
import logging
|
3
3
|
import re
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from enum import Enum
|
5
6
|
from io import BytesIO
|
6
7
|
from pathlib import Path, PurePath
|
7
8
|
from typing import (
|
8
9
|
TYPE_CHECKING,
|
9
10
|
Dict,
|
10
|
-
Iterable,
|
11
11
|
List,
|
12
12
|
Literal,
|
13
13
|
Optional,
|
@@ -17,6 +17,8 @@ from typing import (
|
|
17
17
|
)
|
18
18
|
|
19
19
|
import filetype
|
20
|
+
|
21
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
20
22
|
from docling_core.types.doc import (
|
21
23
|
DocItem,
|
22
24
|
DocItemLabel,
|
@@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
|
|
35
37
|
PageReference,
|
36
38
|
Prov,
|
37
39
|
Ref,
|
40
|
+
Table as DsSchemaTable,
|
41
|
+
TableCell,
|
38
42
|
)
|
39
|
-
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
40
|
-
from docling_core.types.legacy_doc.base import TableCell
|
41
43
|
from docling_core.types.legacy_doc.document import (
|
42
44
|
CCSDocumentDescription as DsDocumentDescription,
|
45
|
+
CCSFileInfoObject as DsFileInfoObject,
|
46
|
+
ExportedCCSDocument as DsDocument,
|
43
47
|
)
|
44
|
-
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
45
|
-
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
46
48
|
from docling_core.utils.file import resolve_source_to_stream
|
47
49
|
from docling_core.utils.legacy import docling_document_to_legacy
|
48
50
|
from pydantic import BaseModel
|
@@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
|
|
65
67
|
)
|
66
68
|
from docling.datamodel.settings import DocumentLimits
|
67
69
|
from docling.utils.profiling import ProfilingItem
|
68
|
-
from docling.utils.utils import create_file_hash
|
70
|
+
from docling.utils.utils import create_file_hash
|
69
71
|
|
70
72
|
if TYPE_CHECKING:
|
71
73
|
from docling.document_converter import FormatOption
|
@@ -134,9 +136,9 @@ class InputDocument(BaseModel):
|
|
134
136
|
self._init_doc(backend, path_or_stream)
|
135
137
|
|
136
138
|
elif isinstance(path_or_stream, BytesIO):
|
137
|
-
assert (
|
138
|
-
|
139
|
-
)
|
139
|
+
assert filename is not None, (
|
140
|
+
"Can't construct InputDocument from stream without providing filename arg."
|
141
|
+
)
|
140
142
|
self.file = PurePath(filename)
|
141
143
|
self.filesize = path_or_stream.getbuffer().nbytes
|
142
144
|
|
@@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
|
|
228
230
|
|
229
231
|
|
230
232
|
class _DocumentConversionInput(BaseModel):
|
231
|
-
|
232
233
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
233
234
|
headers: Optional[Dict[str, str]] = None
|
234
235
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
docling/document_converter.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
import hashlib
|
2
2
|
import logging
|
3
|
-
import math
|
4
3
|
import sys
|
5
4
|
import time
|
5
|
+
from collections.abc import Iterable, Iterator
|
6
6
|
from functools import partial
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Dict,
|
8
|
+
from typing import Dict, List, Optional, Tuple, Type, Union
|
9
9
|
|
10
10
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
11
11
|
|
@@ -172,7 +172,7 @@ class DocumentConverter:
|
|
172
172
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
173
173
|
):
|
174
174
|
self.allowed_formats = (
|
175
|
-
allowed_formats if allowed_formats is not None else
|
175
|
+
allowed_formats if allowed_formats is not None else list(InputFormat)
|
176
176
|
)
|
177
177
|
self.format_to_options = {
|
178
178
|
format: (
|
@@ -254,7 +254,7 @@ class DocumentConverter:
|
|
254
254
|
|
255
255
|
if not had_result and raises_on_error:
|
256
256
|
raise ConversionError(
|
257
|
-
|
257
|
+
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
258
258
|
)
|
259
259
|
|
260
260
|
def _convert(
|
@@ -266,7 +266,7 @@ class DocumentConverter:
|
|
266
266
|
conv_input.docs(self.format_to_options),
|
267
267
|
settings.perf.doc_batch_size, # pass format_options
|
268
268
|
):
|
269
|
-
_log.info(
|
269
|
+
_log.info("Going to convert document batch...")
|
270
270
|
|
271
271
|
# parallel processing only within input_batch
|
272
272
|
# with ThreadPoolExecutor(
|
docling/models/api_vlm_model.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from
|
1
|
+
from collections.abc import Iterable
|
2
2
|
|
3
3
|
from docling.datamodel.base_models import Page, VlmPrediction
|
4
4
|
from docling.datamodel.document import ConversionResult
|
@@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
|
|
10
10
|
|
11
11
|
|
12
12
|
class ApiVlmModel(BasePageModel):
|
13
|
-
|
14
13
|
def __init__(
|
15
14
|
self,
|
16
15
|
enabled: bool,
|
docling/models/base_model.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from
|
2
|
+
from collections.abc import Iterable
|
3
|
+
from typing import Generic, Optional, Protocol, Type
|
3
4
|
|
4
5
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
5
6
|
from typing_extensions import TypeVar
|
@@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
|
29
30
|
|
30
31
|
|
31
32
|
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
32
|
-
|
33
33
|
elements_batch_size: int = settings.perf.elements_batch_size
|
34
34
|
|
35
35
|
@abstractmethod
|
@@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
|
50
50
|
|
51
51
|
|
52
52
|
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
53
|
-
|
54
53
|
def prepare_element(
|
55
54
|
self, conv_res: ConversionResult, element: NodeItem
|
56
55
|
) -> Optional[NodeItem]:
|
@@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
|
62
61
|
class BaseItemAndImageEnrichmentModel(
|
63
62
|
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
64
63
|
):
|
65
|
-
|
66
64
|
images_scale: float
|
67
65
|
expansion_factor: float = 0.0
|
68
66
|
|
docling/models/base_ocr_model.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Type
|
6
7
|
|
7
8
|
import numpy as np
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
-
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
10
10
|
from PIL import Image, ImageDraw
|
11
11
|
from rtree import index
|
12
12
|
from scipy.ndimage import binary_dilation, find_objects, label
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import re
|
2
2
|
from collections import Counter
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import List, Literal, Optional, Tuple, Union
|
5
6
|
|
6
7
|
import numpy as np
|
7
8
|
from docling_core.types.doc import (
|
docling/models/easyocr_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import warnings
|
3
3
|
import zipfile
|
4
|
+
from collections.abc import Iterable
|
4
5
|
from pathlib import Path
|
5
|
-
from typing import
|
6
|
+
from typing import List, Optional, Type
|
6
7
|
|
7
8
|
import numpy
|
8
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
|
|
58
59
|
device = decide_device(accelerator_options.device)
|
59
60
|
# Enable easyocr GPU if running on CUDA, MPS
|
60
61
|
use_gpu = any(
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
AcceleratorDevice.MPS.value,
|
66
|
-
]
|
62
|
+
device.startswith(x)
|
63
|
+
for x in [
|
64
|
+
AcceleratorDevice.CUDA.value,
|
65
|
+
AcceleratorDevice.MPS.value,
|
67
66
|
]
|
68
67
|
)
|
69
68
|
else:
|
@@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
|
|
98
97
|
progress: bool = False,
|
99
98
|
) -> Path:
|
100
99
|
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
101
|
-
from easyocr.config import
|
102
|
-
|
100
|
+
from easyocr.config import (
|
101
|
+
detection_models as det_models_dict,
|
102
|
+
recognition_models as rec_models_dict,
|
103
|
+
)
|
103
104
|
|
104
105
|
if local_dir is None:
|
105
106
|
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
@@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
|
|
126
127
|
def __call__(
|
127
128
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
128
129
|
) -> Iterable[Page]:
|
129
|
-
|
130
130
|
if not self.enabled:
|
131
131
|
yield from page_batch
|
132
132
|
return
|
133
133
|
|
134
134
|
for page in page_batch:
|
135
|
-
|
136
135
|
assert page._backend is not None
|
137
136
|
if not page._backend.is_valid():
|
138
137
|
yield page
|
@@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
|
|
9
9
|
logger = logging.getLogger(__name__)
|
10
10
|
|
11
11
|
|
12
|
-
@lru_cache
|
12
|
+
@lru_cache
|
13
13
|
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
14
14
|
factory = OcrFactory()
|
15
15
|
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
@@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
|
17
17
|
return factory
|
18
18
|
|
19
19
|
|
20
|
-
@lru_cache
|
20
|
+
@lru_cache
|
21
21
|
def get_picture_description_factory(
|
22
22
|
allow_external_plugins: bool = False,
|
23
23
|
) -> PictureDescriptionFactory:
|
@@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
|
|
33
33
|
|
34
34
|
@property
|
35
35
|
def registered_kind(self) -> list[str]:
|
36
|
-
return
|
36
|
+
return [opt.kind for opt in self._classes.keys()]
|
37
37
|
|
38
38
|
def get_enum(self) -> enum.Enum:
|
39
39
|
return enum.Enum(
|