docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Iterable, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
7
|
+
from docling_core.types.doc.page import (
|
|
8
|
+
BoundingRectangle,
|
|
9
|
+
PdfPageBoundaryType,
|
|
10
|
+
PdfPageGeometry,
|
|
11
|
+
SegmentedPdfPage,
|
|
12
|
+
TextCell,
|
|
13
|
+
)
|
|
14
|
+
from PIL import Image
|
|
15
|
+
|
|
16
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
17
|
+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
|
18
|
+
from docling.datamodel.backend_options import PdfBackendOptions
|
|
19
|
+
from docling.datamodel.base_models import InputFormat, Size
|
|
20
|
+
from docling.datamodel.document import InputDocument
|
|
21
|
+
|
|
22
|
+
_log = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _ImagePageBackend(PdfPageBackend):
|
|
26
|
+
def __init__(self, image: Image.Image):
|
|
27
|
+
self._image: Optional[Image.Image] = image
|
|
28
|
+
self.valid: bool = self._image is not None
|
|
29
|
+
|
|
30
|
+
def is_valid(self) -> bool:
|
|
31
|
+
return self.valid
|
|
32
|
+
|
|
33
|
+
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
|
34
|
+
# No text extraction from raw images without OCR
|
|
35
|
+
return ""
|
|
36
|
+
|
|
37
|
+
def get_segmented_page(self) -> SegmentedPdfPage:
|
|
38
|
+
# Return empty segmented page with proper dimensions for raw images
|
|
39
|
+
assert self._image is not None
|
|
40
|
+
page_size = self.get_size()
|
|
41
|
+
bbox = BoundingBox(
|
|
42
|
+
l=0.0,
|
|
43
|
+
t=0.0,
|
|
44
|
+
r=float(page_size.width),
|
|
45
|
+
b=float(page_size.height),
|
|
46
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
47
|
+
)
|
|
48
|
+
dimension = PdfPageGeometry(
|
|
49
|
+
angle=0.0,
|
|
50
|
+
rect=BoundingRectangle.from_bounding_box(bbox),
|
|
51
|
+
boundary_type=PdfPageBoundaryType.CROP_BOX,
|
|
52
|
+
art_bbox=bbox,
|
|
53
|
+
bleed_bbox=bbox,
|
|
54
|
+
crop_bbox=bbox,
|
|
55
|
+
media_bbox=bbox,
|
|
56
|
+
trim_bbox=bbox,
|
|
57
|
+
)
|
|
58
|
+
return SegmentedPdfPage(
|
|
59
|
+
dimension=dimension,
|
|
60
|
+
char_cells=[],
|
|
61
|
+
word_cells=[],
|
|
62
|
+
textline_cells=[],
|
|
63
|
+
has_chars=False,
|
|
64
|
+
has_words=False,
|
|
65
|
+
has_lines=False,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
|
69
|
+
# No text cells on raw images
|
|
70
|
+
return []
|
|
71
|
+
|
|
72
|
+
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
|
73
|
+
# For raw images, the entire page is a bitmap
|
|
74
|
+
assert self._image is not None
|
|
75
|
+
page_size = self.get_size()
|
|
76
|
+
full_page_bbox = BoundingBox(
|
|
77
|
+
l=0.0,
|
|
78
|
+
t=0.0,
|
|
79
|
+
r=float(page_size.width),
|
|
80
|
+
b=float(page_size.height),
|
|
81
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
82
|
+
)
|
|
83
|
+
if scale != 1:
|
|
84
|
+
full_page_bbox = full_page_bbox.scaled(scale=scale)
|
|
85
|
+
yield full_page_bbox
|
|
86
|
+
|
|
87
|
+
def get_page_image(
|
|
88
|
+
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
|
89
|
+
) -> Image.Image:
|
|
90
|
+
assert self._image is not None
|
|
91
|
+
img = self._image
|
|
92
|
+
|
|
93
|
+
if cropbox is not None:
|
|
94
|
+
# Expected cropbox comes in TOPLEFT coords in our pipeline
|
|
95
|
+
if cropbox.coord_origin != CoordOrigin.TOPLEFT:
|
|
96
|
+
# Convert to TOPLEFT relative to current image height
|
|
97
|
+
cropbox = cropbox.to_top_left_origin(img.height)
|
|
98
|
+
left, top, right, bottom = cropbox.as_tuple()
|
|
99
|
+
left = max(0, round(left))
|
|
100
|
+
top = max(0, round(top))
|
|
101
|
+
right = min(img.width, round(right))
|
|
102
|
+
bottom = min(img.height, round(bottom))
|
|
103
|
+
img = img.crop((left, top, right, bottom))
|
|
104
|
+
|
|
105
|
+
if scale != 1:
|
|
106
|
+
new_w = max(1, round(img.width * scale))
|
|
107
|
+
new_h = max(1, round(img.height * scale))
|
|
108
|
+
img = img.resize((new_w, new_h))
|
|
109
|
+
|
|
110
|
+
return img
|
|
111
|
+
|
|
112
|
+
def get_size(self) -> Size:
|
|
113
|
+
assert self._image is not None
|
|
114
|
+
return Size(width=self._image.width, height=self._image.height)
|
|
115
|
+
|
|
116
|
+
def unload(self):
|
|
117
|
+
# Help GC and free memory
|
|
118
|
+
self._image = None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class ImageDocumentBackend(PdfDocumentBackend):
|
|
122
|
+
"""Image-native backend that bypasses pypdfium2.
|
|
123
|
+
|
|
124
|
+
Notes:
|
|
125
|
+
- Subclasses PdfDocumentBackend to satisfy pipeline type checks.
|
|
126
|
+
- Intentionally avoids calling PdfDocumentBackend.__init__ to skip
|
|
127
|
+
the image→PDF conversion and any pypdfium2 usage.
|
|
128
|
+
- Handles multi-page TIFF by extracting frames eagerly to separate
|
|
129
|
+
Image objects to keep thread-safety when pages process in parallel.
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def __init__(
|
|
133
|
+
self,
|
|
134
|
+
in_doc: InputDocument,
|
|
135
|
+
path_or_stream: Union[BytesIO, Path],
|
|
136
|
+
options: PdfBackendOptions = PdfBackendOptions(),
|
|
137
|
+
):
|
|
138
|
+
# Bypass PdfDocumentBackend.__init__ to avoid image→PDF conversion
|
|
139
|
+
AbstractDocumentBackend.__init__(self, in_doc, path_or_stream, options)
|
|
140
|
+
self.options: PdfBackendOptions = options
|
|
141
|
+
|
|
142
|
+
if self.input_format not in {InputFormat.IMAGE}:
|
|
143
|
+
raise RuntimeError(
|
|
144
|
+
f"Incompatible file format {self.input_format} was passed to ImageDocumentBackend."
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Load frames eagerly for thread-safety across pages
|
|
148
|
+
self._frames: List[Image.Image] = []
|
|
149
|
+
try:
|
|
150
|
+
img = Image.open(self.path_or_stream) # type: ignore[arg-type]
|
|
151
|
+
|
|
152
|
+
# Handle multi-frame and single-frame images
|
|
153
|
+
# - multiframe formats: TIFF, GIF, ICO
|
|
154
|
+
# - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
|
|
155
|
+
frame_count = getattr(img, "n_frames", 1)
|
|
156
|
+
|
|
157
|
+
if frame_count > 1:
|
|
158
|
+
for i in range(frame_count):
|
|
159
|
+
img.seek(i)
|
|
160
|
+
self._frames.append(img.copy().convert("RGB"))
|
|
161
|
+
else:
|
|
162
|
+
self._frames.append(img.convert("RGB"))
|
|
163
|
+
except Exception as e:
|
|
164
|
+
raise RuntimeError(f"Could not load image for document {self.file}") from e
|
|
165
|
+
|
|
166
|
+
def is_valid(self) -> bool:
|
|
167
|
+
return len(self._frames) > 0
|
|
168
|
+
|
|
169
|
+
def page_count(self) -> int:
|
|
170
|
+
return len(self._frames)
|
|
171
|
+
|
|
172
|
+
def load_page(self, page_no: int) -> _ImagePageBackend:
|
|
173
|
+
if not (0 <= page_no < len(self._frames)):
|
|
174
|
+
raise IndexError(f"Page index out of range: {page_no}")
|
|
175
|
+
return _ImagePageBackend(self._frames[page_no])
|
|
176
|
+
|
|
177
|
+
@classmethod
|
|
178
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
179
|
+
# Only IMAGE here; PDF handling remains in PDF-oriented backends
|
|
180
|
+
return {InputFormat.IMAGE}
|
|
181
|
+
|
|
182
|
+
@classmethod
|
|
183
|
+
def supports_pagination(cls) -> bool:
|
|
184
|
+
return True
|
|
185
|
+
|
|
186
|
+
def unload(self):
|
|
187
|
+
super().unload()
|
|
188
|
+
self._frames = []
|
|
File without changes
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union
|
|
4
|
+
|
|
5
|
+
from docling_core.types.doc import DoclingDocument
|
|
6
|
+
from typing_extensions import override
|
|
7
|
+
|
|
8
|
+
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
|
9
|
+
from docling.datamodel.base_models import InputFormat
|
|
10
|
+
from docling.datamodel.document import InputDocument
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DoclingJSONBackend(DeclarativeDocumentBackend):
|
|
14
|
+
@override
|
|
15
|
+
def __init__(
|
|
16
|
+
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
|
|
17
|
+
) -> None:
|
|
18
|
+
super().__init__(in_doc, path_or_stream)
|
|
19
|
+
|
|
20
|
+
# given we need to store any actual conversion exception for raising it from
|
|
21
|
+
# convert(), this captures the successful result or the actual error in a
|
|
22
|
+
# mutually exclusive way:
|
|
23
|
+
self._doc_or_err = self._get_doc_or_err()
|
|
24
|
+
|
|
25
|
+
@override
|
|
26
|
+
def is_valid(self) -> bool:
|
|
27
|
+
return isinstance(self._doc_or_err, DoclingDocument)
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
@override
|
|
31
|
+
def supports_pagination(cls) -> bool:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
@override
|
|
36
|
+
def supported_formats(cls) -> set[InputFormat]:
|
|
37
|
+
return {InputFormat.JSON_DOCLING}
|
|
38
|
+
|
|
39
|
+
def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
|
|
40
|
+
try:
|
|
41
|
+
json_data: Union[str, bytes]
|
|
42
|
+
if isinstance(self.path_or_stream, Path):
|
|
43
|
+
with open(self.path_or_stream, encoding="utf-8") as f:
|
|
44
|
+
json_data = f.read()
|
|
45
|
+
elif isinstance(self.path_or_stream, BytesIO):
|
|
46
|
+
json_data = self.path_or_stream.getvalue()
|
|
47
|
+
else:
|
|
48
|
+
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
|
|
49
|
+
return DoclingDocument.model_validate_json(json_data=json_data)
|
|
50
|
+
except Exception as e:
|
|
51
|
+
return e
|
|
52
|
+
|
|
53
|
+
@override
|
|
54
|
+
def convert(self) -> DoclingDocument:
|
|
55
|
+
if isinstance(self._doc_or_err, DoclingDocument):
|
|
56
|
+
return self._doc_or_err
|
|
57
|
+
else:
|
|
58
|
+
raise self._doc_or_err
|