docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
|
|
6
|
+
from docling.datamodel.pipeline_options import (
|
|
7
|
+
LayoutOptions,
|
|
8
|
+
granite_picture_description,
|
|
9
|
+
smolvlm_picture_description,
|
|
10
|
+
)
|
|
11
|
+
from docling.datamodel.settings import settings
|
|
12
|
+
from docling.datamodel.vlm_model_specs import (
|
|
13
|
+
GRANITEDOCLING_MLX,
|
|
14
|
+
GRANITEDOCLING_TRANSFORMERS,
|
|
15
|
+
SMOLDOCLING_MLX,
|
|
16
|
+
SMOLDOCLING_TRANSFORMERS,
|
|
17
|
+
)
|
|
18
|
+
from docling.models.stages.code_formula.code_formula_model import CodeFormulaModel
|
|
19
|
+
from docling.models.stages.layout.layout_model import LayoutModel
|
|
20
|
+
from docling.models.stages.ocr.easyocr_model import EasyOcrModel
|
|
21
|
+
from docling.models.stages.ocr.rapid_ocr_model import RapidOcrModel
|
|
22
|
+
from docling.models.stages.picture_classifier.document_picture_classifier import (
|
|
23
|
+
DocumentPictureClassifier,
|
|
24
|
+
DocumentPictureClassifierOptions,
|
|
25
|
+
)
|
|
26
|
+
from docling.models.stages.picture_description.picture_description_vlm_model import (
|
|
27
|
+
PictureDescriptionVlmModel,
|
|
28
|
+
)
|
|
29
|
+
from docling.models.stages.table_structure.table_structure_model import (
|
|
30
|
+
TableStructureModel,
|
|
31
|
+
)
|
|
32
|
+
from docling.models.utils.hf_model_download import download_hf_model
|
|
33
|
+
|
|
34
|
+
_log = logging.getLogger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def download_models(
|
|
38
|
+
output_dir: Optional[Path] = None,
|
|
39
|
+
*,
|
|
40
|
+
force: bool = False,
|
|
41
|
+
progress: bool = False,
|
|
42
|
+
with_layout: bool = True,
|
|
43
|
+
with_tableformer: bool = True,
|
|
44
|
+
with_code_formula: bool = True,
|
|
45
|
+
with_picture_classifier: bool = True,
|
|
46
|
+
with_smolvlm: bool = False,
|
|
47
|
+
with_granitedocling: bool = False,
|
|
48
|
+
with_granitedocling_mlx: bool = False,
|
|
49
|
+
with_smoldocling: bool = False,
|
|
50
|
+
with_smoldocling_mlx: bool = False,
|
|
51
|
+
with_granite_vision: bool = False,
|
|
52
|
+
with_rapidocr: bool = True,
|
|
53
|
+
with_easyocr: bool = False,
|
|
54
|
+
):
|
|
55
|
+
if output_dir is None:
|
|
56
|
+
output_dir = settings.cache_dir / "models"
|
|
57
|
+
|
|
58
|
+
# Make sure the folder exists
|
|
59
|
+
output_dir.mkdir(exist_ok=True, parents=True)
|
|
60
|
+
|
|
61
|
+
if with_layout:
|
|
62
|
+
_log.info("Downloading layout model...")
|
|
63
|
+
LayoutModel.download_models(
|
|
64
|
+
local_dir=output_dir / LayoutOptions().model_spec.model_repo_folder,
|
|
65
|
+
force=force,
|
|
66
|
+
progress=progress,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if with_tableformer:
|
|
70
|
+
_log.info("Downloading tableformer model...")
|
|
71
|
+
TableStructureModel.download_models(
|
|
72
|
+
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
|
73
|
+
force=force,
|
|
74
|
+
progress=progress,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if with_picture_classifier:
|
|
78
|
+
_log.info("Downloading picture classifier model...")
|
|
79
|
+
pic_opts = DocumentPictureClassifierOptions()
|
|
80
|
+
DocumentPictureClassifier.download_models(
|
|
81
|
+
repo_id=pic_opts.repo_id,
|
|
82
|
+
revision=pic_opts.revision,
|
|
83
|
+
local_dir=output_dir / pic_opts.repo_cache_folder,
|
|
84
|
+
force=force,
|
|
85
|
+
progress=progress,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if with_code_formula:
|
|
89
|
+
_log.info("Downloading code formula model...")
|
|
90
|
+
CodeFormulaModel.download_models(
|
|
91
|
+
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
|
92
|
+
force=force,
|
|
93
|
+
progress=progress,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
if with_smolvlm:
|
|
97
|
+
_log.info("Downloading SmolVlm model...")
|
|
98
|
+
download_hf_model(
|
|
99
|
+
repo_id=smolvlm_picture_description.repo_id,
|
|
100
|
+
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
|
101
|
+
force=force,
|
|
102
|
+
progress=progress,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
if with_granitedocling:
|
|
106
|
+
_log.info("Downloading GraniteDocling model...")
|
|
107
|
+
download_hf_model(
|
|
108
|
+
repo_id=GRANITEDOCLING_TRANSFORMERS.repo_id,
|
|
109
|
+
local_dir=output_dir / GRANITEDOCLING_TRANSFORMERS.repo_cache_folder,
|
|
110
|
+
force=force,
|
|
111
|
+
progress=progress,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if with_granitedocling_mlx:
|
|
115
|
+
_log.info("Downloading GraniteDocling MLX model...")
|
|
116
|
+
download_hf_model(
|
|
117
|
+
repo_id=GRANITEDOCLING_MLX.repo_id,
|
|
118
|
+
local_dir=output_dir / GRANITEDOCLING_MLX.repo_cache_folder,
|
|
119
|
+
force=force,
|
|
120
|
+
progress=progress,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if with_smoldocling:
|
|
124
|
+
_log.info("Downloading SmolDocling model...")
|
|
125
|
+
download_hf_model(
|
|
126
|
+
repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
|
|
127
|
+
local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
|
|
128
|
+
force=force,
|
|
129
|
+
progress=progress,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if with_smoldocling_mlx:
|
|
133
|
+
_log.info("Downloading SmolDocling MLX model...")
|
|
134
|
+
download_hf_model(
|
|
135
|
+
repo_id=SMOLDOCLING_MLX.repo_id,
|
|
136
|
+
local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
|
|
137
|
+
force=force,
|
|
138
|
+
progress=progress,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if with_granite_vision:
|
|
142
|
+
_log.info("Downloading Granite Vision model...")
|
|
143
|
+
download_hf_model(
|
|
144
|
+
repo_id=granite_picture_description.repo_id,
|
|
145
|
+
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
|
146
|
+
force=force,
|
|
147
|
+
progress=progress,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if with_rapidocr:
|
|
151
|
+
for backend in ("torch", "onnxruntime"):
|
|
152
|
+
_log.info(f"Downloading rapidocr {backend} models...")
|
|
153
|
+
RapidOcrModel.download_models(
|
|
154
|
+
backend=backend,
|
|
155
|
+
local_dir=output_dir / RapidOcrModel._model_repo_folder,
|
|
156
|
+
force=force,
|
|
157
|
+
progress=progress,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if with_easyocr:
|
|
161
|
+
_log.info("Downloading easyocr models...")
|
|
162
|
+
EasyOcrModel.download_models(
|
|
163
|
+
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
|
164
|
+
force=force,
|
|
165
|
+
progress=progress,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return output_dir
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from typing import Optional, Tuple
|
|
2
|
+
|
|
3
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
4
|
+
from docling_core.types.doc.page import BoundingRectangle
|
|
5
|
+
|
|
6
|
+
from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def map_tesseract_script(script: str) -> str:
|
|
10
|
+
r""" """
|
|
11
|
+
if script == "Katakana" or script == "Hiragana":
|
|
12
|
+
script = "Japanese"
|
|
13
|
+
elif script == "Han":
|
|
14
|
+
script = "HanS"
|
|
15
|
+
elif script == "Korean":
|
|
16
|
+
script = "Hangul"
|
|
17
|
+
return script
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parse_tesseract_orientation(orientation: str) -> int:
|
|
21
|
+
# Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
|
|
22
|
+
# are [0, 360[ counterclockwise
|
|
23
|
+
parsed = int(orientation)
|
|
24
|
+
if parsed not in CLIPPED_ORIENTATIONS:
|
|
25
|
+
msg = (
|
|
26
|
+
f"invalid tesseract document orientation {orientation}, "
|
|
27
|
+
f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
|
|
28
|
+
)
|
|
29
|
+
raise ValueError(msg)
|
|
30
|
+
parsed = -parsed
|
|
31
|
+
parsed %= 360
|
|
32
|
+
return parsed
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def tesseract_box_to_bounding_rectangle(
|
|
36
|
+
bbox: BoundingBox,
|
|
37
|
+
*,
|
|
38
|
+
original_offset: Optional[BoundingBox] = None,
|
|
39
|
+
scale: float,
|
|
40
|
+
orientation: int,
|
|
41
|
+
im_size: Tuple[int, int],
|
|
42
|
+
) -> BoundingRectangle:
|
|
43
|
+
# box is in the top, left, height, width format, top left coordinates
|
|
44
|
+
rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
|
|
45
|
+
rect = BoundingRectangle(
|
|
46
|
+
r_x0=rect.r_x0 / scale,
|
|
47
|
+
r_y0=rect.r_y0 / scale,
|
|
48
|
+
r_x1=rect.r_x1 / scale,
|
|
49
|
+
r_y1=rect.r_y1 / scale,
|
|
50
|
+
r_x2=rect.r_x2 / scale,
|
|
51
|
+
r_y2=rect.r_y2 / scale,
|
|
52
|
+
r_x3=rect.r_x3 / scale,
|
|
53
|
+
r_y3=rect.r_y3 / scale,
|
|
54
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
55
|
+
)
|
|
56
|
+
if original_offset is not None:
|
|
57
|
+
if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
|
|
58
|
+
msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
|
|
59
|
+
raise ValueError(msg)
|
|
60
|
+
if original_offset is not None:
|
|
61
|
+
rect.r_x0 += original_offset.l
|
|
62
|
+
rect.r_x1 += original_offset.l
|
|
63
|
+
rect.r_x2 += original_offset.l
|
|
64
|
+
rect.r_x3 += original_offset.l
|
|
65
|
+
rect.r_y0 += original_offset.t
|
|
66
|
+
rect.r_y1 += original_offset.t
|
|
67
|
+
rect.r_y2 += original_offset.t
|
|
68
|
+
rect.r_y3 += original_offset.t
|
|
69
|
+
return rect
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
|
|
3
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
4
|
+
from docling_core.types.doc.page import BoundingRectangle
|
|
5
|
+
|
|
6
|
+
CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def rotate_bounding_box(
|
|
10
|
+
bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
|
|
11
|
+
) -> BoundingRectangle:
|
|
12
|
+
# The box is left top width height in TOPLEFT coordinates
|
|
13
|
+
# Bounding rectangle start with r_0 at the bottom left whatever the
|
|
14
|
+
# coordinate system. Then other corners are found rotating counterclockwise
|
|
15
|
+
bbox = bbox.to_top_left_origin(im_size[1])
|
|
16
|
+
left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
|
|
17
|
+
im_w, im_h = im_size
|
|
18
|
+
angle = angle % 360
|
|
19
|
+
if angle == 0:
|
|
20
|
+
return BoundingRectangle.from_bounding_box(bbox)
|
|
21
|
+
elif angle == 90:
|
|
22
|
+
r_x0 = top + height
|
|
23
|
+
r_y0 = im_w - left
|
|
24
|
+
r_x1 = r_x0
|
|
25
|
+
r_y1 = r_y0 - width
|
|
26
|
+
r_x2 = r_x1 - height
|
|
27
|
+
r_y2 = r_y1
|
|
28
|
+
r_x3 = r_x2
|
|
29
|
+
r_y3 = r_y0
|
|
30
|
+
elif angle == 180:
|
|
31
|
+
r_x0 = im_w - left
|
|
32
|
+
r_y0 = im_h - (top + height)
|
|
33
|
+
r_x1 = r_x0 - width
|
|
34
|
+
r_y1 = r_y0
|
|
35
|
+
r_x2 = r_x1
|
|
36
|
+
r_y2 = r_y1 + height
|
|
37
|
+
r_x3 = r_x0
|
|
38
|
+
r_y3 = r_y2
|
|
39
|
+
elif angle == 270:
|
|
40
|
+
r_x0 = im_h - (top + height)
|
|
41
|
+
r_y0 = left
|
|
42
|
+
r_x1 = r_x0
|
|
43
|
+
r_y1 = r_y0 + width
|
|
44
|
+
r_x2 = r_x1 + height
|
|
45
|
+
r_y2 = r_y1
|
|
46
|
+
r_x3 = r_x2
|
|
47
|
+
r_y3 = r_y0
|
|
48
|
+
else:
|
|
49
|
+
msg = (
|
|
50
|
+
f"invalid orientation {angle}, expected values in:"
|
|
51
|
+
f" {sorted(CLIPPED_ORIENTATIONS)}"
|
|
52
|
+
)
|
|
53
|
+
raise ValueError(msg)
|
|
54
|
+
rectangle = BoundingRectangle(
|
|
55
|
+
r_x0=r_x0,
|
|
56
|
+
r_y0=r_y0,
|
|
57
|
+
r_x1=r_x1,
|
|
58
|
+
r_y1=r_y1,
|
|
59
|
+
r_x2=r_x2,
|
|
60
|
+
r_y2=r_y2,
|
|
61
|
+
r_x3=r_x3,
|
|
62
|
+
r_y3=r_y3,
|
|
63
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
64
|
+
)
|
|
65
|
+
return rectangle
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import TYPE_CHECKING, List
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from docling.datamodel.settings import settings
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from docling.datamodel.document import ConversionResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ProfilingScope(str, Enum):
|
|
16
|
+
PAGE = "page"
|
|
17
|
+
DOCUMENT = "document"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ProfilingItem(BaseModel):
|
|
21
|
+
scope: ProfilingScope
|
|
22
|
+
count: int = 0
|
|
23
|
+
times: List[float] = []
|
|
24
|
+
start_timestamps: List[datetime] = []
|
|
25
|
+
|
|
26
|
+
def total(self) -> float:
|
|
27
|
+
return np.sum(self.times) # type: ignore
|
|
28
|
+
|
|
29
|
+
def avg(self) -> float:
|
|
30
|
+
return np.average(self.times) # type: ignore
|
|
31
|
+
|
|
32
|
+
def std(self) -> float:
|
|
33
|
+
return np.std(self.times) # type: ignore
|
|
34
|
+
|
|
35
|
+
def mean(self) -> float:
|
|
36
|
+
return np.mean(self.times) # type: ignore
|
|
37
|
+
|
|
38
|
+
def percentile(self, perc: float) -> float:
|
|
39
|
+
return np.percentile(self.times, perc) # type: ignore
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class TimeRecorder:
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
conv_res: "ConversionResult",
|
|
46
|
+
key: str,
|
|
47
|
+
scope: ProfilingScope = ProfilingScope.PAGE,
|
|
48
|
+
):
|
|
49
|
+
if settings.debug.profile_pipeline_timings:
|
|
50
|
+
if key not in conv_res.timings.keys():
|
|
51
|
+
conv_res.timings[key] = ProfilingItem(scope=scope)
|
|
52
|
+
self.conv_res = conv_res
|
|
53
|
+
self.key = key
|
|
54
|
+
|
|
55
|
+
def __enter__(self):
|
|
56
|
+
if settings.debug.profile_pipeline_timings:
|
|
57
|
+
self.start = time.monotonic()
|
|
58
|
+
self.conv_res.timings[self.key].start_timestamps.append(datetime.utcnow())
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def __exit__(self, *args):
|
|
62
|
+
if settings.debug.profile_pipeline_timings:
|
|
63
|
+
elapsed = time.monotonic() - self.start
|
|
64
|
+
self.conv_res.timings[self.key].times.append(elapsed)
|
|
65
|
+
self.conv_res.timings[self.key].count += 1
|
docling/utils/utils.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from io import BytesIO
|
|
3
|
+
from itertools import islice
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Union
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def chunkify(iterator, chunk_size):
|
|
12
|
+
"""Yield successive chunks of chunk_size from the iterable."""
|
|
13
|
+
if isinstance(iterator, List):
|
|
14
|
+
iterator = iter(iterator)
|
|
15
|
+
for first in iterator: # Take the first element from the iterator
|
|
16
|
+
yield [first, *list(islice(iterator, chunk_size - 1))]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
|
20
|
+
"""Create a stable page_hash of the path_or_stream of a file"""
|
|
21
|
+
|
|
22
|
+
block_size = 65536
|
|
23
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
|
24
|
+
|
|
25
|
+
def _hash_buf(binary_stream):
|
|
26
|
+
buf = binary_stream.read(block_size) # read and page_hash in chunks
|
|
27
|
+
while len(buf) > 0:
|
|
28
|
+
hasher.update(buf)
|
|
29
|
+
buf = binary_stream.read(block_size)
|
|
30
|
+
|
|
31
|
+
if isinstance(path_or_stream, Path):
|
|
32
|
+
with path_or_stream.open("rb") as afile:
|
|
33
|
+
_hash_buf(afile)
|
|
34
|
+
elif isinstance(path_or_stream, BytesIO):
|
|
35
|
+
_hash_buf(path_or_stream)
|
|
36
|
+
|
|
37
|
+
return hasher.hexdigest()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def create_hash(string: str):
|
|
41
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
|
42
|
+
hasher.update(string.encode("utf-8"))
|
|
43
|
+
|
|
44
|
+
return hasher.hexdigest()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
|
|
48
|
+
buf = BytesIO()
|
|
49
|
+
with requests.get(url, stream=True, allow_redirects=True) as response:
|
|
50
|
+
total_size = int(response.headers.get("content-length", 0))
|
|
51
|
+
progress_bar = tqdm(
|
|
52
|
+
total=total_size,
|
|
53
|
+
unit="B",
|
|
54
|
+
unit_scale=True,
|
|
55
|
+
unit_divisor=1024,
|
|
56
|
+
disable=(not progress),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
for chunk in response.iter_content(10 * 1024):
|
|
60
|
+
buf.write(chunk)
|
|
61
|
+
progress_bar.update(len(chunk))
|
|
62
|
+
progress_bar.close()
|
|
63
|
+
|
|
64
|
+
buf.seek(0)
|
|
65
|
+
return buf
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from docling_core.types.doc import DocItemLabel
|
|
2
|
+
from PIL import Image, ImageDraw, ImageFont
|
|
3
|
+
from PIL.ImageFont import FreeTypeFont
|
|
4
|
+
|
|
5
|
+
from docling.datamodel.base_models import Cluster
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def draw_clusters(
|
|
9
|
+
image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
|
|
10
|
+
) -> None:
|
|
11
|
+
"""
|
|
12
|
+
Draw clusters on an image
|
|
13
|
+
"""
|
|
14
|
+
draw = ImageDraw.Draw(image, "RGBA")
|
|
15
|
+
# Create a smaller font for the labels
|
|
16
|
+
font: ImageFont.ImageFont | FreeTypeFont
|
|
17
|
+
try:
|
|
18
|
+
font = ImageFont.truetype("arial.ttf", 12)
|
|
19
|
+
except OSError:
|
|
20
|
+
# Fallback to default font if arial is not available
|
|
21
|
+
font = ImageFont.load_default()
|
|
22
|
+
for c_tl in clusters:
|
|
23
|
+
all_clusters = [c_tl, *c_tl.children]
|
|
24
|
+
for c in all_clusters:
|
|
25
|
+
# Draw cells first (underneath)
|
|
26
|
+
cell_color = (0, 0, 0, 40) # Transparent black for cells
|
|
27
|
+
for tc in c.cells:
|
|
28
|
+
cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
|
|
29
|
+
cx0 *= scale_x
|
|
30
|
+
cx1 *= scale_x
|
|
31
|
+
cy0 *= scale_x
|
|
32
|
+
cy1 *= scale_y
|
|
33
|
+
|
|
34
|
+
draw.rectangle(
|
|
35
|
+
[(cx0, cy0), (cx1, cy1)],
|
|
36
|
+
outline=None,
|
|
37
|
+
fill=cell_color,
|
|
38
|
+
)
|
|
39
|
+
# Draw cluster rectangle
|
|
40
|
+
x0, y0, x1, y1 = c.bbox.as_tuple()
|
|
41
|
+
x0 *= scale_x
|
|
42
|
+
x1 *= scale_x
|
|
43
|
+
y0 *= scale_x
|
|
44
|
+
y1 *= scale_y
|
|
45
|
+
|
|
46
|
+
if y1 <= y0:
|
|
47
|
+
y1, y0 = y0, y1
|
|
48
|
+
if x1 <= x0:
|
|
49
|
+
x1, x0 = x0, x1
|
|
50
|
+
|
|
51
|
+
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
|
52
|
+
cluster_outline_color = (
|
|
53
|
+
*list(DocItemLabel.get_color(c.label)),
|
|
54
|
+
255,
|
|
55
|
+
)
|
|
56
|
+
draw.rectangle(
|
|
57
|
+
[(x0, y0), (x1, y1)],
|
|
58
|
+
outline=cluster_outline_color,
|
|
59
|
+
fill=cluster_fill_color,
|
|
60
|
+
)
|
|
61
|
+
# Add label name and confidence
|
|
62
|
+
label_text = f"{c.label.name} ({c.confidence:.2f})"
|
|
63
|
+
# Create semi-transparent background for text
|
|
64
|
+
text_bbox = draw.textbbox((x0, y0), label_text, font=font)
|
|
65
|
+
text_bg_padding = 2
|
|
66
|
+
draw.rectangle(
|
|
67
|
+
[
|
|
68
|
+
(
|
|
69
|
+
text_bbox[0] - text_bg_padding,
|
|
70
|
+
text_bbox[1] - text_bg_padding,
|
|
71
|
+
),
|
|
72
|
+
(
|
|
73
|
+
text_bbox[2] + text_bg_padding,
|
|
74
|
+
text_bbox[3] + text_bg_padding,
|
|
75
|
+
),
|
|
76
|
+
],
|
|
77
|
+
fill=(255, 255, 255, 180), # Semi-transparent white
|
|
78
|
+
)
|
|
79
|
+
# Draw text
|
|
80
|
+
draw.text(
|
|
81
|
+
(x0, y0),
|
|
82
|
+
label_text,
|
|
83
|
+
fill=(0, 0, 0, 255), # Solid black
|
|
84
|
+
font=font,
|
|
85
|
+
)
|