docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,168 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
6
+ from docling.datamodel.pipeline_options import (
7
+ LayoutOptions,
8
+ granite_picture_description,
9
+ smolvlm_picture_description,
10
+ )
11
+ from docling.datamodel.settings import settings
12
+ from docling.datamodel.vlm_model_specs import (
13
+ GRANITEDOCLING_MLX,
14
+ GRANITEDOCLING_TRANSFORMERS,
15
+ SMOLDOCLING_MLX,
16
+ SMOLDOCLING_TRANSFORMERS,
17
+ )
18
+ from docling.models.stages.code_formula.code_formula_model import CodeFormulaModel
19
+ from docling.models.stages.layout.layout_model import LayoutModel
20
+ from docling.models.stages.ocr.easyocr_model import EasyOcrModel
21
+ from docling.models.stages.ocr.rapid_ocr_model import RapidOcrModel
22
+ from docling.models.stages.picture_classifier.document_picture_classifier import (
23
+ DocumentPictureClassifier,
24
+ DocumentPictureClassifierOptions,
25
+ )
26
+ from docling.models.stages.picture_description.picture_description_vlm_model import (
27
+ PictureDescriptionVlmModel,
28
+ )
29
+ from docling.models.stages.table_structure.table_structure_model import (
30
+ TableStructureModel,
31
+ )
32
+ from docling.models.utils.hf_model_download import download_hf_model
33
+
34
+ _log = logging.getLogger(__name__)
35
+
36
+
37
+ def download_models(
38
+ output_dir: Optional[Path] = None,
39
+ *,
40
+ force: bool = False,
41
+ progress: bool = False,
42
+ with_layout: bool = True,
43
+ with_tableformer: bool = True,
44
+ with_code_formula: bool = True,
45
+ with_picture_classifier: bool = True,
46
+ with_smolvlm: bool = False,
47
+ with_granitedocling: bool = False,
48
+ with_granitedocling_mlx: bool = False,
49
+ with_smoldocling: bool = False,
50
+ with_smoldocling_mlx: bool = False,
51
+ with_granite_vision: bool = False,
52
+ with_rapidocr: bool = True,
53
+ with_easyocr: bool = False,
54
+ ):
55
+ if output_dir is None:
56
+ output_dir = settings.cache_dir / "models"
57
+
58
+ # Make sure the folder exists
59
+ output_dir.mkdir(exist_ok=True, parents=True)
60
+
61
+ if with_layout:
62
+ _log.info("Downloading layout model...")
63
+ LayoutModel.download_models(
64
+ local_dir=output_dir / LayoutOptions().model_spec.model_repo_folder,
65
+ force=force,
66
+ progress=progress,
67
+ )
68
+
69
+ if with_tableformer:
70
+ _log.info("Downloading tableformer model...")
71
+ TableStructureModel.download_models(
72
+ local_dir=output_dir / TableStructureModel._model_repo_folder,
73
+ force=force,
74
+ progress=progress,
75
+ )
76
+
77
+ if with_picture_classifier:
78
+ _log.info("Downloading picture classifier model...")
79
+ pic_opts = DocumentPictureClassifierOptions()
80
+ DocumentPictureClassifier.download_models(
81
+ repo_id=pic_opts.repo_id,
82
+ revision=pic_opts.revision,
83
+ local_dir=output_dir / pic_opts.repo_cache_folder,
84
+ force=force,
85
+ progress=progress,
86
+ )
87
+
88
+ if with_code_formula:
89
+ _log.info("Downloading code formula model...")
90
+ CodeFormulaModel.download_models(
91
+ local_dir=output_dir / CodeFormulaModel._model_repo_folder,
92
+ force=force,
93
+ progress=progress,
94
+ )
95
+
96
+ if with_smolvlm:
97
+ _log.info("Downloading SmolVlm model...")
98
+ download_hf_model(
99
+ repo_id=smolvlm_picture_description.repo_id,
100
+ local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
101
+ force=force,
102
+ progress=progress,
103
+ )
104
+
105
+ if with_granitedocling:
106
+ _log.info("Downloading GraniteDocling model...")
107
+ download_hf_model(
108
+ repo_id=GRANITEDOCLING_TRANSFORMERS.repo_id,
109
+ local_dir=output_dir / GRANITEDOCLING_TRANSFORMERS.repo_cache_folder,
110
+ force=force,
111
+ progress=progress,
112
+ )
113
+
114
+ if with_granitedocling_mlx:
115
+ _log.info("Downloading GraniteDocling MLX model...")
116
+ download_hf_model(
117
+ repo_id=GRANITEDOCLING_MLX.repo_id,
118
+ local_dir=output_dir / GRANITEDOCLING_MLX.repo_cache_folder,
119
+ force=force,
120
+ progress=progress,
121
+ )
122
+
123
+ if with_smoldocling:
124
+ _log.info("Downloading SmolDocling model...")
125
+ download_hf_model(
126
+ repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
127
+ local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
128
+ force=force,
129
+ progress=progress,
130
+ )
131
+
132
+ if with_smoldocling_mlx:
133
+ _log.info("Downloading SmolDocling MLX model...")
134
+ download_hf_model(
135
+ repo_id=SMOLDOCLING_MLX.repo_id,
136
+ local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
137
+ force=force,
138
+ progress=progress,
139
+ )
140
+
141
+ if with_granite_vision:
142
+ _log.info("Downloading Granite Vision model...")
143
+ download_hf_model(
144
+ repo_id=granite_picture_description.repo_id,
145
+ local_dir=output_dir / granite_picture_description.repo_cache_folder,
146
+ force=force,
147
+ progress=progress,
148
+ )
149
+
150
+ if with_rapidocr:
151
+ for backend in ("torch", "onnxruntime"):
152
+ _log.info(f"Downloading rapidocr {backend} models...")
153
+ RapidOcrModel.download_models(
154
+ backend=backend,
155
+ local_dir=output_dir / RapidOcrModel._model_repo_folder,
156
+ force=force,
157
+ progress=progress,
158
+ )
159
+
160
+ if with_easyocr:
161
+ _log.info("Downloading easyocr models...")
162
+ EasyOcrModel.download_models(
163
+ local_dir=output_dir / EasyOcrModel._model_repo_folder,
164
+ force=force,
165
+ progress=progress,
166
+ )
167
+
168
+ return output_dir
@@ -0,0 +1,69 @@
1
+ from typing import Optional, Tuple
2
+
3
+ from docling_core.types.doc import BoundingBox, CoordOrigin
4
+ from docling_core.types.doc.page import BoundingRectangle
5
+
6
+ from docling.utils.orientation import CLIPPED_ORIENTATIONS, rotate_bounding_box
7
+
8
+
9
+ def map_tesseract_script(script: str) -> str:
10
+ r""" """
11
+ if script == "Katakana" or script == "Hiragana":
12
+ script = "Japanese"
13
+ elif script == "Han":
14
+ script = "HanS"
15
+ elif script == "Korean":
16
+ script = "Hangul"
17
+ return script
18
+
19
+
20
+ def parse_tesseract_orientation(orientation: str) -> int:
21
+ # Tesseract orientation is [0, 90, 180, 270] clockwise, bounding rectangle angles
22
+ # are [0, 360[ counterclockwise
23
+ parsed = int(orientation)
24
+ if parsed not in CLIPPED_ORIENTATIONS:
25
+ msg = (
26
+ f"invalid tesseract document orientation {orientation}, "
27
+ f"expected orientation: {sorted(CLIPPED_ORIENTATIONS)}"
28
+ )
29
+ raise ValueError(msg)
30
+ parsed = -parsed
31
+ parsed %= 360
32
+ return parsed
33
+
34
+
35
+ def tesseract_box_to_bounding_rectangle(
36
+ bbox: BoundingBox,
37
+ *,
38
+ original_offset: Optional[BoundingBox] = None,
39
+ scale: float,
40
+ orientation: int,
41
+ im_size: Tuple[int, int],
42
+ ) -> BoundingRectangle:
43
+ # box is in the top, left, height, width format, top left coordinates
44
+ rect = rotate_bounding_box(bbox, angle=orientation, im_size=im_size)
45
+ rect = BoundingRectangle(
46
+ r_x0=rect.r_x0 / scale,
47
+ r_y0=rect.r_y0 / scale,
48
+ r_x1=rect.r_x1 / scale,
49
+ r_y1=rect.r_y1 / scale,
50
+ r_x2=rect.r_x2 / scale,
51
+ r_y2=rect.r_y2 / scale,
52
+ r_x3=rect.r_x3 / scale,
53
+ r_y3=rect.r_y3 / scale,
54
+ coord_origin=CoordOrigin.TOPLEFT,
55
+ )
56
+ if original_offset is not None:
57
+ if original_offset.coord_origin is not CoordOrigin.TOPLEFT:
58
+ msg = f"expected coordinate origin to be {CoordOrigin.TOPLEFT.value}"
59
+ raise ValueError(msg)
60
+ if original_offset is not None:
61
+ rect.r_x0 += original_offset.l
62
+ rect.r_x1 += original_offset.l
63
+ rect.r_x2 += original_offset.l
64
+ rect.r_x3 += original_offset.l
65
+ rect.r_y0 += original_offset.t
66
+ rect.r_y1 += original_offset.t
67
+ rect.r_y2 += original_offset.t
68
+ rect.r_y3 += original_offset.t
69
+ return rect
@@ -0,0 +1,65 @@
1
+ from typing import Tuple
2
+
3
+ from docling_core.types.doc import BoundingBox, CoordOrigin
4
+ from docling_core.types.doc.page import BoundingRectangle
5
+
6
+ CLIPPED_ORIENTATIONS = [0, 90, 180, 270]
7
+
8
+
9
+ def rotate_bounding_box(
10
+ bbox: BoundingBox, angle: int, im_size: Tuple[int, int]
11
+ ) -> BoundingRectangle:
12
+ # The box is left top width height in TOPLEFT coordinates
13
+ # Bounding rectangle start with r_0 at the bottom left whatever the
14
+ # coordinate system. Then other corners are found rotating counterclockwise
15
+ bbox = bbox.to_top_left_origin(im_size[1])
16
+ left, top, width, height = bbox.l, bbox.t, bbox.width, bbox.height
17
+ im_w, im_h = im_size
18
+ angle = angle % 360
19
+ if angle == 0:
20
+ return BoundingRectangle.from_bounding_box(bbox)
21
+ elif angle == 90:
22
+ r_x0 = top + height
23
+ r_y0 = im_w - left
24
+ r_x1 = r_x0
25
+ r_y1 = r_y0 - width
26
+ r_x2 = r_x1 - height
27
+ r_y2 = r_y1
28
+ r_x3 = r_x2
29
+ r_y3 = r_y0
30
+ elif angle == 180:
31
+ r_x0 = im_w - left
32
+ r_y0 = im_h - (top + height)
33
+ r_x1 = r_x0 - width
34
+ r_y1 = r_y0
35
+ r_x2 = r_x1
36
+ r_y2 = r_y1 + height
37
+ r_x3 = r_x0
38
+ r_y3 = r_y2
39
+ elif angle == 270:
40
+ r_x0 = im_h - (top + height)
41
+ r_y0 = left
42
+ r_x1 = r_x0
43
+ r_y1 = r_y0 + width
44
+ r_x2 = r_x1 + height
45
+ r_y2 = r_y1
46
+ r_x3 = r_x2
47
+ r_y3 = r_y0
48
+ else:
49
+ msg = (
50
+ f"invalid orientation {angle}, expected values in:"
51
+ f" {sorted(CLIPPED_ORIENTATIONS)}"
52
+ )
53
+ raise ValueError(msg)
54
+ rectangle = BoundingRectangle(
55
+ r_x0=r_x0,
56
+ r_y0=r_y0,
57
+ r_x1=r_x1,
58
+ r_y1=r_y1,
59
+ r_x2=r_x2,
60
+ r_y2=r_y2,
61
+ r_x3=r_x3,
62
+ r_y3=r_y3,
63
+ coord_origin=CoordOrigin.TOPLEFT,
64
+ )
65
+ return rectangle
@@ -0,0 +1,65 @@
1
+ import time
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from typing import TYPE_CHECKING, List
5
+
6
+ import numpy as np
7
+ from pydantic import BaseModel
8
+
9
+ from docling.datamodel.settings import settings
10
+
11
+ if TYPE_CHECKING:
12
+ from docling.datamodel.document import ConversionResult
13
+
14
+
15
+ class ProfilingScope(str, Enum):
16
+ PAGE = "page"
17
+ DOCUMENT = "document"
18
+
19
+
20
+ class ProfilingItem(BaseModel):
21
+ scope: ProfilingScope
22
+ count: int = 0
23
+ times: List[float] = []
24
+ start_timestamps: List[datetime] = []
25
+
26
+ def total(self) -> float:
27
+ return np.sum(self.times) # type: ignore
28
+
29
+ def avg(self) -> float:
30
+ return np.average(self.times) # type: ignore
31
+
32
+ def std(self) -> float:
33
+ return np.std(self.times) # type: ignore
34
+
35
+ def mean(self) -> float:
36
+ return np.mean(self.times) # type: ignore
37
+
38
+ def percentile(self, perc: float) -> float:
39
+ return np.percentile(self.times, perc) # type: ignore
40
+
41
+
42
+ class TimeRecorder:
43
+ def __init__(
44
+ self,
45
+ conv_res: "ConversionResult",
46
+ key: str,
47
+ scope: ProfilingScope = ProfilingScope.PAGE,
48
+ ):
49
+ if settings.debug.profile_pipeline_timings:
50
+ if key not in conv_res.timings.keys():
51
+ conv_res.timings[key] = ProfilingItem(scope=scope)
52
+ self.conv_res = conv_res
53
+ self.key = key
54
+
55
+ def __enter__(self):
56
+ if settings.debug.profile_pipeline_timings:
57
+ self.start = time.monotonic()
58
+ self.conv_res.timings[self.key].start_timestamps.append(datetime.utcnow())
59
+ return self
60
+
61
+ def __exit__(self, *args):
62
+ if settings.debug.profile_pipeline_timings:
63
+ elapsed = time.monotonic() - self.start
64
+ self.conv_res.timings[self.key].times.append(elapsed)
65
+ self.conv_res.timings[self.key].count += 1
docling/utils/utils.py ADDED
@@ -0,0 +1,65 @@
1
+ import hashlib
2
+ from io import BytesIO
3
+ from itertools import islice
4
+ from pathlib import Path
5
+ from typing import List, Union
6
+
7
+ import requests
8
+ from tqdm import tqdm
9
+
10
+
11
+ def chunkify(iterator, chunk_size):
12
+ """Yield successive chunks of chunk_size from the iterable."""
13
+ if isinstance(iterator, List):
14
+ iterator = iter(iterator)
15
+ for first in iterator: # Take the first element from the iterator
16
+ yield [first, *list(islice(iterator, chunk_size - 1))]
17
+
18
+
19
+ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
20
+ """Create a stable page_hash of the path_or_stream of a file"""
21
+
22
+ block_size = 65536
23
+ hasher = hashlib.sha256(usedforsecurity=False)
24
+
25
+ def _hash_buf(binary_stream):
26
+ buf = binary_stream.read(block_size) # read and page_hash in chunks
27
+ while len(buf) > 0:
28
+ hasher.update(buf)
29
+ buf = binary_stream.read(block_size)
30
+
31
+ if isinstance(path_or_stream, Path):
32
+ with path_or_stream.open("rb") as afile:
33
+ _hash_buf(afile)
34
+ elif isinstance(path_or_stream, BytesIO):
35
+ _hash_buf(path_or_stream)
36
+
37
+ return hasher.hexdigest()
38
+
39
+
40
+ def create_hash(string: str):
41
+ hasher = hashlib.sha256(usedforsecurity=False)
42
+ hasher.update(string.encode("utf-8"))
43
+
44
+ return hasher.hexdigest()
45
+
46
+
47
+ def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
48
+ buf = BytesIO()
49
+ with requests.get(url, stream=True, allow_redirects=True) as response:
50
+ total_size = int(response.headers.get("content-length", 0))
51
+ progress_bar = tqdm(
52
+ total=total_size,
53
+ unit="B",
54
+ unit_scale=True,
55
+ unit_divisor=1024,
56
+ disable=(not progress),
57
+ )
58
+
59
+ for chunk in response.iter_content(10 * 1024):
60
+ buf.write(chunk)
61
+ progress_bar.update(len(chunk))
62
+ progress_bar.close()
63
+
64
+ buf.seek(0)
65
+ return buf
@@ -0,0 +1,85 @@
1
+ from docling_core.types.doc import DocItemLabel
2
+ from PIL import Image, ImageDraw, ImageFont
3
+ from PIL.ImageFont import FreeTypeFont
4
+
5
+ from docling.datamodel.base_models import Cluster
6
+
7
+
8
+ def draw_clusters(
9
+ image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
10
+ ) -> None:
11
+ """
12
+ Draw clusters on an image
13
+ """
14
+ draw = ImageDraw.Draw(image, "RGBA")
15
+ # Create a smaller font for the labels
16
+ font: ImageFont.ImageFont | FreeTypeFont
17
+ try:
18
+ font = ImageFont.truetype("arial.ttf", 12)
19
+ except OSError:
20
+ # Fallback to default font if arial is not available
21
+ font = ImageFont.load_default()
22
+ for c_tl in clusters:
23
+ all_clusters = [c_tl, *c_tl.children]
24
+ for c in all_clusters:
25
+ # Draw cells first (underneath)
26
+ cell_color = (0, 0, 0, 40) # Transparent black for cells
27
+ for tc in c.cells:
28
+ cx0, cy0, cx1, cy1 = tc.rect.to_bounding_box().as_tuple()
29
+ cx0 *= scale_x
30
+ cx1 *= scale_x
31
+ cy0 *= scale_x
32
+ cy1 *= scale_y
33
+
34
+ draw.rectangle(
35
+ [(cx0, cy0), (cx1, cy1)],
36
+ outline=None,
37
+ fill=cell_color,
38
+ )
39
+ # Draw cluster rectangle
40
+ x0, y0, x1, y1 = c.bbox.as_tuple()
41
+ x0 *= scale_x
42
+ x1 *= scale_x
43
+ y0 *= scale_x
44
+ y1 *= scale_y
45
+
46
+ if y1 <= y0:
47
+ y1, y0 = y0, y1
48
+ if x1 <= x0:
49
+ x1, x0 = x0, x1
50
+
51
+ cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
52
+ cluster_outline_color = (
53
+ *list(DocItemLabel.get_color(c.label)),
54
+ 255,
55
+ )
56
+ draw.rectangle(
57
+ [(x0, y0), (x1, y1)],
58
+ outline=cluster_outline_color,
59
+ fill=cluster_fill_color,
60
+ )
61
+ # Add label name and confidence
62
+ label_text = f"{c.label.name} ({c.confidence:.2f})"
63
+ # Create semi-transparent background for text
64
+ text_bbox = draw.textbbox((x0, y0), label_text, font=font)
65
+ text_bg_padding = 2
66
+ draw.rectangle(
67
+ [
68
+ (
69
+ text_bbox[0] - text_bg_padding,
70
+ text_bbox[1] - text_bg_padding,
71
+ ),
72
+ (
73
+ text_bbox[2] + text_bg_padding,
74
+ text_bbox[3] + text_bg_padding,
75
+ ),
76
+ ],
77
+ fill=(255, 255, 255, 180), # Semi-transparent white
78
+ )
79
+ # Draw text
80
+ draw.text(
81
+ (x0, y0),
82
+ label_text,
83
+ fill=(0, 0, 0, 255), # Solid black
84
+ font=font,
85
+ )