docling 2.15.0__py3-none-any.whl → 2.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. docling/backend/abstract_backend.py +0 -1
  2. docling/backend/asciidoc_backend.py +0 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/html_backend.py +1 -1
  6. docling/backend/json/__init__.py +0 -0
  7. docling/backend/json/docling_json_backend.py +58 -0
  8. docling/backend/md_backend.py +44 -27
  9. docling/backend/msexcel_backend.py +50 -38
  10. docling/backend/msword_backend.py +0 -1
  11. docling/backend/pdf_backend.py +0 -2
  12. docling/backend/pypdfium2_backend.py +2 -2
  13. docling/datamodel/base_models.py +30 -3
  14. docling/datamodel/document.py +2 -0
  15. docling/datamodel/pipeline_options.py +7 -10
  16. docling/document_converter.py +4 -0
  17. docling/models/base_model.py +62 -6
  18. docling/models/base_ocr_model.py +15 -12
  19. docling/models/code_formula_model.py +245 -0
  20. docling/models/document_picture_classifier.py +187 -0
  21. docling/models/layout_model.py +10 -86
  22. docling/models/page_assemble_model.py +1 -33
  23. docling/models/tesseract_ocr_cli_model.py +0 -1
  24. docling/models/tesseract_ocr_model.py +63 -15
  25. docling/pipeline/base_pipeline.py +40 -17
  26. docling/pipeline/standard_pdf_pipeline.py +31 -2
  27. docling/utils/glm_utils.py +4 -1
  28. docling/utils/visualization.py +80 -0
  29. {docling-2.15.0.dist-info → docling-2.16.0.dist-info}/METADATA +7 -7
  30. docling-2.16.0.dist-info/RECORD +61 -0
  31. docling-2.15.0.dist-info/RECORD +0 -56
  32. {docling-2.15.0.dist-info → docling-2.16.0.dist-info}/LICENSE +0 -0
  33. {docling-2.15.0.dist-info → docling-2.16.0.dist-info}/WHEEL +0 -0
  34. {docling-2.15.0.dist-info → docling-2.16.0.dist-info}/entry_points.txt +0 -0
@@ -1,28 +1,21 @@
1
1
  import copy
2
2
  import logging
3
- import random
4
- import time
5
3
  from pathlib import Path
6
- from typing import Iterable, List
4
+ from typing import Iterable
7
5
 
8
- from docling_core.types.doc import CoordOrigin, DocItemLabel
6
+ from docling_core.types.doc import DocItemLabel
9
7
  from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
10
- from PIL import Image, ImageDraw, ImageFont
11
-
12
- from docling.datamodel.base_models import (
13
- BoundingBox,
14
- Cell,
15
- Cluster,
16
- LayoutPrediction,
17
- Page,
18
- )
8
+ from PIL import Image
9
+
10
+ from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
19
11
  from docling.datamodel.document import ConversionResult
20
- from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
12
+ from docling.datamodel.pipeline_options import AcceleratorOptions
21
13
  from docling.datamodel.settings import settings
22
14
  from docling.models.base_model import BasePageModel
23
15
  from docling.utils.accelerator_utils import decide_device
24
16
  from docling.utils.layout_postprocessor import LayoutPostprocessor
25
17
  from docling.utils.profiling import TimeRecorder
18
+ from docling.utils.visualization import draw_clusters
26
19
 
27
20
  _log = logging.getLogger(__name__)
28
21
 
@@ -40,7 +33,7 @@ class LayoutModel(BasePageModel):
40
33
  DocItemLabel.PAGE_FOOTER,
41
34
  DocItemLabel.CODE,
42
35
  DocItemLabel.LIST_ITEM,
43
- # "Formula",
36
+ DocItemLabel.FORMULA,
44
37
  ]
45
38
  PAGE_HEADER_LABELS = [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]
46
39
 
@@ -82,78 +75,9 @@ class LayoutModel(BasePageModel):
82
75
  left_image = copy.deepcopy(page.image)
83
76
  right_image = copy.deepcopy(page.image)
84
77
 
85
- # Function to draw clusters on an image
86
- def draw_clusters(image, clusters):
87
- draw = ImageDraw.Draw(image, "RGBA")
88
- # Create a smaller font for the labels
89
- try:
90
- font = ImageFont.truetype("arial.ttf", 12)
91
- except OSError:
92
- # Fallback to default font if arial is not available
93
- font = ImageFont.load_default()
94
- for c_tl in clusters:
95
- all_clusters = [c_tl, *c_tl.children]
96
- for c in all_clusters:
97
- # Draw cells first (underneath)
98
- cell_color = (0, 0, 0, 40) # Transparent black for cells
99
- for tc in c.cells:
100
- cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
101
- cx0 *= scale_x
102
- cx1 *= scale_x
103
- cy0 *= scale_x
104
- cy1 *= scale_y
105
-
106
- draw.rectangle(
107
- [(cx0, cy0), (cx1, cy1)],
108
- outline=None,
109
- fill=cell_color,
110
- )
111
- # Draw cluster rectangle
112
- x0, y0, x1, y1 = c.bbox.as_tuple()
113
- x0 *= scale_x
114
- x1 *= scale_x
115
- y0 *= scale_x
116
- y1 *= scale_y
117
-
118
- cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
119
- cluster_outline_color = (
120
- *list(DocItemLabel.get_color(c.label)),
121
- 255,
122
- )
123
- draw.rectangle(
124
- [(x0, y0), (x1, y1)],
125
- outline=cluster_outline_color,
126
- fill=cluster_fill_color,
127
- )
128
- # Add label name and confidence
129
- label_text = f"{c.label.name} ({c.confidence:.2f})"
130
- # Create semi-transparent background for text
131
- text_bbox = draw.textbbox((x0, y0), label_text, font=font)
132
- text_bg_padding = 2
133
- draw.rectangle(
134
- [
135
- (
136
- text_bbox[0] - text_bg_padding,
137
- text_bbox[1] - text_bg_padding,
138
- ),
139
- (
140
- text_bbox[2] + text_bg_padding,
141
- text_bbox[3] + text_bg_padding,
142
- ),
143
- ],
144
- fill=(255, 255, 255, 180), # Semi-transparent white
145
- )
146
- # Draw text
147
- draw.text(
148
- (x0, y0),
149
- label_text,
150
- fill=(0, 0, 0, 255), # Solid black
151
- font=font,
152
- )
153
-
154
78
  # Draw clusters on both images
155
- draw_clusters(left_image, left_clusters)
156
- draw_clusters(right_image, right_clusters)
79
+ draw_clusters(left_image, left_clusters, scale_x, scale_y)
80
+ draw_clusters(right_image, right_clusters, scale_x, scale_y)
157
81
  # Combine the images side by side
158
82
  combined_width = left_image.width * 2
159
83
  combined_height = left_image.height
@@ -22,7 +22,7 @@ _log = logging.getLogger(__name__)
22
22
 
23
23
 
24
24
  class PageAssembleOptions(BaseModel):
25
- keep_images: bool = False
25
+ pass
26
26
 
27
27
 
28
28
  class PageAssembleModel(BasePageModel):
@@ -135,31 +135,6 @@ class PageAssembleModel(BasePageModel):
135
135
  )
136
136
  elements.append(fig)
137
137
  body.append(fig)
138
- elif cluster.label == LayoutModel.FORMULA_LABEL:
139
- equation = None
140
- if page.predictions.equations_prediction:
141
- equation = page.predictions.equations_prediction.equation_map.get(
142
- cluster.id, None
143
- )
144
- if (
145
- not equation
146
- ): # fallback: add empty formula, if it isn't present
147
- text = self.sanitize_text(
148
- [
149
- cell.text.replace("\x02", "-").strip()
150
- for cell in cluster.cells
151
- if len(cell.text.strip()) > 0
152
- ]
153
- )
154
- equation = TextElement(
155
- label=cluster.label,
156
- id=cluster.id,
157
- cluster=cluster,
158
- page_no=page.page_no,
159
- text=text,
160
- )
161
- elements.append(equation)
162
- body.append(equation)
163
138
  elif cluster.label in LayoutModel.CONTAINER_LABELS:
164
139
  container_el = ContainerElement(
165
140
  label=cluster.label,
@@ -174,11 +149,4 @@ class PageAssembleModel(BasePageModel):
174
149
  elements=elements, headers=headers, body=body
175
150
  )
176
151
 
177
- # Remove page images (can be disabled)
178
- if not self.options.keep_images:
179
- page._image_cache = {}
180
-
181
- # Unload backend
182
- page._backend.unload()
183
-
184
152
  yield page
@@ -20,7 +20,6 @@ _log = logging.getLogger(__name__)
20
20
 
21
21
 
22
22
  class TesseractOcrCliModel(BaseOcrModel):
23
-
24
23
  def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
25
24
  super().__init__(enabled=enabled, options=options)
26
25
  self.options: TesseractCliOcrOptions
@@ -54,43 +54,56 @@ class TesseractOcrModel(BaseOcrModel):
54
54
  # Initialize the tesseractAPI
55
55
  _log.debug("Initializing TesserOCR: %s", tesseract_version)
56
56
  lang = "+".join(self.options.lang)
57
+
58
+ self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
59
+
60
+ if any([l.startswith("script/") for l in tesserocr_languages]):
61
+ self.script_prefix = "script/"
62
+ else:
63
+ self.script_prefix = ""
64
+
65
+ tesserocr_kwargs = {
66
+ "psm": tesserocr.PSM.AUTO,
67
+ "init": True,
68
+ "oem": tesserocr.OEM.DEFAULT,
69
+ }
70
+
57
71
  if self.options.path is not None:
72
+ tesserocr_kwargs["path"] = self.options.path
73
+
74
+ if lang == "auto":
58
75
  self.reader = tesserocr.PyTessBaseAPI(
59
- path=self.options.path,
60
- lang=lang,
61
- psm=tesserocr.PSM.AUTO,
62
- init=True,
63
- oem=tesserocr.OEM.DEFAULT,
76
+ **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
64
77
  )
65
78
  else:
66
79
  self.reader = tesserocr.PyTessBaseAPI(
67
- lang=lang,
68
- psm=tesserocr.PSM.AUTO,
69
- init=True,
70
- oem=tesserocr.OEM.DEFAULT,
80
+ **{"lang": lang} | tesserocr_kwargs,
71
81
  )
82
+
72
83
  self.reader_RIL = tesserocr.RIL
73
84
 
74
85
  def __del__(self):
75
86
  if self.reader is not None:
76
87
  # Finalize the tesseractAPI
77
88
  self.reader.End()
89
+ for script in self.script_readers:
90
+ self.script_readers[script].End()
78
91
 
79
92
  def __call__(
80
93
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
81
94
  ) -> Iterable[Page]:
82
-
83
95
  if not self.enabled:
84
96
  yield from page_batch
85
97
  return
86
98
 
99
+ import tesserocr
100
+
87
101
  for page in page_batch:
88
102
  assert page._backend is not None
89
103
  if not page._backend.is_valid():
90
104
  yield page
91
105
  else:
92
106
  with TimeRecorder(conv_res, "ocr"):
93
-
94
107
  assert self.reader is not None
95
108
 
96
109
  ocr_rects = self.get_ocr_rects(page)
@@ -106,20 +119,55 @@ class TesseractOcrModel(BaseOcrModel):
106
119
 
107
120
  # Retrieve text snippets with their bounding boxes
108
121
  self.reader.SetImage(high_res_image)
109
- boxes = self.reader.GetComponentImages(
122
+
123
+ if self.options.lang == ["auto"]:
124
+ osd = self.reader.DetectOrientationScript()
125
+
126
+ # No text, probably
127
+ if osd is None:
128
+ continue
129
+
130
+ script = osd["script_name"]
131
+
132
+ if script == "Katakana" or script == "Hiragana":
133
+ script = "Japanese"
134
+ elif script == "Han":
135
+ script = "HanS"
136
+ elif script == "Korean":
137
+ script = "Hangul"
138
+
139
+ _log.debug(
140
+ f'Using model for the detected script "{script}"'
141
+ )
142
+
143
+ if script not in self.script_readers:
144
+ self.script_readers[script] = tesserocr.PyTessBaseAPI(
145
+ path=self.reader.GetDatapath(),
146
+ lang=f"{self.script_prefix}{script}",
147
+ psm=tesserocr.PSM.AUTO,
148
+ init=True,
149
+ oem=tesserocr.OEM.DEFAULT,
150
+ )
151
+
152
+ local_reader = self.script_readers[script]
153
+ local_reader.SetImage(high_res_image)
154
+ else:
155
+ local_reader = self.reader
156
+
157
+ boxes = local_reader.GetComponentImages(
110
158
  self.reader_RIL.TEXTLINE, True
111
159
  )
112
160
 
113
161
  cells = []
114
162
  for ix, (im, box, _, _) in enumerate(boxes):
115
163
  # Set the area of interest. Tesseract uses Bottom-Left for the origin
116
- self.reader.SetRectangle(
164
+ local_reader.SetRectangle(
117
165
  box["x"], box["y"], box["w"], box["h"]
118
166
  )
119
167
 
120
168
  # Extract text within the bounding box
121
- text = self.reader.GetUTF8Text().strip()
122
- confidence = self.reader.MeanTextConf()
169
+ text = local_reader.GetUTF8Text().strip()
170
+ confidence = local_reader.MeanTextConf()
123
171
  left = box["x"] / self.scale
124
172
  bottom = box["y"] / self.scale
125
173
  right = (box["x"] + box["w"]) / self.scale
@@ -3,7 +3,7 @@ import logging
3
3
  import time
4
4
  import traceback
5
5
  from abc import ABC, abstractmethod
6
- from typing import Callable, Iterable, List
6
+ from typing import Any, Callable, Iterable, List
7
7
 
8
8
  from docling_core.types.doc import DoclingDocument, NodeItem
9
9
 
@@ -18,7 +18,7 @@ from docling.datamodel.base_models import (
18
18
  from docling.datamodel.document import ConversionResult, InputDocument
19
19
  from docling.datamodel.pipeline_options import PipelineOptions
20
20
  from docling.datamodel.settings import settings
21
- from docling.models.base_model import BaseEnrichmentModel
21
+ from docling.models.base_model import GenericEnrichmentModel
22
22
  from docling.utils.profiling import ProfilingScope, TimeRecorder
23
23
  from docling.utils.utils import chunkify
24
24
 
@@ -28,8 +28,9 @@ _log = logging.getLogger(__name__)
28
28
  class BasePipeline(ABC):
29
29
  def __init__(self, pipeline_options: PipelineOptions):
30
30
  self.pipeline_options = pipeline_options
31
+ self.keep_images = False
31
32
  self.build_pipe: List[Callable] = []
32
- self.enrichment_pipe: List[BaseEnrichmentModel] = []
33
+ self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
33
34
 
34
35
  def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
35
36
  conv_res = ConversionResult(input=in_doc)
@@ -40,7 +41,7 @@ class BasePipeline(ABC):
40
41
  conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
41
42
  ):
42
43
  # These steps are building and assembling the structure of the
43
- # output DoclingDocument
44
+ # output DoclingDocument.
44
45
  conv_res = self._build_document(conv_res)
45
46
  conv_res = self._assemble_document(conv_res)
46
47
  # From this stage, all operations should rely only on conv_res.output
@@ -50,6 +51,8 @@ class BasePipeline(ABC):
50
51
  conv_res.status = ConversionStatus.FAILURE
51
52
  if raises_on_error:
52
53
  raise e
54
+ finally:
55
+ self._unload(conv_res)
53
56
 
54
57
  return conv_res
55
58
 
@@ -62,21 +65,22 @@ class BasePipeline(ABC):
62
65
 
63
66
  def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
64
67
 
65
- def _filter_elements(
66
- doc: DoclingDocument, model: BaseEnrichmentModel
68
+ def _prepare_elements(
69
+ conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
67
70
  ) -> Iterable[NodeItem]:
68
- for element, _level in doc.iterate_items():
69
- if model.is_processable(doc=doc, element=element):
70
- yield element
71
+ for doc_element, _level in conv_res.document.iterate_items():
72
+ prepared_element = model.prepare_element(
73
+ conv_res=conv_res, element=doc_element
74
+ )
75
+ if prepared_element is not None:
76
+ yield prepared_element
71
77
 
72
78
  with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
73
79
  for model in self.enrichment_pipe:
74
80
  for element_batch in chunkify(
75
- _filter_elements(conv_res.document, model),
81
+ _prepare_elements(conv_res, model),
76
82
  settings.perf.elements_batch_size,
77
83
  ):
78
- # TODO: currently we assume the element itself is modified, because
79
- # we don't have an interface to save the element back to the document
80
84
  for element in model(
81
85
  doc=conv_res.document, element_batch=element_batch
82
86
  ): # Must exhaust!
@@ -88,6 +92,9 @@ class BasePipeline(ABC):
88
92
  def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
89
93
  pass
90
94
 
95
+ def _unload(self, conv_res: ConversionResult):
96
+ pass
97
+
91
98
  @classmethod
92
99
  @abstractmethod
93
100
  def get_default_options(cls) -> PipelineOptions:
@@ -107,6 +114,10 @@ class BasePipeline(ABC):
107
114
 
108
115
  class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
109
116
 
117
+ def __init__(self, pipeline_options: PipelineOptions):
118
+ super().__init__(pipeline_options)
119
+ self.keep_backend = False
120
+
110
121
  def _apply_on_pages(
111
122
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
112
123
  ) -> Iterable[Page]:
@@ -148,7 +159,14 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
148
159
  pipeline_pages = self._apply_on_pages(conv_res, init_pages)
149
160
 
150
161
  for p in pipeline_pages: # Must exhaust!
151
- pass
162
+
163
+ # Cleanup cached images
164
+ if not self.keep_images:
165
+ p._image_cache = {}
166
+
167
+ # Cleanup page backends
168
+ if not self.keep_backend and p._backend is not None:
169
+ p._backend.unload()
152
170
 
153
171
  end_batch_time = time.monotonic()
154
172
  total_elapsed_time += end_batch_time - start_batch_time
@@ -177,10 +195,15 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
177
195
  )
178
196
  raise e
179
197
 
180
- finally:
181
- # Always unload the PDF backend, even in case of failure
182
- if conv_res.input._backend:
183
- conv_res.input._backend.unload()
198
+ return conv_res
199
+
200
+ def _unload(self, conv_res: ConversionResult) -> ConversionResult:
201
+ for page in conv_res.pages:
202
+ if page._backend is not None:
203
+ page._backend.unload()
204
+
205
+ if conv_res.input._backend:
206
+ conv_res.input._backend.unload()
184
207
 
185
208
  return conv_res
186
209
 
@@ -18,6 +18,11 @@ from docling.datamodel.pipeline_options import (
18
18
  TesseractOcrOptions,
19
19
  )
20
20
  from docling.models.base_ocr_model import BaseOcrModel
21
+ from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
22
+ from docling.models.document_picture_classifier import (
23
+ DocumentPictureClassifier,
24
+ DocumentPictureClassifierOptions,
25
+ )
21
26
  from docling.models.ds_glm_model import GlmModel, GlmOptions
22
27
  from docling.models.easyocr_model import EasyOcrModel
23
28
  from docling.models.layout_model import LayoutModel
@@ -50,7 +55,7 @@ class StandardPdfPipeline(PaginatedPipeline):
50
55
  else:
51
56
  self.artifacts_path = Path(pipeline_options.artifacts_path)
52
57
 
53
- keep_images = (
58
+ self.keep_images = (
54
59
  self.pipeline_options.generate_page_images
55
60
  or self.pipeline_options.generate_picture_images
56
61
  or self.pipeline_options.generate_table_images
@@ -87,13 +92,37 @@ class StandardPdfPipeline(PaginatedPipeline):
87
92
  accelerator_options=pipeline_options.accelerator_options,
88
93
  ),
89
94
  # Page assemble
90
- PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
95
+ PageAssembleModel(options=PageAssembleOptions()),
91
96
  ]
92
97
 
93
98
  self.enrichment_pipe = [
94
99
  # Other models working on `NodeItem` elements in the DoclingDocument
100
+ # Code Formula Enrichment Model
101
+ CodeFormulaModel(
102
+ enabled=pipeline_options.do_code_enrichment
103
+ or pipeline_options.do_formula_enrichment,
104
+ artifacts_path=pipeline_options.artifacts_path,
105
+ options=CodeFormulaModelOptions(
106
+ do_code_enrichment=pipeline_options.do_code_enrichment,
107
+ do_formula_enrichment=pipeline_options.do_formula_enrichment,
108
+ ),
109
+ accelerator_options=pipeline_options.accelerator_options,
110
+ ),
111
+ # Document Picture Classifier
112
+ DocumentPictureClassifier(
113
+ enabled=pipeline_options.do_picture_classification,
114
+ artifacts_path=pipeline_options.artifacts_path,
115
+ options=DocumentPictureClassifierOptions(),
116
+ accelerator_options=pipeline_options.accelerator_options,
117
+ ),
95
118
  ]
96
119
 
120
+ if (
121
+ self.pipeline_options.do_formula_enrichment
122
+ or self.pipeline_options.do_code_enrichment
123
+ ):
124
+ self.keep_backend = True
125
+
97
126
  @staticmethod
98
127
  def download_models_hf(
99
128
  local_dir: Optional[Path] = None, force: bool = False
@@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
270
270
  container_el = doc.add_group(label=group_label)
271
271
 
272
272
  _add_child_elements(container_el, doc, obj, pelem)
273
-
274
273
  elif "text" in obj:
275
274
  text = obj["text"][span_i:span_j]
276
275
 
@@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
304
303
  current_list = None
305
304
 
306
305
  doc.add_heading(text=text, prov=prov)
306
+ elif label == DocItemLabel.CODE:
307
+ current_list = None
308
+
309
+ doc.add_code(text=text, prov=prov)
307
310
  else:
308
311
  current_list = None
309
312
 
@@ -0,0 +1,80 @@
1
+ from docling_core.types.doc import DocItemLabel
2
+ from PIL import Image, ImageDraw, ImageFont
3
+ from PIL.ImageFont import FreeTypeFont
4
+
5
+ from docling.datamodel.base_models import Cluster
6
+
7
+
8
+ def draw_clusters(
9
+ image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
10
+ ) -> None:
11
+ """
12
+ Draw clusters on an image
13
+ """
14
+ draw = ImageDraw.Draw(image, "RGBA")
15
+ # Create a smaller font for the labels
16
+ font: ImageFont.ImageFont | FreeTypeFont
17
+ try:
18
+ font = ImageFont.truetype("arial.ttf", 12)
19
+ except OSError:
20
+ # Fallback to default font if arial is not available
21
+ font = ImageFont.load_default()
22
+ for c_tl in clusters:
23
+ all_clusters = [c_tl, *c_tl.children]
24
+ for c in all_clusters:
25
+ # Draw cells first (underneath)
26
+ cell_color = (0, 0, 0, 40) # Transparent black for cells
27
+ for tc in c.cells:
28
+ cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
29
+ cx0 *= scale_x
30
+ cx1 *= scale_x
31
+ cy0 *= scale_x
32
+ cy1 *= scale_y
33
+
34
+ draw.rectangle(
35
+ [(cx0, cy0), (cx1, cy1)],
36
+ outline=None,
37
+ fill=cell_color,
38
+ )
39
+ # Draw cluster rectangle
40
+ x0, y0, x1, y1 = c.bbox.as_tuple()
41
+ x0 *= scale_x
42
+ x1 *= scale_x
43
+ y0 *= scale_x
44
+ y1 *= scale_y
45
+
46
+ cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
47
+ cluster_outline_color = (
48
+ *list(DocItemLabel.get_color(c.label)),
49
+ 255,
50
+ )
51
+ draw.rectangle(
52
+ [(x0, y0), (x1, y1)],
53
+ outline=cluster_outline_color,
54
+ fill=cluster_fill_color,
55
+ )
56
+ # Add label name and confidence
57
+ label_text = f"{c.label.name} ({c.confidence:.2f})"
58
+ # Create semi-transparent background for text
59
+ text_bbox = draw.textbbox((x0, y0), label_text, font=font)
60
+ text_bg_padding = 2
61
+ draw.rectangle(
62
+ [
63
+ (
64
+ text_bbox[0] - text_bg_padding,
65
+ text_bbox[1] - text_bg_padding,
66
+ ),
67
+ (
68
+ text_bbox[2] + text_bg_padding,
69
+ text_bbox[3] + text_bg_padding,
70
+ ),
71
+ ],
72
+ fill=(255, 255, 255, 180), # Semi-transparent white
73
+ )
74
+ # Draw text
75
+ draw.text(
76
+ (x0, y0),
77
+ label_text,
78
+ fill=(0, 0, 0, 255), # Solid black
79
+ font=font,
80
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.15.0
3
+ Version: 2.16.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
30
- Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
31
- Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
30
+ Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
+ Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
34
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,13 +39,14 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
39
39
  Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
+ Requires-Dist: pillow (>=10.0.0,<11.0.0)
42
43
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
43
44
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
44
45
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
45
46
  Requires-Dist: python-docx (>=1.1.2,<2.0.0)
46
47
  Requires-Dist: python-pptx (>=1.0.2,<2.0.0)
47
48
  Requires-Dist: rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; (python_version < "3.13") and (extra == "rapidocr")
48
- Requires-Dist: requests (>=2.32.3,<3.0.0)
49
+ Requires-Dist: requests (>=2.32.2,<3.0.0)
49
50
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
50
51
  Requires-Dist: scipy (>=1.6.0,<2.0.0)
51
52
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
@@ -84,7 +85,7 @@ Docling parses documents and exports them to the desired format with ease and sp
84
85
  * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
85
86
  * 📑 Advanced PDF document understanding including page layout, reading order & table structures
86
87
  * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
87
- * 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
88
+ * 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
88
89
  * 🔍 OCR support for scanned PDFs
89
90
  * 💻 Simple and convenient CLI
90
91
 
@@ -94,7 +95,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
94
95
 
95
96
  * ♾️ Equation & code extraction
96
97
  * 📝 Metadata extraction, including title, authors, references & language
97
- * 🦜🔗 Native LangChain extension
98
98
 
99
99
  ## Installation
100
100