docling 2.15.1__py3-none-any.whl → 2.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. docling/backend/abstract_backend.py +0 -1
  2. docling/backend/asciidoc_backend.py +0 -1
  3. docling/backend/docling_parse_backend.py +1 -1
  4. docling/backend/docling_parse_v2_backend.py +1 -1
  5. docling/backend/html_backend.py +4 -3
  6. docling/backend/json/__init__.py +0 -0
  7. docling/backend/json/docling_json_backend.py +58 -0
  8. docling/backend/md_backend.py +49 -36
  9. docling/backend/msexcel_backend.py +50 -38
  10. docling/backend/msword_backend.py +0 -1
  11. docling/backend/pdf_backend.py +0 -2
  12. docling/backend/pypdfium2_backend.py +1 -1
  13. docling/backend/xml/uspto_backend.py +25 -25
  14. docling/cli/main.py +18 -3
  15. docling/datamodel/base_models.py +30 -3
  16. docling/datamodel/document.py +4 -0
  17. docling/datamodel/pipeline_options.py +7 -9
  18. docling/document_converter.py +4 -0
  19. docling/models/base_model.py +62 -6
  20. docling/models/code_formula_model.py +245 -0
  21. docling/models/document_picture_classifier.py +187 -0
  22. docling/models/layout_model.py +10 -86
  23. docling/models/page_assemble_model.py +1 -33
  24. docling/models/rapid_ocr_model.py +1 -0
  25. docling/models/tesseract_ocr_cli_model.py +72 -5
  26. docling/models/tesseract_ocr_model.py +68 -20
  27. docling/pipeline/base_pipeline.py +40 -17
  28. docling/pipeline/standard_pdf_pipeline.py +31 -2
  29. docling/utils/glm_utils.py +4 -1
  30. docling/utils/ocr_utils.py +9 -0
  31. docling/utils/visualization.py +80 -0
  32. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/METADATA +17 -13
  33. docling-2.17.0.dist-info/RECORD +62 -0
  34. docling-2.15.1.dist-info/RECORD +0 -56
  35. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/LICENSE +0 -0
  36. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/WHEEL +0 -0
  37. {docling-2.15.1.dist-info → docling-2.17.0.dist-info}/entry_points.txt +0 -0
@@ -4,7 +4,7 @@ import logging
4
4
  import os
5
5
  import tempfile
6
6
  from subprocess import DEVNULL, PIPE, Popen
7
- from typing import Iterable, Optional, Tuple
7
+ from typing import Iterable, List, Optional, Tuple
8
8
 
9
9
  import pandas as pd
10
10
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -14,13 +14,13 @@ from docling.datamodel.document import ConversionResult
14
14
  from docling.datamodel.pipeline_options import TesseractCliOcrOptions
15
15
  from docling.datamodel.settings import settings
16
16
  from docling.models.base_ocr_model import BaseOcrModel
17
+ from docling.utils.ocr_utils import map_tesseract_script
17
18
  from docling.utils.profiling import TimeRecorder
18
19
 
19
20
  _log = logging.getLogger(__name__)
20
21
 
21
22
 
22
23
  class TesseractOcrCliModel(BaseOcrModel):
23
-
24
24
  def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
25
25
  super().__init__(enabled=enabled, options=options)
26
26
  self.options: TesseractCliOcrOptions
@@ -29,10 +29,13 @@ class TesseractOcrCliModel(BaseOcrModel):
29
29
 
30
30
  self._name: Optional[str] = None
31
31
  self._version: Optional[str] = None
32
+ self._tesseract_languages: Optional[List[str]] = None
33
+ self._script_prefix: Optional[str] = None
32
34
 
33
35
  if self.enabled:
34
36
  try:
35
37
  self._get_name_and_version()
38
+ self._set_languages_and_prefix()
36
39
 
37
40
  except Exception as exc:
38
41
  raise RuntimeError(
@@ -74,12 +77,20 @@ class TesseractOcrCliModel(BaseOcrModel):
74
77
  return name, version
75
78
 
76
79
  def _run_tesseract(self, ifilename: str):
77
-
80
+ r"""
81
+ Run tesseract CLI
82
+ """
78
83
  cmd = [self.options.tesseract_cmd]
79
84
 
80
- if self.options.lang is not None and len(self.options.lang) > 0:
85
+ if "auto" in self.options.lang:
86
+ lang = self._detect_language(ifilename)
87
+ if lang is not None:
88
+ cmd.append("-l")
89
+ cmd.append(lang)
90
+ elif self.options.lang is not None and len(self.options.lang) > 0:
81
91
  cmd.append("-l")
82
92
  cmd.append("+".join(self.options.lang))
93
+
83
94
  if self.options.path is not None:
84
95
  cmd.append("--tessdata-dir")
85
96
  cmd.append(self.options.path)
@@ -107,6 +118,63 @@ class TesseractOcrCliModel(BaseOcrModel):
107
118
 
108
119
  return df_filtered
109
120
 
121
+ def _detect_language(self, ifilename: str):
122
+ r"""
123
+ Run tesseract in PSM 0 mode to detect the language
124
+ """
125
+ assert self._tesseract_languages is not None
126
+
127
+ cmd = [self.options.tesseract_cmd]
128
+ cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
129
+ _log.info("command: {}".format(" ".join(cmd)))
130
+ proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
131
+ output, _ = proc.communicate()
132
+ decoded_data = output.decode("utf-8")
133
+ df = pd.read_csv(
134
+ io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
135
+ )
136
+ scripts = df.loc[df["key"] == "Script"].value.tolist()
137
+ if len(scripts) == 0:
138
+ _log.warning("Tesseract cannot detect the script of the page")
139
+ return None
140
+
141
+ script = map_tesseract_script(scripts[0].strip())
142
+ lang = f"{self._script_prefix}{script}"
143
+
144
+ # Check if the detected language has been installed
145
+ if lang not in self._tesseract_languages:
146
+ msg = f"Tesseract detected the script '{script}' and language '{lang}'."
147
+ msg += " However this language is not installed in your system and will be ignored."
148
+ _log.warning(msg)
149
+ return None
150
+
151
+ _log.debug(
152
+ f"Using tesseract model for the detected script '{script}' and language '{lang}'"
153
+ )
154
+ return lang
155
+
156
+ def _set_languages_and_prefix(self):
157
+ r"""
158
+ Read and set the languages installed in tesseract and decide the script prefix
159
+ """
160
+ # Get all languages
161
+ cmd = [self.options.tesseract_cmd]
162
+ cmd.append("--list-langs")
163
+ _log.info("command: {}".format(" ".join(cmd)))
164
+ proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
165
+ output, _ = proc.communicate()
166
+ decoded_data = output.decode("utf-8")
167
+ df = pd.read_csv(io.StringIO(decoded_data), header=None)
168
+ self._tesseract_languages = df[0].tolist()[1:]
169
+
170
+ # Decide the script prefix
171
+ if any([l.startswith("script/") for l in self._tesseract_languages]):
172
+ script_prefix = "script/"
173
+ else:
174
+ script_prefix = ""
175
+
176
+ self._script_prefix = script_prefix
177
+
110
178
  def __call__(
111
179
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
112
180
  ) -> Iterable[Page]:
@@ -121,7 +189,6 @@ class TesseractOcrCliModel(BaseOcrModel):
121
189
  yield page
122
190
  else:
123
191
  with TimeRecorder(conv_res, "ocr"):
124
-
125
192
  ocr_rects = self.get_ocr_rects(page)
126
193
 
127
194
  all_ocr_cells = []
@@ -8,6 +8,7 @@ from docling.datamodel.document import ConversionResult
8
8
  from docling.datamodel.pipeline_options import TesseractOcrOptions
9
9
  from docling.datamodel.settings import settings
10
10
  from docling.models.base_ocr_model import BaseOcrModel
11
+ from docling.utils.ocr_utils import map_tesseract_script
11
12
  from docling.utils.profiling import TimeRecorder
12
13
 
13
14
  _log = logging.getLogger(__name__)
@@ -20,6 +21,7 @@ class TesseractOcrModel(BaseOcrModel):
20
21
 
21
22
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
22
23
  self.reader = None
24
+ self.osd_reader = None
23
25
 
24
26
  if self.enabled:
25
27
  install_errmsg = (
@@ -47,27 +49,38 @@ class TesseractOcrModel(BaseOcrModel):
47
49
  except:
48
50
  raise ImportError(install_errmsg)
49
51
 
50
- _, tesserocr_languages = tesserocr.get_languages()
51
- if not tesserocr_languages:
52
+ _, self._tesserocr_languages = tesserocr.get_languages()
53
+ if not self._tesserocr_languages:
52
54
  raise ImportError(missing_langs_errmsg)
53
55
 
54
56
  # Initialize the tesseractAPI
55
57
  _log.debug("Initializing TesserOCR: %s", tesseract_version)
56
58
  lang = "+".join(self.options.lang)
59
+
60
+ self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
61
+
62
+ if any([l.startswith("script/") for l in self._tesserocr_languages]):
63
+ self.script_prefix = "script/"
64
+ else:
65
+ self.script_prefix = ""
66
+
67
+ tesserocr_kwargs = {
68
+ "psm": tesserocr.PSM.AUTO,
69
+ "init": True,
70
+ "oem": tesserocr.OEM.DEFAULT,
71
+ }
72
+
57
73
  if self.options.path is not None:
58
- self.reader = tesserocr.PyTessBaseAPI(
59
- path=self.options.path,
60
- lang=lang,
61
- psm=tesserocr.PSM.AUTO,
62
- init=True,
63
- oem=tesserocr.OEM.DEFAULT,
74
+ tesserocr_kwargs["path"] = self.options.path
75
+
76
+ if lang == "auto":
77
+ self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
78
+ self.osd_reader = tesserocr.PyTessBaseAPI(
79
+ **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
64
80
  )
65
81
  else:
66
82
  self.reader = tesserocr.PyTessBaseAPI(
67
- lang=lang,
68
- psm=tesserocr.PSM.AUTO,
69
- init=True,
70
- oem=tesserocr.OEM.DEFAULT,
83
+ **{"lang": lang} | tesserocr_kwargs,
71
84
  )
72
85
  self.reader_RIL = tesserocr.RIL
73
86
 
@@ -75,11 +88,12 @@ class TesseractOcrModel(BaseOcrModel):
75
88
  if self.reader is not None:
76
89
  # Finalize the tesseractAPI
77
90
  self.reader.End()
91
+ for script in self.script_readers:
92
+ self.script_readers[script].End()
78
93
 
79
94
  def __call__(
80
95
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
81
96
  ) -> Iterable[Page]:
82
-
83
97
  if not self.enabled:
84
98
  yield from page_batch
85
99
  return
@@ -90,8 +104,8 @@ class TesseractOcrModel(BaseOcrModel):
90
104
  yield page
91
105
  else:
92
106
  with TimeRecorder(conv_res, "ocr"):
93
-
94
107
  assert self.reader is not None
108
+ assert self._tesserocr_languages is not None
95
109
 
96
110
  ocr_rects = self.get_ocr_rects(page)
97
111
 
@@ -104,22 +118,56 @@ class TesseractOcrModel(BaseOcrModel):
104
118
  scale=self.scale, cropbox=ocr_rect
105
119
  )
106
120
 
107
- # Retrieve text snippets with their bounding boxes
108
- self.reader.SetImage(high_res_image)
109
- boxes = self.reader.GetComponentImages(
121
+ local_reader = self.reader
122
+ if "auto" in self.options.lang:
123
+ assert self.osd_reader is not None
124
+
125
+ self.osd_reader.SetImage(high_res_image)
126
+ osd = self.osd_reader.DetectOrientationScript()
127
+
128
+ # No text, probably
129
+ if osd is None:
130
+ continue
131
+
132
+ script = osd["script_name"]
133
+ script = map_tesseract_script(script)
134
+ lang = f"{self.script_prefix}{script}"
135
+
136
+ # Check if the detected languge is present in the system
137
+ if lang not in self._tesserocr_languages:
138
+ msg = f"Tesseract detected the script '{script}' and language '{lang}'."
139
+ msg += " However this language is not installed in your system and will be ignored."
140
+ _log.warning(msg)
141
+ else:
142
+ if script not in self.script_readers:
143
+ import tesserocr
144
+
145
+ self.script_readers[script] = (
146
+ tesserocr.PyTessBaseAPI(
147
+ path=self.reader.GetDatapath(),
148
+ lang=lang,
149
+ psm=tesserocr.PSM.AUTO,
150
+ init=True,
151
+ oem=tesserocr.OEM.DEFAULT,
152
+ )
153
+ )
154
+ local_reader = self.script_readers[script]
155
+
156
+ local_reader.SetImage(high_res_image)
157
+ boxes = local_reader.GetComponentImages(
110
158
  self.reader_RIL.TEXTLINE, True
111
159
  )
112
160
 
113
161
  cells = []
114
162
  for ix, (im, box, _, _) in enumerate(boxes):
115
163
  # Set the area of interest. Tesseract uses Bottom-Left for the origin
116
- self.reader.SetRectangle(
164
+ local_reader.SetRectangle(
117
165
  box["x"], box["y"], box["w"], box["h"]
118
166
  )
119
167
 
120
168
  # Extract text within the bounding box
121
- text = self.reader.GetUTF8Text().strip()
122
- confidence = self.reader.MeanTextConf()
169
+ text = local_reader.GetUTF8Text().strip()
170
+ confidence = local_reader.MeanTextConf()
123
171
  left = box["x"] / self.scale
124
172
  bottom = box["y"] / self.scale
125
173
  right = (box["x"] + box["w"]) / self.scale
@@ -3,7 +3,7 @@ import logging
3
3
  import time
4
4
  import traceback
5
5
  from abc import ABC, abstractmethod
6
- from typing import Callable, Iterable, List
6
+ from typing import Any, Callable, Iterable, List
7
7
 
8
8
  from docling_core.types.doc import DoclingDocument, NodeItem
9
9
 
@@ -18,7 +18,7 @@ from docling.datamodel.base_models import (
18
18
  from docling.datamodel.document import ConversionResult, InputDocument
19
19
  from docling.datamodel.pipeline_options import PipelineOptions
20
20
  from docling.datamodel.settings import settings
21
- from docling.models.base_model import BaseEnrichmentModel
21
+ from docling.models.base_model import GenericEnrichmentModel
22
22
  from docling.utils.profiling import ProfilingScope, TimeRecorder
23
23
  from docling.utils.utils import chunkify
24
24
 
@@ -28,8 +28,9 @@ _log = logging.getLogger(__name__)
28
28
  class BasePipeline(ABC):
29
29
  def __init__(self, pipeline_options: PipelineOptions):
30
30
  self.pipeline_options = pipeline_options
31
+ self.keep_images = False
31
32
  self.build_pipe: List[Callable] = []
32
- self.enrichment_pipe: List[BaseEnrichmentModel] = []
33
+ self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
33
34
 
34
35
  def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
35
36
  conv_res = ConversionResult(input=in_doc)
@@ -40,7 +41,7 @@ class BasePipeline(ABC):
40
41
  conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT
41
42
  ):
42
43
  # These steps are building and assembling the structure of the
43
- # output DoclingDocument
44
+ # output DoclingDocument.
44
45
  conv_res = self._build_document(conv_res)
45
46
  conv_res = self._assemble_document(conv_res)
46
47
  # From this stage, all operations should rely only on conv_res.output
@@ -50,6 +51,8 @@ class BasePipeline(ABC):
50
51
  conv_res.status = ConversionStatus.FAILURE
51
52
  if raises_on_error:
52
53
  raise e
54
+ finally:
55
+ self._unload(conv_res)
53
56
 
54
57
  return conv_res
55
58
 
@@ -62,21 +65,22 @@ class BasePipeline(ABC):
62
65
 
63
66
  def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
64
67
 
65
- def _filter_elements(
66
- doc: DoclingDocument, model: BaseEnrichmentModel
68
+ def _prepare_elements(
69
+ conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
67
70
  ) -> Iterable[NodeItem]:
68
- for element, _level in doc.iterate_items():
69
- if model.is_processable(doc=doc, element=element):
70
- yield element
71
+ for doc_element, _level in conv_res.document.iterate_items():
72
+ prepared_element = model.prepare_element(
73
+ conv_res=conv_res, element=doc_element
74
+ )
75
+ if prepared_element is not None:
76
+ yield prepared_element
71
77
 
72
78
  with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT):
73
79
  for model in self.enrichment_pipe:
74
80
  for element_batch in chunkify(
75
- _filter_elements(conv_res.document, model),
81
+ _prepare_elements(conv_res, model),
76
82
  settings.perf.elements_batch_size,
77
83
  ):
78
- # TODO: currently we assume the element itself is modified, because
79
- # we don't have an interface to save the element back to the document
80
84
  for element in model(
81
85
  doc=conv_res.document, element_batch=element_batch
82
86
  ): # Must exhaust!
@@ -88,6 +92,9 @@ class BasePipeline(ABC):
88
92
  def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
89
93
  pass
90
94
 
95
+ def _unload(self, conv_res: ConversionResult):
96
+ pass
97
+
91
98
  @classmethod
92
99
  @abstractmethod
93
100
  def get_default_options(cls) -> PipelineOptions:
@@ -107,6 +114,10 @@ class BasePipeline(ABC):
107
114
 
108
115
  class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
109
116
 
117
+ def __init__(self, pipeline_options: PipelineOptions):
118
+ super().__init__(pipeline_options)
119
+ self.keep_backend = False
120
+
110
121
  def _apply_on_pages(
111
122
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
112
123
  ) -> Iterable[Page]:
@@ -148,7 +159,14 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
148
159
  pipeline_pages = self._apply_on_pages(conv_res, init_pages)
149
160
 
150
161
  for p in pipeline_pages: # Must exhaust!
151
- pass
162
+
163
+ # Cleanup cached images
164
+ if not self.keep_images:
165
+ p._image_cache = {}
166
+
167
+ # Cleanup page backends
168
+ if not self.keep_backend and p._backend is not None:
169
+ p._backend.unload()
152
170
 
153
171
  end_batch_time = time.monotonic()
154
172
  total_elapsed_time += end_batch_time - start_batch_time
@@ -177,10 +195,15 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
177
195
  )
178
196
  raise e
179
197
 
180
- finally:
181
- # Always unload the PDF backend, even in case of failure
182
- if conv_res.input._backend:
183
- conv_res.input._backend.unload()
198
+ return conv_res
199
+
200
+ def _unload(self, conv_res: ConversionResult) -> ConversionResult:
201
+ for page in conv_res.pages:
202
+ if page._backend is not None:
203
+ page._backend.unload()
204
+
205
+ if conv_res.input._backend:
206
+ conv_res.input._backend.unload()
184
207
 
185
208
  return conv_res
186
209
 
@@ -18,6 +18,11 @@ from docling.datamodel.pipeline_options import (
18
18
  TesseractOcrOptions,
19
19
  )
20
20
  from docling.models.base_ocr_model import BaseOcrModel
21
+ from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
22
+ from docling.models.document_picture_classifier import (
23
+ DocumentPictureClassifier,
24
+ DocumentPictureClassifierOptions,
25
+ )
21
26
  from docling.models.ds_glm_model import GlmModel, GlmOptions
22
27
  from docling.models.easyocr_model import EasyOcrModel
23
28
  from docling.models.layout_model import LayoutModel
@@ -50,7 +55,7 @@ class StandardPdfPipeline(PaginatedPipeline):
50
55
  else:
51
56
  self.artifacts_path = Path(pipeline_options.artifacts_path)
52
57
 
53
- keep_images = (
58
+ self.keep_images = (
54
59
  self.pipeline_options.generate_page_images
55
60
  or self.pipeline_options.generate_picture_images
56
61
  or self.pipeline_options.generate_table_images
@@ -87,13 +92,37 @@ class StandardPdfPipeline(PaginatedPipeline):
87
92
  accelerator_options=pipeline_options.accelerator_options,
88
93
  ),
89
94
  # Page assemble
90
- PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
95
+ PageAssembleModel(options=PageAssembleOptions()),
91
96
  ]
92
97
 
93
98
  self.enrichment_pipe = [
94
99
  # Other models working on `NodeItem` elements in the DoclingDocument
100
+ # Code Formula Enrichment Model
101
+ CodeFormulaModel(
102
+ enabled=pipeline_options.do_code_enrichment
103
+ or pipeline_options.do_formula_enrichment,
104
+ artifacts_path=pipeline_options.artifacts_path,
105
+ options=CodeFormulaModelOptions(
106
+ do_code_enrichment=pipeline_options.do_code_enrichment,
107
+ do_formula_enrichment=pipeline_options.do_formula_enrichment,
108
+ ),
109
+ accelerator_options=pipeline_options.accelerator_options,
110
+ ),
111
+ # Document Picture Classifier
112
+ DocumentPictureClassifier(
113
+ enabled=pipeline_options.do_picture_classification,
114
+ artifacts_path=pipeline_options.artifacts_path,
115
+ options=DocumentPictureClassifierOptions(),
116
+ accelerator_options=pipeline_options.accelerator_options,
117
+ ),
95
118
  ]
96
119
 
120
+ if (
121
+ self.pipeline_options.do_formula_enrichment
122
+ or self.pipeline_options.do_code_enrichment
123
+ ):
124
+ self.keep_backend = True
125
+
97
126
  @staticmethod
98
127
  def download_models_hf(
99
128
  local_dir: Optional[Path] = None, force: bool = False
@@ -270,7 +270,6 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
270
270
  container_el = doc.add_group(label=group_label)
271
271
 
272
272
  _add_child_elements(container_el, doc, obj, pelem)
273
-
274
273
  elif "text" in obj:
275
274
  text = obj["text"][span_i:span_j]
276
275
 
@@ -304,6 +303,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
304
303
  current_list = None
305
304
 
306
305
  doc.add_heading(text=text, prov=prov)
306
+ elif label == DocItemLabel.CODE:
307
+ current_list = None
308
+
309
+ doc.add_code(text=text, prov=prov)
307
310
  else:
308
311
  current_list = None
309
312
 
@@ -0,0 +1,9 @@
1
+ def map_tesseract_script(script: str) -> str:
2
+ r""" """
3
+ if script == "Katakana" or script == "Hiragana":
4
+ script = "Japanese"
5
+ elif script == "Han":
6
+ script = "HanS"
7
+ elif script == "Korean":
8
+ script = "Hangul"
9
+ return script
@@ -0,0 +1,80 @@
1
+ from docling_core.types.doc import DocItemLabel
2
+ from PIL import Image, ImageDraw, ImageFont
3
+ from PIL.ImageFont import FreeTypeFont
4
+
5
+ from docling.datamodel.base_models import Cluster
6
+
7
+
8
+ def draw_clusters(
9
+ image: Image.Image, clusters: list[Cluster], scale_x: float, scale_y: float
10
+ ) -> None:
11
+ """
12
+ Draw clusters on an image
13
+ """
14
+ draw = ImageDraw.Draw(image, "RGBA")
15
+ # Create a smaller font for the labels
16
+ font: ImageFont.ImageFont | FreeTypeFont
17
+ try:
18
+ font = ImageFont.truetype("arial.ttf", 12)
19
+ except OSError:
20
+ # Fallback to default font if arial is not available
21
+ font = ImageFont.load_default()
22
+ for c_tl in clusters:
23
+ all_clusters = [c_tl, *c_tl.children]
24
+ for c in all_clusters:
25
+ # Draw cells first (underneath)
26
+ cell_color = (0, 0, 0, 40) # Transparent black for cells
27
+ for tc in c.cells:
28
+ cx0, cy0, cx1, cy1 = tc.bbox.as_tuple()
29
+ cx0 *= scale_x
30
+ cx1 *= scale_x
31
+ cy0 *= scale_x
32
+ cy1 *= scale_y
33
+
34
+ draw.rectangle(
35
+ [(cx0, cy0), (cx1, cy1)],
36
+ outline=None,
37
+ fill=cell_color,
38
+ )
39
+ # Draw cluster rectangle
40
+ x0, y0, x1, y1 = c.bbox.as_tuple()
41
+ x0 *= scale_x
42
+ x1 *= scale_x
43
+ y0 *= scale_x
44
+ y1 *= scale_y
45
+
46
+ cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
47
+ cluster_outline_color = (
48
+ *list(DocItemLabel.get_color(c.label)),
49
+ 255,
50
+ )
51
+ draw.rectangle(
52
+ [(x0, y0), (x1, y1)],
53
+ outline=cluster_outline_color,
54
+ fill=cluster_fill_color,
55
+ )
56
+ # Add label name and confidence
57
+ label_text = f"{c.label.name} ({c.confidence:.2f})"
58
+ # Create semi-transparent background for text
59
+ text_bbox = draw.textbbox((x0, y0), label_text, font=font)
60
+ text_bg_padding = 2
61
+ draw.rectangle(
62
+ [
63
+ (
64
+ text_bbox[0] - text_bg_padding,
65
+ text_bbox[1] - text_bg_padding,
66
+ ),
67
+ (
68
+ text_bbox[2] + text_bg_padding,
69
+ text_bbox[3] + text_bg_padding,
70
+ ),
71
+ ],
72
+ fill=(255, 255, 255, 180), # Semi-transparent white
73
+ )
74
+ # Draw text
75
+ draw.text(
76
+ (x0, y0),
77
+ label_text,
78
+ fill=(0, 0, 0, 255), # Solid black
79
+ font=font,
80
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.15.1
3
+ Version: 2.17.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -26,9 +26,9 @@ Provides-Extra: tesserocr
26
26
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
27
27
  Requires-Dist: certifi (>=2024.7.4)
28
28
  Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
29
- Requires-Dist: docling-core[chunking] (>=2.13.1,<3.0.0)
30
- Requires-Dist: docling-ibm-models (>=3.1.0,<4.0.0)
31
- Requires-Dist: docling-parse (>=3.0.0,<4.0.0)
29
+ Requires-Dist: docling-core[chunking] (>=2.15.1,<3.0.0)
30
+ Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
31
+ Requires-Dist: docling-parse (>=3.1.0,<4.0.0)
32
32
  Requires-Dist: easyocr (>=1.7,<2.0)
33
33
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
34
34
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -39,6 +39,7 @@ Requires-Dist: onnxruntime (>=1.7.0,<1.20.0) ; (python_version < "3.10") and (ex
39
39
  Requires-Dist: onnxruntime (>=1.7.0,<2.0.0) ; (python_version >= "3.10") and (extra == "rapidocr")
40
40
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
41
41
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
42
+ Requires-Dist: pillow (>=10.0.0,<11.0.0)
42
43
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
43
44
  Requires-Dist: pydantic-settings (>=2.3.0,<3.0.0)
44
45
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
@@ -77,22 +78,21 @@ Description-Content-Type: text/markdown
77
78
  [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
78
79
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
79
80
 
80
- Docling parses documents and exports them to the desired format with ease and speed.
81
+ Docling simplifies document processing, parsing diverse formats including advanced PDF understanding and providing seamless integrations with the gen AI ecosystem.
81
82
 
82
83
  ## Features
83
84
 
84
- * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
85
- * 📑 Advanced PDF document understanding including page layout, reading order & table structures
86
- * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
87
- * 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
88
- * 🔍 OCR support for scanned PDFs
85
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
86
+ * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
87
+ * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
88
+ * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
89
+ * 🔒 Local execution capabilities for sensitive data and air-gapped environments
90
+ * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
91
+ * 🔍 Extensive OCR support for scanned PDFs and images
89
92
  * 💻 Simple and convenient CLI
90
93
 
91
- Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling!
92
-
93
94
  ### Coming soon
94
95
 
95
- * ♾️ Equation & code extraction
96
96
  * 📝 Metadata extraction, including title, authors, references & language
97
97
 
98
98
  ## Installation
@@ -176,3 +176,7 @@ For individual model usage, please refer to the model licenses found in the orig
176
176
 
177
177
  Docling has been brought to you by IBM.
178
178
 
179
+ [supported_formats]: https://ds4sd.github.io/docling/supported_formats/
180
+ [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
181
+ [integrations]: https://ds4sd.github.io/docling/integrations/
182
+