docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. docling/backend/asciidoc_backend.py +7 -15
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/docling_parse_v4_backend.py +3 -4
  6. docling/backend/docx/latex/latex_dict.py +0 -5
  7. docling/backend/docx/latex/omml.py +4 -7
  8. docling/backend/html_backend.py +26 -9
  9. docling/backend/md_backend.py +5 -7
  10. docling/backend/msexcel_backend.py +271 -95
  11. docling/backend/mspowerpoint_backend.py +4 -7
  12. docling/backend/msword_backend.py +23 -15
  13. docling/backend/pdf_backend.py +2 -1
  14. docling/backend/pypdfium2_backend.py +3 -3
  15. docling/backend/xml/jats_backend.py +10 -13
  16. docling/backend/xml/uspto_backend.py +15 -19
  17. docling/cli/main.py +27 -9
  18. docling/cli/models.py +2 -3
  19. docling/datamodel/base_models.py +40 -5
  20. docling/datamodel/document.py +18 -10
  21. docling/datamodel/pipeline_options.py +29 -4
  22. docling/document_converter.py +5 -5
  23. docling/models/api_vlm_model.py +66 -0
  24. docling/models/base_model.py +2 -4
  25. docling/models/base_ocr_model.py +2 -2
  26. docling/models/code_formula_model.py +2 -1
  27. docling/models/document_picture_classifier.py +2 -1
  28. docling/models/easyocr_model.py +10 -11
  29. docling/models/factories/__init__.py +2 -2
  30. docling/models/factories/base_factory.py +1 -1
  31. docling/models/hf_mlx_model.py +4 -6
  32. docling/models/hf_vlm_model.py +7 -5
  33. docling/models/layout_model.py +2 -2
  34. docling/models/ocr_mac_model.py +3 -4
  35. docling/models/page_assemble_model.py +7 -12
  36. docling/models/page_preprocessing_model.py +2 -1
  37. docling/models/picture_description_api_model.py +9 -75
  38. docling/models/picture_description_base_model.py +16 -5
  39. docling/models/picture_description_vlm_model.py +2 -3
  40. docling/models/rapid_ocr_model.py +2 -3
  41. docling/models/readingorder_model.py +8 -23
  42. docling/models/table_structure_model.py +2 -6
  43. docling/models/tesseract_ocr_cli_model.py +17 -16
  44. docling/models/tesseract_ocr_model.py +8 -6
  45. docling/pipeline/base_pipeline.py +4 -8
  46. docling/pipeline/simple_pipeline.py +0 -1
  47. docling/pipeline/standard_pdf_pipeline.py +6 -3
  48. docling/pipeline/vlm_pipeline.py +27 -20
  49. docling/utils/api_image_request.py +61 -0
  50. docling/utils/export.py +2 -4
  51. docling/utils/glm_utils.py +2 -2
  52. docling/utils/layout_postprocessor.py +4 -2
  53. docling/utils/model_downloader.py +7 -7
  54. docling/utils/utils.py +1 -1
  55. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
  56. docling-2.31.0.dist-info/RECORD +86 -0
  57. docling-2.29.0.dist-info/RECORD +0 -84
  58. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
  59. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
  60. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -1,12 +1,7 @@
1
- import copy
2
- import random
3
1
  from pathlib import Path
4
2
  from typing import Dict, List
5
3
 
6
4
  from docling_core.types.doc import (
7
- BoundingBox,
8
- CoordOrigin,
9
- DocItem,
10
5
  DocItemLabel,
11
6
  DoclingDocument,
12
7
  DocumentOrigin,
@@ -17,13 +12,10 @@ from docling_core.types.doc import (
17
12
  TableData,
18
13
  )
19
14
  from docling_core.types.doc.document import ContentLayer
20
- from docling_core.types.legacy_doc.base import Ref
21
- from docling_core.types.legacy_doc.document import BaseText
22
15
  from docling_ibm_models.reading_order.reading_order_rb import (
23
16
  PageElement as ReadingOrderPageElement,
17
+ ReadingOrderPredictor,
24
18
  )
25
- from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
26
- from PIL import ImageDraw
27
19
  from pydantic import BaseModel, ConfigDict
28
20
 
29
21
  from docling.datamodel.base_models import (
@@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
35
27
  TextElement,
36
28
  )
37
29
  from docling.datamodel.document import ConversionResult
38
- from docling.datamodel.settings import settings
39
30
  from docling.utils.profiling import ProfilingScope, TimeRecorder
40
31
 
41
32
 
@@ -53,12 +44,10 @@ class ReadingOrderModel:
53
44
  def _assembled_to_readingorder_elements(
54
45
  self, conv_res: ConversionResult
55
46
  ) -> List[ReadingOrderPageElement]:
56
-
57
47
  elements: List[ReadingOrderPageElement] = []
58
48
  page_no_to_pages = {p.page_no: p for p in conv_res.pages}
59
49
 
60
50
  for element in conv_res.assembled.elements:
61
-
62
51
  page_height = page_no_to_pages[element.page_no].size.height # type: ignore
63
52
  bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
64
53
  text = element.text or ""
@@ -84,7 +73,6 @@ class ReadingOrderModel:
84
73
  def _add_child_elements(
85
74
  self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
86
75
  ):
87
-
88
76
  child: Cluster
89
77
  for child in element.cluster.children:
90
78
  c_label = child.label
@@ -110,7 +98,7 @@ class ReadingOrderModel:
110
98
  else:
111
99
  doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
112
100
 
113
- def _readingorder_elements_to_docling_doc(
101
+ def _readingorder_elements_to_docling_doc( # noqa: C901
114
102
  self,
115
103
  conv_res: ConversionResult,
116
104
  ro_elements: List[ReadingOrderPageElement],
@@ -118,7 +106,6 @@ class ReadingOrderModel:
118
106
  el_to_footnotes_mapping: Dict[int, List[int]],
119
107
  el_merges_mapping: Dict[int, List[int]],
120
108
  ) -> DoclingDocument:
121
-
122
109
  id_to_elem = {
123
110
  RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
124
111
  for elem in conv_res.assembled.elements
@@ -192,7 +179,6 @@ class ReadingOrderModel:
192
179
 
193
180
  code_item.footnotes.append(new_footnote_item.get_ref())
194
181
  else:
195
-
196
182
  new_item, current_list = self._handle_text_element(
197
183
  element, out_doc, current_list, page_height
198
184
  )
@@ -206,7 +192,6 @@ class ReadingOrderModel:
206
192
  )
207
193
 
208
194
  elif isinstance(element, Table):
209
-
210
195
  tbl_data = TableData(
211
196
  num_rows=element.num_rows,
212
197
  num_cols=element.num_cols,
@@ -342,12 +327,12 @@ class ReadingOrderModel:
342
327
  return new_item, current_list
343
328
 
344
329
  def _merge_elements(self, element, merged_elem, new_item, page_height):
345
- assert isinstance(
346
- merged_elem, type(element)
347
- ), "Merged element must be of same type as element."
348
- assert (
349
- merged_elem.label == new_item.label
350
- ), "Labels of merged elements must match."
330
+ assert isinstance(merged_elem, type(element)), (
331
+ "Merged element must be of same type as element."
332
+ )
333
+ assert merged_elem.label == new_item.label, (
334
+ "Labels of merged elements must match."
335
+ )
351
336
  prov = ProvenanceItem(
352
337
  page_no=element.page_no + 1,
353
338
  charspan=(
@@ -1,13 +1,13 @@
1
1
  import copy
2
2
  import warnings
3
+ from collections.abc import Iterable
3
4
  from pathlib import Path
4
- from typing import Iterable, Optional, Union
5
+ from typing import Optional
5
6
 
6
7
  import numpy
7
8
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
8
9
  from docling_core.types.doc.page import (
9
10
  BoundingRectangle,
10
- SegmentedPdfPage,
11
11
  TextCellUnit,
12
12
  )
13
13
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
@@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
44
44
 
45
45
  self.enabled = enabled
46
46
  if self.enabled:
47
-
48
47
  if artifacts_path is None:
49
48
  artifacts_path = self.download_models() / self._model_path
50
49
  else:
@@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
175
174
  def __call__(
176
175
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
177
176
  ) -> Iterable[Page]:
178
-
179
177
  if not self.enabled:
180
178
  yield from page_batch
181
179
  return
@@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
186
184
  yield page
187
185
  else:
188
186
  with TimeRecorder(conv_res, "table_structure"):
189
-
190
187
  assert page.predictions.layout is not None
191
188
  assert page.size is not None
192
189
 
@@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
260
257
  table_out = tf_output[0]
261
258
  table_cells = []
262
259
  for element in table_out["tf_responses"]:
263
-
264
260
  if not self.do_cell_matching:
265
261
  the_bbox = BoundingBox.model_validate(
266
262
  element["bbox"]
@@ -3,9 +3,10 @@ import io
3
3
  import logging
4
4
  import os
5
5
  import tempfile
6
+ from collections.abc import Iterable
6
7
  from pathlib import Path
7
8
  from subprocess import DEVNULL, PIPE, Popen
8
- from typing import Iterable, List, Optional, Tuple, Type
9
+ from typing import List, Optional, Tuple, Type
9
10
 
10
11
  import pandas as pd
11
12
  from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
63
64
  )
64
65
 
65
66
  def _get_name_and_version(self) -> Tuple[str, str]:
66
-
67
- if self._name != None and self._version != None:
67
+ if self._name is not None and self._version is not None:
68
68
  return self._name, self._version # type: ignore
69
69
 
70
70
  cmd = [self.options.tesseract_cmd, "--version"]
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
125
125
  # _log.info(decoded_data)
126
126
 
127
127
  # Read the TSV file generated by Tesseract
128
- df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
128
+ df_result = pd.read_csv(
129
+ io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
130
+ )
129
131
 
130
132
  # Display the dataframe (optional)
131
133
  # _log.info("df: ", df.head())
132
134
 
133
135
  # Filter rows that contain actual text (ignore header or empty rows)
134
- df_filtered = df[
135
- df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
136
+ df_filtered = df_result[
137
+ df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
136
138
  ]
137
139
 
138
140
  return df_filtered
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
149
151
  proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
150
152
  output, _ = proc.communicate()
151
153
  decoded_data = output.decode("utf-8")
152
- df = pd.read_csv(
154
+ df_detected = pd.read_csv(
153
155
  io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
154
156
  )
155
- scripts = df.loc[df["key"] == "Script"].value.tolist()
157
+ scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
156
158
  if len(scripts) == 0:
157
159
  _log.warning("Tesseract cannot detect the script of the page")
158
160
  return None
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
183
185
  proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
184
186
  output, _ = proc.communicate()
185
187
  decoded_data = output.decode("utf-8")
186
- df = pd.read_csv(io.StringIO(decoded_data), header=None)
187
- self._tesseract_languages = df[0].tolist()[1:]
188
+ df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
189
+ self._tesseract_languages = df_list[0].tolist()[1:]
188
190
 
189
191
  # Decide the script prefix
190
- if any([l.startswith("script/") for l in self._tesseract_languages]):
192
+ if any(lang.startswith("script/") for lang in self._tesseract_languages):
191
193
  script_prefix = "script/"
192
194
  else:
193
195
  script_prefix = ""
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
197
199
  def __call__(
198
200
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
199
201
  ) -> Iterable[Page]:
200
-
201
202
  if not self.enabled:
202
203
  yield from page_batch
203
204
  return
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
225
226
  fname = image_file.name
226
227
  high_res_image.save(image_file)
227
228
 
228
- df = self._run_tesseract(fname)
229
+ df_result = self._run_tesseract(fname)
229
230
  finally:
230
231
  if os.path.exists(fname):
231
232
  os.remove(fname)
232
233
 
233
- # _log.info(df)
234
+ # _log.info(df_result)
234
235
 
235
236
  # Print relevant columns (bounding box and text)
236
- for ix, row in df.iterrows():
237
+ for ix, row in df_result.iterrows():
237
238
  text = row["text"]
238
239
  conf = row["conf"]
239
240
 
240
- l = float(row["left"])
241
+ l = float(row["left"]) # noqa: E741
241
242
  b = float(row["top"])
242
243
  w = float(row["width"])
243
244
  h = float(row["height"])
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ from collections.abc import Iterable
2
3
  from pathlib import Path
3
- from typing import Iterable, Optional, Type
4
+ from typing import Optional, Type
4
5
 
5
6
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
7
  from docling_core.types.doc.page import BoundingRectangle, TextCell
@@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
37
38
  self.options: TesseractOcrOptions
38
39
 
39
40
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
40
- self.reader = None
41
- self.osd_reader = None
42
- self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
43
41
 
44
42
  if self.enabled:
45
43
  install_errmsg = (
@@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
64
62
  raise ImportError(install_errmsg)
65
63
  try:
66
64
  tesseract_version = tesserocr.tesseract_version()
67
- except:
65
+ except Exception:
68
66
  raise ImportError(install_errmsg)
69
67
 
70
68
  _, self._tesserocr_languages = tesserocr.get_languages()
@@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
75
73
  _log.debug("Initializing TesserOCR: %s", tesseract_version)
76
74
  lang = "+".join(self.options.lang)
77
75
 
78
- if any([l.startswith("script/") for l in self._tesserocr_languages]):
76
+ if any(lang.startswith("script/") for lang in self._tesserocr_languages):
79
77
  self.script_prefix = "script/"
80
78
  else:
81
79
  self.script_prefix = ""
@@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
86
84
  "oem": tesserocr.OEM.DEFAULT,
87
85
  }
88
86
 
87
+ self.reader = None
88
+ self.osd_reader = None
89
+ self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
90
+
89
91
  if self.options.path is not None:
90
92
  tesserocr_kwargs["path"] = self.options.path
91
93
 
@@ -3,9 +3,10 @@ import logging
3
3
  import time
4
4
  import traceback
5
5
  from abc import ABC, abstractmethod
6
- from typing import Any, Callable, Iterable, List
6
+ from collections.abc import Iterable
7
+ from typing import Any, Callable, List
7
8
 
8
- from docling_core.types.doc import DoclingDocument, NodeItem
9
+ from docling_core.types.doc import NodeItem
9
10
 
10
11
  from docling.backend.abstract_backend import AbstractDocumentBackend
11
12
  from docling.backend.pdf_backend import PdfDocumentBackend
@@ -64,7 +65,6 @@ class BasePipeline(ABC):
64
65
  return conv_res
65
66
 
66
67
  def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
67
-
68
68
  def _prepare_elements(
69
69
  conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
70
70
  ) -> Iterable[NodeItem]:
@@ -113,7 +113,6 @@ class BasePipeline(ABC):
113
113
 
114
114
 
115
115
  class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
116
-
117
116
  def __init__(self, pipeline_options: PipelineOptions):
118
117
  super().__init__(pipeline_options)
119
118
  self.keep_backend = False
@@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
127
126
  yield from page_batch
128
127
 
129
128
  def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
130
-
131
129
  if not isinstance(conv_res.input._backend, PdfDocumentBackend):
132
130
  raise RuntimeError(
133
131
  f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
@@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
139
137
 
140
138
  total_elapsed_time = 0.0
141
139
  with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
142
-
143
- for i in range(0, conv_res.input.page_count):
140
+ for i in range(conv_res.input.page_count):
144
141
  start_page, end_page = conv_res.input.limits.page_range
145
142
  if (start_page - 1) <= i <= (end_page - 1):
146
143
  conv_res.pages.append(Page(page_no=i))
@@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
161
158
  pipeline_pages = self._apply_on_pages(conv_res, init_pages)
162
159
 
163
160
  for p in pipeline_pages: # Must exhaust!
164
-
165
161
  # Cleanup cached images
166
162
  if not self.keep_images:
167
163
  p._image_cache = {}
@@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
24
24
  super().__init__(pipeline_options)
25
25
 
26
26
  def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
27
-
28
27
  if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
29
28
  raise RuntimeError(
30
29
  f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
@@ -1,8 +1,7 @@
1
1
  import logging
2
- import sys
3
2
  import warnings
4
3
  from pathlib import Path
5
- from typing import Optional
4
+ from typing import Optional, cast
6
5
 
7
6
  from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
8
7
 
@@ -226,7 +225,11 @@ class StandardPdfPipeline(PaginatedPipeline):
226
225
  and self.pipeline_options.generate_table_images
227
226
  ):
228
227
  page_ix = element.prov[0].page_no - 1
229
- page = conv_res.pages[page_ix]
228
+ page = next(
229
+ (p for p in conv_res.pages if p.page_no == page_ix),
230
+ cast("Page", None),
231
+ )
232
+ assert page is not None
230
233
  assert page.size is not None
231
234
  assert page.image is not None
232
235
 
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import warnings
3
2
  from io import BytesIO
4
3
  from pathlib import Path
5
4
  from typing import List, Optional, Union, cast
@@ -15,11 +14,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend
15
14
  from docling.datamodel.base_models import InputFormat, Page
16
15
  from docling.datamodel.document import ConversionResult, InputDocument
17
16
  from docling.datamodel.pipeline_options import (
17
+ ApiVlmOptions,
18
+ HuggingFaceVlmOptions,
18
19
  InferenceFramework,
19
20
  ResponseFormat,
20
21
  VlmPipelineOptions,
21
22
  )
22
23
  from docling.datamodel.settings import settings
24
+ from docling.models.api_vlm_model import ApiVlmModel
23
25
  from docling.models.hf_mlx_model import HuggingFaceMlxModel
24
26
  from docling.models.hf_vlm_model import HuggingFaceVlmModel
25
27
  from docling.pipeline.base_pipeline import PaginatedPipeline
@@ -29,7 +31,6 @@ _log = logging.getLogger(__name__)
29
31
 
30
32
 
31
33
  class VlmPipeline(PaginatedPipeline):
32
-
33
34
  def __init__(self, pipeline_options: VlmPipelineOptions):
34
35
  super().__init__(pipeline_options)
35
36
  self.keep_backend = True
@@ -57,27 +58,34 @@ class VlmPipeline(PaginatedPipeline):
57
58
 
58
59
  self.keep_images = self.pipeline_options.generate_page_images
59
60
 
60
- if (
61
- self.pipeline_options.vlm_options.inference_framework
62
- == InferenceFramework.MLX
63
- ):
61
+ if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
64
62
  self.build_pipe = [
65
- HuggingFaceMlxModel(
63
+ ApiVlmModel(
66
64
  enabled=True, # must be always enabled for this pipeline to make sense.
67
- artifacts_path=artifacts_path,
68
- accelerator_options=pipeline_options.accelerator_options,
69
- vlm_options=self.pipeline_options.vlm_options,
70
- ),
71
- ]
72
- else:
73
- self.build_pipe = [
74
- HuggingFaceVlmModel(
75
- enabled=True, # must be always enabled for this pipeline to make sense.
76
- artifacts_path=artifacts_path,
77
- accelerator_options=pipeline_options.accelerator_options,
78
- vlm_options=self.pipeline_options.vlm_options,
65
+ enable_remote_services=self.pipeline_options.enable_remote_services,
66
+ vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
79
67
  ),
80
68
  ]
69
+ elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
70
+ vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
71
+ if vlm_options.inference_framework == InferenceFramework.MLX:
72
+ self.build_pipe = [
73
+ HuggingFaceMlxModel(
74
+ enabled=True, # must be always enabled for this pipeline to make sense.
75
+ artifacts_path=artifacts_path,
76
+ accelerator_options=pipeline_options.accelerator_options,
77
+ vlm_options=vlm_options,
78
+ ),
79
+ ]
80
+ else:
81
+ self.build_pipe = [
82
+ HuggingFaceVlmModel(
83
+ enabled=True, # must be always enabled for this pipeline to make sense.
84
+ artifacts_path=artifacts_path,
85
+ accelerator_options=pipeline_options.accelerator_options,
86
+ vlm_options=vlm_options,
87
+ ),
88
+ ]
81
89
 
82
90
  self.enrichment_pipe = [
83
91
  # Other models working on `NodeItem` elements in the DoclingDocument
@@ -104,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
104
112
 
105
113
  def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
106
114
  with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
107
-
108
115
  if (
109
116
  self.pipeline_options.vlm_options.response_format
110
117
  == ResponseFormat.DOCTAGS
@@ -0,0 +1,61 @@
1
+ import base64
2
+ import logging
3
+ from io import BytesIO
4
+ from typing import Dict, Optional
5
+
6
+ import requests
7
+ from PIL import Image
8
+ from pydantic import AnyUrl
9
+
10
+ from docling.datamodel.base_models import OpenAiApiResponse
11
+
12
+ _log = logging.getLogger(__name__)
13
+
14
+
15
+ def api_image_request(
16
+ image: Image.Image,
17
+ prompt: str,
18
+ url: AnyUrl,
19
+ timeout: float = 20,
20
+ headers: Optional[Dict[str, str]] = None,
21
+ **params,
22
+ ) -> str:
23
+ img_io = BytesIO()
24
+ image.save(img_io, "PNG")
25
+ image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
26
+ messages = [
27
+ {
28
+ "role": "user",
29
+ "content": [
30
+ {
31
+ "type": "image_url",
32
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
33
+ },
34
+ {
35
+ "type": "text",
36
+ "text": prompt,
37
+ },
38
+ ],
39
+ }
40
+ ]
41
+
42
+ payload = {
43
+ "messages": messages,
44
+ **params,
45
+ }
46
+
47
+ headers = headers or {}
48
+
49
+ r = requests.post(
50
+ str(url),
51
+ headers=headers,
52
+ json=payload,
53
+ timeout=timeout,
54
+ )
55
+ if not r.ok:
56
+ _log.error(f"Error calling the API. Response was {r.text}")
57
+ r.raise_for_status()
58
+
59
+ api_resp = OpenAiApiResponse.model_validate_json(r.text)
60
+ generated_text = api_resp.choices[0].message.content.strip()
61
+ return generated_text
docling/utils/export.py CHANGED
@@ -1,8 +1,8 @@
1
1
  import logging
2
- from typing import Any, Dict, Iterable, List, Tuple, Union
2
+ from collections.abc import Iterable
3
+ from typing import Any, Dict, List, Tuple, Union
3
4
 
4
5
  from docling_core.types.doc import BoundingBox, CoordOrigin
5
- from docling_core.types.doc.page import TextCell
6
6
  from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
7
7
 
8
8
  from docling.datamodel.document import ConversionResult, Page
@@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
13
13
  def generate_multimodal_pages(
14
14
  doc_result: ConversionResult,
15
15
  ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
16
-
17
16
  label_to_doclaynet = {
18
17
  "title": "title",
19
18
  "table-of-contents": "document_index",
@@ -122,7 +121,6 @@ def generate_multimodal_pages(
122
121
  if doc.main_text is None:
123
122
  return
124
123
  for ix, orig_item in enumerate(doc.main_text):
125
-
126
124
  item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
127
125
  if item is None or item.prov is None or len(item.prov) == 0:
128
126
  _log.debug(f"Skipping item {orig_item}")
@@ -29,7 +29,7 @@ def resolve_item(paths, obj):
29
29
 
30
30
  try:
31
31
  key = int(paths[0])
32
- except:
32
+ except Exception:
33
33
  key = paths[0]
34
34
 
35
35
  if len(paths) == 1:
@@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
67
67
  return unique_objects
68
68
 
69
69
 
70
- def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
70
+ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
71
71
  origin = DocumentOrigin(
72
72
  mimetype="application/pdf",
73
73
  filename=doc_glm["file-info"]["filename"],
@@ -18,7 +18,7 @@ class UnionFind:
18
18
 
19
19
  def __init__(self, elements):
20
20
  self.parent = {elem: elem for elem in elements}
21
- self.rank = {elem: 0 for elem in elements}
21
+ self.rank = dict.fromkeys(elements, 0)
22
22
 
23
23
  def find(self, x):
24
24
  if self.parent[x] != x:
@@ -484,7 +484,9 @@ class LayoutPostprocessor:
484
484
  spatial_index = (
485
485
  self.regular_index
486
486
  if cluster_type == "regular"
487
- else self.picture_index if cluster_type == "picture" else self.wrapper_index
487
+ else self.picture_index
488
+ if cluster_type == "picture"
489
+ else self.wrapper_index
488
490
  )
489
491
 
490
492
  # Map of currently valid clusters
@@ -37,7 +37,7 @@ def download_models(
37
37
  output_dir.mkdir(exist_ok=True, parents=True)
38
38
 
39
39
  if with_layout:
40
- _log.info(f"Downloading layout model...")
40
+ _log.info("Downloading layout model...")
41
41
  LayoutModel.download_models(
42
42
  local_dir=output_dir / LayoutModel._model_repo_folder,
43
43
  force=force,
@@ -45,7 +45,7 @@ def download_models(
45
45
  )
46
46
 
47
47
  if with_tableformer:
48
- _log.info(f"Downloading tableformer model...")
48
+ _log.info("Downloading tableformer model...")
49
49
  TableStructureModel.download_models(
50
50
  local_dir=output_dir / TableStructureModel._model_repo_folder,
51
51
  force=force,
@@ -53,7 +53,7 @@ def download_models(
53
53
  )
54
54
 
55
55
  if with_picture_classifier:
56
- _log.info(f"Downloading picture classifier model...")
56
+ _log.info("Downloading picture classifier model...")
57
57
  DocumentPictureClassifier.download_models(
58
58
  local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
59
59
  force=force,
@@ -61,7 +61,7 @@ def download_models(
61
61
  )
62
62
 
63
63
  if with_code_formula:
64
- _log.info(f"Downloading code formula model...")
64
+ _log.info("Downloading code formula model...")
65
65
  CodeFormulaModel.download_models(
66
66
  local_dir=output_dir / CodeFormulaModel._model_repo_folder,
67
67
  force=force,
@@ -69,7 +69,7 @@ def download_models(
69
69
  )
70
70
 
71
71
  if with_smolvlm:
72
- _log.info(f"Downloading SmolVlm model...")
72
+ _log.info("Downloading SmolVlm model...")
73
73
  PictureDescriptionVlmModel.download_models(
74
74
  repo_id=smolvlm_picture_description.repo_id,
75
75
  local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@@ -78,7 +78,7 @@ def download_models(
78
78
  )
79
79
 
80
80
  if with_granite_vision:
81
- _log.info(f"Downloading Granite Vision model...")
81
+ _log.info("Downloading Granite Vision model...")
82
82
  PictureDescriptionVlmModel.download_models(
83
83
  repo_id=granite_picture_description.repo_id,
84
84
  local_dir=output_dir / granite_picture_description.repo_cache_folder,
@@ -87,7 +87,7 @@ def download_models(
87
87
  )
88
88
 
89
89
  if with_easyocr:
90
- _log.info(f"Downloading easyocr models...")
90
+ _log.info("Downloading easyocr models...")
91
91
  EasyOcrModel.download_models(
92
92
  local_dir=output_dir / EasyOcrModel._model_repo_folder,
93
93
  force=force,
docling/utils/utils.py CHANGED
@@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
13
13
  if isinstance(iterator, List):
14
14
  iterator = iter(iterator)
15
15
  for first in iterator: # Take the first element from the iterator
16
- yield [first] + list(islice(iterator, chunk_size - 1))
16
+ yield [first, *list(islice(iterator, chunk_size - 1))]
17
17
 
18
18
 
19
19
  def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str: