docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. docling/backend/asciidoc_backend.py +7 -15
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/docling_parse_v4_backend.py +3 -4
  6. docling/backend/docx/latex/latex_dict.py +0 -5
  7. docling/backend/docx/latex/omml.py +4 -7
  8. docling/backend/html_backend.py +26 -9
  9. docling/backend/md_backend.py +5 -7
  10. docling/backend/msexcel_backend.py +271 -95
  11. docling/backend/mspowerpoint_backend.py +4 -7
  12. docling/backend/msword_backend.py +23 -15
  13. docling/backend/pdf_backend.py +2 -1
  14. docling/backend/pypdfium2_backend.py +3 -3
  15. docling/backend/xml/jats_backend.py +10 -13
  16. docling/backend/xml/uspto_backend.py +15 -19
  17. docling/cli/main.py +27 -9
  18. docling/cli/models.py +2 -3
  19. docling/datamodel/base_models.py +40 -5
  20. docling/datamodel/document.py +18 -10
  21. docling/datamodel/pipeline_options.py +29 -4
  22. docling/document_converter.py +5 -5
  23. docling/models/api_vlm_model.py +66 -0
  24. docling/models/base_model.py +2 -4
  25. docling/models/base_ocr_model.py +2 -2
  26. docling/models/code_formula_model.py +2 -1
  27. docling/models/document_picture_classifier.py +2 -1
  28. docling/models/easyocr_model.py +10 -11
  29. docling/models/factories/__init__.py +2 -2
  30. docling/models/factories/base_factory.py +1 -1
  31. docling/models/hf_mlx_model.py +4 -6
  32. docling/models/hf_vlm_model.py +7 -5
  33. docling/models/layout_model.py +2 -2
  34. docling/models/ocr_mac_model.py +3 -4
  35. docling/models/page_assemble_model.py +7 -12
  36. docling/models/page_preprocessing_model.py +2 -1
  37. docling/models/picture_description_api_model.py +9 -75
  38. docling/models/picture_description_base_model.py +16 -5
  39. docling/models/picture_description_vlm_model.py +2 -3
  40. docling/models/rapid_ocr_model.py +2 -3
  41. docling/models/readingorder_model.py +8 -23
  42. docling/models/table_structure_model.py +2 -6
  43. docling/models/tesseract_ocr_cli_model.py +17 -16
  44. docling/models/tesseract_ocr_model.py +8 -6
  45. docling/pipeline/base_pipeline.py +4 -8
  46. docling/pipeline/simple_pipeline.py +0 -1
  47. docling/pipeline/standard_pdf_pipeline.py +6 -3
  48. docling/pipeline/vlm_pipeline.py +27 -20
  49. docling/utils/api_image_request.py +61 -0
  50. docling/utils/export.py +2 -4
  51. docling/utils/glm_utils.py +2 -2
  52. docling/utils/layout_postprocessor.py +4 -2
  53. docling/utils/model_downloader.py +7 -7
  54. docling/utils/utils.py +1 -1
  55. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
  56. docling-2.31.0.dist-info/RECORD +86 -0
  57. docling-2.29.0.dist-info/RECORD +0 -84
  58. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
  59. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
  60. {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import random
3
+ from collections.abc import Iterable
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
+ from typing import TYPE_CHECKING, List, Optional, Union
6
7
 
7
8
  import pypdfium2 as pdfium
8
9
  import pypdfium2.raw as pdfium_c
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
29
30
  self.valid = True # No better way to tell from pypdfium.
30
31
  try:
31
32
  self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
32
- except PdfiumError as e:
33
+ except PdfiumError:
33
34
  _log.info(
34
35
  f"An exception occurred when loading page {page_no} of document {document_hash}.",
35
36
  exc_info=True,
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
225
226
  def get_page_image(
226
227
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
227
228
  ) -> Image.Image:
228
-
229
229
  page_size = self.get_size()
230
230
 
231
231
  if not cropbox:
@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
102
102
 
103
103
  doc_info: etree.DocInfo = self.tree.docinfo
104
104
  if doc_info.system_url and any(
105
- [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
105
+ kwd in doc_info.system_url for kwd in JATS_DTD_URL
106
106
  ):
107
107
  self.valid = True
108
108
  return
109
109
  for ent in doc_info.internalDTD.iterentities():
110
110
  if ent.system_url and any(
111
- [kwd in ent.system_url for kwd in JATS_DTD_URL]
111
+ kwd in ent.system_url for kwd in JATS_DTD_URL
112
112
  ):
113
113
  self.valid = True
114
114
  return
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
232
232
  # TODO: once superscript is supported, add label with formatting
233
233
  aff = aff.removeprefix(f"{label[0].text}, ")
234
234
  affiliation_names.append(aff)
235
- affiliation_ids_names = {
236
- id: name
237
- for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
238
- }
235
+ affiliation_ids_names = dict(
236
+ zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
237
+ )
239
238
 
240
239
  # Get author names and affiliation names
241
240
  for author_node in meta.xpath(
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
300
299
  def _add_abstract(
301
300
  self, doc: DoclingDocument, xml_components: XMLComponents
302
301
  ) -> None:
303
-
304
302
  for abstract in xml_components["abstract"]:
305
303
  text: str = abstract["content"]
306
304
  title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
349
347
 
350
348
  return
351
349
 
352
- def _parse_element_citation(self, node: etree._Element) -> str:
350
+ def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
353
351
  citation: Citation = {
354
352
  "author_names": "",
355
353
  "title": "",
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
440
438
  citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
441
439
  if len(node.xpath("lpage")) > 0:
442
440
  citation["page"] += (
443
- "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
441
+ "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
444
442
  )
445
443
 
446
444
  # Flatten the citation to string
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
595
593
 
596
594
  try:
597
595
  self._add_table(doc, parent, table)
598
- except Exception as e:
599
- _log.warning(f"Skipping unsupported table in {str(self.file)}")
600
- pass
596
+ except Exception:
597
+ _log.warning(f"Skipping unsupported table in {self.file!s}")
601
598
 
602
599
  return
603
600
 
@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
609
606
  )
610
607
  return
611
608
 
612
- def _walk_linear(
609
+ def _walk_linear( # noqa: C901
613
610
  self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
614
611
  ) -> str:
615
612
  skip_tags = ["term"]
@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
122
122
 
123
123
  @override
124
124
  def convert(self) -> DoclingDocument:
125
-
126
125
  if self.parser is not None:
127
126
  doc = self.parser.parse(self.patent_content)
128
127
  if doc is None:
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
163
162
  Returns:
164
163
  The patent parsed as a docling document.
165
164
  """
166
- pass
167
165
 
168
166
 
169
167
  class PatentUsptoIce(PatentUspto):
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
265
263
  self.style_html = HtmlEntity()
266
264
 
267
265
  @override
268
- def startElement(self, tag, attributes): # noqa: N802
266
+ def startElement(self, tag, attributes):
269
267
  """Signal the start of an element.
270
268
 
271
269
  Args:
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
281
279
  self._start_registered_elements(tag, attributes)
282
280
 
283
281
  @override
284
- def skippedEntity(self, name): # noqa: N802
282
+ def skippedEntity(self, name):
285
283
  """Receive notification of a skipped entity.
286
284
 
287
285
  HTML entities will be skipped by the parser. This method will unescape them
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
315
313
  self.text += unescaped
316
314
 
317
315
  @override
318
- def endElement(self, tag): # noqa: N802
316
+ def endElement(self, tag):
319
317
  """Signal the end of an element.
320
318
 
321
319
  Args:
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
603
601
  self.style_html = HtmlEntity()
604
602
 
605
603
  @override
606
- def startElement(self, tag, attributes): # noqa: N802
604
+ def startElement(self, tag, attributes):
607
605
  """Signal the start of an element.
608
606
 
609
607
  Args:
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
616
614
  self._start_registered_elements(tag, attributes)
617
615
 
618
616
  @override
619
- def skippedEntity(self, name): # noqa: N802
617
+ def skippedEntity(self, name):
620
618
  """Receive notification of a skipped entity.
621
619
 
622
620
  HTML entities will be skipped by the parser. This method will unescape them
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
650
648
  self.text += unescaped
651
649
 
652
650
  @override
653
- def endElement(self, tag): # noqa: N802
651
+ def endElement(self, tag):
654
652
  """Signal the end of an element.
655
653
 
656
654
  Args:
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
691
689
  if tag in [member.value for member in self.Element]:
692
690
  if (
693
691
  tag == self.Element.HEADING.value
694
- and not self.Element.SDOCL.value in self.property
692
+ and self.Element.SDOCL.value not in self.property
695
693
  ):
696
694
  level_attr: str = attributes.get("LVL", "")
697
695
  new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
743
741
  # headers except claims statement
744
742
  elif (
745
743
  self.Element.HEADING.value in self.property
746
- and not self.Element.SDOCL.value in self.property
744
+ and self.Element.SDOCL.value not in self.property
747
745
  and text.strip()
748
746
  ):
749
747
  self.parents[self.level + 1] = self.doc.add_heading(
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
1164
1162
  self.style_html = HtmlEntity()
1165
1163
 
1166
1164
  @override
1167
- def startElement(self, tag, attributes): # noqa: N802
1165
+ def startElement(self, tag, attributes):
1168
1166
  """Signal the start of an element.
1169
1167
 
1170
1168
  Args:
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
1177
1175
  self._start_registered_elements(tag, attributes)
1178
1176
 
1179
1177
  @override
1180
- def skippedEntity(self, name): # noqa: N802
1178
+ def skippedEntity(self, name):
1181
1179
  """Receive notification of a skipped entity.
1182
1180
 
1183
1181
  HTML entities will be skipped by the parser. This method will unescape them
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
1211
1209
  self.text += unescaped
1212
1210
 
1213
1211
  @override
1214
- def endElement(self, tag): # noqa: N802
1212
+ def endElement(self, tag):
1215
1213
  """Signal the end of an element.
1216
1214
 
1217
1215
  Args:
@@ -1474,9 +1472,7 @@ class XmlTable:
1474
1472
  if cw == 0:
1475
1473
  offset_w0.append(col["offset"][ic])
1476
1474
 
1477
- min_colinfo["offset"] = sorted(
1478
- list(set(col["offset"] + min_colinfo["offset"]))
1479
- )
1475
+ min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
1480
1476
 
1481
1477
  # add back the 0 width cols to offset list
1482
1478
  offset_w0 = list(set(offset_w0))
@@ -1527,7 +1523,7 @@ class XmlTable:
1527
1523
 
1528
1524
  return ncols_max
1529
1525
 
1530
- def _parse_table(self, table: Tag) -> TableData:
1526
+ def _parse_table(self, table: Tag) -> TableData: # noqa: C901
1531
1527
  """Parse the content of a table tag.
1532
1528
 
1533
1529
  Args:
@@ -1722,7 +1718,7 @@ class HtmlEntity:
1722
1718
  "0": "⁰",
1723
1719
  "+": "⁺",
1724
1720
  "-": "⁻",
1725
- "−": "⁻",
1721
+ "−": "⁻", # noqa: RUF001
1726
1722
  "=": "⁼",
1727
1723
  "(": "⁽",
1728
1724
  ")": "⁾",
@@ -1746,7 +1742,7 @@ class HtmlEntity:
1746
1742
  "0": "₀",
1747
1743
  "+": "₊",
1748
1744
  "-": "₋",
1749
- "−": "₋",
1745
+ "−": "₋", # noqa: RUF001
1750
1746
  "=": "₌",
1751
1747
  "(": "₍",
1752
1748
  ")": "₎",
docling/cli/main.py CHANGED
@@ -6,14 +6,16 @@ import sys
6
6
  import tempfile
7
7
  import time
8
8
  import warnings
9
+ from collections.abc import Iterable
9
10
  from pathlib import Path
10
- from typing import Annotated, Dict, Iterable, List, Optional, Type
11
+ from typing import Annotated, Dict, List, Optional, Type
11
12
 
12
13
  import rich.table
13
14
  import typer
14
15
  from docling_core.types.doc import ImageRefMode
15
16
  from docling_core.utils.file import resolve_source_to_path
16
17
  from pydantic import TypeAdapter
18
+ from rich.console import Console
17
19
 
18
20
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
19
21
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -40,6 +42,7 @@ from docling.datamodel.pipeline_options import (
40
42
  VlmModelType,
41
43
  VlmPipelineOptions,
42
44
  granite_vision_vlm_conversion_options,
45
+ granite_vision_vlm_ollama_conversion_options,
43
46
  smoldocling_vlm_conversion_options,
44
47
  smoldocling_vlm_mlx_conversion_options,
45
48
  )
@@ -52,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
52
55
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
53
56
 
54
57
  _log = logging.getLogger(__name__)
55
- from rich.console import Console
56
58
 
57
59
  console = Console()
58
60
  err_console = Console(stderr=True)
@@ -153,12 +155,12 @@ def export_documents(
153
155
  output_dir: Path,
154
156
  export_json: bool,
155
157
  export_html: bool,
158
+ export_html_split_page: bool,
156
159
  export_md: bool,
157
160
  export_txt: bool,
158
161
  export_doctags: bool,
159
162
  image_export_mode: ImageRefMode,
160
163
  ):
161
-
162
164
  success_count = 0
163
165
  failure_count = 0
164
166
 
@@ -180,7 +182,15 @@ def export_documents(
180
182
  fname = output_dir / f"{doc_filename}.html"
181
183
  _log.info(f"writing HTML output to {fname}")
182
184
  conv_res.document.save_as_html(
183
- filename=fname, image_mode=image_export_mode
185
+ filename=fname, image_mode=image_export_mode, split_page_view=False
186
+ )
187
+
188
+ # Export HTML format:
189
+ if export_html_split_page:
190
+ fname = output_dir / f"{doc_filename}.html"
191
+ _log.info(f"writing HTML output to {fname}")
192
+ conv_res.document.save_as_html(
193
+ filename=fname, image_mode=image_export_mode, split_page_view=True
184
194
  )
185
195
 
186
196
  # Export Text format:
@@ -223,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
223
233
 
224
234
 
225
235
  @app.command(no_args_is_help=True)
226
- def convert(
236
+ def convert( # noqa: C901
227
237
  input_sources: Annotated[
228
238
  List[str],
229
239
  typer.Argument(
@@ -279,7 +289,7 @@ def convert(
279
289
  ...,
280
290
  help=(
281
291
  f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
282
- f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
292
+ f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
283
293
  f"Use the option --show-external-plugins to see the options allowed with external plugins."
284
294
  ),
285
295
  ),
@@ -411,7 +421,7 @@ def convert(
411
421
  logging.basicConfig(level=logging.WARNING)
412
422
  elif verbose == 1:
413
423
  logging.basicConfig(level=logging.INFO)
414
- elif verbose == 2:
424
+ else:
415
425
  logging.basicConfig(level=logging.DEBUG)
416
426
 
417
427
  settings.debug.visualize_cells = debug_visualize_cells
@@ -420,7 +430,7 @@ def convert(
420
430
  settings.debug.visualize_ocr = debug_visualize_ocr
421
431
 
422
432
  if from_formats is None:
423
- from_formats = [e for e in InputFormat]
433
+ from_formats = list(InputFormat)
424
434
 
425
435
  parsed_headers: Optional[Dict[str, str]] = None
426
436
  if headers is not None:
@@ -471,6 +481,7 @@ def convert(
471
481
 
472
482
  export_json = OutputFormat.JSON in to_formats
473
483
  export_html = OutputFormat.HTML in to_formats
484
+ export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
474
485
  export_md = OutputFormat.MARKDOWN in to_formats
475
486
  export_txt = OutputFormat.TEXT in to_formats
476
487
  export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -531,10 +542,16 @@ def convert(
531
542
  backend=backend, # pdf_backend
532
543
  )
533
544
  elif pipeline == PdfPipeline.VLM:
534
- pipeline_options = VlmPipelineOptions()
545
+ pipeline_options = VlmPipelineOptions(
546
+ enable_remote_services=enable_remote_services,
547
+ )
535
548
 
536
549
  if vlm_model == VlmModelType.GRANITE_VISION:
537
550
  pipeline_options.vlm_options = granite_vision_vlm_conversion_options
551
+ elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
552
+ pipeline_options.vlm_options = (
553
+ granite_vision_vlm_ollama_conversion_options
554
+ )
538
555
  elif vlm_model == VlmModelType.SMOLDOCLING:
539
556
  pipeline_options.vlm_options = smoldocling_vlm_conversion_options
540
557
  if sys.platform == "darwin":
@@ -578,6 +595,7 @@ def convert(
578
595
  output_dir=output,
579
596
  export_json=export_json,
580
597
  export_html=export_html,
598
+ export_html_split_page=export_html_split_page,
581
599
  export_md=export_md,
582
600
  export_txt=export_txt,
583
601
  export_doctags=export_doctags,
docling/cli/models.py CHANGED
@@ -62,7 +62,7 @@ def download(
62
62
  models: Annotated[
63
63
  Optional[list[_AvailableModels]],
64
64
  typer.Argument(
65
- help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
65
+ help="Models to download (default behavior: a predefined set of models will be downloaded).",
66
66
  ),
67
67
  ] = None,
68
68
  all: Annotated[
@@ -89,14 +89,13 @@ def download(
89
89
  "Cannot simultaneously set 'all' parameter and specify models to download."
90
90
  )
91
91
  if not quiet:
92
- FORMAT = "%(message)s"
93
92
  logging.basicConfig(
94
93
  level=logging.INFO,
95
94
  format="[blue]%(message)s[/blue]",
96
95
  datefmt="[%X]",
97
96
  handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
98
97
  )
99
- to_download = models or ([m for m in _AvailableModels] if all else _default_models)
98
+ to_download = models or (list(_AvailableModels) if all else _default_models)
100
99
  output_dir = download_models(
101
100
  output_dir=output_dir,
102
101
  force=force,
@@ -10,7 +10,9 @@ from docling_core.types.doc import (
10
10
  TableCell,
11
11
  )
12
12
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
13
- from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
13
+
14
+ # DO NOT REMOVE; explicitly exposed from this location
15
+ from docling_core.types.io import (
14
16
  DocumentStream,
15
17
  )
16
18
  from PIL.Image import Image
@@ -50,6 +52,7 @@ class OutputFormat(str, Enum):
50
52
  MARKDOWN = "md"
51
53
  JSON = "json"
52
54
  HTML = "html"
55
+ HTML_SPLIT_PAGE = "html_split_page"
53
56
  TEXT = "text"
54
57
  DOCTAGS = "doctags"
55
58
 
@@ -232,9 +235,9 @@ class Page(BaseModel):
232
235
  None # Internal PDF backend. By default it is cleared during assembling.
233
236
  )
234
237
  _default_image_scale: float = 1.0 # Default image scale for external usage.
235
- _image_cache: Dict[float, Image] = (
236
- {}
237
- ) # Cache of images in different scales. By default it is cleared during assembling.
238
+ _image_cache: Dict[
239
+ float, Image
240
+ ] = {} # Cache of images in different scales. By default it is cleared during assembling.
238
241
 
239
242
  def get_image(
240
243
  self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
@@ -242,7 +245,7 @@ class Page(BaseModel):
242
245
  if self._backend is None:
243
246
  return self._image_cache.get(scale, None)
244
247
 
245
- if not scale in self._image_cache:
248
+ if scale not in self._image_cache:
246
249
  if cropbox is None:
247
250
  self._image_cache[scale] = self._backend.get_page_image(scale=scale)
248
251
  else:
@@ -262,3 +265,35 @@ class Page(BaseModel):
262
265
  @property
263
266
  def image(self) -> Optional[Image]:
264
267
  return self.get_image(scale=self._default_image_scale)
268
+
269
+
270
+ ## OpenAI API Request / Response Models ##
271
+
272
+
273
+ class OpenAiChatMessage(BaseModel):
274
+ role: str
275
+ content: str
276
+
277
+
278
+ class OpenAiResponseChoice(BaseModel):
279
+ index: int
280
+ message: OpenAiChatMessage
281
+ finish_reason: str
282
+
283
+
284
+ class OpenAiResponseUsage(BaseModel):
285
+ prompt_tokens: int
286
+ completion_tokens: int
287
+ total_tokens: int
288
+
289
+
290
+ class OpenAiApiResponse(BaseModel):
291
+ model_config = ConfigDict(
292
+ protected_namespaces=(),
293
+ )
294
+
295
+ id: str
296
+ model: Optional[str] = None # returned by openai
297
+ choices: List[OpenAiResponseChoice]
298
+ created: int
299
+ usage: OpenAiResponseUsage
@@ -1,13 +1,13 @@
1
1
  import csv
2
2
  import logging
3
3
  import re
4
+ from collections.abc import Iterable
4
5
  from enum import Enum
5
6
  from io import BytesIO
6
7
  from pathlib import Path, PurePath
7
8
  from typing import (
8
9
  TYPE_CHECKING,
9
10
  Dict,
10
- Iterable,
11
11
  List,
12
12
  Literal,
13
13
  Optional,
@@ -17,6 +17,8 @@ from typing import (
17
17
  )
18
18
 
19
19
  import filetype
20
+
21
+ # DO NOT REMOVE; explicitly exposed from this location
20
22
  from docling_core.types.doc import (
21
23
  DocItem,
22
24
  DocItemLabel,
@@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
35
37
  PageReference,
36
38
  Prov,
37
39
  Ref,
40
+ Table as DsSchemaTable,
41
+ TableCell,
38
42
  )
39
- from docling_core.types.legacy_doc.base import Table as DsSchemaTable
40
- from docling_core.types.legacy_doc.base import TableCell
41
43
  from docling_core.types.legacy_doc.document import (
42
44
  CCSDocumentDescription as DsDocumentDescription,
45
+ CCSFileInfoObject as DsFileInfoObject,
46
+ ExportedCCSDocument as DsDocument,
43
47
  )
44
- from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
45
- from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
46
48
  from docling_core.utils.file import resolve_source_to_stream
47
49
  from docling_core.utils.legacy import docling_document_to_legacy
48
50
  from pydantic import BaseModel
@@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
65
67
  )
66
68
  from docling.datamodel.settings import DocumentLimits
67
69
  from docling.utils.profiling import ProfilingItem
68
- from docling.utils.utils import create_file_hash, create_hash
70
+ from docling.utils.utils import create_file_hash
69
71
 
70
72
  if TYPE_CHECKING:
71
73
  from docling.document_converter import FormatOption
@@ -134,9 +136,9 @@ class InputDocument(BaseModel):
134
136
  self._init_doc(backend, path_or_stream)
135
137
 
136
138
  elif isinstance(path_or_stream, BytesIO):
137
- assert (
138
- filename is not None
139
- ), "Can't construct InputDocument from stream without providing filename arg."
139
+ assert filename is not None, (
140
+ "Can't construct InputDocument from stream without providing filename arg."
141
+ )
140
142
  self.file = PurePath(filename)
141
143
  self.filesize = path_or_stream.getbuffer().nbytes
142
144
 
@@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
228
230
 
229
231
 
230
232
  class _DocumentConversionInput(BaseModel):
231
-
232
233
  path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
233
234
  headers: Optional[Dict[str, str]] = None
234
235
  limits: Optional[DocumentLimits] = DocumentLimits()
@@ -283,6 +284,13 @@ class _DocumentConversionInput(BaseModel):
283
284
  if mime is None: # must guess from
284
285
  with obj.open("rb") as f:
285
286
  content = f.read(1024) # Read first 1KB
287
+ if mime is not None and mime.lower() == "application/zip":
288
+ if obj.suffixes[-1].lower() == ".xlsx":
289
+ mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
290
+ elif obj.suffixes[-1].lower() == ".docx":
291
+ mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
292
+ elif obj.suffixes[-1].lower() == ".pptx":
293
+ mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
286
294
 
287
295
  elif isinstance(obj, DocumentStream):
288
296
  content = obj.stream.read(8192)
@@ -213,8 +213,8 @@ class PictureDescriptionBaseOptions(BaseOptions):
213
213
  batch_size: int = 8
214
214
  scale: float = 2
215
215
 
216
- bitmap_area_threshold: float = (
217
- 0.2 # percentage of the area for a bitmap to processed with the models
216
+ picture_area_threshold: float = (
217
+ 0.05 # percentage of the area for a picture to processed with the models
218
218
  )
219
219
 
220
220
 
@@ -266,6 +266,7 @@ class ResponseFormat(str, Enum):
266
266
  class InferenceFramework(str, Enum):
267
267
  MLX = "mlx"
268
268
  TRANSFORMERS = "transformers"
269
+ OPENAI = "openai"
269
270
 
270
271
 
271
272
  class HuggingFaceVlmOptions(BaseVlmOptions):
@@ -284,6 +285,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
284
285
  return self.repo_id.replace("/", "--")
285
286
 
286
287
 
288
+ class ApiVlmOptions(BaseVlmOptions):
289
+ kind: Literal["api_model_options"] = "api_model_options"
290
+
291
+ url: AnyUrl = AnyUrl(
292
+ "http://localhost:11434/v1/chat/completions"
293
+ ) # Default to ollama
294
+ headers: Dict[str, str] = {}
295
+ params: Dict[str, Any] = {}
296
+ scale: float = 2.0
297
+ timeout: float = 60
298
+ response_format: ResponseFormat
299
+
300
+
287
301
  smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
288
302
  repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
289
303
  prompt="Convert this page to docling.",
@@ -307,10 +321,20 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
307
321
  inference_framework=InferenceFramework.TRANSFORMERS,
308
322
  )
309
323
 
324
+ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
325
+ url=AnyUrl("http://localhost:11434/v1/chat/completions"),
326
+ params={"model": "granite3.2-vision:2b"},
327
+ prompt="OCR the full page to markdown.",
328
+ scale=1.0,
329
+ timeout=120,
330
+ response_format=ResponseFormat.MARKDOWN,
331
+ )
332
+
310
333
 
311
334
  class VlmModelType(str, Enum):
312
335
  SMOLDOCLING = "smoldocling"
313
336
  GRANITE_VISION = "granite_vision"
337
+ GRANITE_VISION_OLLAMA = "granite_vision_ollama"
314
338
 
315
339
 
316
340
  # Define an enum for the backend options
@@ -356,13 +380,14 @@ class PaginatedPipelineOptions(PipelineOptions):
356
380
 
357
381
 
358
382
  class VlmPipelineOptions(PaginatedPipelineOptions):
359
-
360
383
  generate_page_images: bool = True
361
384
  force_backend_text: bool = (
362
385
  False # (To be used with vlms, or other generative models)
363
386
  )
364
387
  # If True, text from backend will be used instead of generated text
365
- vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
388
+ vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
389
+ smoldocling_vlm_conversion_options
390
+ )
366
391
 
367
392
 
368
393
  class PdfPipelineOptions(PaginatedPipelineOptions):
@@ -1,11 +1,11 @@
1
1
  import hashlib
2
2
  import logging
3
- import math
4
3
  import sys
5
4
  import time
5
+ from collections.abc import Iterable, Iterator
6
6
  from functools import partial
7
7
  from pathlib import Path
8
- from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
8
+ from typing import Dict, List, Optional, Tuple, Type, Union
9
9
 
10
10
  from pydantic import BaseModel, ConfigDict, model_validator, validate_call
11
11
 
@@ -172,7 +172,7 @@ class DocumentConverter:
172
172
  format_options: Optional[Dict[InputFormat, FormatOption]] = None,
173
173
  ):
174
174
  self.allowed_formats = (
175
- allowed_formats if allowed_formats is not None else [e for e in InputFormat]
175
+ allowed_formats if allowed_formats is not None else list(InputFormat)
176
176
  )
177
177
  self.format_to_options = {
178
178
  format: (
@@ -254,7 +254,7 @@ class DocumentConverter:
254
254
 
255
255
  if not had_result and raises_on_error:
256
256
  raise ConversionError(
257
- f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
257
+ "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
258
258
  )
259
259
 
260
260
  def _convert(
@@ -266,7 +266,7 @@ class DocumentConverter:
266
266
  conv_input.docs(self.format_to_options),
267
267
  settings.perf.doc_batch_size, # pass format_options
268
268
  ):
269
- _log.info(f"Going to convert document batch...")
269
+ _log.info("Going to convert document batch...")
270
270
 
271
271
  # parallel processing only within input_batch
272
272
  # with ThreadPoolExecutor(