docling 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. docling/backend/asciidoc_backend.py +7 -15
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +2 -2
  4. docling/backend/docling_parse_v2_backend.py +2 -2
  5. docling/backend/docling_parse_v4_backend.py +3 -4
  6. docling/backend/docx/latex/latex_dict.py +0 -5
  7. docling/backend/docx/latex/omml.py +4 -7
  8. docling/backend/html_backend.py +66 -25
  9. docling/backend/md_backend.py +6 -8
  10. docling/backend/msexcel_backend.py +1 -7
  11. docling/backend/mspowerpoint_backend.py +4 -7
  12. docling/backend/msword_backend.py +5 -5
  13. docling/backend/pdf_backend.py +2 -1
  14. docling/backend/pypdfium2_backend.py +3 -3
  15. docling/backend/xml/jats_backend.py +11 -14
  16. docling/backend/xml/uspto_backend.py +19 -23
  17. docling/cli/main.py +8 -8
  18. docling/cli/models.py +6 -3
  19. docling/datamodel/base_models.py +7 -5
  20. docling/datamodel/document.py +19 -10
  21. docling/datamodel/pipeline_options.py +0 -1
  22. docling/document_converter.py +8 -6
  23. docling/models/api_vlm_model.py +1 -2
  24. docling/models/base_model.py +2 -4
  25. docling/models/base_ocr_model.py +2 -2
  26. docling/models/code_formula_model.py +2 -1
  27. docling/models/document_picture_classifier.py +2 -1
  28. docling/models/easyocr_model.py +10 -11
  29. docling/models/factories/__init__.py +2 -2
  30. docling/models/factories/base_factory.py +1 -1
  31. docling/models/hf_mlx_model.py +4 -6
  32. docling/models/hf_vlm_model.py +7 -5
  33. docling/models/layout_model.py +2 -2
  34. docling/models/ocr_mac_model.py +3 -4
  35. docling/models/page_assemble_model.py +7 -12
  36. docling/models/page_preprocessing_model.py +2 -1
  37. docling/models/picture_description_api_model.py +2 -1
  38. docling/models/picture_description_base_model.py +2 -3
  39. docling/models/picture_description_vlm_model.py +6 -4
  40. docling/models/rapid_ocr_model.py +2 -3
  41. docling/models/readingorder_model.py +9 -24
  42. docling/models/table_structure_model.py +4 -8
  43. docling/models/tesseract_ocr_cli_model.py +17 -16
  44. docling/models/tesseract_ocr_model.py +9 -5
  45. docling/pipeline/base_pipeline.py +4 -8
  46. docling/pipeline/simple_pipeline.py +0 -1
  47. docling/pipeline/standard_pdf_pipeline.py +0 -1
  48. docling/pipeline/vlm_pipeline.py +0 -3
  49. docling/utils/export.py +2 -4
  50. docling/utils/glm_utils.py +2 -2
  51. docling/utils/layout_postprocessor.py +4 -2
  52. docling/utils/model_downloader.py +31 -7
  53. docling/utils/utils.py +3 -3
  54. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/METADATA +2 -1
  55. docling-2.31.1.dist-info/RECORD +86 -0
  56. docling-2.30.0.dist-info/RECORD +0 -86
  57. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
  58. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
  59. {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0
@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
158
158
  def _get_level(self) -> int:
159
159
  """Return the first None index."""
160
160
  for k, v in self.parents.items():
161
- if k >= 0 and v == None:
161
+ if k >= 0 and v is None:
162
162
  return k
163
163
  return 0
164
164
 
@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
418
418
  else prev_parent
419
419
  )
420
420
 
421
- def _handle_text_elements(
421
+ def _handle_text_elements( # noqa: C901
422
422
  self,
423
423
  element: BaseOxmlElement,
424
424
  docx_obj: DocxDocument,
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
436
436
 
437
437
  # Common styles for bullet and numbered lists.
438
438
  # "List Bullet", "List Number", "List Paragraph"
439
- # Identify wether list is a numbered list or not
439
+ # Identify whether list is a numbered list or not
440
440
  # is_numbered = "List Bullet" not in paragraph.style.name
441
441
  is_numbered = False
442
442
  p_style_id, p_level = self._get_label_and_level(paragraph)
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
812
812
  f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
813
813
  )
814
814
  if cell is None or cell._tc in cell_set:
815
- _log.debug(f" skipped since repeated content")
815
+ _log.debug(" skipped since repeated content")
816
816
  col_idx += cell.grid_span
817
817
  continue
818
818
  else:
@@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
879
879
  image=ImageRef.from_pil(image=pil_image, dpi=72),
880
880
  caption=None,
881
881
  )
882
- except (UnidentifiedImageError, OSError) as e:
882
+ except (UnidentifiedImageError, OSError):
883
883
  _log.warning("Warning: image cannot be loaded by Pillow")
884
884
  doc.add_picture(
885
885
  parent=self.parents[level - 1],
@@ -1,7 +1,8 @@
1
1
  from abc import ABC, abstractmethod
2
+ from collections.abc import Iterable
2
3
  from io import BytesIO
3
4
  from pathlib import Path
4
- from typing import Iterable, Optional, Set, Union
5
+ from typing import Optional, Set, Union
5
6
 
6
7
  from docling_core.types.doc import BoundingBox, Size
7
8
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
@@ -1,8 +1,9 @@
1
1
  import logging
2
2
  import random
3
+ from collections.abc import Iterable
3
4
  from io import BytesIO
4
5
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
6
+ from typing import TYPE_CHECKING, List, Optional, Union
6
7
 
7
8
  import pypdfium2 as pdfium
8
9
  import pypdfium2.raw as pdfium_c
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
29
30
  self.valid = True # No better way to tell from pypdfium.
30
31
  try:
31
32
  self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
32
- except PdfiumError as e:
33
+ except PdfiumError:
33
34
  _log.info(
34
35
  f"An exception occurred when loading page {page_no} of document {document_hash}.",
35
36
  exc_info=True,
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
225
226
  def get_page_image(
226
227
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None
227
228
  ) -> Image.Image:
228
-
229
229
  page_size = self.get_size()
230
230
 
231
231
  if not cropbox:
@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
91
91
  super().__init__(in_doc, path_or_stream)
92
92
  self.path_or_stream = path_or_stream
93
93
 
94
- # Initialize the root of the document hiearchy
94
+ # Initialize the root of the document hierarchy
95
95
  self.root: Optional[NodeItem] = None
96
96
 
97
97
  self.valid = False
@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
102
102
 
103
103
  doc_info: etree.DocInfo = self.tree.docinfo
104
104
  if doc_info.system_url and any(
105
- [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
105
+ kwd in doc_info.system_url for kwd in JATS_DTD_URL
106
106
  ):
107
107
  self.valid = True
108
108
  return
109
109
  for ent in doc_info.internalDTD.iterentities():
110
110
  if ent.system_url and any(
111
- [kwd in ent.system_url for kwd in JATS_DTD_URL]
111
+ kwd in ent.system_url for kwd in JATS_DTD_URL
112
112
  ):
113
113
  self.valid = True
114
114
  return
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
232
232
  # TODO: once superscript is supported, add label with formatting
233
233
  aff = aff.removeprefix(f"{label[0].text}, ")
234
234
  affiliation_names.append(aff)
235
- affiliation_ids_names = {
236
- id: name
237
- for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
238
- }
235
+ affiliation_ids_names = dict(
236
+ zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
237
+ )
239
238
 
240
239
  # Get author names and affiliation names
241
240
  for author_node in meta.xpath(
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
300
299
  def _add_abstract(
301
300
  self, doc: DoclingDocument, xml_components: XMLComponents
302
301
  ) -> None:
303
-
304
302
  for abstract in xml_components["abstract"]:
305
303
  text: str = abstract["content"]
306
304
  title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
349
347
 
350
348
  return
351
349
 
352
- def _parse_element_citation(self, node: etree._Element) -> str:
350
+ def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
353
351
  citation: Citation = {
354
352
  "author_names": "",
355
353
  "title": "",
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
440
438
  citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
441
439
  if len(node.xpath("lpage")) > 0:
442
440
  citation["page"] += (
443
- "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
441
+ "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
444
442
  )
445
443
 
446
444
  # Flatten the citation to string
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
595
593
 
596
594
  try:
597
595
  self._add_table(doc, parent, table)
598
- except Exception as e:
599
- _log.warning(f"Skipping unsupported table in {str(self.file)}")
600
- pass
596
+ except Exception:
597
+ _log.warning(f"Skipping unsupported table in {self.file!s}")
601
598
 
602
599
  return
603
600
 
@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
609
606
  )
610
607
  return
611
608
 
612
- def _walk_linear(
609
+ def _walk_linear( # noqa: C901
613
610
  self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
614
611
  ) -> str:
615
612
  skip_tags = ["term"]
@@ -1,6 +1,6 @@
1
1
  """Backend to parse patents from the United States Patent Office (USPTO).
2
2
 
3
- The parsers included in this module can handle patent grants pubished since 1976 and
3
+ The parsers included in this module can handle patent grants published since 1976 and
4
4
  patent applications since 2001.
5
5
  The original files can be found in https://bulkdata.uspto.gov.
6
6
  """
@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
122
122
 
123
123
  @override
124
124
  def convert(self) -> DoclingDocument:
125
-
126
125
  if self.parser is not None:
127
126
  doc = self.parser.parse(self.patent_content)
128
127
  if doc is None:
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
163
162
  Returns:
164
163
  The patent parsed as a docling document.
165
164
  """
166
- pass
167
165
 
168
166
 
169
167
  class PatentUsptoIce(PatentUspto):
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
265
263
  self.style_html = HtmlEntity()
266
264
 
267
265
  @override
268
- def startElement(self, tag, attributes): # noqa: N802
266
+ def startElement(self, tag, attributes):
269
267
  """Signal the start of an element.
270
268
 
271
269
  Args:
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
281
279
  self._start_registered_elements(tag, attributes)
282
280
 
283
281
  @override
284
- def skippedEntity(self, name): # noqa: N802
282
+ def skippedEntity(self, name):
285
283
  """Receive notification of a skipped entity.
286
284
 
287
285
  HTML entities will be skipped by the parser. This method will unescape them
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
315
313
  self.text += unescaped
316
314
 
317
315
  @override
318
- def endElement(self, tag): # noqa: N802
316
+ def endElement(self, tag):
319
317
  """Signal the end of an element.
320
318
 
321
319
  Args:
@@ -442,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
442
440
  )
443
441
 
444
442
  elif name == self.Element.PARAGRAPH.value and text:
445
- # remmove blank spaces added in paragraphs
443
+ # remove blank spaces added in paragraphs
446
444
  text = re.sub("\\s+", " ", text)
447
445
  if self.Element.ABSTRACT.value in self.property:
448
446
  self.abstract = (
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
603
601
  self.style_html = HtmlEntity()
604
602
 
605
603
  @override
606
- def startElement(self, tag, attributes): # noqa: N802
604
+ def startElement(self, tag, attributes):
607
605
  """Signal the start of an element.
608
606
 
609
607
  Args:
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
616
614
  self._start_registered_elements(tag, attributes)
617
615
 
618
616
  @override
619
- def skippedEntity(self, name): # noqa: N802
617
+ def skippedEntity(self, name):
620
618
  """Receive notification of a skipped entity.
621
619
 
622
620
  HTML entities will be skipped by the parser. This method will unescape them
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
650
648
  self.text += unescaped
651
649
 
652
650
  @override
653
- def endElement(self, tag): # noqa: N802
651
+ def endElement(self, tag):
654
652
  """Signal the end of an element.
655
653
 
656
654
  Args:
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
691
689
  if tag in [member.value for member in self.Element]:
692
690
  if (
693
691
  tag == self.Element.HEADING.value
694
- and not self.Element.SDOCL.value in self.property
692
+ and self.Element.SDOCL.value not in self.property
695
693
  ):
696
694
  level_attr: str = attributes.get("LVL", "")
697
695
  new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
743
741
  # headers except claims statement
744
742
  elif (
745
743
  self.Element.HEADING.value in self.property
746
- and not self.Element.SDOCL.value in self.property
744
+ and self.Element.SDOCL.value not in self.property
747
745
  and text.strip()
748
746
  ):
749
747
  self.parents[self.level + 1] = self.doc.add_heading(
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
1164
1162
  self.style_html = HtmlEntity()
1165
1163
 
1166
1164
  @override
1167
- def startElement(self, tag, attributes): # noqa: N802
1165
+ def startElement(self, tag, attributes):
1168
1166
  """Signal the start of an element.
1169
1167
 
1170
1168
  Args:
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
1177
1175
  self._start_registered_elements(tag, attributes)
1178
1176
 
1179
1177
  @override
1180
- def skippedEntity(self, name): # noqa: N802
1178
+ def skippedEntity(self, name):
1181
1179
  """Receive notification of a skipped entity.
1182
1180
 
1183
1181
  HTML entities will be skipped by the parser. This method will unescape them
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
1211
1209
  self.text += unescaped
1212
1210
 
1213
1211
  @override
1214
- def endElement(self, tag): # noqa: N802
1212
+ def endElement(self, tag):
1215
1213
  """Signal the end of an element.
1216
1214
 
1217
1215
  Args:
@@ -1474,9 +1472,7 @@ class XmlTable:
1474
1472
  if cw == 0:
1475
1473
  offset_w0.append(col["offset"][ic])
1476
1474
 
1477
- min_colinfo["offset"] = sorted(
1478
- list(set(col["offset"] + min_colinfo["offset"]))
1479
- )
1475
+ min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
1480
1476
 
1481
1477
  # add back the 0 width cols to offset list
1482
1478
  offset_w0 = list(set(offset_w0))
@@ -1527,7 +1523,7 @@ class XmlTable:
1527
1523
 
1528
1524
  return ncols_max
1529
1525
 
1530
- def _parse_table(self, table: Tag) -> TableData:
1526
+ def _parse_table(self, table: Tag) -> TableData: # noqa: C901
1531
1527
  """Parse the content of a table tag.
1532
1528
 
1533
1529
  Args:
@@ -1701,7 +1697,7 @@ class XmlTable:
1701
1697
  class HtmlEntity:
1702
1698
  """Provide utility functions to get the HTML entities of styled characters.
1703
1699
 
1704
- This class has been developped from:
1700
+ This class has been developed from:
1705
1701
  https://unicode-table.com/en/html-entities/
1706
1702
  https://www.w3.org/TR/WD-math-970515/table03.html
1707
1703
  """
@@ -1722,7 +1718,7 @@ class HtmlEntity:
1722
1718
  "0": "⁰",
1723
1719
  "+": "⁺",
1724
1720
  "-": "⁻",
1725
- "−": "⁻",
1721
+ "−": "⁻", # noqa: RUF001
1726
1722
  "=": "⁼",
1727
1723
  "(": "⁽",
1728
1724
  ")": "⁾",
@@ -1746,7 +1742,7 @@ class HtmlEntity:
1746
1742
  "0": "₀",
1747
1743
  "+": "₊",
1748
1744
  "-": "₋",
1749
- "−": "₋",
1745
+ "−": "₋", # noqa: RUF001
1750
1746
  "=": "₌",
1751
1747
  "(": "₍",
1752
1748
  ")": "₎",
@@ -1900,7 +1896,7 @@ class HtmlEntity:
1900
1896
  """Get an HTML entity of a greek letter in ISO 8879.
1901
1897
 
1902
1898
  Args:
1903
- The text to transform, as an ISO 8879 entitiy.
1899
+ The text to transform, as an ISO 8879 entity.
1904
1900
 
1905
1901
  Returns:
1906
1902
  The HTML entity representing a greek letter. If the input text is not
docling/cli/main.py CHANGED
@@ -6,14 +6,16 @@ import sys
6
6
  import tempfile
7
7
  import time
8
8
  import warnings
9
+ from collections.abc import Iterable
9
10
  from pathlib import Path
10
- from typing import Annotated, Dict, Iterable, List, Optional, Type
11
+ from typing import Annotated, Dict, List, Optional, Type
11
12
 
12
13
  import rich.table
13
14
  import typer
14
15
  from docling_core.types.doc import ImageRefMode
15
16
  from docling_core.utils.file import resolve_source_to_path
16
17
  from pydantic import TypeAdapter
18
+ from rich.console import Console
17
19
 
18
20
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
19
21
  from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
53
55
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
54
56
 
55
57
  _log = logging.getLogger(__name__)
56
- from rich.console import Console
57
58
 
58
59
  console = Console()
59
60
  err_console = Console(stderr=True)
@@ -160,7 +161,6 @@ def export_documents(
160
161
  export_doctags: bool,
161
162
  image_export_mode: ImageRefMode,
162
163
  ):
163
-
164
164
  success_count = 0
165
165
  failure_count = 0
166
166
 
@@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
233
233
 
234
234
 
235
235
  @app.command(no_args_is_help=True)
236
- def convert(
236
+ def convert( # noqa: C901
237
237
  input_sources: Annotated[
238
238
  List[str],
239
239
  typer.Argument(
@@ -289,7 +289,7 @@ def convert(
289
289
  ...,
290
290
  help=(
291
291
  f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
292
- f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
292
+ f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
293
293
  f"Use the option --show-external-plugins to see the options allowed with external plugins."
294
294
  ),
295
295
  ),
@@ -421,7 +421,7 @@ def convert(
421
421
  logging.basicConfig(level=logging.WARNING)
422
422
  elif verbose == 1:
423
423
  logging.basicConfig(level=logging.INFO)
424
- elif verbose == 2:
424
+ else:
425
425
  logging.basicConfig(level=logging.DEBUG)
426
426
 
427
427
  settings.debug.visualize_cells = debug_visualize_cells
@@ -430,7 +430,7 @@ def convert(
430
430
  settings.debug.visualize_ocr = debug_visualize_ocr
431
431
 
432
432
  if from_formats is None:
433
- from_formats = [e for e in InputFormat]
433
+ from_formats = list(InputFormat)
434
434
 
435
435
  parsed_headers: Optional[Dict[str, str]] = None
436
436
  if headers is not None:
@@ -521,7 +521,7 @@ def convert(
521
521
  if image_export_mode != ImageRefMode.PLACEHOLDER:
522
522
  pipeline_options.generate_page_images = True
523
523
  pipeline_options.generate_picture_images = (
524
- True # FIXME: to be deprecated in verson 3
524
+ True # FIXME: to be deprecated in version 3
525
525
  )
526
526
  pipeline_options.images_scale = 2
527
527
 
docling/cli/models.py CHANGED
@@ -32,6 +32,8 @@ class _AvailableModels(str, Enum):
32
32
  CODE_FORMULA = "code_formula"
33
33
  PICTURE_CLASSIFIER = "picture_classifier"
34
34
  SMOLVLM = "smolvlm"
35
+ SMOLDOCLING = "smoldocling"
36
+ SMOLDOCLING_MLX = "smoldocling_mlx"
35
37
  GRANITE_VISION = "granite_vision"
36
38
  EASYOCR = "easyocr"
37
39
 
@@ -62,7 +64,7 @@ def download(
62
64
  models: Annotated[
63
65
  Optional[list[_AvailableModels]],
64
66
  typer.Argument(
65
- help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
67
+ help="Models to download (default behavior: a predefined set of models will be downloaded).",
66
68
  ),
67
69
  ] = None,
68
70
  all: Annotated[
@@ -89,14 +91,13 @@ def download(
89
91
  "Cannot simultaneously set 'all' parameter and specify models to download."
90
92
  )
91
93
  if not quiet:
92
- FORMAT = "%(message)s"
93
94
  logging.basicConfig(
94
95
  level=logging.INFO,
95
96
  format="[blue]%(message)s[/blue]",
96
97
  datefmt="[%X]",
97
98
  handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
98
99
  )
99
- to_download = models or ([m for m in _AvailableModels] if all else _default_models)
100
+ to_download = models or (list(_AvailableModels) if all else _default_models)
100
101
  output_dir = download_models(
101
102
  output_dir=output_dir,
102
103
  force=force,
@@ -106,6 +107,8 @@ def download(
106
107
  with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
107
108
  with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
108
109
  with_smolvlm=_AvailableModels.SMOLVLM in to_download,
110
+ with_smoldocling=_AvailableModels.SMOLDOCLING in to_download,
111
+ with_smoldocling_mlx=_AvailableModels.SMOLDOCLING_MLX in to_download,
109
112
  with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
110
113
  with_easyocr=_AvailableModels.EASYOCR in to_download,
111
114
  )
@@ -10,7 +10,9 @@ from docling_core.types.doc import (
10
10
  TableCell,
11
11
  )
12
12
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
13
- from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
13
+
14
+ # DO NOT REMOVE; explicitly exposed from this location
15
+ from docling_core.types.io import (
14
16
  DocumentStream,
15
17
  )
16
18
  from PIL.Image import Image
@@ -233,9 +235,9 @@ class Page(BaseModel):
233
235
  None # Internal PDF backend. By default it is cleared during assembling.
234
236
  )
235
237
  _default_image_scale: float = 1.0 # Default image scale for external usage.
236
- _image_cache: Dict[float, Image] = (
237
- {}
238
- ) # Cache of images in different scales. By default it is cleared during assembling.
238
+ _image_cache: Dict[
239
+ float, Image
240
+ ] = {} # Cache of images in different scales. By default it is cleared during assembling.
239
241
 
240
242
  def get_image(
241
243
  self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
@@ -243,7 +245,7 @@ class Page(BaseModel):
243
245
  if self._backend is None:
244
246
  return self._image_cache.get(scale, None)
245
247
 
246
- if not scale in self._image_cache:
248
+ if scale not in self._image_cache:
247
249
  if cropbox is None:
248
250
  self._image_cache[scale] = self._backend.get_page_image(scale=scale)
249
251
  else:
@@ -1,13 +1,13 @@
1
1
  import csv
2
2
  import logging
3
3
  import re
4
+ from collections.abc import Iterable
4
5
  from enum import Enum
5
6
  from io import BytesIO
6
7
  from pathlib import Path, PurePath
7
8
  from typing import (
8
9
  TYPE_CHECKING,
9
10
  Dict,
10
- Iterable,
11
11
  List,
12
12
  Literal,
13
13
  Optional,
@@ -17,6 +17,8 @@ from typing import (
17
17
  )
18
18
 
19
19
  import filetype
20
+
21
+ # DO NOT REMOVE; explicitly exposed from this location
20
22
  from docling_core.types.doc import (
21
23
  DocItem,
22
24
  DocItemLabel,
@@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
35
37
  PageReference,
36
38
  Prov,
37
39
  Ref,
40
+ Table as DsSchemaTable,
41
+ TableCell,
38
42
  )
39
- from docling_core.types.legacy_doc.base import Table as DsSchemaTable
40
- from docling_core.types.legacy_doc.base import TableCell
41
43
  from docling_core.types.legacy_doc.document import (
42
44
  CCSDocumentDescription as DsDocumentDescription,
45
+ CCSFileInfoObject as DsFileInfoObject,
46
+ ExportedCCSDocument as DsDocument,
43
47
  )
44
- from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
45
- from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
46
48
  from docling_core.utils.file import resolve_source_to_stream
47
49
  from docling_core.utils.legacy import docling_document_to_legacy
48
50
  from pydantic import BaseModel
@@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
65
67
  )
66
68
  from docling.datamodel.settings import DocumentLimits
67
69
  from docling.utils.profiling import ProfilingItem
68
- from docling.utils.utils import create_file_hash, create_hash
70
+ from docling.utils.utils import create_file_hash
69
71
 
70
72
  if TYPE_CHECKING:
71
73
  from docling.document_converter import FormatOption
@@ -134,9 +136,9 @@ class InputDocument(BaseModel):
134
136
  self._init_doc(backend, path_or_stream)
135
137
 
136
138
  elif isinstance(path_or_stream, BytesIO):
137
- assert (
138
- filename is not None
139
- ), "Can't construct InputDocument from stream without providing filename arg."
139
+ assert filename is not None, (
140
+ "Can't construct InputDocument from stream without providing filename arg."
141
+ )
140
142
  self.file = PurePath(filename)
141
143
  self.filesize = path_or_stream.getbuffer().nbytes
142
144
 
@@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
228
230
 
229
231
 
230
232
  class _DocumentConversionInput(BaseModel):
231
-
232
233
  path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
233
234
  headers: Optional[Dict[str, str]] = None
234
235
  limits: Optional[DocumentLimits] = DocumentLimits()
@@ -302,6 +303,14 @@ class _DocumentConversionInput(BaseModel):
302
303
  else ""
303
304
  )
304
305
  mime = _DocumentConversionInput._mime_from_extension(ext)
306
+ if mime is not None and mime.lower() == "application/zip":
307
+ objname = obj.name.lower()
308
+ if objname.endswith(".xlsx"):
309
+ mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
310
+ elif objname.endswith(".docx"):
311
+ mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
312
+ elif objname.endswith(".pptx"):
313
+ mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
305
314
 
306
315
  mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
307
316
  mime = mime or _DocumentConversionInput._detect_csv(content)
@@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions):
380
380
 
381
381
 
382
382
  class VlmPipelineOptions(PaginatedPipelineOptions):
383
-
384
383
  generate_page_images: bool = True
385
384
  force_backend_text: bool = (
386
385
  False # (To be used with vlms, or other generative models)
@@ -1,11 +1,11 @@
1
1
  import hashlib
2
2
  import logging
3
- import math
4
3
  import sys
5
4
  import time
5
+ from collections.abc import Iterable, Iterator
6
6
  from functools import partial
7
7
  from pathlib import Path
8
- from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
8
+ from typing import Dict, List, Optional, Tuple, Type, Union
9
9
 
10
10
  from pydantic import BaseModel, ConfigDict, model_validator, validate_call
11
11
 
@@ -172,7 +172,7 @@ class DocumentConverter:
172
172
  format_options: Optional[Dict[InputFormat, FormatOption]] = None,
173
173
  ):
174
174
  self.allowed_formats = (
175
- allowed_formats if allowed_formats is not None else [e for e in InputFormat]
175
+ allowed_formats if allowed_formats is not None else list(InputFormat)
176
176
  )
177
177
  self.format_to_options = {
178
178
  format: (
@@ -189,7 +189,9 @@ class DocumentConverter:
189
189
  def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
190
190
  """Generate a hash of pipeline options to use as part of the cache key."""
191
191
  options_str = str(pipeline_options.model_dump())
192
- return hashlib.md5(options_str.encode("utf-8")).hexdigest()
192
+ return hashlib.md5(
193
+ options_str.encode("utf-8"), usedforsecurity=False
194
+ ).hexdigest()
193
195
 
194
196
  def initialize_pipeline(self, format: InputFormat):
195
197
  """Initialize the conversion pipeline for the selected format."""
@@ -254,7 +256,7 @@ class DocumentConverter:
254
256
 
255
257
  if not had_result and raises_on_error:
256
258
  raise ConversionError(
257
- f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
259
+ "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
258
260
  )
259
261
 
260
262
  def _convert(
@@ -266,7 +268,7 @@ class DocumentConverter:
266
268
  conv_input.docs(self.format_to_options),
267
269
  settings.perf.doc_batch_size, # pass format_options
268
270
  ):
269
- _log.info(f"Going to convert document batch...")
271
+ _log.info("Going to convert document batch...")
270
272
 
271
273
  # parallel processing only within input_batch
272
274
  # with ThreadPoolExecutor(
@@ -1,4 +1,4 @@
1
- from typing import Iterable
1
+ from collections.abc import Iterable
2
2
 
3
3
  from docling.datamodel.base_models import Page, VlmPrediction
4
4
  from docling.datamodel.document import ConversionResult
@@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
10
10
 
11
11
 
12
12
  class ApiVlmModel(BasePageModel):
13
-
14
13
  def __init__(
15
14
  self,
16
15
  enabled: bool,