docling 2.3.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {docling-2.3.0 → docling-2.4.0}/PKG-INFO +6 -10
  2. {docling-2.3.0 → docling-2.4.0}/docling/cli/main.py +38 -8
  3. {docling-2.3.0 → docling-2.4.0}/docling/datamodel/pipeline_options.py +3 -3
  4. {docling-2.3.0 → docling-2.4.0}/docling/document_converter.py +8 -4
  5. {docling-2.3.0 → docling-2.4.0}/pyproject.toml +20 -24
  6. {docling-2.3.0 → docling-2.4.0}/LICENSE +0 -0
  7. {docling-2.3.0 → docling-2.4.0}/README.md +0 -0
  8. {docling-2.3.0 → docling-2.4.0}/docling/__init__.py +0 -0
  9. {docling-2.3.0 → docling-2.4.0}/docling/backend/__init__.py +0 -0
  10. {docling-2.3.0 → docling-2.4.0}/docling/backend/abstract_backend.py +0 -0
  11. {docling-2.3.0 → docling-2.4.0}/docling/backend/asciidoc_backend.py +0 -0
  12. {docling-2.3.0 → docling-2.4.0}/docling/backend/docling_parse_backend.py +0 -0
  13. {docling-2.3.0 → docling-2.4.0}/docling/backend/docling_parse_v2_backend.py +0 -0
  14. {docling-2.3.0 → docling-2.4.0}/docling/backend/html_backend.py +0 -0
  15. {docling-2.3.0 → docling-2.4.0}/docling/backend/md_backend.py +0 -0
  16. {docling-2.3.0 → docling-2.4.0}/docling/backend/mspowerpoint_backend.py +0 -0
  17. {docling-2.3.0 → docling-2.4.0}/docling/backend/msword_backend.py +0 -0
  18. {docling-2.3.0 → docling-2.4.0}/docling/backend/pdf_backend.py +0 -0
  19. {docling-2.3.0 → docling-2.4.0}/docling/backend/pypdfium2_backend.py +0 -0
  20. {docling-2.3.0 → docling-2.4.0}/docling/cli/__init__.py +0 -0
  21. {docling-2.3.0 → docling-2.4.0}/docling/datamodel/__init__.py +0 -0
  22. {docling-2.3.0 → docling-2.4.0}/docling/datamodel/base_models.py +0 -0
  23. {docling-2.3.0 → docling-2.4.0}/docling/datamodel/document.py +0 -0
  24. {docling-2.3.0 → docling-2.4.0}/docling/datamodel/settings.py +0 -0
  25. {docling-2.3.0 → docling-2.4.0}/docling/models/__init__.py +0 -0
  26. {docling-2.3.0 → docling-2.4.0}/docling/models/base_model.py +0 -0
  27. {docling-2.3.0 → docling-2.4.0}/docling/models/base_ocr_model.py +0 -0
  28. {docling-2.3.0 → docling-2.4.0}/docling/models/ds_glm_model.py +0 -0
  29. {docling-2.3.0 → docling-2.4.0}/docling/models/easyocr_model.py +0 -0
  30. {docling-2.3.0 → docling-2.4.0}/docling/models/layout_model.py +0 -0
  31. {docling-2.3.0 → docling-2.4.0}/docling/models/page_assemble_model.py +0 -0
  32. {docling-2.3.0 → docling-2.4.0}/docling/models/page_preprocessing_model.py +0 -0
  33. {docling-2.3.0 → docling-2.4.0}/docling/models/table_structure_model.py +0 -0
  34. {docling-2.3.0 → docling-2.4.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
  35. {docling-2.3.0 → docling-2.4.0}/docling/models/tesseract_ocr_model.py +0 -0
  36. {docling-2.3.0 → docling-2.4.0}/docling/pipeline/__init__.py +0 -0
  37. {docling-2.3.0 → docling-2.4.0}/docling/pipeline/base_pipeline.py +0 -0
  38. {docling-2.3.0 → docling-2.4.0}/docling/pipeline/simple_pipeline.py +0 -0
  39. {docling-2.3.0 → docling-2.4.0}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  40. {docling-2.3.0 → docling-2.4.0}/docling/utils/__init__.py +0 -0
  41. {docling-2.3.0 → docling-2.4.0}/docling/utils/export.py +0 -0
  42. {docling-2.3.0 → docling-2.4.0}/docling/utils/layout_utils.py +0 -0
  43. {docling-2.3.0 → docling-2.4.0}/docling/utils/profiling.py +0 -0
  44. {docling-2.3.0 → docling-2.4.0}/docling/utils/utils.py +0 -0
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.3.0
4
- Summary: Docling PDF conversion package
3
+ Version: 2.4.0
4
+ Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
7
- Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
7
+ Keywords: docling,convert,document,pdf,docx,html,markdown,layout model,segmentation,table structure,table former
8
8
  Author: Christoph Auer
9
9
  Author-email: cau@zurich.ibm.com
10
10
  Requires-Python: >=3.10,<4.0
@@ -23,9 +23,9 @@ Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
- Requires-Dist: docling-core (>=2.2.3,<3.0.0)
27
- Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
- Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
26
+ Requires-Dist: docling-core (>=2.3.0,<3.0.0)
27
+ Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
28
+ Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -41,10 +41,6 @@ Requires-Dist: requests (>=2.32.3,<3.0.0)
41
41
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
42
42
  Requires-Dist: scipy (>=1.14.1,<2.0.0)
43
43
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
44
- Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
45
- Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
46
- Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
47
- Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
48
44
  Requires-Dist: typer (>=0.12.5,<0.13.0)
49
45
  Project-URL: Repository, https://github.com/DS4SD/docling
50
46
  Description-Content-Type: text/markdown
@@ -5,12 +5,15 @@ import time
5
5
  import warnings
6
6
  from enum import Enum
7
7
  from pathlib import Path
8
- from typing import Annotated, Dict, Iterable, List, Optional
8
+ from typing import Annotated, Dict, Iterable, List, Optional, Type
9
9
 
10
10
  import typer
11
11
  from docling_core.utils.file import resolve_file_source
12
12
 
13
13
  from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
+ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
15
+ from docling.backend.pdf_backend import PdfDocumentBackend
16
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
14
17
  from docling.datamodel.base_models import (
15
18
  ConversionStatus,
16
19
  FormatToExtensions,
@@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
22
25
  EasyOcrOptions,
23
26
  OcrOptions,
24
27
  PdfPipelineOptions,
28
+ TableFormerMode,
25
29
  TesseractCliOcrOptions,
26
30
  TesseractOcrOptions,
27
31
  )
@@ -58,9 +62,10 @@ def version_callback(value: bool):
58
62
 
59
63
 
60
64
  # Define an enum for the backend options
61
- class Backend(str, Enum):
65
+ class PdfBackend(str, Enum):
62
66
  PYPDFIUM2 = "pypdfium2"
63
- DOCLING = "docling"
67
+ DLPARSE_V1 = "dlparse_v1"
68
+ DLPARSE_V2 = "dlparse_v2"
64
69
 
65
70
 
66
71
  # Define an enum for the ocr engines
@@ -90,28 +95,28 @@ def export_documents(
90
95
  # Export Deep Search document JSON format:
91
96
  if export_json:
92
97
  fname = output_dir / f"{doc_filename}.json"
93
- with fname.open("w") as fp:
98
+ with fname.open("w", encoding="utf8") as fp:
94
99
  _log.info(f"writing JSON output to {fname}")
95
100
  fp.write(json.dumps(conv_res.document.export_to_dict()))
96
101
 
97
102
  # Export Text format:
98
103
  if export_txt:
99
104
  fname = output_dir / f"{doc_filename}.txt"
100
- with fname.open("w") as fp:
105
+ with fname.open("w", encoding="utf8") as fp:
101
106
  _log.info(f"writing Text output to {fname}")
102
107
  fp.write(conv_res.document.export_to_markdown(strict_text=True))
103
108
 
104
109
  # Export Markdown format:
105
110
  if export_md:
106
111
  fname = output_dir / f"{doc_filename}.md"
107
- with fname.open("w") as fp:
112
+ with fname.open("w", encoding="utf8") as fp:
108
113
  _log.info(f"writing Markdown output to {fname}")
109
114
  fp.write(conv_res.document.export_to_markdown())
110
115
 
111
116
  # Export Document Tags format:
112
117
  if export_doctags:
113
118
  fname = output_dir / f"{doc_filename}.doctags"
114
- with fname.open("w") as fp:
119
+ with fname.open("w", encoding="utf8") as fp:
115
120
  _log.info(f"writing Doc Tags output to {fname}")
116
121
  fp.write(conv_res.document.export_to_document_tokens())
117
122
 
@@ -151,6 +156,17 @@ def convert(
151
156
  ocr_engine: Annotated[
152
157
  OcrEngine, typer.Option(..., help="The OCR engine to use.")
153
158
  ] = OcrEngine.EASYOCR,
159
+ pdf_backend: Annotated[
160
+ PdfBackend, typer.Option(..., help="The PDF backend to use.")
161
+ ] = PdfBackend.DLPARSE_V1,
162
+ table_mode: Annotated[
163
+ TableFormerMode,
164
+ typer.Option(..., help="The mode to use in the table structure model."),
165
+ ] = TableFormerMode.FAST,
166
+ artifacts_path: Annotated[
167
+ Optional[Path],
168
+ typer.Option(..., help="If provided, the location of the model artifacts."),
169
+ ] = None,
154
170
  abort_on_error: Annotated[
155
171
  bool,
156
172
  typer.Option(
@@ -217,11 +233,25 @@ def convert(
217
233
  do_table_structure=True,
218
234
  )
219
235
  pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
236
+ pipeline_options.table_structure_options.mode = table_mode
237
+
238
+ if artifacts_path is not None:
239
+ pipeline_options.artifacts_path = artifacts_path
240
+
241
+ match pdf_backend:
242
+ case PdfBackend.DLPARSE_V1:
243
+ backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
244
+ case PdfBackend.DLPARSE_V2:
245
+ backend = DoclingParseV2DocumentBackend
246
+ case PdfBackend.PYPDFIUM2:
247
+ backend = PyPdfiumDocumentBackend
248
+ case _:
249
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
220
250
 
221
251
  format_options: Dict[InputFormat, FormatOption] = {
222
252
  InputFormat.PDF: PdfFormatOption(
223
253
  pipeline_options=pipeline_options,
224
- backend=DoclingParseDocumentBackend, # pdf_backend
254
+ backend=backend, # pdf_backend
225
255
  )
226
256
  }
227
257
  doc_converter = DocumentConverter(
@@ -1,4 +1,4 @@
1
- from enum import Enum, auto
1
+ from enum import Enum
2
2
  from pathlib import Path
3
3
  from typing import List, Literal, Optional, Union
4
4
 
@@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
 
8
8
  class TableFormerMode(str, Enum):
9
- FAST = auto()
10
- ACCURATE = auto()
9
+ FAST = "fast"
10
+ ACCURATE = "accurate"
11
11
 
12
12
 
13
13
  class TableStructureOptions(BaseModel):
@@ -139,6 +139,10 @@ class DocumentConverter:
139
139
 
140
140
  self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
141
141
 
142
+ def initialize_pipeline(self, format: InputFormat):
143
+ """Initialize the conversion pipeline for the selected format."""
144
+ self._get_pipeline(doc_format=format)
145
+
142
146
  @validate_call(config=ConfigDict(strict=True))
143
147
  def convert(
144
148
  self,
@@ -219,13 +223,13 @@ class DocumentConverter:
219
223
  else:
220
224
  _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
221
225
 
222
- def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
226
+ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
223
227
  assert self.format_to_options is not None
224
228
 
225
- fopt = self.format_to_options.get(doc.format)
229
+ fopt = self.format_to_options.get(doc_format)
226
230
 
227
231
  if fopt is None:
228
- raise RuntimeError(f"Could not get pipeline for document {doc.file}")
232
+ raise RuntimeError(f"Could not get pipeline for {doc_format}")
229
233
  else:
230
234
  pipeline_class = fopt.pipeline_cls
231
235
  pipeline_options = fopt.pipeline_options
@@ -256,7 +260,7 @@ class DocumentConverter:
256
260
  self, in_doc: InputDocument, raises_on_error: bool
257
261
  ) -> ConversionResult:
258
262
  if in_doc.valid:
259
- pipeline = self._get_pipeline(in_doc)
263
+ pipeline = self._get_pipeline(in_doc.format)
260
264
  if pipeline is None: # Can't find a default pipeline. Should this raise?
261
265
  if raises_on_error:
262
266
  raise RuntimeError(
@@ -1,13 +1,13 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.3.0" # DO NOT EDIT, updated automatically
4
- description = "Docling PDF conversion package"
5
- authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
3
+ version = "2.4.0" # DO NOT EDIT, updated automatically
4
+ description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
+ authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
7
7
  readme = "README.md"
8
8
  repository = "https://github.com/DS4SD/docling"
9
9
  homepage = "https://github.com/DS4SD/docling"
10
- keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentation", "table structure", "table former"]
10
+ keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"]
11
11
  classifiers = [
12
12
  "License :: OSI Approved :: MIT License",
13
13
  "Operating System :: MacOS :: MacOS X",
@@ -21,24 +21,13 @@ keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentatio
21
21
  packages = [{include = "docling"}]
22
22
 
23
23
  [tool.poetry.dependencies]
24
- ##############
25
- # constraints:
26
- ##############
27
- torch = [
28
- {version = "^2.2.2", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
29
- {version = "~2.2.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
30
- ]
31
- torchvision = [
32
- {version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
33
- {version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
34
- ]
35
24
  ######################
36
25
  # actual dependencies:
37
26
  ######################
38
27
  python = "^3.10"
39
28
  pydantic = "^2.0.0"
40
- docling-core = "^2.2.3"
41
- docling-ibm-models = "^2.0.1"
29
+ docling-core = "^2.3.0"
30
+ docling-ibm-models = "^2.0.3"
42
31
  deepsearch-glm = "^0.26.1"
43
32
  filetype = "^1.2.0"
44
33
  pypdfium2 = "^4.30.0"
@@ -47,7 +36,7 @@ huggingface_hub = ">=0.23,<1"
47
36
  requests = "^2.32.3"
48
37
  easyocr = "^1.7"
49
38
  tesserocr = { version = "^2.7.1", optional = true }
50
- docling-parse = "^2.0.0"
39
+ docling-parse = "^2.0.2"
51
40
  certifi = ">=2024.7.4"
52
41
  rtree = "^1.3.0"
53
42
  scipy = "^1.14.1"
@@ -84,16 +73,23 @@ mkdocs-jupyter = "^0.25.0"
84
73
  [tool.poetry.group.examples.dependencies]
85
74
  datasets = "^2.21.0"
86
75
  python-dotenv = "^1.0.1"
87
- # llama-index-readers-docling = { version = "^0.1.0", markers = 'python_version < "3.13"' }
88
- # llama-index-node-parser-docling = { version = "^0.1.0", markers = 'python_version < "3.13"' }
89
- # llama-index-readers-file = { version = "^0.2.2", markers = 'python_version < "3.13"' }
90
- # llama-index-embeddings-huggingface = { version = "^0.3.1", markers = 'python_version < "3.13"' }
91
- # llama-index-llms-huggingface-api = { version = "^0.2.0", markers = 'python_version < "3.13"' }
92
- # llama-index-vector-stores-milvus ={ version = "^0.2.1", markers = 'python_version < "3.13"' }
93
76
  langchain-huggingface = "^0.0.3"
94
77
  langchain-milvus = "^0.1.4"
95
78
  langchain-text-splitters = "^0.2.4"
96
79
 
80
+ [tool.poetry.group.mac_intel]
81
+ optional = true
82
+
83
+ [tool.poetry.group.mac_intel.dependencies]
84
+ torch = [
85
+ {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2"},
86
+ {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"}
87
+ ]
88
+ torchvision = [
89
+ {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
90
+ {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
91
+ ]
92
+
97
93
  [tool.poetry.extras]
98
94
  tesserocr = ["tesserocr"]
99
95
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes