docling 2.3.0__tar.gz → 2.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {docling-2.3.0 → docling-2.3.1}/PKG-INFO +4 -8
  2. {docling-2.3.0 → docling-2.3.1}/docling/document_converter.py +8 -4
  3. {docling-2.3.0 → docling-2.3.1}/pyproject.toml +17 -15
  4. {docling-2.3.0 → docling-2.3.1}/LICENSE +0 -0
  5. {docling-2.3.0 → docling-2.3.1}/README.md +0 -0
  6. {docling-2.3.0 → docling-2.3.1}/docling/__init__.py +0 -0
  7. {docling-2.3.0 → docling-2.3.1}/docling/backend/__init__.py +0 -0
  8. {docling-2.3.0 → docling-2.3.1}/docling/backend/abstract_backend.py +0 -0
  9. {docling-2.3.0 → docling-2.3.1}/docling/backend/asciidoc_backend.py +0 -0
  10. {docling-2.3.0 → docling-2.3.1}/docling/backend/docling_parse_backend.py +0 -0
  11. {docling-2.3.0 → docling-2.3.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  12. {docling-2.3.0 → docling-2.3.1}/docling/backend/html_backend.py +0 -0
  13. {docling-2.3.0 → docling-2.3.1}/docling/backend/md_backend.py +0 -0
  14. {docling-2.3.0 → docling-2.3.1}/docling/backend/mspowerpoint_backend.py +0 -0
  15. {docling-2.3.0 → docling-2.3.1}/docling/backend/msword_backend.py +0 -0
  16. {docling-2.3.0 → docling-2.3.1}/docling/backend/pdf_backend.py +0 -0
  17. {docling-2.3.0 → docling-2.3.1}/docling/backend/pypdfium2_backend.py +0 -0
  18. {docling-2.3.0 → docling-2.3.1}/docling/cli/__init__.py +0 -0
  19. {docling-2.3.0 → docling-2.3.1}/docling/cli/main.py +0 -0
  20. {docling-2.3.0 → docling-2.3.1}/docling/datamodel/__init__.py +0 -0
  21. {docling-2.3.0 → docling-2.3.1}/docling/datamodel/base_models.py +0 -0
  22. {docling-2.3.0 → docling-2.3.1}/docling/datamodel/document.py +0 -0
  23. {docling-2.3.0 → docling-2.3.1}/docling/datamodel/pipeline_options.py +0 -0
  24. {docling-2.3.0 → docling-2.3.1}/docling/datamodel/settings.py +0 -0
  25. {docling-2.3.0 → docling-2.3.1}/docling/models/__init__.py +0 -0
  26. {docling-2.3.0 → docling-2.3.1}/docling/models/base_model.py +0 -0
  27. {docling-2.3.0 → docling-2.3.1}/docling/models/base_ocr_model.py +0 -0
  28. {docling-2.3.0 → docling-2.3.1}/docling/models/ds_glm_model.py +0 -0
  29. {docling-2.3.0 → docling-2.3.1}/docling/models/easyocr_model.py +0 -0
  30. {docling-2.3.0 → docling-2.3.1}/docling/models/layout_model.py +0 -0
  31. {docling-2.3.0 → docling-2.3.1}/docling/models/page_assemble_model.py +0 -0
  32. {docling-2.3.0 → docling-2.3.1}/docling/models/page_preprocessing_model.py +0 -0
  33. {docling-2.3.0 → docling-2.3.1}/docling/models/table_structure_model.py +0 -0
  34. {docling-2.3.0 → docling-2.3.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  35. {docling-2.3.0 → docling-2.3.1}/docling/models/tesseract_ocr_model.py +0 -0
  36. {docling-2.3.0 → docling-2.3.1}/docling/pipeline/__init__.py +0 -0
  37. {docling-2.3.0 → docling-2.3.1}/docling/pipeline/base_pipeline.py +0 -0
  38. {docling-2.3.0 → docling-2.3.1}/docling/pipeline/simple_pipeline.py +0 -0
  39. {docling-2.3.0 → docling-2.3.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  40. {docling-2.3.0 → docling-2.3.1}/docling/utils/__init__.py +0 -0
  41. {docling-2.3.0 → docling-2.3.1}/docling/utils/export.py +0 -0
  42. {docling-2.3.0 → docling-2.3.1}/docling/utils/layout_utils.py +0 -0
  43. {docling-2.3.0 → docling-2.3.1}/docling/utils/profiling.py +0 -0
  44. {docling-2.3.0 → docling-2.3.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.3.0
3
+ Version: 2.3.1
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -23,9 +23,9 @@ Provides-Extra: tesserocr
23
23
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
24
24
  Requires-Dist: certifi (>=2024.7.4)
25
25
  Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
26
- Requires-Dist: docling-core (>=2.2.3,<3.0.0)
27
- Requires-Dist: docling-ibm-models (>=2.0.1,<3.0.0)
28
- Requires-Dist: docling-parse (>=2.0.0,<3.0.0)
26
+ Requires-Dist: docling-core (>=2.3.0,<3.0.0)
27
+ Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
28
+ Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
29
29
  Requires-Dist: easyocr (>=1.7,<2.0)
30
30
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
31
31
  Requires-Dist: huggingface_hub (>=0.23,<1)
@@ -41,10 +41,6 @@ Requires-Dist: requests (>=2.32.3,<3.0.0)
41
41
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
42
42
  Requires-Dist: scipy (>=1.14.1,<2.0.0)
43
43
  Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
44
- Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
45
- Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
46
- Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
47
- Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
48
44
  Requires-Dist: typer (>=0.12.5,<0.13.0)
49
45
  Project-URL: Repository, https://github.com/DS4SD/docling
50
46
  Description-Content-Type: text/markdown
@@ -139,6 +139,10 @@ class DocumentConverter:
139
139
 
140
140
  self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
141
141
 
142
+ def initialize_pipeline(self, format: InputFormat):
143
+ """Initialize the conversion pipeline for the selected format."""
144
+ self._get_pipeline(doc_format=format)
145
+
142
146
  @validate_call(config=ConfigDict(strict=True))
143
147
  def convert(
144
148
  self,
@@ -219,13 +223,13 @@ class DocumentConverter:
219
223
  else:
220
224
  _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
221
225
 
222
- def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
226
+ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
223
227
  assert self.format_to_options is not None
224
228
 
225
- fopt = self.format_to_options.get(doc.format)
229
+ fopt = self.format_to_options.get(doc_format)
226
230
 
227
231
  if fopt is None:
228
- raise RuntimeError(f"Could not get pipeline for document {doc.file}")
232
+ raise RuntimeError(f"Could not get pipeline for {doc_format}")
229
233
  else:
230
234
  pipeline_class = fopt.pipeline_cls
231
235
  pipeline_options = fopt.pipeline_options
@@ -256,7 +260,7 @@ class DocumentConverter:
256
260
  self, in_doc: InputDocument, raises_on_error: bool
257
261
  ) -> ConversionResult:
258
262
  if in_doc.valid:
259
- pipeline = self._get_pipeline(in_doc)
263
+ pipeline = self._get_pipeline(in_doc.format)
260
264
  if pipeline is None: # Can't find a default pipeline. Should this raise?
261
265
  if raises_on_error:
262
266
  raise RuntimeError(
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.3.0" # DO NOT EDIT, updated automatically
3
+ version = "2.3.1" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -21,24 +21,13 @@ keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentatio
21
21
  packages = [{include = "docling"}]
22
22
 
23
23
  [tool.poetry.dependencies]
24
- ##############
25
- # constraints:
26
- ##############
27
- torch = [
28
- {version = "^2.2.2", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
29
- {version = "~2.2.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
30
- ]
31
- torchvision = [
32
- {version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
33
- {version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
34
- ]
35
24
  ######################
36
25
  # actual dependencies:
37
26
  ######################
38
27
  python = "^3.10"
39
28
  pydantic = "^2.0.0"
40
- docling-core = "^2.2.3"
41
- docling-ibm-models = "^2.0.1"
29
+ docling-core = "^2.3.0"
30
+ docling-ibm-models = "^2.0.3"
42
31
  deepsearch-glm = "^0.26.1"
43
32
  filetype = "^1.2.0"
44
33
  pypdfium2 = "^4.30.0"
@@ -47,7 +36,7 @@ huggingface_hub = ">=0.23,<1"
47
36
  requests = "^2.32.3"
48
37
  easyocr = "^1.7"
49
38
  tesserocr = { version = "^2.7.1", optional = true }
50
- docling-parse = "^2.0.0"
39
+ docling-parse = "^2.0.2"
51
40
  certifi = ">=2024.7.4"
52
41
  rtree = "^1.3.0"
53
42
  scipy = "^1.14.1"
@@ -94,6 +83,19 @@ langchain-huggingface = "^0.0.3"
94
83
  langchain-milvus = "^0.1.4"
95
84
  langchain-text-splitters = "^0.2.4"
96
85
 
86
+ [tool.poetry.group.mac_intel]
87
+ optional = true
88
+
89
+ [tool.poetry.group.mac_intel.dependencies]
90
+ torch = [
91
+ {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2"},
92
+ {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"}
93
+ ]
94
+ torchvision = [
95
+ {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"},
96
+ {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"}
97
+ ]
98
+
97
99
  [tool.poetry.extras]
98
100
  tesserocr = ["tesserocr"]
99
101
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes