docling 0.4.0__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {docling-0.4.0 → docling-1.0.0}/PKG-INFO +14 -13
  2. {docling-0.4.0 → docling-1.0.0}/README.md +9 -8
  3. {docling-0.4.0 → docling-1.0.0}/docling/datamodel/document.py +1 -4
  4. {docling-0.4.0 → docling-1.0.0}/pyproject.toml +6 -7
  5. {docling-0.4.0 → docling-1.0.0}/LICENSE +0 -0
  6. {docling-0.4.0 → docling-1.0.0}/docling/__init__.py +0 -0
  7. {docling-0.4.0 → docling-1.0.0}/docling/backend/__init__.py +0 -0
  8. {docling-0.4.0 → docling-1.0.0}/docling/backend/abstract_backend.py +0 -0
  9. {docling-0.4.0 → docling-1.0.0}/docling/backend/pypdfium2_backend.py +0 -0
  10. {docling-0.4.0 → docling-1.0.0}/docling/datamodel/__init__.py +0 -0
  11. {docling-0.4.0 → docling-1.0.0}/docling/datamodel/base_models.py +0 -0
  12. {docling-0.4.0 → docling-1.0.0}/docling/datamodel/settings.py +0 -0
  13. {docling-0.4.0 → docling-1.0.0}/docling/document_converter.py +0 -0
  14. {docling-0.4.0 → docling-1.0.0}/docling/models/__init__.py +0 -0
  15. {docling-0.4.0 → docling-1.0.0}/docling/models/ds_glm_model.py +0 -0
  16. {docling-0.4.0 → docling-1.0.0}/docling/models/easyocr_model.py +0 -0
  17. {docling-0.4.0 → docling-1.0.0}/docling/models/layout_model.py +0 -0
  18. {docling-0.4.0 → docling-1.0.0}/docling/models/page_assemble_model.py +0 -0
  19. {docling-0.4.0 → docling-1.0.0}/docling/models/table_structure_model.py +0 -0
  20. {docling-0.4.0 → docling-1.0.0}/docling/pipeline/__init__.py +0 -0
  21. {docling-0.4.0 → docling-1.0.0}/docling/pipeline/base_model_pipeline.py +0 -0
  22. {docling-0.4.0 → docling-1.0.0}/docling/pipeline/standard_model_pipeline.py +0 -0
  23. {docling-0.4.0 → docling-1.0.0}/docling/utils/__init__.py +0 -0
  24. {docling-0.4.0 → docling-1.0.0}/docling/utils/layout_utils.py +0 -0
  25. {docling-0.4.0 → docling-1.0.0}/docling/utils/utils.py +0 -0
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 0.4.0
3
+ Version: 1.0.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
7
7
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
8
8
  Author: Christoph Auer
9
9
  Author-email: cau@zurich.ibm.com
10
- Requires-Python: >=3.11,<4.0
10
+ Requires-Python: >=3.10,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Intended Audience :: Science/Research
@@ -15,13 +15,13 @@ Classifier: License :: OSI Approved :: MIT License
15
15
  Classifier: Operating System :: MacOS :: MacOS X
16
16
  Classifier: Operating System :: POSIX :: Linux
17
17
  Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
18
19
  Classifier: Programming Language :: Python :: 3.11
19
20
  Classifier: Programming Language :: Python :: 3.12
20
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
22
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
22
- Requires-Dist: deepsearch-toolkit (>=0.47.0,<1)
23
- Requires-Dist: docling-core (>=0.2.0,<0.3.0)
24
- Requires-Dist: docling-ibm-models (>=0.2.0,<0.3.0)
23
+ Requires-Dist: docling-core (>=1.1.0,<2.0.0)
24
+ Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
25
25
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
26
26
  Requires-Dist: huggingface_hub (>=0.23,<1)
27
27
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -31,19 +31,21 @@ Project-URL: Repository, https://github.com/DS4SD/docling
31
31
  Description-Content-Type: text/markdown
32
32
 
33
33
  <p align="center">
34
- <a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
34
+ <a href="https://github.com/ds4sd/docling">
35
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
36
+ </a>
35
37
  </p>
36
38
 
37
39
  # Docling
38
40
 
39
41
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
40
- ![Python](https://img.shields.io/badge/python-3.11%20%7C%203.12-blue)
42
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
41
43
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
42
44
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
43
45
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
44
46
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
45
47
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
46
- [![License MIT](https://img.shields.io/github/license/ds4sd/deepsearch-toolkit)](https://opensource.org/licenses/MIT)
48
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
47
49
 
48
50
  Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
49
51
 
@@ -65,7 +67,7 @@ pip install docling
65
67
 
66
68
  ### Development setup
67
69
 
68
- To develop for Docling, you need Python 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
70
+ To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
69
71
  ```bash
70
72
  poetry install
71
73
  ```
@@ -81,7 +83,7 @@ The output of the above command will be written to `./scratch`.
81
83
 
82
84
  ### Adjust pipeline features
83
85
 
84
- **Control pipeline options**
86
+ #### Control pipeline options
85
87
 
86
88
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
87
89
  ```python
@@ -94,16 +96,15 @@ doc_converter = DocumentConverter(
94
96
  )
95
97
  ```
96
98
 
97
- **Control table extraction options**
99
+ #### Control table extraction options
98
100
 
99
101
  You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
100
102
  This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
101
103
 
102
104
 
103
105
  ```python
104
-
105
106
  pipeline_options = PipelineOptions(do_table_structure=True)
106
- pipeline_options.table_structure_options.do_cell_matching = False # Uses text cells predicted from table structure model
107
+ pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
107
108
 
108
109
  doc_converter = DocumentConverter(
109
110
  artifacts_path=artifacts_path,
@@ -1,17 +1,19 @@
1
1
  <p align="center">
2
- <a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
2
+ <a href="https://github.com/ds4sd/docling">
3
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
4
+ </a>
3
5
  </p>
4
6
 
5
7
  # Docling
6
8
 
7
9
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
8
- ![Python](https://img.shields.io/badge/python-3.11%20%7C%203.12-blue)
10
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
9
11
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
10
12
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
11
13
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
12
14
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
13
15
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
14
- [![License MIT](https://img.shields.io/github/license/ds4sd/deepsearch-toolkit)](https://opensource.org/licenses/MIT)
16
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
15
17
 
16
18
  Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
17
19
 
@@ -33,7 +35,7 @@ pip install docling
33
35
 
34
36
  ### Development setup
35
37
 
36
- To develop for Docling, you need Python 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
38
+ To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
37
39
  ```bash
38
40
  poetry install
39
41
  ```
@@ -49,7 +51,7 @@ The output of the above command will be written to `./scratch`.
49
51
 
50
52
  ### Adjust pipeline features
51
53
 
52
- **Control pipeline options**
54
+ #### Control pipeline options
53
55
 
54
56
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
55
57
  ```python
@@ -62,16 +64,15 @@ doc_converter = DocumentConverter(
62
64
  )
63
65
  ```
64
66
 
65
- **Control table extraction options**
67
+ #### Control table extraction options
66
68
 
67
69
  You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
68
70
  This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
69
71
 
70
72
 
71
73
  ```python
72
-
73
74
  pipeline_options = PipelineOptions(do_table_structure=True)
74
- pipeline_options.table_structure_options.do_cell_matching = False # Uses text cells predicted from table structure model
75
+ pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
75
76
 
76
77
  doc_converter = DocumentConverter(
77
78
  artifacts_path=artifacts_path,
@@ -3,7 +3,6 @@ from io import BytesIO
3
3
  from pathlib import Path, PurePath
4
4
  from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
5
5
 
6
- from deepsearch.documents.core.export import export_to_markdown
7
6
  from docling_core.types import BaseCell, BaseText
8
7
  from docling_core.types import BoundingBox as DsBoundingBox
9
8
  from docling_core.types import Document as DsDocument
@@ -299,9 +298,7 @@ class ConvertedDocument(BaseModel):
299
298
 
300
299
  def render_as_markdown(self):
301
300
  if self.output:
302
- return export_to_markdown(
303
- self.output.model_dump(by_alias=True, exclude_none=True)
304
- )
301
+ return self.output.export_to_markdown()
305
302
  else:
306
303
  return ""
307
304
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "0.4.0" # DO NOT EDIT, updated automatically
3
+ version = "1.0.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -21,12 +21,11 @@ keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentatio
21
21
  packages = [{include = "docling"}]
22
22
 
23
23
  [tool.poetry.dependencies]
24
- python = "^3.11"
24
+ python = "^3.10"
25
25
  pydantic = "^2.0.0"
26
- docling-core = "^0.2.0"
27
- docling-ibm-models = "^0.2.0"
26
+ docling-core = "^1.1.0"
27
+ docling-ibm-models = "^1.1.0"
28
28
  deepsearch-glm = ">=0.19.0,<1"
29
- deepsearch-toolkit = ">=0.47.0,<1"
30
29
  filetype = "^1.2.0"
31
30
  pypdfium2 = "^4.30.0"
32
31
  pydantic-settings = "^2.3.0"
@@ -55,7 +54,7 @@ build-backend = "poetry.core.masonry.api"
55
54
 
56
55
  [tool.black]
57
56
  line-length = 88
58
- target-version = ["py311"]
57
+ target-version = ["py310"]
59
58
  include = '\.pyi?$'
60
59
 
61
60
  [tool.isort]
@@ -67,7 +66,7 @@ py_version=311
67
66
  pretty = true
68
67
  # strict = true
69
68
  no_implicit_optional = true
70
- python_version = "3.11"
69
+ python_version = "3.10"
71
70
 
72
71
  [tool.flake8]
73
72
  max-line-length = 88
File without changes
File without changes
File without changes