docling 0.3.1__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {docling-0.3.1 → docling-1.0.0}/PKG-INFO +30 -10
  2. {docling-0.3.1 → docling-1.0.0}/README.md +25 -5
  3. {docling-0.3.1 → docling-1.0.0}/docling/datamodel/base_models.py +23 -2
  4. {docling-0.3.1 → docling-1.0.0}/docling/datamodel/document.py +1 -4
  5. {docling-0.3.1 → docling-1.0.0}/docling/models/page_assemble_model.py +0 -12
  6. {docling-0.3.1 → docling-1.0.0}/docling/models/table_structure_model.py +43 -11
  7. {docling-0.3.1 → docling-1.0.0}/docling/pipeline/standard_model_pipeline.py +1 -1
  8. {docling-0.3.1 → docling-1.0.0}/pyproject.toml +6 -7
  9. {docling-0.3.1 → docling-1.0.0}/LICENSE +0 -0
  10. {docling-0.3.1 → docling-1.0.0}/docling/__init__.py +0 -0
  11. {docling-0.3.1 → docling-1.0.0}/docling/backend/__init__.py +0 -0
  12. {docling-0.3.1 → docling-1.0.0}/docling/backend/abstract_backend.py +0 -0
  13. {docling-0.3.1 → docling-1.0.0}/docling/backend/pypdfium2_backend.py +0 -0
  14. {docling-0.3.1 → docling-1.0.0}/docling/datamodel/__init__.py +0 -0
  15. {docling-0.3.1 → docling-1.0.0}/docling/datamodel/settings.py +0 -0
  16. {docling-0.3.1 → docling-1.0.0}/docling/document_converter.py +0 -0
  17. {docling-0.3.1 → docling-1.0.0}/docling/models/__init__.py +0 -0
  18. {docling-0.3.1 → docling-1.0.0}/docling/models/ds_glm_model.py +0 -0
  19. {docling-0.3.1 → docling-1.0.0}/docling/models/easyocr_model.py +0 -0
  20. {docling-0.3.1 → docling-1.0.0}/docling/models/layout_model.py +0 -0
  21. {docling-0.3.1 → docling-1.0.0}/docling/pipeline/__init__.py +0 -0
  22. {docling-0.3.1 → docling-1.0.0}/docling/pipeline/base_model_pipeline.py +0 -0
  23. {docling-0.3.1 → docling-1.0.0}/docling/utils/__init__.py +0 -0
  24. {docling-0.3.1 → docling-1.0.0}/docling/utils/layout_utils.py +0 -0
  25. {docling-0.3.1 → docling-1.0.0}/docling/utils/utils.py +0 -0
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 0.3.1
3
+ Version: 1.0.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
7
7
  Keywords: docling,convert,document,pdf,layout model,segmentation,table structure,table former
8
8
  Author: Christoph Auer
9
9
  Author-email: cau@zurich.ibm.com
10
- Requires-Python: >=3.11,<4.0
10
+ Requires-Python: >=3.10,<4.0
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Intended Audience :: Science/Research
@@ -15,13 +15,13 @@ Classifier: License :: OSI Approved :: MIT License
15
15
  Classifier: Operating System :: MacOS :: MacOS X
16
16
  Classifier: Operating System :: POSIX :: Linux
17
17
  Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
18
19
  Classifier: Programming Language :: Python :: 3.11
19
20
  Classifier: Programming Language :: Python :: 3.12
20
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
22
  Requires-Dist: deepsearch-glm (>=0.19.0,<1)
22
- Requires-Dist: deepsearch-toolkit (>=0.47.0,<1)
23
- Requires-Dist: docling-core (>=0.2.0,<0.3.0)
24
- Requires-Dist: docling-ibm-models (>=0.2.0,<0.3.0)
23
+ Requires-Dist: docling-core (>=1.1.0,<2.0.0)
24
+ Requires-Dist: docling-ibm-models (>=1.1.0,<2.0.0)
25
25
  Requires-Dist: filetype (>=1.2.0,<2.0.0)
26
26
  Requires-Dist: huggingface_hub (>=0.23,<1)
27
27
  Requires-Dist: pydantic (>=2.0.0,<3.0.0)
@@ -31,19 +31,21 @@ Project-URL: Repository, https://github.com/DS4SD/docling
31
31
  Description-Content-Type: text/markdown
32
32
 
33
33
  <p align="center">
34
- <a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
34
+ <a href="https://github.com/ds4sd/docling">
35
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
36
+ </a>
35
37
  </p>
36
38
 
37
39
  # Docling
38
40
 
39
41
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
40
- ![Python](https://img.shields.io/badge/python-3.11%20%7C%203.12-blue)
42
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
41
43
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
42
44
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
43
45
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
44
46
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
45
47
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
46
- [![License MIT](https://img.shields.io/github/license/ds4sd/deepsearch-toolkit)](https://opensource.org/licenses/MIT)
48
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
47
49
 
48
50
  Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
49
51
 
@@ -65,7 +67,7 @@ pip install docling
65
67
 
66
68
  ### Development setup
67
69
 
68
- To develop for Docling, you need Python 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
70
+ To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
69
71
  ```bash
70
72
  poetry install
71
73
  ```
@@ -79,7 +81,9 @@ python examples/convert.py
79
81
  ```
80
82
  The output of the above command will be written to `./scratch`.
81
83
 
82
- ### Enable or disable pipeline features
84
+ ### Adjust pipeline features
85
+
86
+ #### Control pipeline options
83
87
 
84
88
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
85
89
  ```python
@@ -92,6 +96,22 @@ doc_converter = DocumentConverter(
92
96
  )
93
97
  ```
94
98
 
99
+ #### Control table extraction options
100
+
101
+ You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
102
+ This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
103
+
104
+
105
+ ```python
106
+ pipeline_options = PipelineOptions(do_table_structure=True)
107
+ pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
108
+
109
+ doc_converter = DocumentConverter(
110
+ artifacts_path=artifacts_path,
111
+ pipeline_options=pipeline_options,
112
+ )
113
+ ```
114
+
95
115
  ### Impose limits on the document size
96
116
 
97
117
  You can limit the file size and number of pages which should be allowed to process per document:
@@ -1,17 +1,19 @@
1
1
  <p align="center">
2
- <a href="https://github.com/ds4sd/docling"> <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
2
+ <a href="https://github.com/ds4sd/docling">
3
+ <img loading="lazy" alt="Docling" src="https://github.com/DS4SD/docling/raw/main/logo.png" width="150" />
4
+ </a>
3
5
  </p>
4
6
 
5
7
  # Docling
6
8
 
7
9
  [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
8
- ![Python](https://img.shields.io/badge/python-3.11%20%7C%203.12-blue)
10
+ ![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
9
11
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
10
12
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
11
13
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
12
14
  [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
13
15
  [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
14
- [![License MIT](https://img.shields.io/github/license/ds4sd/deepsearch-toolkit)](https://opensource.org/licenses/MIT)
16
+ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT)
15
17
 
16
18
  Docling bundles PDF document conversion to JSON and Markdown in an easy, self-contained package.
17
19
 
@@ -33,7 +35,7 @@ pip install docling
33
35
 
34
36
  ### Development setup
35
37
 
36
- To develop for Docling, you need Python 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
38
+ To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
37
39
  ```bash
38
40
  poetry install
39
41
  ```
@@ -47,7 +49,9 @@ python examples/convert.py
47
49
  ```
48
50
  The output of the above command will be written to `./scratch`.
49
51
 
50
- ### Enable or disable pipeline features
52
+ ### Adjust pipeline features
53
+
54
+ #### Control pipeline options
51
55
 
52
56
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
53
57
  ```python
@@ -60,6 +64,22 @@ doc_converter = DocumentConverter(
60
64
  )
61
65
  ```
62
66
 
67
+ #### Control table extraction options
68
+
69
+ You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
70
+ This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
71
+
72
+
73
+ ```python
74
+ pipeline_options = PipelineOptions(do_table_structure=True)
75
+ pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model
76
+
77
+ doc_converter = DocumentConverter(
78
+ artifacts_path=artifacts_path,
79
+ pipeline_options=pipeline_options,
80
+ )
81
+ ```
82
+
63
83
  ### Impose limits on the document size
64
84
 
65
85
  You can limit the file size and number of pages which should be allowed to process per document:
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  from enum import Enum, auto
2
3
  from io import BytesIO
3
4
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,6 +48,15 @@ class BoundingBox(BaseModel):
47
48
  def height(self):
48
49
  return abs(self.t - self.b)
49
50
 
51
+ def scaled(self, scale: float) -> "BoundingBox":
52
+ out_bbox = copy.deepcopy(self)
53
+ out_bbox.l *= scale
54
+ out_bbox.r *= scale
55
+ out_bbox.t *= scale
56
+ out_bbox.b *= scale
57
+
58
+ return out_bbox
59
+
50
60
  def as_tuple(self):
51
61
  if self.coord_origin == CoordOrigin.TOPLEFT:
52
62
  return (self.l, self.t, self.r, self.b)
@@ -241,6 +251,17 @@ class DocumentStream(BaseModel):
241
251
  stream: BytesIO
242
252
 
243
253
 
254
+ class TableStructureOptions(BaseModel):
255
+ do_cell_matching: bool = (
256
+ True
257
+ # True: Matches predictions back to PDF cells. Can break table output if PDF cells
258
+ # are merged across table columns.
259
+ # False: Let table structure model define the text cells, ignore PDF cells.
260
+ )
261
+
262
+
244
263
  class PipelineOptions(BaseModel):
245
- do_table_structure: bool = True
246
- do_ocr: bool = False
264
+ do_table_structure: bool = True # True: perform table structure extraction
265
+ do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
266
+
267
+ table_structure_options: TableStructureOptions = TableStructureOptions()
@@ -3,7 +3,6 @@ from io import BytesIO
3
3
  from pathlib import Path, PurePath
4
4
  from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
5
5
 
6
- from deepsearch.documents.core.export import export_to_markdown
7
6
  from docling_core.types import BaseCell, BaseText
8
7
  from docling_core.types import BoundingBox as DsBoundingBox
9
8
  from docling_core.types import Document as DsDocument
@@ -299,9 +298,7 @@ class ConvertedDocument(BaseModel):
299
298
 
300
299
  def render_as_markdown(self):
301
300
  if self.output:
302
- return export_to_markdown(
303
- self.output.model_dump(by_alias=True, exclude_none=True)
304
- )
301
+ return self.output.export_to_markdown()
305
302
  else:
306
303
  return ""
307
304
 
@@ -19,18 +19,6 @@ class PageAssembleModel:
19
19
  def __init__(self, config):
20
20
  self.config = config
21
21
 
22
- # self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
23
-
24
- # def sanitize_text_poor(self, lines):
25
- # text = '\n'.join(lines)
26
- #
27
- # # treat line wraps.
28
- # sanitized_text = self.line_wrap_pattern.sub('', text)
29
- #
30
- # sanitized_text = sanitized_text.replace('\n', ' ')
31
- #
32
- # return sanitized_text
33
-
34
22
  def sanitize_text(self, lines):
35
23
  if len(lines) <= 1:
36
24
  return " ".join(lines)
@@ -1,7 +1,10 @@
1
- from typing import Iterable
1
+ import copy
2
+ import random
3
+ from typing import Iterable, List
2
4
 
3
5
  import numpy
4
6
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
7
+ from PIL import ImageDraw
5
8
 
6
9
  from docling.datamodel.base_models import (
7
10
  BoundingBox,
@@ -28,6 +31,21 @@ class TableStructureModel:
28
31
  self.tm_model_type = self.tm_config["model"]["type"]
29
32
 
30
33
  self.tf_predictor = TFPredictor(self.tm_config)
34
+ self.scale = 2.0 # Scale up table input images to 144 dpi
35
+
36
+ def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
37
+ image = page._backend.get_page_image()
38
+ draw = ImageDraw.Draw(image)
39
+
40
+ for table_element in tbl_list:
41
+ x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
42
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
43
+
44
+ for tc in table_element.table_cells:
45
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
46
+ draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
47
+
48
+ image.show()
31
49
 
32
50
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
33
51
 
@@ -36,16 +54,17 @@ class TableStructureModel:
36
54
  return
37
55
 
38
56
  for page in page_batch:
57
+
39
58
  page.predictions.tablestructure = TableStructurePrediction() # dummy
40
59
 
41
60
  in_tables = [
42
61
  (
43
62
  cluster,
44
63
  [
45
- round(cluster.bbox.l),
46
- round(cluster.bbox.t),
47
- round(cluster.bbox.r),
48
- round(cluster.bbox.b),
64
+ round(cluster.bbox.l) * self.scale,
65
+ round(cluster.bbox.t) * self.scale,
66
+ round(cluster.bbox.r) * self.scale,
67
+ round(cluster.bbox.b) * self.scale,
49
68
  ],
50
69
  )
51
70
  for cluster in page.predictions.layout.clusters
@@ -65,20 +84,29 @@ class TableStructureModel:
65
84
  ):
66
85
  # Only allow non empty stings (spaces) into the cells of a table
67
86
  if len(c.text.strip()) > 0:
68
- tokens.append(c.model_dump())
87
+ new_cell = copy.deepcopy(c)
88
+ new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
89
+
90
+ tokens.append(new_cell.model_dump())
69
91
 
70
- iocr_page = {
71
- "image": numpy.asarray(page.image),
92
+ page_input = {
72
93
  "tokens": tokens,
73
- "width": page.size.width,
74
- "height": page.size.height,
94
+ "width": page.size.width * self.scale,
95
+ "height": page.size.height * self.scale,
75
96
  }
97
+ # add image to page input.
98
+ if self.scale == 1.0:
99
+ page_input["image"] = numpy.asarray(page.image)
100
+ else: # render new page image on the fly at desired scale
101
+ page_input["image"] = numpy.asarray(
102
+ page._backend.get_page_image(scale=self.scale)
103
+ )
76
104
 
77
105
  table_clusters, table_bboxes = zip(*in_tables)
78
106
 
79
107
  if len(table_bboxes):
80
108
  tf_output = self.tf_predictor.multi_table_predict(
81
- iocr_page, table_bboxes, do_matching=self.do_cell_matching
109
+ page_input, table_bboxes, do_matching=self.do_cell_matching
82
110
  )
83
111
 
84
112
  for table_cluster, table_out in zip(table_clusters, tf_output):
@@ -91,6 +119,7 @@ class TableStructureModel:
91
119
  element["bbox"]["token"] = text_piece
92
120
 
93
121
  tc = TableCell.model_validate(element)
122
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
94
123
  table_cells.append(tc)
95
124
 
96
125
  # Retrieving cols/rows, after post processing:
@@ -111,4 +140,7 @@ class TableStructureModel:
111
140
 
112
141
  page.predictions.tablestructure.table_map[table_cluster.id] = tbl
113
142
 
143
+ # For debugging purposes:
144
+ # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
145
+
114
146
  yield page
@@ -34,7 +34,7 @@ class StandardModelPipeline(BaseModelPipeline):
34
34
  "artifacts_path": artifacts_path
35
35
  / StandardModelPipeline._table_model_path,
36
36
  "enabled": pipeline_options.do_table_structure,
37
- "do_cell_matching": False,
37
+ "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
38
38
  }
39
39
  ),
40
40
  ]
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "0.3.1" # DO NOT EDIT, updated automatically
3
+ version = "1.0.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -21,12 +21,11 @@ keywords= ["docling", "convert", "document", "pdf", "layout model", "segmentatio
21
21
  packages = [{include = "docling"}]
22
22
 
23
23
  [tool.poetry.dependencies]
24
- python = "^3.11"
24
+ python = "^3.10"
25
25
  pydantic = "^2.0.0"
26
- docling-core = "^0.2.0"
27
- docling-ibm-models = "^0.2.0"
26
+ docling-core = "^1.1.0"
27
+ docling-ibm-models = "^1.1.0"
28
28
  deepsearch-glm = ">=0.19.0,<1"
29
- deepsearch-toolkit = ">=0.47.0,<1"
30
29
  filetype = "^1.2.0"
31
30
  pypdfium2 = "^4.30.0"
32
31
  pydantic-settings = "^2.3.0"
@@ -55,7 +54,7 @@ build-backend = "poetry.core.masonry.api"
55
54
 
56
55
  [tool.black]
57
56
  line-length = 88
58
- target-version = ["py311"]
57
+ target-version = ["py310"]
59
58
  include = '\.pyi?$'
60
59
 
61
60
  [tool.isort]
@@ -67,7 +66,7 @@ py_version=311
67
66
  pretty = true
68
67
  # strict = true
69
68
  no_implicit_optional = true
70
- python_version = "3.11"
69
+ python_version = "3.10"
71
70
 
72
71
  [tool.flake8]
73
72
  max-line-length = 88
File without changes
File without changes
File without changes