docling 0.3.1__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {docling-0.3.1 → docling-0.4.0}/PKG-INFO +21 -2
  2. {docling-0.3.1 → docling-0.4.0}/README.md +20 -1
  3. {docling-0.3.1 → docling-0.4.0}/docling/datamodel/base_models.py +23 -2
  4. {docling-0.3.1 → docling-0.4.0}/docling/models/page_assemble_model.py +0 -12
  5. {docling-0.3.1 → docling-0.4.0}/docling/models/table_structure_model.py +43 -11
  6. {docling-0.3.1 → docling-0.4.0}/docling/pipeline/standard_model_pipeline.py +1 -1
  7. {docling-0.3.1 → docling-0.4.0}/pyproject.toml +1 -1
  8. {docling-0.3.1 → docling-0.4.0}/LICENSE +0 -0
  9. {docling-0.3.1 → docling-0.4.0}/docling/__init__.py +0 -0
  10. {docling-0.3.1 → docling-0.4.0}/docling/backend/__init__.py +0 -0
  11. {docling-0.3.1 → docling-0.4.0}/docling/backend/abstract_backend.py +0 -0
  12. {docling-0.3.1 → docling-0.4.0}/docling/backend/pypdfium2_backend.py +0 -0
  13. {docling-0.3.1 → docling-0.4.0}/docling/datamodel/__init__.py +0 -0
  14. {docling-0.3.1 → docling-0.4.0}/docling/datamodel/document.py +0 -0
  15. {docling-0.3.1 → docling-0.4.0}/docling/datamodel/settings.py +0 -0
  16. {docling-0.3.1 → docling-0.4.0}/docling/document_converter.py +0 -0
  17. {docling-0.3.1 → docling-0.4.0}/docling/models/__init__.py +0 -0
  18. {docling-0.3.1 → docling-0.4.0}/docling/models/ds_glm_model.py +0 -0
  19. {docling-0.3.1 → docling-0.4.0}/docling/models/easyocr_model.py +0 -0
  20. {docling-0.3.1 → docling-0.4.0}/docling/models/layout_model.py +0 -0
  21. {docling-0.3.1 → docling-0.4.0}/docling/pipeline/__init__.py +0 -0
  22. {docling-0.3.1 → docling-0.4.0}/docling/pipeline/base_model_pipeline.py +0 -0
  23. {docling-0.3.1 → docling-0.4.0}/docling/utils/__init__.py +0 -0
  24. {docling-0.3.1 → docling-0.4.0}/docling/utils/layout_utils.py +0 -0
  25. {docling-0.3.1 → docling-0.4.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 0.3.1
3
+ Version: 0.4.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -79,7 +79,9 @@ python examples/convert.py
79
79
  ```
80
80
  The output of the above command will be written to `./scratch`.
81
81
 
82
- ### Enable or disable pipeline features
82
+ ### Adjust pipeline features
83
+
84
+ **Control pipeline options**
83
85
 
84
86
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
85
87
  ```python
@@ -92,6 +94,23 @@ doc_converter = DocumentConverter(
92
94
  )
93
95
  ```
94
96
 
97
+ **Control table extraction options**
98
+
99
+ You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
100
+ This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
101
+
102
+
103
+ ```python
104
+
105
+ pipeline_options = PipelineOptions(do_table_structure=True)
106
+ pipeline_options.table_structure_options.do_cell_matching = False # Uses text cells predicted from table structure model
107
+
108
+ doc_converter = DocumentConverter(
109
+ artifacts_path=artifacts_path,
110
+ pipeline_options=pipeline_options,
111
+ )
112
+ ```
113
+
95
114
  ### Impose limits on the document size
96
115
 
97
116
  You can limit the file size and number of pages which should be allowed to process per document:
@@ -47,7 +47,9 @@ python examples/convert.py
47
47
  ```
48
48
  The output of the above command will be written to `./scratch`.
49
49
 
50
- ### Enable or disable pipeline features
50
+ ### Adjust pipeline features
51
+
52
+ **Control pipeline options**
51
53
 
52
54
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
53
55
  ```python
@@ -60,6 +62,23 @@ doc_converter = DocumentConverter(
60
62
  )
61
63
  ```
62
64
 
65
+ **Control table extraction options**
66
+
67
+ You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
68
+ This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
69
+
70
+
71
+ ```python
72
+
73
+ pipeline_options = PipelineOptions(do_table_structure=True)
74
+ pipeline_options.table_structure_options.do_cell_matching = False # Uses text cells predicted from table structure model
75
+
76
+ doc_converter = DocumentConverter(
77
+ artifacts_path=artifacts_path,
78
+ pipeline_options=pipeline_options,
79
+ )
80
+ ```
81
+
63
82
  ### Impose limits on the document size
64
83
 
65
84
  You can limit the file size and number of pages which should be allowed to process per document:
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  from enum import Enum, auto
2
3
  from io import BytesIO
3
4
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,6 +48,15 @@ class BoundingBox(BaseModel):
47
48
  def height(self):
48
49
  return abs(self.t - self.b)
49
50
 
51
+ def scaled(self, scale: float) -> "BoundingBox":
52
+ out_bbox = copy.deepcopy(self)
53
+ out_bbox.l *= scale
54
+ out_bbox.r *= scale
55
+ out_bbox.t *= scale
56
+ out_bbox.b *= scale
57
+
58
+ return out_bbox
59
+
50
60
  def as_tuple(self):
51
61
  if self.coord_origin == CoordOrigin.TOPLEFT:
52
62
  return (self.l, self.t, self.r, self.b)
@@ -241,6 +251,17 @@ class DocumentStream(BaseModel):
241
251
  stream: BytesIO
242
252
 
243
253
 
254
+ class TableStructureOptions(BaseModel):
255
+ do_cell_matching: bool = (
256
+ True
257
+ # True: Matches predictions back to PDF cells. Can break table output if PDF cells
258
+ # are merged across table columns.
259
+ # False: Let table structure model define the text cells, ignore PDF cells.
260
+ )
261
+
262
+
244
263
  class PipelineOptions(BaseModel):
245
- do_table_structure: bool = True
246
- do_ocr: bool = False
264
+ do_table_structure: bool = True # True: perform table structure extraction
265
+ do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
266
+
267
+ table_structure_options: TableStructureOptions = TableStructureOptions()
@@ -19,18 +19,6 @@ class PageAssembleModel:
19
19
  def __init__(self, config):
20
20
  self.config = config
21
21
 
22
- # self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
23
-
24
- # def sanitize_text_poor(self, lines):
25
- # text = '\n'.join(lines)
26
- #
27
- # # treat line wraps.
28
- # sanitized_text = self.line_wrap_pattern.sub('', text)
29
- #
30
- # sanitized_text = sanitized_text.replace('\n', ' ')
31
- #
32
- # return sanitized_text
33
-
34
22
  def sanitize_text(self, lines):
35
23
  if len(lines) <= 1:
36
24
  return " ".join(lines)
@@ -1,7 +1,10 @@
1
- from typing import Iterable
1
+ import copy
2
+ import random
3
+ from typing import Iterable, List
2
4
 
3
5
  import numpy
4
6
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
7
+ from PIL import ImageDraw
5
8
 
6
9
  from docling.datamodel.base_models import (
7
10
  BoundingBox,
@@ -28,6 +31,21 @@ class TableStructureModel:
28
31
  self.tm_model_type = self.tm_config["model"]["type"]
29
32
 
30
33
  self.tf_predictor = TFPredictor(self.tm_config)
34
+ self.scale = 2.0 # Scale up table input images to 144 dpi
35
+
36
+ def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
37
+ image = page._backend.get_page_image()
38
+ draw = ImageDraw.Draw(image)
39
+
40
+ for table_element in tbl_list:
41
+ x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
42
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
43
+
44
+ for tc in table_element.table_cells:
45
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
46
+ draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
47
+
48
+ image.show()
31
49
 
32
50
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
33
51
 
@@ -36,16 +54,17 @@ class TableStructureModel:
36
54
  return
37
55
 
38
56
  for page in page_batch:
57
+
39
58
  page.predictions.tablestructure = TableStructurePrediction() # dummy
40
59
 
41
60
  in_tables = [
42
61
  (
43
62
  cluster,
44
63
  [
45
- round(cluster.bbox.l),
46
- round(cluster.bbox.t),
47
- round(cluster.bbox.r),
48
- round(cluster.bbox.b),
64
+ round(cluster.bbox.l) * self.scale,
65
+ round(cluster.bbox.t) * self.scale,
66
+ round(cluster.bbox.r) * self.scale,
67
+ round(cluster.bbox.b) * self.scale,
49
68
  ],
50
69
  )
51
70
  for cluster in page.predictions.layout.clusters
@@ -65,20 +84,29 @@ class TableStructureModel:
65
84
  ):
66
85
  # Only allow non empty stings (spaces) into the cells of a table
67
86
  if len(c.text.strip()) > 0:
68
- tokens.append(c.model_dump())
87
+ new_cell = copy.deepcopy(c)
88
+ new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
89
+
90
+ tokens.append(new_cell.model_dump())
69
91
 
70
- iocr_page = {
71
- "image": numpy.asarray(page.image),
92
+ page_input = {
72
93
  "tokens": tokens,
73
- "width": page.size.width,
74
- "height": page.size.height,
94
+ "width": page.size.width * self.scale,
95
+ "height": page.size.height * self.scale,
75
96
  }
97
+ # add image to page input.
98
+ if self.scale == 1.0:
99
+ page_input["image"] = numpy.asarray(page.image)
100
+ else: # render new page image on the fly at desired scale
101
+ page_input["image"] = numpy.asarray(
102
+ page._backend.get_page_image(scale=self.scale)
103
+ )
76
104
 
77
105
  table_clusters, table_bboxes = zip(*in_tables)
78
106
 
79
107
  if len(table_bboxes):
80
108
  tf_output = self.tf_predictor.multi_table_predict(
81
- iocr_page, table_bboxes, do_matching=self.do_cell_matching
109
+ page_input, table_bboxes, do_matching=self.do_cell_matching
82
110
  )
83
111
 
84
112
  for table_cluster, table_out in zip(table_clusters, tf_output):
@@ -91,6 +119,7 @@ class TableStructureModel:
91
119
  element["bbox"]["token"] = text_piece
92
120
 
93
121
  tc = TableCell.model_validate(element)
122
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
94
123
  table_cells.append(tc)
95
124
 
96
125
  # Retrieving cols/rows, after post processing:
@@ -111,4 +140,7 @@ class TableStructureModel:
111
140
 
112
141
  page.predictions.tablestructure.table_map[table_cluster.id] = tbl
113
142
 
143
+ # For debugging purposes:
144
+ # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
145
+
114
146
  yield page
@@ -34,7 +34,7 @@ class StandardModelPipeline(BaseModelPipeline):
34
34
  "artifacts_path": artifacts_path
35
35
  / StandardModelPipeline._table_model_path,
36
36
  "enabled": pipeline_options.do_table_structure,
37
- "do_cell_matching": False,
37
+ "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
38
38
  }
39
39
  ),
40
40
  ]
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "0.3.1" # DO NOT EDIT, updated automatically
3
+ version = "0.4.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes