docling 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ import copy
1
2
  from enum import Enum, auto
2
3
  from io import BytesIO
3
4
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,6 +48,15 @@ class BoundingBox(BaseModel):
47
48
  def height(self):
48
49
  return abs(self.t - self.b)
49
50
 
51
+ def scaled(self, scale: float) -> "BoundingBox":
52
+ out_bbox = copy.deepcopy(self)
53
+ out_bbox.l *= scale
54
+ out_bbox.r *= scale
55
+ out_bbox.t *= scale
56
+ out_bbox.b *= scale
57
+
58
+ return out_bbox
59
+
50
60
  def as_tuple(self):
51
61
  if self.coord_origin == CoordOrigin.TOPLEFT:
52
62
  return (self.l, self.t, self.r, self.b)
@@ -241,6 +251,17 @@ class DocumentStream(BaseModel):
241
251
  stream: BytesIO
242
252
 
243
253
 
254
+ class TableStructureOptions(BaseModel):
255
+ do_cell_matching: bool = (
256
+ True
257
+ # True: Matches predictions back to PDF cells. Can break table output if PDF cells
258
+ # are merged across table columns.
259
+ # False: Let table structure model define the text cells, ignore PDF cells.
260
+ )
261
+
262
+
244
263
  class PipelineOptions(BaseModel):
245
- do_table_structure: bool = True
246
- do_ocr: bool = False
264
+ do_table_structure: bool = True # True: perform table structure extraction
265
+ do_ocr: bool = False # True: perform OCR, replace programmatic PDF text
266
+
267
+ table_structure_options: TableStructureOptions = TableStructureOptions()
@@ -19,18 +19,6 @@ class PageAssembleModel:
19
19
  def __init__(self, config):
20
20
  self.config = config
21
21
 
22
- # self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
23
-
24
- # def sanitize_text_poor(self, lines):
25
- # text = '\n'.join(lines)
26
- #
27
- # # treat line wraps.
28
- # sanitized_text = self.line_wrap_pattern.sub('', text)
29
- #
30
- # sanitized_text = sanitized_text.replace('\n', ' ')
31
- #
32
- # return sanitized_text
33
-
34
22
  def sanitize_text(self, lines):
35
23
  if len(lines) <= 1:
36
24
  return " ".join(lines)
@@ -1,7 +1,10 @@
1
- from typing import Iterable
1
+ import copy
2
+ import random
3
+ from typing import Iterable, List
2
4
 
3
5
  import numpy
4
6
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
7
+ from PIL import ImageDraw
5
8
 
6
9
  from docling.datamodel.base_models import (
7
10
  BoundingBox,
@@ -28,6 +31,21 @@ class TableStructureModel:
28
31
  self.tm_model_type = self.tm_config["model"]["type"]
29
32
 
30
33
  self.tf_predictor = TFPredictor(self.tm_config)
34
+ self.scale = 2.0 # Scale up table input images to 144 dpi
35
+
36
+ def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
37
+ image = page._backend.get_page_image()
38
+ draw = ImageDraw.Draw(image)
39
+
40
+ for table_element in tbl_list:
41
+ x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
42
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
43
+
44
+ for tc in table_element.table_cells:
45
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
46
+ draw.rectangle([(x0, y0), (x1, y1)], outline="blue")
47
+
48
+ image.show()
31
49
 
32
50
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
33
51
 
@@ -36,16 +54,17 @@ class TableStructureModel:
36
54
  return
37
55
 
38
56
  for page in page_batch:
57
+
39
58
  page.predictions.tablestructure = TableStructurePrediction() # dummy
40
59
 
41
60
  in_tables = [
42
61
  (
43
62
  cluster,
44
63
  [
45
- round(cluster.bbox.l),
46
- round(cluster.bbox.t),
47
- round(cluster.bbox.r),
48
- round(cluster.bbox.b),
64
+ round(cluster.bbox.l) * self.scale,
65
+ round(cluster.bbox.t) * self.scale,
66
+ round(cluster.bbox.r) * self.scale,
67
+ round(cluster.bbox.b) * self.scale,
49
68
  ],
50
69
  )
51
70
  for cluster in page.predictions.layout.clusters
@@ -65,20 +84,29 @@ class TableStructureModel:
65
84
  ):
66
85
  # Only allow non empty stings (spaces) into the cells of a table
67
86
  if len(c.text.strip()) > 0:
68
- tokens.append(c.model_dump())
87
+ new_cell = copy.deepcopy(c)
88
+ new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
89
+
90
+ tokens.append(new_cell.model_dump())
69
91
 
70
- iocr_page = {
71
- "image": numpy.asarray(page.image),
92
+ page_input = {
72
93
  "tokens": tokens,
73
- "width": page.size.width,
74
- "height": page.size.height,
94
+ "width": page.size.width * self.scale,
95
+ "height": page.size.height * self.scale,
75
96
  }
97
+ # add image to page input.
98
+ if self.scale == 1.0:
99
+ page_input["image"] = numpy.asarray(page.image)
100
+ else: # render new page image on the fly at desired scale
101
+ page_input["image"] = numpy.asarray(
102
+ page._backend.get_page_image(scale=self.scale)
103
+ )
76
104
 
77
105
  table_clusters, table_bboxes = zip(*in_tables)
78
106
 
79
107
  if len(table_bboxes):
80
108
  tf_output = self.tf_predictor.multi_table_predict(
81
- iocr_page, table_bboxes, do_matching=self.do_cell_matching
109
+ page_input, table_bboxes, do_matching=self.do_cell_matching
82
110
  )
83
111
 
84
112
  for table_cluster, table_out in zip(table_clusters, tf_output):
@@ -91,6 +119,7 @@ class TableStructureModel:
91
119
  element["bbox"]["token"] = text_piece
92
120
 
93
121
  tc = TableCell.model_validate(element)
122
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
94
123
  table_cells.append(tc)
95
124
 
96
125
  # Retrieving cols/rows, after post processing:
@@ -111,4 +140,7 @@ class TableStructureModel:
111
140
 
112
141
  page.predictions.tablestructure.table_map[table_cluster.id] = tbl
113
142
 
143
+ # For debugging purposes:
144
+ # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
145
+
114
146
  yield page
@@ -34,7 +34,7 @@ class StandardModelPipeline(BaseModelPipeline):
34
34
  "artifacts_path": artifacts_path
35
35
  / StandardModelPipeline._table_model_path,
36
36
  "enabled": pipeline_options.do_table_structure,
37
- "do_cell_matching": False,
37
+ "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
38
38
  }
39
39
  ),
40
40
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 0.3.1
3
+ Version: 0.4.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -79,7 +79,9 @@ python examples/convert.py
79
79
  ```
80
80
  The output of the above command will be written to `./scratch`.
81
81
 
82
- ### Enable or disable pipeline features
82
+ ### Adjust pipeline features
83
+
84
+ **Control pipeline options**
83
85
 
84
86
  You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
85
87
  ```python
@@ -92,6 +94,23 @@ doc_converter = DocumentConverter(
92
94
  )
93
95
  ```
94
96
 
97
+ **Control table extraction options**
98
+
99
+ You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
100
+ This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one.
101
+
102
+
103
+ ```python
104
+
105
+ pipeline_options = PipelineOptions(do_table_structure=True)
106
+ pipeline_options.table_structure_options.do_cell_matching = False # Uses text cells predicted from table structure model
107
+
108
+ doc_converter = DocumentConverter(
109
+ artifacts_path=artifacts_path,
110
+ pipeline_options=pipeline_options,
111
+ )
112
+ ```
113
+
95
114
  ### Impose limits on the document size
96
115
 
97
116
  You can limit the file size and number of pages which should be allowed to process per document:
@@ -3,7 +3,7 @@ docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  docling/backend/abstract_backend.py,sha256=dINr8oTax9Fq31Y1AR0CGWNZtAHN5aqB_M7TAPkJNVQ,1122
4
4
  docling/backend/pypdfium2_backend.py,sha256=sJMoActFyc3qdKB6RFly3auHXuXM4noQAG0ypUlj26o,7647
5
5
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- docling/datamodel/base_models.py,sha256=GKeRryRuCS6mPWJf0IPJ5manXwiuS0v8wFOnVXF38b0,6128
6
+ docling/datamodel/base_models.py,sha256=k7gLFPnq3ArEMAFz6qUcp5qemlYzVhOmR9qtBTkAiX4,6862
7
7
  docling/datamodel/document.py,sha256=S4USz13mqLS9WUwTgEkoocykcmY6B3cC3f4JlfTSYcM,12635
8
8
  docling/datamodel/settings.py,sha256=t5g6wrEJnPa9gBzMMl8ppgBRUYz-8xgopEtfMS0ZH28,733
9
9
  docling/document_converter.py,sha256=MZw23oPlRmRi1ggzoD1PukUnqo-6boO3RZB06dZ5Xt0,7305
@@ -11,15 +11,15 @@ docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling/models/ds_glm_model.py,sha256=wmb--2JKFQby-kvidw6PyM8wURPXYPQ_Z_eKKCBAdYQ,3192
12
12
  docling/models/easyocr_model.py,sha256=NaHVs8IN0eW9KB076E2Kae1s-bq74_4IMWueze9QqtE,2290
13
13
  docling/models/layout_model.py,sha256=4AfPFiu6pXc8wIQ1sQlEZnHRt7SnBmfzDdctiRveOWw,10944
14
- docling/models/page_assemble_model.py,sha256=jhjQt0NOkVi-dWBaovJ2KsBim5FF6e47y21uZ8EWfBg,5906
15
- docling/models/table_structure_model.py,sha256=uvkK2NPvltk9-zScbORUA05JbvymkGX6Dfsal4wLwsI,4103
14
+ docling/models/page_assemble_model.py,sha256=8eoG2WiFxPxq9TPvM-wkngb2gkr0tdtCRVXg1JcTETo,5550
15
+ docling/models/table_structure_model.py,sha256=ryZrmkNkCbw5SCpgdQabkmcRAEi_4VqOMv2VGdpvGZo,5499
16
16
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  docling/pipeline/base_model_pipeline.py,sha256=ozHdJak0yQAxQf7pQN_C480vI35A2e5KL5Qq1xSkq5c,560
18
- docling/pipeline/standard_model_pipeline.py,sha256=pDbgVO0oOJry7Q-3KYdMuaypXCQOdoVikR80veizo9o,1489
18
+ docling/pipeline/standard_model_pipeline.py,sha256=UTwodKUKrisLoVcntbNUBDhjzRyFvpdUvyVw-gNmBlM,1541
19
19
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  docling/utils/layout_utils.py,sha256=FOFbL0hKzUoWXdZaeUvEtFqKv0IkPifIr4sdGW4suKs,31804
21
21
  docling/utils/utils.py,sha256=llhXSbIDNZ1MHOwBEfLHBAoJIAYI7QlPIonlI1jLUJ0,1208
22
- docling-0.3.1.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
23
- docling-0.3.1.dist-info/METADATA,sha256=5OpesJEMNC_jdf88GO7drN0XHZkmpmk0J13mM1E50rk,5390
24
- docling-0.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
- docling-0.3.1.dist-info/RECORD,,
22
+ docling-0.4.0.dist-info/LICENSE,sha256=ACwmltkrXIz5VsEQcrqljq-fat6ZXAMepjXGoe40KtE,1069
23
+ docling-0.4.0.dist-info/METADATA,sha256=aWx7RrxtFIXHkPqEBrzO_gJfPjgWcpgENqx02cdBQys,6044
24
+ docling-0.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
25
+ docling-0.4.0.dist-info/RECORD,,