docling 2.2.1__py3-none-any.whl → 2.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,8 +12,10 @@ from docling.datamodel.base_models import (
12
12
  Table,
13
13
  TextElement,
14
14
  )
15
+ from docling.datamodel.document import ConversionResult
15
16
  from docling.models.base_model import BasePageModel
16
17
  from docling.models.layout_model import LayoutModel
18
+ from docling.utils.profiling import TimeRecorder
17
19
 
18
20
  _log = logging.getLogger(__name__)
19
21
 
@@ -51,122 +53,122 @@ class PageAssembleModel(BasePageModel):
51
53
 
52
54
  return sanitized_text.strip() # Strip any leading or trailing whitespace
53
55
 
54
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
56
+ def __call__(
57
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
58
+ ) -> Iterable[Page]:
55
59
  for page in page_batch:
56
60
  assert page._backend is not None
57
61
  if not page._backend.is_valid():
58
62
  yield page
59
63
  else:
60
- assert page.predictions.layout is not None
61
-
62
- # assembles some JSON output page by page.
63
-
64
- elements: List[PageElement] = []
65
- headers: List[PageElement] = []
66
- body: List[PageElement] = []
67
-
68
- for cluster in page.predictions.layout.clusters:
69
- # _log.info("Cluster label seen:", cluster.label)
70
- if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
71
-
72
- textlines = [
73
- cell.text.replace("\x02", "-").strip()
74
- for cell in cluster.cells
75
- if len(cell.text.strip()) > 0
76
- ]
77
- text = self.sanitize_text(textlines)
78
- text_el = TextElement(
79
- label=cluster.label,
80
- id=cluster.id,
81
- text=text,
82
- page_no=page.page_no,
83
- cluster=cluster,
84
- )
85
- elements.append(text_el)
86
-
87
- if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
88
- headers.append(text_el)
89
- else:
90
- body.append(text_el)
91
- elif cluster.label == LayoutModel.TABLE_LABEL:
92
- tbl = None
93
- if page.predictions.tablestructure:
94
- tbl = page.predictions.tablestructure.table_map.get(
95
- cluster.id, None
96
- )
97
- if (
98
- not tbl
99
- ): # fallback: add table without structure, if it isn't present
100
- tbl = Table(
64
+ with TimeRecorder(conv_res, "page_assemble"):
65
+
66
+ assert page.predictions.layout is not None
67
+
68
+ # assembles some JSON output page by page.
69
+
70
+ elements: List[PageElement] = []
71
+ headers: List[PageElement] = []
72
+ body: List[PageElement] = []
73
+
74
+ for cluster in page.predictions.layout.clusters:
75
+ # _log.info("Cluster label seen:", cluster.label)
76
+ if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
77
+
78
+ textlines = [
79
+ cell.text.replace("\x02", "-").strip()
80
+ for cell in cluster.cells
81
+ if len(cell.text.strip()) > 0
82
+ ]
83
+ text = self.sanitize_text(textlines)
84
+ text_el = TextElement(
101
85
  label=cluster.label,
102
86
  id=cluster.id,
103
- text="",
104
- otsl_seq=[],
105
- table_cells=[],
106
- cluster=cluster,
87
+ text=text,
107
88
  page_no=page.page_no,
89
+ cluster=cluster,
108
90
  )
91
+ elements.append(text_el)
92
+
93
+ if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
94
+ headers.append(text_el)
95
+ else:
96
+ body.append(text_el)
97
+ elif cluster.label == LayoutModel.TABLE_LABEL:
98
+ tbl = None
99
+ if page.predictions.tablestructure:
100
+ tbl = page.predictions.tablestructure.table_map.get(
101
+ cluster.id, None
102
+ )
103
+ if (
104
+ not tbl
105
+ ): # fallback: add table without structure, if it isn't present
106
+ tbl = Table(
107
+ label=cluster.label,
108
+ id=cluster.id,
109
+ text="",
110
+ otsl_seq=[],
111
+ table_cells=[],
112
+ cluster=cluster,
113
+ page_no=page.page_no,
114
+ )
109
115
 
110
- elements.append(tbl)
111
- body.append(tbl)
112
- elif cluster.label == LayoutModel.FIGURE_LABEL:
113
- fig = None
114
- if page.predictions.figures_classification:
115
- fig = (
116
- page.predictions.figures_classification.figure_map.get(
116
+ elements.append(tbl)
117
+ body.append(tbl)
118
+ elif cluster.label == LayoutModel.FIGURE_LABEL:
119
+ fig = None
120
+ if page.predictions.figures_classification:
121
+ fig = page.predictions.figures_classification.figure_map.get(
117
122
  cluster.id, None
118
123
  )
119
- )
120
- if (
121
- not fig
122
- ): # fallback: add figure without classification, if it isn't present
123
- fig = FigureElement(
124
- label=cluster.label,
125
- id=cluster.id,
126
- text="",
127
- data=None,
128
- cluster=cluster,
129
- page_no=page.page_no,
130
- )
131
- elements.append(fig)
132
- body.append(fig)
133
- elif cluster.label == LayoutModel.FORMULA_LABEL:
134
- equation = None
135
- if page.predictions.equations_prediction:
136
- equation = (
137
- page.predictions.equations_prediction.equation_map.get(
124
+ if (
125
+ not fig
126
+ ): # fallback: add figure without classification, if it isn't present
127
+ fig = FigureElement(
128
+ label=cluster.label,
129
+ id=cluster.id,
130
+ text="",
131
+ data=None,
132
+ cluster=cluster,
133
+ page_no=page.page_no,
134
+ )
135
+ elements.append(fig)
136
+ body.append(fig)
137
+ elif cluster.label == LayoutModel.FORMULA_LABEL:
138
+ equation = None
139
+ if page.predictions.equations_prediction:
140
+ equation = page.predictions.equations_prediction.equation_map.get(
138
141
  cluster.id, None
139
142
  )
140
- )
141
- if (
142
- not equation
143
- ): # fallback: add empty formula, if it isn't present
144
- text = self.sanitize_text(
145
- [
146
- cell.text.replace("\x02", "-").strip()
147
- for cell in cluster.cells
148
- if len(cell.text.strip()) > 0
149
- ]
150
- )
151
- equation = TextElement(
152
- label=cluster.label,
153
- id=cluster.id,
154
- cluster=cluster,
155
- page_no=page.page_no,
156
- text=text,
157
- )
158
- elements.append(equation)
159
- body.append(equation)
143
+ if (
144
+ not equation
145
+ ): # fallback: add empty formula, if it isn't present
146
+ text = self.sanitize_text(
147
+ [
148
+ cell.text.replace("\x02", "-").strip()
149
+ for cell in cluster.cells
150
+ if len(cell.text.strip()) > 0
151
+ ]
152
+ )
153
+ equation = TextElement(
154
+ label=cluster.label,
155
+ id=cluster.id,
156
+ cluster=cluster,
157
+ page_no=page.page_no,
158
+ text=text,
159
+ )
160
+ elements.append(equation)
161
+ body.append(equation)
160
162
 
161
- page.assembled = AssembledUnit(
162
- elements=elements, headers=headers, body=body
163
- )
163
+ page.assembled = AssembledUnit(
164
+ elements=elements, headers=headers, body=body
165
+ )
164
166
 
165
- # Remove page images (can be disabled)
166
- if not self.options.keep_images:
167
- page._image_cache = {}
167
+ # Remove page images (can be disabled)
168
+ if not self.options.keep_images:
169
+ page._image_cache = {}
168
170
 
169
- # Unload backend
170
- page._backend.unload()
171
+ # Unload backend
172
+ page._backend.unload()
171
173
 
172
174
  yield page
@@ -1,10 +1,14 @@
1
+ from pathlib import Path
1
2
  from typing import Iterable, Optional
2
3
 
3
4
  from PIL import ImageDraw
4
5
  from pydantic import BaseModel
5
6
 
6
7
  from docling.datamodel.base_models import Page
8
+ from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.settings import settings
7
10
  from docling.models.base_model import BasePageModel
11
+ from docling.utils.profiling import TimeRecorder
8
12
 
9
13
 
10
14
  class PagePreprocessingOptions(BaseModel):
@@ -15,14 +19,17 @@ class PagePreprocessingModel(BasePageModel):
15
19
  def __init__(self, options: PagePreprocessingOptions):
16
20
  self.options = options
17
21
 
18
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
22
+ def __call__(
23
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
24
+ ) -> Iterable[Page]:
19
25
  for page in page_batch:
20
26
  assert page._backend is not None
21
27
  if not page._backend.is_valid():
22
28
  yield page
23
29
  else:
24
- page = self._populate_page_images(page)
25
- page = self._parse_page_cells(page)
30
+ with TimeRecorder(conv_res, "page_parse"):
31
+ page = self._populate_page_images(page)
32
+ page = self._parse_page_cells(conv_res, page)
26
33
  yield page
27
34
 
28
35
  # Generate the page image and store it in the page object
@@ -43,19 +50,30 @@ class PagePreprocessingModel(BasePageModel):
43
50
  return page
44
51
 
45
52
  # Extract and populate the page cells and store it in the page object
46
- def _parse_page_cells(self, page: Page) -> Page:
53
+ def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
47
54
  assert page._backend is not None
48
55
 
49
56
  page.cells = list(page._backend.get_text_cells())
50
57
 
51
58
  # DEBUG code:
52
- def draw_text_boxes(image, cells):
59
+ def draw_text_boxes(image, cells, show: bool = False):
53
60
  draw = ImageDraw.Draw(image)
54
61
  for c in cells:
55
62
  x0, y0, x1, y1 = c.bbox.as_tuple()
56
63
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
57
- image.show()
64
+ if show:
65
+ image.show()
66
+ else:
67
+ out_path: Path = (
68
+ Path(settings.debug.debug_output_path)
69
+ / f"debug_{conv_res.input.file.stem}"
70
+ )
71
+ out_path.mkdir(parents=True, exist_ok=True)
72
+
73
+ out_file = out_path / f"cells_page_{page.page_no:05}.png"
74
+ image.save(str(out_file), format="png")
58
75
 
59
- # draw_text_boxes(page.get_image(scale=1.0), cells)
76
+ if settings.debug.visualize_cells:
77
+ draw_text_boxes(page.get_image(scale=1.0), page.cells)
60
78
 
61
79
  return page
@@ -1,6 +1,6 @@
1
1
  import copy
2
2
  from pathlib import Path
3
- from typing import Iterable, List
3
+ from typing import Iterable
4
4
 
5
5
  import numpy
6
6
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -8,8 +8,11 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
8
8
  from PIL import ImageDraw
9
9
 
10
10
  from docling.datamodel.base_models import Page, Table, TableStructurePrediction
11
+ from docling.datamodel.document import ConversionResult
11
12
  from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
13
+ from docling.datamodel.settings import settings
12
14
  from docling.models.base_model import BasePageModel
15
+ from docling.utils.profiling import TimeRecorder
13
16
 
14
17
 
15
18
  class TableStructureModel(BasePageModel):
@@ -35,7 +38,13 @@ class TableStructureModel(BasePageModel):
35
38
  self.tf_predictor = TFPredictor(self.tm_config)
36
39
  self.scale = 2.0 # Scale up table input images to 144 dpi
37
40
 
38
- def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
41
+ def draw_table_and_cells(
42
+ self,
43
+ conv_res: ConversionResult,
44
+ page: Page,
45
+ tbl_list: Iterable[Table],
46
+ show: bool = False,
47
+ ):
39
48
  assert page._backend is not None
40
49
 
41
50
  image = (
@@ -61,9 +70,21 @@ class TableStructureModel(BasePageModel):
61
70
  fill="black",
62
71
  )
63
72
 
64
- image.show()
73
+ if show:
74
+ image.show()
75
+ else:
76
+ out_path: Path = (
77
+ Path(settings.debug.debug_output_path)
78
+ / f"debug_{conv_res.input.file.stem}"
79
+ )
80
+ out_path.mkdir(parents=True, exist_ok=True)
81
+
82
+ out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
83
+ image.save(str(out_file), format="png")
65
84
 
66
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
85
+ def __call__(
86
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
87
+ ) -> Iterable[Page]:
67
88
 
68
89
  if not self.enabled:
69
90
  yield from page_batch
@@ -74,98 +95,112 @@ class TableStructureModel(BasePageModel):
74
95
  if not page._backend.is_valid():
75
96
  yield page
76
97
  else:
77
-
78
- assert page.predictions.layout is not None
79
- assert page.size is not None
80
-
81
- page.predictions.tablestructure = TableStructurePrediction() # dummy
82
-
83
- in_tables = [
84
- (
85
- cluster,
86
- [
87
- round(cluster.bbox.l) * self.scale,
88
- round(cluster.bbox.t) * self.scale,
89
- round(cluster.bbox.r) * self.scale,
90
- round(cluster.bbox.b) * self.scale,
91
- ],
98
+ with TimeRecorder(conv_res, "table_structure"):
99
+
100
+ assert page.predictions.layout is not None
101
+ assert page.size is not None
102
+
103
+ page.predictions.tablestructure = (
104
+ TableStructurePrediction()
105
+ ) # dummy
106
+
107
+ in_tables = [
108
+ (
109
+ cluster,
110
+ [
111
+ round(cluster.bbox.l) * self.scale,
112
+ round(cluster.bbox.t) * self.scale,
113
+ round(cluster.bbox.r) * self.scale,
114
+ round(cluster.bbox.b) * self.scale,
115
+ ],
116
+ )
117
+ for cluster in page.predictions.layout.clusters
118
+ if cluster.label == DocItemLabel.TABLE
119
+ ]
120
+ if not len(in_tables):
121
+ yield page
122
+ continue
123
+
124
+ tokens = []
125
+ for c in page.cells:
126
+ for cluster, _ in in_tables:
127
+ if c.bbox.area() > 0:
128
+ if (
129
+ c.bbox.intersection_area_with(cluster.bbox)
130
+ / c.bbox.area()
131
+ > 0.2
132
+ ):
133
+ # Only allow non empty stings (spaces) into the cells of a table
134
+ if len(c.text.strip()) > 0:
135
+ new_cell = copy.deepcopy(c)
136
+ new_cell.bbox = new_cell.bbox.scaled(
137
+ scale=self.scale
138
+ )
139
+
140
+ tokens.append(new_cell.model_dump())
141
+
142
+ page_input = {
143
+ "tokens": tokens,
144
+ "width": page.size.width * self.scale,
145
+ "height": page.size.height * self.scale,
146
+ }
147
+ page_input["image"] = numpy.asarray(
148
+ page.get_image(scale=self.scale)
92
149
  )
93
- for cluster in page.predictions.layout.clusters
94
- if cluster.label == DocItemLabel.TABLE
95
- ]
96
- if not len(in_tables):
97
- yield page
98
- continue
99
-
100
- tokens = []
101
- for c in page.cells:
102
- for cluster, _ in in_tables:
103
- if c.bbox.area() > 0:
104
- if (
105
- c.bbox.intersection_area_with(cluster.bbox)
106
- / c.bbox.area()
107
- > 0.2
108
- ):
109
- # Only allow non empty stings (spaces) into the cells of a table
110
- if len(c.text.strip()) > 0:
111
- new_cell = copy.deepcopy(c)
112
- new_cell.bbox = new_cell.bbox.scaled(
113
- scale=self.scale
114
- )
115
-
116
- tokens.append(new_cell.model_dump())
117
150
 
118
- page_input = {
119
- "tokens": tokens,
120
- "width": page.size.width * self.scale,
121
- "height": page.size.height * self.scale,
122
- }
123
- page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
151
+ table_clusters, table_bboxes = zip(*in_tables)
124
152
 
125
- table_clusters, table_bboxes = zip(*in_tables)
126
-
127
- if len(table_bboxes):
128
- tf_output = self.tf_predictor.multi_table_predict(
129
- page_input, table_bboxes, do_matching=self.do_cell_matching
130
- )
131
-
132
- for table_cluster, table_out in zip(table_clusters, tf_output):
133
- table_cells = []
134
- for element in table_out["tf_responses"]:
135
-
136
- if not self.do_cell_matching:
137
- the_bbox = BoundingBox.model_validate(
138
- element["bbox"]
139
- ).scaled(1 / self.scale)
140
- text_piece = page._backend.get_text_in_rect(the_bbox)
141
- element["bbox"]["token"] = text_piece
142
-
143
- tc = TableCell.model_validate(element)
144
- if self.do_cell_matching and tc.bbox is not None:
145
- tc.bbox = tc.bbox.scaled(1 / self.scale)
146
- table_cells.append(tc)
147
-
148
- # Retrieving cols/rows, after post processing:
149
- num_rows = table_out["predict_details"]["num_rows"]
150
- num_cols = table_out["predict_details"]["num_cols"]
151
- otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
152
-
153
- tbl = Table(
154
- otsl_seq=otsl_seq,
155
- table_cells=table_cells,
156
- num_rows=num_rows,
157
- num_cols=num_cols,
158
- id=table_cluster.id,
159
- page_no=page.page_no,
160
- cluster=table_cluster,
161
- label=DocItemLabel.TABLE,
153
+ if len(table_bboxes):
154
+ tf_output = self.tf_predictor.multi_table_predict(
155
+ page_input, table_bboxes, do_matching=self.do_cell_matching
162
156
  )
163
157
 
164
- page.predictions.tablestructure.table_map[table_cluster.id] = (
165
- tbl
166
- )
158
+ for table_cluster, table_out in zip(table_clusters, tf_output):
159
+ table_cells = []
160
+ for element in table_out["tf_responses"]:
161
+
162
+ if not self.do_cell_matching:
163
+ the_bbox = BoundingBox.model_validate(
164
+ element["bbox"]
165
+ ).scaled(1 / self.scale)
166
+ text_piece = page._backend.get_text_in_rect(
167
+ the_bbox
168
+ )
169
+ element["bbox"]["token"] = text_piece
170
+
171
+ tc = TableCell.model_validate(element)
172
+ if self.do_cell_matching and tc.bbox is not None:
173
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
174
+ table_cells.append(tc)
175
+
176
+ # Retrieving cols/rows, after post processing:
177
+ num_rows = table_out["predict_details"]["num_rows"]
178
+ num_cols = table_out["predict_details"]["num_cols"]
179
+ otsl_seq = table_out["predict_details"]["prediction"][
180
+ "rs_seq"
181
+ ]
182
+
183
+ tbl = Table(
184
+ otsl_seq=otsl_seq,
185
+ table_cells=table_cells,
186
+ num_rows=num_rows,
187
+ num_cols=num_cols,
188
+ id=table_cluster.id,
189
+ page_no=page.page_no,
190
+ cluster=table_cluster,
191
+ label=DocItemLabel.TABLE,
192
+ )
193
+
194
+ page.predictions.tablestructure.table_map[
195
+ table_cluster.id
196
+ ] = tbl
167
197
 
168
198
  # For debugging purposes:
169
- # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
199
+ if settings.debug.visualize_tables:
200
+ self.draw_table_and_cells(
201
+ conv_res,
202
+ page,
203
+ page.predictions.tablestructure.table_map.values(),
204
+ )
170
205
 
171
206
  yield page