docling 2.1.0__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. docling/backend/abstract_backend.py +1 -0
  2. docling/backend/asciidoc_backend.py +431 -0
  3. docling/backend/docling_parse_backend.py +4 -4
  4. docling/backend/docling_parse_v2_backend.py +12 -4
  5. docling/backend/html_backend.py +61 -57
  6. docling/backend/md_backend.py +346 -0
  7. docling/backend/mspowerpoint_backend.py +62 -39
  8. docling/backend/msword_backend.py +12 -25
  9. docling/backend/pypdfium2_backend.py +1 -1
  10. docling/cli/main.py +38 -8
  11. docling/datamodel/base_models.py +16 -10
  12. docling/datamodel/document.py +36 -6
  13. docling/datamodel/pipeline_options.py +3 -3
  14. docling/datamodel/settings.py +15 -1
  15. docling/document_converter.py +38 -12
  16. docling/models/base_model.py +4 -1
  17. docling/models/base_ocr_model.py +21 -4
  18. docling/models/ds_glm_model.py +27 -11
  19. docling/models/easyocr_model.py +49 -39
  20. docling/models/layout_model.py +87 -61
  21. docling/models/page_assemble_model.py +102 -100
  22. docling/models/page_preprocessing_model.py +25 -7
  23. docling/models/table_structure_model.py +125 -90
  24. docling/models/tesseract_ocr_cli_model.py +62 -52
  25. docling/models/tesseract_ocr_model.py +76 -52
  26. docling/pipeline/base_pipeline.py +68 -69
  27. docling/pipeline/simple_pipeline.py +8 -11
  28. docling/pipeline/standard_pdf_pipeline.py +59 -56
  29. docling/utils/profiling.py +62 -0
  30. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/METADATA +27 -22
  31. docling-2.4.1.dist-info/RECORD +45 -0
  32. docling-2.1.0.dist-info/RECORD +0 -42
  33. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
  34. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
  35. {docling-2.1.0.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  import copy
2
2
  from pathlib import Path
3
- from typing import Iterable, List
3
+ from typing import Iterable
4
4
 
5
5
  import numpy
6
6
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
@@ -8,8 +8,11 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
8
8
  from PIL import ImageDraw
9
9
 
10
10
  from docling.datamodel.base_models import Page, Table, TableStructurePrediction
11
+ from docling.datamodel.document import ConversionResult
11
12
  from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
13
+ from docling.datamodel.settings import settings
12
14
  from docling.models.base_model import BasePageModel
15
+ from docling.utils.profiling import TimeRecorder
13
16
 
14
17
 
15
18
  class TableStructureModel(BasePageModel):
@@ -35,7 +38,13 @@ class TableStructureModel(BasePageModel):
35
38
  self.tf_predictor = TFPredictor(self.tm_config)
36
39
  self.scale = 2.0 # Scale up table input images to 144 dpi
37
40
 
38
- def draw_table_and_cells(self, page: Page, tbl_list: List[Table]):
41
+ def draw_table_and_cells(
42
+ self,
43
+ conv_res: ConversionResult,
44
+ page: Page,
45
+ tbl_list: Iterable[Table],
46
+ show: bool = False,
47
+ ):
39
48
  assert page._backend is not None
40
49
 
41
50
  image = (
@@ -61,9 +70,21 @@ class TableStructureModel(BasePageModel):
61
70
  fill="black",
62
71
  )
63
72
 
64
- image.show()
73
+ if show:
74
+ image.show()
75
+ else:
76
+ out_path: Path = (
77
+ Path(settings.debug.debug_output_path)
78
+ / f"debug_{conv_res.input.file.stem}"
79
+ )
80
+ out_path.mkdir(parents=True, exist_ok=True)
81
+
82
+ out_file = out_path / f"table_struct_page_{page.page_no:05}.png"
83
+ image.save(str(out_file), format="png")
65
84
 
66
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
85
+ def __call__(
86
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
87
+ ) -> Iterable[Page]:
67
88
 
68
89
  if not self.enabled:
69
90
  yield from page_batch
@@ -74,98 +95,112 @@ class TableStructureModel(BasePageModel):
74
95
  if not page._backend.is_valid():
75
96
  yield page
76
97
  else:
77
-
78
- assert page.predictions.layout is not None
79
- assert page.size is not None
80
-
81
- page.predictions.tablestructure = TableStructurePrediction() # dummy
82
-
83
- in_tables = [
84
- (
85
- cluster,
86
- [
87
- round(cluster.bbox.l) * self.scale,
88
- round(cluster.bbox.t) * self.scale,
89
- round(cluster.bbox.r) * self.scale,
90
- round(cluster.bbox.b) * self.scale,
91
- ],
98
+ with TimeRecorder(conv_res, "table_structure"):
99
+
100
+ assert page.predictions.layout is not None
101
+ assert page.size is not None
102
+
103
+ page.predictions.tablestructure = (
104
+ TableStructurePrediction()
105
+ ) # dummy
106
+
107
+ in_tables = [
108
+ (
109
+ cluster,
110
+ [
111
+ round(cluster.bbox.l) * self.scale,
112
+ round(cluster.bbox.t) * self.scale,
113
+ round(cluster.bbox.r) * self.scale,
114
+ round(cluster.bbox.b) * self.scale,
115
+ ],
116
+ )
117
+ for cluster in page.predictions.layout.clusters
118
+ if cluster.label == DocItemLabel.TABLE
119
+ ]
120
+ if not len(in_tables):
121
+ yield page
122
+ continue
123
+
124
+ tokens = []
125
+ for c in page.cells:
126
+ for cluster, _ in in_tables:
127
+ if c.bbox.area() > 0:
128
+ if (
129
+ c.bbox.intersection_area_with(cluster.bbox)
130
+ / c.bbox.area()
131
+ > 0.2
132
+ ):
133
+ # Only allow non empty stings (spaces) into the cells of a table
134
+ if len(c.text.strip()) > 0:
135
+ new_cell = copy.deepcopy(c)
136
+ new_cell.bbox = new_cell.bbox.scaled(
137
+ scale=self.scale
138
+ )
139
+
140
+ tokens.append(new_cell.model_dump())
141
+
142
+ page_input = {
143
+ "tokens": tokens,
144
+ "width": page.size.width * self.scale,
145
+ "height": page.size.height * self.scale,
146
+ }
147
+ page_input["image"] = numpy.asarray(
148
+ page.get_image(scale=self.scale)
92
149
  )
93
- for cluster in page.predictions.layout.clusters
94
- if cluster.label == DocItemLabel.TABLE
95
- ]
96
- if not len(in_tables):
97
- yield page
98
- continue
99
-
100
- tokens = []
101
- for c in page.cells:
102
- for cluster, _ in in_tables:
103
- if c.bbox.area() > 0:
104
- if (
105
- c.bbox.intersection_area_with(cluster.bbox)
106
- / c.bbox.area()
107
- > 0.2
108
- ):
109
- # Only allow non empty stings (spaces) into the cells of a table
110
- if len(c.text.strip()) > 0:
111
- new_cell = copy.deepcopy(c)
112
- new_cell.bbox = new_cell.bbox.scaled(
113
- scale=self.scale
114
- )
115
-
116
- tokens.append(new_cell.model_dump())
117
150
 
118
- page_input = {
119
- "tokens": tokens,
120
- "width": page.size.width * self.scale,
121
- "height": page.size.height * self.scale,
122
- }
123
- page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
151
+ table_clusters, table_bboxes = zip(*in_tables)
124
152
 
125
- table_clusters, table_bboxes = zip(*in_tables)
126
-
127
- if len(table_bboxes):
128
- tf_output = self.tf_predictor.multi_table_predict(
129
- page_input, table_bboxes, do_matching=self.do_cell_matching
130
- )
131
-
132
- for table_cluster, table_out in zip(table_clusters, tf_output):
133
- table_cells = []
134
- for element in table_out["tf_responses"]:
135
-
136
- if not self.do_cell_matching:
137
- the_bbox = BoundingBox.model_validate(
138
- element["bbox"]
139
- ).scaled(1 / self.scale)
140
- text_piece = page._backend.get_text_in_rect(the_bbox)
141
- element["bbox"]["token"] = text_piece
142
-
143
- tc = TableCell.model_validate(element)
144
- if self.do_cell_matching and tc.bbox is not None:
145
- tc.bbox = tc.bbox.scaled(1 / self.scale)
146
- table_cells.append(tc)
147
-
148
- # Retrieving cols/rows, after post processing:
149
- num_rows = table_out["predict_details"]["num_rows"]
150
- num_cols = table_out["predict_details"]["num_cols"]
151
- otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
152
-
153
- tbl = Table(
154
- otsl_seq=otsl_seq,
155
- table_cells=table_cells,
156
- num_rows=num_rows,
157
- num_cols=num_cols,
158
- id=table_cluster.id,
159
- page_no=page.page_no,
160
- cluster=table_cluster,
161
- label=DocItemLabel.TABLE,
153
+ if len(table_bboxes):
154
+ tf_output = self.tf_predictor.multi_table_predict(
155
+ page_input, table_bboxes, do_matching=self.do_cell_matching
162
156
  )
163
157
 
164
- page.predictions.tablestructure.table_map[table_cluster.id] = (
165
- tbl
166
- )
158
+ for table_cluster, table_out in zip(table_clusters, tf_output):
159
+ table_cells = []
160
+ for element in table_out["tf_responses"]:
161
+
162
+ if not self.do_cell_matching:
163
+ the_bbox = BoundingBox.model_validate(
164
+ element["bbox"]
165
+ ).scaled(1 / self.scale)
166
+ text_piece = page._backend.get_text_in_rect(
167
+ the_bbox
168
+ )
169
+ element["bbox"]["token"] = text_piece
170
+
171
+ tc = TableCell.model_validate(element)
172
+ if self.do_cell_matching and tc.bbox is not None:
173
+ tc.bbox = tc.bbox.scaled(1 / self.scale)
174
+ table_cells.append(tc)
175
+
176
+ # Retrieving cols/rows, after post processing:
177
+ num_rows = table_out["predict_details"]["num_rows"]
178
+ num_cols = table_out["predict_details"]["num_cols"]
179
+ otsl_seq = table_out["predict_details"]["prediction"][
180
+ "rs_seq"
181
+ ]
182
+
183
+ tbl = Table(
184
+ otsl_seq=otsl_seq,
185
+ table_cells=table_cells,
186
+ num_rows=num_rows,
187
+ num_cols=num_cols,
188
+ id=table_cluster.id,
189
+ page_no=page.page_no,
190
+ cluster=table_cluster,
191
+ label=DocItemLabel.TABLE,
192
+ )
193
+
194
+ page.predictions.tablestructure.table_map[
195
+ table_cluster.id
196
+ ] = tbl
167
197
 
168
198
  # For debugging purposes:
169
- # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
199
+ if settings.debug.visualize_tables:
200
+ self.draw_table_and_cells(
201
+ conv_res,
202
+ page,
203
+ page.predictions.tablestructure.table_map.values(),
204
+ )
170
205
 
171
206
  yield page
@@ -8,8 +8,11 @@ import pandas as pd
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
9
 
10
10
  from docling.datamodel.base_models import OcrCell, Page
11
+ from docling.datamodel.document import ConversionResult
11
12
  from docling.datamodel.pipeline_options import TesseractCliOcrOptions
13
+ from docling.datamodel.settings import settings
12
14
  from docling.models.base_ocr_model import BaseOcrModel
15
+ from docling.utils.profiling import TimeRecorder
13
16
 
14
17
  _log = logging.getLogger(__name__)
15
18
 
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
102
105
 
103
106
  return df_filtered
104
107
 
105
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
108
+ def __call__(
109
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
110
+ ) -> Iterable[Page]:
106
111
 
107
112
  if not self.enabled:
108
113
  yield from page_batch
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
113
118
  if not page._backend.is_valid():
114
119
  yield page
115
120
  else:
116
- ocr_rects = self.get_ocr_rects(page)
117
-
118
- all_ocr_cells = []
119
- for ocr_rect in ocr_rects:
120
- # Skip zero area boxes
121
- if ocr_rect.area() == 0:
122
- continue
123
- high_res_image = page._backend.get_page_image(
124
- scale=self.scale, cropbox=ocr_rect
125
- )
121
+ with TimeRecorder(conv_res, "ocr"):
126
122
 
127
- with tempfile.NamedTemporaryFile(
128
- suffix=".png", mode="w"
129
- ) as image_file:
130
- fname = image_file.name
131
- high_res_image.save(fname)
132
-
133
- df = self._run_tesseract(fname)
134
-
135
- # _log.info(df)
136
-
137
- # Print relevant columns (bounding box and text)
138
- for ix, row in df.iterrows():
139
- text = row["text"]
140
- conf = row["conf"]
141
-
142
- l = float(row["left"])
143
- b = float(row["top"])
144
- w = float(row["width"])
145
- h = float(row["height"])
146
-
147
- t = b + h
148
- r = l + w
149
-
150
- cell = OcrCell(
151
- id=ix,
152
- text=text,
153
- confidence=conf / 100.0,
154
- bbox=BoundingBox.from_tuple(
155
- coord=(
156
- (l / self.scale) + ocr_rect.l,
157
- (b / self.scale) + ocr_rect.t,
158
- (r / self.scale) + ocr_rect.l,
159
- (t / self.scale) + ocr_rect.t,
160
- ),
161
- origin=CoordOrigin.TOPLEFT,
162
- ),
123
+ ocr_rects = self.get_ocr_rects(page)
124
+
125
+ all_ocr_cells = []
126
+ for ocr_rect in ocr_rects:
127
+ # Skip zero area boxes
128
+ if ocr_rect.area() == 0:
129
+ continue
130
+ high_res_image = page._backend.get_page_image(
131
+ scale=self.scale, cropbox=ocr_rect
163
132
  )
164
- all_ocr_cells.append(cell)
165
133
 
166
- ## Remove OCR cells which overlap with programmatic cells.
167
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
134
+ with tempfile.NamedTemporaryFile(
135
+ suffix=".png", mode="w"
136
+ ) as image_file:
137
+ fname = image_file.name
138
+ high_res_image.save(fname)
139
+
140
+ df = self._run_tesseract(fname)
141
+
142
+ # _log.info(df)
143
+
144
+ # Print relevant columns (bounding box and text)
145
+ for ix, row in df.iterrows():
146
+ text = row["text"]
147
+ conf = row["conf"]
148
+
149
+ l = float(row["left"])
150
+ b = float(row["top"])
151
+ w = float(row["width"])
152
+ h = float(row["height"])
153
+
154
+ t = b + h
155
+ r = l + w
156
+
157
+ cell = OcrCell(
158
+ id=ix,
159
+ text=text,
160
+ confidence=conf / 100.0,
161
+ bbox=BoundingBox.from_tuple(
162
+ coord=(
163
+ (l / self.scale) + ocr_rect.l,
164
+ (b / self.scale) + ocr_rect.t,
165
+ (r / self.scale) + ocr_rect.l,
166
+ (t / self.scale) + ocr_rect.t,
167
+ ),
168
+ origin=CoordOrigin.TOPLEFT,
169
+ ),
170
+ )
171
+ all_ocr_cells.append(cell)
172
+
173
+ ## Remove OCR cells which overlap with programmatic cells.
174
+ filtered_ocr_cells = self.filter_ocr_cells(
175
+ all_ocr_cells, page.cells
176
+ )
168
177
 
169
- page.cells.extend(filtered_ocr_cells)
178
+ page.cells.extend(filtered_ocr_cells)
170
179
 
171
180
  # DEBUG code:
172
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
181
+ if settings.debug.visualize_ocr:
182
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
173
183
 
174
184
  yield page
@@ -4,8 +4,11 @@ from typing import Iterable
4
4
  from docling_core.types.doc import BoundingBox, CoordOrigin
5
5
 
6
6
  from docling.datamodel.base_models import OcrCell, Page
7
+ from docling.datamodel.document import ConversionResult
7
8
  from docling.datamodel.pipeline_options import TesseractOcrOptions
9
+ from docling.datamodel.settings import settings
8
10
  from docling.models.base_ocr_model import BaseOcrModel
11
+ from docling.utils.profiling import TimeRecorder
9
12
 
10
13
  _log = logging.getLogger(__name__)
11
14
 
@@ -19,25 +22,37 @@ class TesseractOcrModel(BaseOcrModel):
19
22
  self.reader = None
20
23
 
21
24
  if self.enabled:
22
- setup_errmsg = (
25
+ install_errmsg = (
23
26
  "tesserocr is not correctly installed. "
24
27
  "Please install it via `pip install tesserocr` to use this OCR engine. "
25
- "Note that tesserocr might have to be manually compiled for working with"
28
+ "Note that tesserocr might have to be manually compiled for working with "
26
29
  "your Tesseract installation. The Docling documentation provides examples for it. "
27
- "Alternatively, Docling has support for other OCR engines. See the documentation."
30
+ "Alternatively, Docling has support for other OCR engines. See the documentation: "
31
+ "https://ds4sd.github.io/docling/installation/"
28
32
  )
33
+ missing_langs_errmsg = (
34
+ "tesserocr is not correctly configured. No language models have been detected. "
35
+ "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
36
+ "You can find more information how to setup other OCR engines in Docling "
37
+ "documentation: "
38
+ "https://ds4sd.github.io/docling/installation/"
39
+ )
40
+
29
41
  try:
30
42
  import tesserocr
31
43
  except ImportError:
32
- raise ImportError(setup_errmsg)
33
-
44
+ raise ImportError(install_errmsg)
34
45
  try:
35
46
  tesseract_version = tesserocr.tesseract_version()
36
- _log.debug("Initializing TesserOCR: %s", tesseract_version)
37
47
  except:
38
- raise ImportError(setup_errmsg)
48
+ raise ImportError(install_errmsg)
49
+
50
+ _, tesserocr_languages = tesserocr.get_languages()
51
+ if not tesserocr_languages:
52
+ raise ImportError(missing_langs_errmsg)
39
53
 
40
54
  # Initialize the tesseractAPI
55
+ _log.debug("Initializing TesserOCR: %s", tesseract_version)
41
56
  lang = "+".join(self.options.lang)
42
57
  if self.options.path is not None:
43
58
  self.reader = tesserocr.PyTessBaseAPI(
@@ -61,7 +76,9 @@ class TesseractOcrModel(BaseOcrModel):
61
76
  # Finalize the tesseractAPI
62
77
  self.reader.End()
63
78
 
64
- def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
79
+ def __call__(
80
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
81
+ ) -> Iterable[Page]:
65
82
 
66
83
  if not self.enabled:
67
84
  yield from page_batch
@@ -72,59 +89,66 @@ class TesseractOcrModel(BaseOcrModel):
72
89
  if not page._backend.is_valid():
73
90
  yield page
74
91
  else:
75
- assert self.reader is not None
92
+ with TimeRecorder(conv_res, "ocr"):
76
93
 
77
- ocr_rects = self.get_ocr_rects(page)
94
+ assert self.reader is not None
78
95
 
79
- all_ocr_cells = []
80
- for ocr_rect in ocr_rects:
81
- # Skip zero area boxes
82
- if ocr_rect.area() == 0:
83
- continue
84
- high_res_image = page._backend.get_page_image(
85
- scale=self.scale, cropbox=ocr_rect
86
- )
96
+ ocr_rects = self.get_ocr_rects(page)
87
97
 
88
- # Retrieve text snippets with their bounding boxes
89
- self.reader.SetImage(high_res_image)
90
- boxes = self.reader.GetComponentImages(
91
- self.reader_RIL.TEXTLINE, True
92
- )
98
+ all_ocr_cells = []
99
+ for ocr_rect in ocr_rects:
100
+ # Skip zero area boxes
101
+ if ocr_rect.area() == 0:
102
+ continue
103
+ high_res_image = page._backend.get_page_image(
104
+ scale=self.scale, cropbox=ocr_rect
105
+ )
93
106
 
94
- cells = []
95
- for ix, (im, box, _, _) in enumerate(boxes):
96
- # Set the area of interest. Tesseract uses Bottom-Left for the origin
97
- self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
98
-
99
- # Extract text within the bounding box
100
- text = self.reader.GetUTF8Text().strip()
101
- confidence = self.reader.MeanTextConf()
102
- left = box["x"] / self.scale
103
- bottom = box["y"] / self.scale
104
- right = (box["x"] + box["w"]) / self.scale
105
- top = (box["y"] + box["h"]) / self.scale
106
-
107
- cells.append(
108
- OcrCell(
109
- id=ix,
110
- text=text,
111
- confidence=confidence,
112
- bbox=BoundingBox.from_tuple(
113
- coord=(left, top, right, bottom),
114
- origin=CoordOrigin.TOPLEFT,
115
- ),
116
- )
107
+ # Retrieve text snippets with their bounding boxes
108
+ self.reader.SetImage(high_res_image)
109
+ boxes = self.reader.GetComponentImages(
110
+ self.reader_RIL.TEXTLINE, True
117
111
  )
118
112
 
119
- # del high_res_image
120
- all_ocr_cells.extend(cells)
113
+ cells = []
114
+ for ix, (im, box, _, _) in enumerate(boxes):
115
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
116
+ self.reader.SetRectangle(
117
+ box["x"], box["y"], box["w"], box["h"]
118
+ )
119
+
120
+ # Extract text within the bounding box
121
+ text = self.reader.GetUTF8Text().strip()
122
+ confidence = self.reader.MeanTextConf()
123
+ left = box["x"] / self.scale
124
+ bottom = box["y"] / self.scale
125
+ right = (box["x"] + box["w"]) / self.scale
126
+ top = (box["y"] + box["h"]) / self.scale
127
+
128
+ cells.append(
129
+ OcrCell(
130
+ id=ix,
131
+ text=text,
132
+ confidence=confidence,
133
+ bbox=BoundingBox.from_tuple(
134
+ coord=(left, top, right, bottom),
135
+ origin=CoordOrigin.TOPLEFT,
136
+ ),
137
+ )
138
+ )
139
+
140
+ # del high_res_image
141
+ all_ocr_cells.extend(cells)
121
142
 
122
- ## Remove OCR cells which overlap with programmatic cells.
123
- filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
143
+ ## Remove OCR cells which overlap with programmatic cells.
144
+ filtered_ocr_cells = self.filter_ocr_cells(
145
+ all_ocr_cells, page.cells
146
+ )
124
147
 
125
- page.cells.extend(filtered_ocr_cells)
148
+ page.cells.extend(filtered_ocr_cells)
126
149
 
127
150
  # DEBUG code:
128
- # self.draw_ocr_rects_and_cells(page, ocr_rects)
151
+ if settings.debug.visualize_ocr:
152
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
129
153
 
130
154
  yield page