docling 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,351 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path, PurePath
4
+ from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
5
+
6
+ from deepsearch.documents.core.export import export_to_markdown
7
+ from docling_core.types import BaseCell, BaseText
8
+ from docling_core.types import BoundingBox as DsBoundingBox
9
+ from docling_core.types import Document as DsDocument
10
+ from docling_core.types import DocumentDescription as DsDocumentDescription
11
+ from docling_core.types import FileInfoObject as DsFileInfoObject
12
+ from docling_core.types import PageDimensions, PageReference, Prov, Ref
13
+ from docling_core.types import Table as DsSchemaTable
14
+ from docling_core.types import TableCell
15
+ from pydantic import BaseModel
16
+
17
+ from docling.backend.abstract_backend import PdfDocumentBackend
18
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
19
+ from docling.datamodel.base_models import (
20
+ AssembledUnit,
21
+ ConversionStatus,
22
+ DocumentStream,
23
+ FigureElement,
24
+ Page,
25
+ TableElement,
26
+ TextElement,
27
+ )
28
+ from docling.datamodel.settings import DocumentLimits
29
+ from docling.utils.utils import create_file_hash
30
+
31
+ _log = logging.getLogger(__name__)
32
+
33
+ layout_label_to_ds_type = {
34
+ "Title": "title",
35
+ "Document Index": "table-of-path_or_stream",
36
+ "Section-header": "subtitle-level-1",
37
+ "Checkbox-Selected": "checkbox-selected",
38
+ "Checkbox-Unselected": "checkbox-unselected",
39
+ "Caption": "caption",
40
+ "Page-header": "page-header",
41
+ "Page-footer": "page-footer",
42
+ "Footnote": "footnote",
43
+ "Table": "table",
44
+ "Formula": "equation",
45
+ "List-item": "paragraph",
46
+ "Code": "paragraph",
47
+ "Picture": "figure",
48
+ "Text": "paragraph",
49
+ }
50
+
51
+
52
+ class InputDocument(BaseModel):
53
+ file: PurePath = None
54
+ document_hash: Optional[str] = None
55
+ valid: bool = False
56
+ limits: DocumentLimits = DocumentLimits()
57
+
58
+ filesize: Optional[int] = None
59
+ page_count: Optional[int] = None
60
+
61
+ _backend: PdfDocumentBackend = None # Internal PDF backend used
62
+
63
+ def __init__(
64
+ self,
65
+ path_or_stream: Union[BytesIO, Path],
66
+ filename: Optional[str] = None,
67
+ limits: Optional[DocumentLimits] = None,
68
+ pdf_backend=PyPdfiumDocumentBackend,
69
+ ):
70
+ super().__init__()
71
+
72
+ self.limits = limits or DocumentLimits()
73
+
74
+ try:
75
+ if isinstance(path_or_stream, Path):
76
+ self.file = path_or_stream
77
+ self.filesize = path_or_stream.stat().st_size
78
+ if self.filesize > self.limits.max_file_size:
79
+ self.valid = False
80
+ else:
81
+ self.document_hash = create_file_hash(path_or_stream)
82
+ self._backend = pdf_backend(path_or_stream=path_or_stream)
83
+
84
+ elif isinstance(path_or_stream, BytesIO):
85
+ self.file = PurePath(filename)
86
+ self.filesize = path_or_stream.getbuffer().nbytes
87
+
88
+ if self.filesize > self.limits.max_file_size:
89
+ self.valid = False
90
+ else:
91
+ self.document_hash = create_file_hash(path_or_stream)
92
+ self._backend = pdf_backend(path_or_stream=path_or_stream)
93
+
94
+ if self.document_hash and self._backend.page_count() > 0:
95
+ self.page_count = self._backend.page_count()
96
+
97
+ if self.page_count <= self.limits.max_num_pages:
98
+ self.valid = True
99
+
100
+ except (FileNotFoundError, OSError) as e:
101
+ _log.exception(
102
+ f"File {self.file.name} not found or cannot be opened.", exc_info=e
103
+ )
104
+ # raise
105
+ except RuntimeError as e:
106
+ _log.exception(
107
+ f"An unexpected error occurred while opening the document {self.file.name}",
108
+ exc_info=e,
109
+ )
110
+ # raise
111
+
112
+
113
+ class ConvertedDocument(BaseModel):
114
+ input: InputDocument
115
+
116
+ status: ConversionStatus = ConversionStatus.PENDING # failure, success
117
+ errors: List[Dict] = [] # structure to keep errors
118
+
119
+ pages: List[Page] = []
120
+ assembled: AssembledUnit = None
121
+
122
+ output: DsDocument = None
123
+
124
+ def to_ds_document(self) -> DsDocument:
125
+ title = ""
126
+ desc = DsDocumentDescription(logs=[])
127
+
128
+ page_hashes = [
129
+ PageReference(hash=p.page_hash, page=p.page_no, model="default")
130
+ for p in self.pages
131
+ ]
132
+
133
+ file_info = DsFileInfoObject(
134
+ filename=self.input.file.name,
135
+ document_hash=self.input.document_hash,
136
+ num_pages=self.input.page_count,
137
+ page_hashes=page_hashes,
138
+ )
139
+
140
+ main_text = []
141
+ tables = []
142
+ figures = []
143
+
144
+ page_no_to_page = {p.page_no: p for p in self.pages}
145
+
146
+ for element in self.assembled.elements:
147
+ # Convert bboxes to lower-left origin.
148
+ target_bbox = DsBoundingBox(
149
+ element.cluster.bbox.to_bottom_left_origin(
150
+ page_no_to_page[element.page_no].size.height
151
+ ).as_tuple()
152
+ )
153
+
154
+ if isinstance(element, TextElement):
155
+ main_text.append(
156
+ BaseText(
157
+ text=element.text,
158
+ obj_type=layout_label_to_ds_type.get(element.label),
159
+ name=element.label,
160
+ prov=[
161
+ Prov(
162
+ bbox=target_bbox,
163
+ page=element.page_no,
164
+ span=[0, len(element.text)],
165
+ )
166
+ ],
167
+ )
168
+ )
169
+ elif isinstance(element, TableElement):
170
+ index = len(tables)
171
+ ref_str = f"#/tables/{index}"
172
+ main_text.append(
173
+ Ref(
174
+ name=element.label,
175
+ obj_type=layout_label_to_ds_type.get(element.label),
176
+ ref=ref_str,
177
+ ),
178
+ )
179
+
180
+ # Initialise empty table data grid (only empty cells)
181
+ table_data = [
182
+ [
183
+ TableCell(
184
+ text="",
185
+ # bbox=[0,0,0,0],
186
+ spans=[[i, j]],
187
+ obj_type="body",
188
+ )
189
+ for j in range(element.num_cols)
190
+ ]
191
+ for i in range(element.num_rows)
192
+ ]
193
+
194
+ # Overwrite cells in table data for which there is actual cell content.
195
+ for cell in element.table_cells:
196
+ for i in range(
197
+ min(cell.start_row_offset_idx, element.num_rows),
198
+ min(cell.end_row_offset_idx, element.num_rows),
199
+ ):
200
+ for j in range(
201
+ min(cell.start_col_offset_idx, element.num_cols),
202
+ min(cell.end_col_offset_idx, element.num_cols),
203
+ ):
204
+ celltype = "body"
205
+ if cell.column_header:
206
+ celltype = "col_header"
207
+ elif cell.row_header:
208
+ celltype = "row_header"
209
+
210
+ def make_spans(cell):
211
+ for rspan in range(
212
+ min(cell.start_row_offset_idx, element.num_rows),
213
+ min(cell.end_row_offset_idx, element.num_rows),
214
+ ):
215
+ for cspan in range(
216
+ min(
217
+ cell.start_col_offset_idx, element.num_cols
218
+ ),
219
+ min(cell.end_col_offset_idx, element.num_cols),
220
+ ):
221
+ yield [rspan, cspan]
222
+
223
+ spans = list(make_spans(cell))
224
+ table_data[i][j] = TableCell(
225
+ text=cell.text,
226
+ bbox=cell.bbox.to_bottom_left_origin(
227
+ page_no_to_page[element.page_no].size.height
228
+ ).as_tuple(),
229
+ # col=j,
230
+ # row=i,
231
+ spans=spans,
232
+ obj_type=celltype,
233
+ # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
234
+ # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
235
+ )
236
+
237
+ tables.append(
238
+ DsSchemaTable(
239
+ num_cols=element.num_cols,
240
+ num_rows=element.num_rows,
241
+ obj_type=layout_label_to_ds_type.get(element.label),
242
+ data=table_data,
243
+ prov=[
244
+ Prov(
245
+ bbox=target_bbox,
246
+ page=element.page_no,
247
+ span=[0, 0],
248
+ )
249
+ ],
250
+ )
251
+ )
252
+
253
+ elif isinstance(element, FigureElement):
254
+ index = len(figures)
255
+ ref_str = f"#/figures/{index}"
256
+ main_text.append(
257
+ Ref(
258
+ name=element.label,
259
+ obj_type=layout_label_to_ds_type.get(element.label),
260
+ ref=ref_str,
261
+ ),
262
+ )
263
+ figures.append(
264
+ BaseCell(
265
+ prov=[
266
+ Prov(
267
+ bbox=target_bbox,
268
+ page=element.page_no,
269
+ span=[0, 0],
270
+ )
271
+ ],
272
+ obj_type=layout_label_to_ds_type.get(element.label),
273
+ # data=[[]],
274
+ )
275
+ )
276
+
277
+ page_dimensions = [
278
+ PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
279
+ for p in self.pages
280
+ ]
281
+
282
+ ds_doc = DsDocument(
283
+ name=title,
284
+ description=desc,
285
+ file_info=file_info,
286
+ main_text=main_text,
287
+ tables=tables,
288
+ figures=figures,
289
+ page_dimensions=page_dimensions,
290
+ )
291
+
292
+ return ds_doc
293
+
294
+ def render_as_dict(self):
295
+ if self.output:
296
+ return self.output.model_dump(by_alias=True, exclude_none=True)
297
+ else:
298
+ return {}
299
+
300
+ def render_as_markdown(self):
301
+ if self.output:
302
+ return export_to_markdown(
303
+ self.output.model_dump(by_alias=True, exclude_none=True)
304
+ )
305
+ else:
306
+ return ""
307
+
308
+
309
+ class DocumentConversionInput(BaseModel):
310
+
311
+ _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
312
+ limits: Optional[DocumentLimits] = DocumentLimits()
313
+
314
+ DEFAULT_BACKEND: ClassVar = PyPdfiumDocumentBackend
315
+
316
+ def docs(
317
+ self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
318
+ ) -> Iterable[InputDocument]:
319
+
320
+ pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
321
+
322
+ for obj in self._path_or_stream_iterator:
323
+ if isinstance(obj, Path):
324
+ yield InputDocument(
325
+ path_or_stream=obj, limits=self.limits, pdf_backend=pdf_backend
326
+ )
327
+ elif isinstance(obj, DocumentStream):
328
+ yield InputDocument(
329
+ path_or_stream=obj.stream,
330
+ filename=obj.filename,
331
+ limits=self.limits,
332
+ pdf_backend=pdf_backend,
333
+ )
334
+
335
+ @classmethod
336
+ def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
337
+ paths = [Path(p) for p in paths]
338
+
339
+ doc_input = cls(limits=limits)
340
+ doc_input._path_or_stream_iterator = paths
341
+
342
+ return doc_input
343
+
344
+ @classmethod
345
+ def from_streams(
346
+ cls, streams: Iterable[DocumentStream], limits: Optional[DocumentLimits] = None
347
+ ):
348
+ doc_input = cls(limits=limits)
349
+ doc_input._path_or_stream_iterator = streams
350
+
351
+ return doc_input
@@ -0,0 +1,32 @@
1
+ import sys
2
+
3
+ from pydantic import BaseModel
4
+ from pydantic_settings import BaseSettings
5
+
6
+
7
+ class DocumentLimits(BaseModel):
8
+ max_num_pages: int = sys.maxsize
9
+ max_file_size: int = sys.maxsize
10
+
11
+
12
+ class BatchConcurrencySettings(BaseModel):
13
+ doc_batch_size: int = 2
14
+ doc_batch_concurrency: int = 2
15
+ page_batch_size: int = 4
16
+ page_batch_concurrency: int = 2
17
+
18
+ # doc_batch_size: int = 1
19
+ # doc_batch_concurrency: int = 1
20
+ # page_batch_size: int = 1
21
+ # page_batch_concurrency: int = 1
22
+
23
+ # model_concurrency: int = 2
24
+
25
+ # To force models into single core: export OMP_NUM_THREADS=1
26
+
27
+
28
+ class AppSettings(BaseSettings):
29
+ perf: BatchConcurrencySettings
30
+
31
+
32
+ settings = AppSettings(perf=BatchConcurrencySettings())
@@ -0,0 +1,207 @@
1
+ import functools
2
+ import logging
3
+ import time
4
+ import traceback
5
+ from pathlib import Path
6
+ from typing import Iterable, Optional, Type, Union
7
+
8
+ from PIL import ImageDraw
9
+
10
+ from docling.backend.abstract_backend import PdfDocumentBackend
11
+ from docling.datamodel.base_models import (
12
+ AssembledUnit,
13
+ ConversionStatus,
14
+ Page,
15
+ PipelineOptions,
16
+ )
17
+ from docling.datamodel.document import (
18
+ ConvertedDocument,
19
+ DocumentConversionInput,
20
+ InputDocument,
21
+ )
22
+ from docling.datamodel.settings import settings
23
+ from docling.models.ds_glm_model import GlmModel
24
+ from docling.models.page_assemble_model import PageAssembleModel
25
+ from docling.pipeline.base_model_pipeline import BaseModelPipeline
26
+ from docling.pipeline.standard_model_pipeline import StandardModelPipeline
27
+ from docling.utils.utils import chunkify, create_hash
28
+
29
+ _log = logging.getLogger(__name__)
30
+
31
+
32
+ class DocumentConverter:
33
+ _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
34
+ _table_model_path = "model_artifacts/tableformer"
35
+
36
+ def __init__(
37
+ self,
38
+ artifacts_path: Optional[Union[Path, str]] = None,
39
+ pipeline_options: PipelineOptions = PipelineOptions(),
40
+ pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
41
+ pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
42
+ ):
43
+ if not artifacts_path:
44
+ artifacts_path = self.download_models_hf()
45
+
46
+ artifacts_path = Path(artifacts_path)
47
+
48
+ self.model_pipeline = pipeline_cls(
49
+ artifacts_path=artifacts_path, pipeline_options=pipeline_options
50
+ )
51
+
52
+ self.page_assemble_model = PageAssembleModel(config={})
53
+ self.glm_model = GlmModel(config={})
54
+ self.pdf_backend = pdf_backend
55
+
56
+ @staticmethod
57
+ def download_models_hf(
58
+ local_dir: Optional[Path] = None, force: bool = False
59
+ ) -> Path:
60
+ from huggingface_hub import snapshot_download
61
+
62
+ download_path = snapshot_download(
63
+ repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
64
+ )
65
+
66
+ return Path(download_path)
67
+
68
+ def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument]:
69
+
70
+ for input_batch in chunkify(
71
+ input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
72
+ ):
73
+ _log.info(f"Going to convert document batch...")
74
+ # parallel processing only within input_batch
75
+ # with ThreadPoolExecutor(
76
+ # max_workers=settings.perf.doc_batch_concurrency
77
+ # ) as pool:
78
+ # yield from pool.map(self.process_document, input_batch)
79
+
80
+ # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
81
+ yield from map(self.process_document, input_batch)
82
+
83
+ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
84
+ start_doc_time = time.time()
85
+ converted_doc = ConvertedDocument(input=in_doc)
86
+
87
+ if not in_doc.valid:
88
+ converted_doc.status = ConversionStatus.FAILURE
89
+ return converted_doc
90
+
91
+ for i in range(0, in_doc.page_count):
92
+ converted_doc.pages.append(Page(page_no=i))
93
+
94
+ all_assembled_pages = []
95
+
96
+ try:
97
+ # Iterate batches of pages (page_batch_size) in the doc
98
+ for page_batch in chunkify(
99
+ converted_doc.pages, settings.perf.page_batch_size
100
+ ):
101
+
102
+ start_pb_time = time.time()
103
+ # Pipeline
104
+
105
+ # 1. Initialise the page resources
106
+ init_pages = map(
107
+ functools.partial(self.initialize_page, in_doc), page_batch
108
+ )
109
+
110
+ # 2. Populate page image
111
+ pages_with_images = map(
112
+ functools.partial(self.populate_page_images, in_doc), init_pages
113
+ )
114
+
115
+ # 3. Populate programmatic page cells
116
+ pages_with_cells = map(
117
+ functools.partial(self.parse_page_cells, in_doc),
118
+ pages_with_images,
119
+ )
120
+
121
+ pipeline_pages = self.model_pipeline.apply(pages_with_cells)
122
+
123
+ # 7. Assemble page elements (per page)
124
+ assembled_pages = self.page_assemble_model(pipeline_pages)
125
+
126
+ # exhaust assembled_pages
127
+ for assembled_page in assembled_pages:
128
+ # Free up mem resources before moving on with next batch
129
+ assembled_page.image = (
130
+ None # Comment this if you want to visualize page images
131
+ )
132
+ assembled_page._backend.unload()
133
+
134
+ all_assembled_pages.append(assembled_page)
135
+
136
+ end_pb_time = time.time() - start_pb_time
137
+ _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
138
+
139
+ # Free up mem resources of PDF backend
140
+ in_doc._backend.unload()
141
+
142
+ converted_doc.pages = all_assembled_pages
143
+ self.assemble_doc(converted_doc)
144
+
145
+ converted_doc.status = ConversionStatus.SUCCESS
146
+
147
+ except Exception as e:
148
+ converted_doc.status = ConversionStatus.FAILURE
149
+ trace = "\n".join(traceback.format_exception(e))
150
+ _log.info(f"Encountered an error during conversion: {trace}")
151
+
152
+ end_doc_time = time.time() - start_doc_time
153
+ _log.info(
154
+ f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
155
+ )
156
+
157
+ return converted_doc
158
+
159
+ # Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
160
+ def initialize_page(self, doc: InputDocument, page: Page) -> Page:
161
+ page._backend = doc._backend.load_page(page.page_no)
162
+ page.size = page._backend.get_size()
163
+ page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
164
+
165
+ return page
166
+
167
+ # Generate the page image and store it in the page object
168
+ def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
169
+ page.image = page._backend.get_page_image()
170
+
171
+ return page
172
+
173
+ # Extract and populate the page cells and store it in the page object
174
+ def parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
175
+ page.cells = page._backend.get_text_cells()
176
+
177
+ # DEBUG code:
178
+ def draw_text_boxes(image, cells):
179
+ draw = ImageDraw.Draw(image)
180
+ for c in cells:
181
+ x0, y0, x1, y1 = c.bbox.as_tuple()
182
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
183
+ image.show()
184
+
185
+ # draw_text_boxes(page.image, cells)
186
+
187
+ return page
188
+
189
+ def assemble_doc(self, converted_doc: ConvertedDocument):
190
+ all_elements = []
191
+ all_headers = []
192
+ all_body = []
193
+
194
+ for p in converted_doc.pages:
195
+
196
+ for el in p.assembled.body:
197
+ all_body.append(el)
198
+ for el in p.assembled.headers:
199
+ all_headers.append(el)
200
+ for el in p.assembled.elements:
201
+ all_elements.append(el)
202
+
203
+ converted_doc.assembled = AssembledUnit(
204
+ elements=all_elements, headers=all_headers, body=all_body
205
+ )
206
+
207
+ converted_doc.output = self.glm_model(converted_doc)
File without changes
@@ -0,0 +1,82 @@
1
+ import copy
2
+ import random
3
+
4
+ from deepsearch_glm.nlp_utils import init_nlp_model
5
+ from deepsearch_glm.utils.ds_utils import to_legacy_document_format
6
+ from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
7
+ from docling_core.types import BaseText
8
+ from docling_core.types import Document as DsDocument
9
+ from docling_core.types import Ref
10
+ from PIL import ImageDraw
11
+
12
+ from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin
13
+ from docling.datamodel.document import ConvertedDocument
14
+
15
+
16
+ class GlmModel:
17
+ def __init__(self, config):
18
+ self.config = config
19
+ load_pretrained_nlp_models()
20
+ model = init_nlp_model(model_names="language;term;reference")
21
+ self.model = model
22
+
23
+ def __call__(self, document: ConvertedDocument) -> DsDocument:
24
+ ds_doc = document.to_ds_document()
25
+ ds_doc_dict = ds_doc.model_dump(by_alias=True)
26
+
27
+ glm_doc = self.model.apply_on_doc(ds_doc_dict)
28
+ ds_doc_dict = to_legacy_document_format(
29
+ glm_doc, ds_doc_dict, update_name_label=True
30
+ )
31
+
32
+ exported_doc = DsDocument.model_validate(ds_doc_dict)
33
+
34
+ # DEBUG code:
35
+ def draw_clusters_and_cells(ds_document, page_no):
36
+ clusters_to_draw = []
37
+ image = copy.deepcopy(document.pages[page_no].image)
38
+ for ix, elem in enumerate(ds_document.main_text):
39
+ if isinstance(elem, BaseText):
40
+ prov = elem.prov[0]
41
+ elif isinstance(elem, Ref):
42
+ _, arr, index = elem.ref.split("/")
43
+ index = int(index)
44
+ if arr == "tables":
45
+ prov = ds_document.tables[index].prov[0]
46
+ elif arr == "figures":
47
+ prov = ds_document.figures[index].prov[0]
48
+ else:
49
+ prov = None
50
+
51
+ if prov and prov.page == page_no:
52
+ clusters_to_draw.append(
53
+ Cluster(
54
+ id=ix,
55
+ label=elem.name,
56
+ bbox=BoundingBox.from_tuple(
57
+ coord=prov.bbox,
58
+ origin=CoordOrigin.BOTTOMLEFT,
59
+ ).to_top_left_origin(document.pages[page_no].size.height),
60
+ )
61
+ )
62
+
63
+ draw = ImageDraw.Draw(image)
64
+ for c in clusters_to_draw:
65
+ x0, y0, x1, y1 = c.bbox.as_tuple()
66
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
67
+ draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
68
+
69
+ cell_color = (
70
+ random.randint(30, 140),
71
+ random.randint(30, 140),
72
+ random.randint(30, 140),
73
+ )
74
+ for tc in c.cells: # [:1]:
75
+ x0, y0, x1, y1 = tc.bbox.as_tuple()
76
+ draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
77
+ image.show()
78
+
79
+ # draw_clusters_and_cells(ds_doc, 0)
80
+ # draw_clusters_and_cells(exported_doc, 0)
81
+
82
+ return exported_doc