docling 2.23.0__py3-none-any.whl → 2.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,386 +0,0 @@
1
- import copy
2
- import random
3
- from pathlib import Path
4
- from typing import List, Union
5
-
6
- from deepsearch_glm.andromeda_nlp import nlp_model
7
- from docling_core.types.doc import (
8
- BoundingBox,
9
- CoordOrigin,
10
- DocItemLabel,
11
- DoclingDocument,
12
- )
13
- from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
14
- from docling_core.types.legacy_doc.base import (
15
- Figure,
16
- PageDimensions,
17
- PageReference,
18
- Prov,
19
- Ref,
20
- )
21
- from docling_core.types.legacy_doc.base import Table as DsSchemaTable
22
- from docling_core.types.legacy_doc.base import TableCell
23
- from docling_core.types.legacy_doc.document import BaseText
24
- from docling_core.types.legacy_doc.document import (
25
- CCSDocumentDescription as DsDocumentDescription,
26
- )
27
- from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
28
- from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
29
- from PIL import ImageDraw
30
- from pydantic import BaseModel, ConfigDict, TypeAdapter
31
-
32
- from docling.datamodel.base_models import (
33
- Cluster,
34
- ContainerElement,
35
- FigureElement,
36
- Table,
37
- TextElement,
38
- )
39
- from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
40
- from docling.datamodel.settings import settings
41
- from docling.utils.glm_utils import to_docling_document
42
- from docling.utils.profiling import ProfilingScope, TimeRecorder
43
- from docling.utils.utils import create_hash
44
-
45
-
46
- class GlmOptions(BaseModel):
47
- model_config = ConfigDict(protected_namespaces=())
48
-
49
- model_names: str = "" # e.g. "language;term;reference"
50
-
51
-
52
- class GlmModel:
53
- def __init__(self, options: GlmOptions):
54
- self.options = options
55
-
56
- self.model = nlp_model(loglevel="error", text_ordering=True)
57
-
58
- def _to_legacy_document(self, conv_res) -> DsDocument:
59
- title = ""
60
- desc: DsDocumentDescription = DsDocumentDescription(logs=[])
61
-
62
- page_hashes = [
63
- PageReference(
64
- hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
65
- page=p.page_no + 1,
66
- model="default",
67
- )
68
- for p in conv_res.pages
69
- ]
70
-
71
- file_info = DsFileInfoObject(
72
- filename=conv_res.input.file.name,
73
- document_hash=conv_res.input.document_hash,
74
- num_pages=conv_res.input.page_count,
75
- page_hashes=page_hashes,
76
- )
77
-
78
- main_text: List[Union[Ref, BaseText]] = []
79
- page_headers: List[Union[Ref, BaseText]] = []
80
- page_footers: List[Union[Ref, BaseText]] = []
81
-
82
- tables: List[DsSchemaTable] = []
83
- figures: List[Figure] = []
84
-
85
- page_no_to_page = {p.page_no: p for p in conv_res.pages}
86
-
87
- for element in conv_res.assembled.body:
88
- # Convert bboxes to lower-left origin.
89
- target_bbox = DsBoundingBox(
90
- element.cluster.bbox.to_bottom_left_origin(
91
- page_no_to_page[element.page_no].size.height
92
- ).as_tuple()
93
- )
94
-
95
- if isinstance(element, TextElement):
96
- main_text.append(
97
- BaseText(
98
- text=element.text,
99
- obj_type=layout_label_to_ds_type.get(element.label),
100
- name=element.label,
101
- prov=[
102
- Prov(
103
- bbox=target_bbox,
104
- page=element.page_no + 1,
105
- span=[0, len(element.text)],
106
- )
107
- ],
108
- )
109
- )
110
- elif isinstance(element, Table):
111
- index = len(tables)
112
- ref_str = f"#/tables/{index}"
113
- main_text.append(
114
- Ref(
115
- name=element.label,
116
- obj_type=layout_label_to_ds_type.get(element.label),
117
- ref=ref_str,
118
- ),
119
- )
120
-
121
- # Initialise empty table data grid (only empty cells)
122
- table_data = [
123
- [
124
- TableCell(
125
- text="",
126
- # bbox=[0,0,0,0],
127
- spans=[[i, j]],
128
- obj_type="body",
129
- )
130
- for j in range(element.num_cols)
131
- ]
132
- for i in range(element.num_rows)
133
- ]
134
-
135
- # Overwrite cells in table data for which there is actual cell content.
136
- for cell in element.table_cells:
137
- for i in range(
138
- min(cell.start_row_offset_idx, element.num_rows),
139
- min(cell.end_row_offset_idx, element.num_rows),
140
- ):
141
- for j in range(
142
- min(cell.start_col_offset_idx, element.num_cols),
143
- min(cell.end_col_offset_idx, element.num_cols),
144
- ):
145
- celltype = "body"
146
- if cell.column_header:
147
- celltype = "col_header"
148
- elif cell.row_header:
149
- celltype = "row_header"
150
- elif cell.row_section:
151
- celltype = "row_section"
152
-
153
- def make_spans(cell):
154
- for rspan in range(
155
- min(cell.start_row_offset_idx, element.num_rows),
156
- min(cell.end_row_offset_idx, element.num_rows),
157
- ):
158
- for cspan in range(
159
- min(
160
- cell.start_col_offset_idx, element.num_cols
161
- ),
162
- min(cell.end_col_offset_idx, element.num_cols),
163
- ):
164
- yield [rspan, cspan]
165
-
166
- spans = list(make_spans(cell))
167
- if cell.bbox is not None:
168
- bbox = cell.bbox.to_bottom_left_origin(
169
- page_no_to_page[element.page_no].size.height
170
- ).as_tuple()
171
- else:
172
- bbox = None
173
-
174
- table_data[i][j] = TableCell(
175
- text=cell.text,
176
- bbox=bbox,
177
- # col=j,
178
- # row=i,
179
- spans=spans,
180
- obj_type=celltype,
181
- # col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
182
- # row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
183
- )
184
-
185
- tables.append(
186
- DsSchemaTable(
187
- num_cols=element.num_cols,
188
- num_rows=element.num_rows,
189
- obj_type=layout_label_to_ds_type.get(element.label),
190
- data=table_data,
191
- prov=[
192
- Prov(
193
- bbox=target_bbox,
194
- page=element.page_no + 1,
195
- span=[0, 0],
196
- )
197
- ],
198
- )
199
- )
200
-
201
- elif isinstance(element, FigureElement):
202
- index = len(figures)
203
- ref_str = f"#/figures/{index}"
204
- main_text.append(
205
- Ref(
206
- name=element.label,
207
- obj_type=layout_label_to_ds_type.get(element.label),
208
- ref=ref_str,
209
- ),
210
- )
211
- figures.append(
212
- Figure(
213
- prov=[
214
- Prov(
215
- bbox=target_bbox,
216
- page=element.page_no + 1,
217
- span=[0, 0],
218
- )
219
- ],
220
- obj_type=layout_label_to_ds_type.get(element.label),
221
- payload={
222
- "children": TypeAdapter(List[Cluster]).dump_python(
223
- element.cluster.children
224
- )
225
- }, # hack to channel child clusters through GLM
226
- )
227
- )
228
- elif isinstance(element, ContainerElement):
229
- main_text.append(
230
- BaseText(
231
- text="",
232
- payload={
233
- "children": TypeAdapter(List[Cluster]).dump_python(
234
- element.cluster.children
235
- )
236
- }, # hack to channel child clusters through GLM
237
- obj_type=layout_label_to_ds_type.get(element.label),
238
- name=element.label,
239
- prov=[
240
- Prov(
241
- bbox=target_bbox,
242
- page=element.page_no + 1,
243
- span=[0, 0],
244
- )
245
- ],
246
- )
247
- )
248
-
249
- # We can throw in headers and footers at the end of the legacy doc
250
- # since the reading-order will re-sort it later.
251
- for element in conv_res.assembled.headers:
252
- # Convert bboxes to lower-left origin.
253
- target_bbox = DsBoundingBox(
254
- element.cluster.bbox.to_bottom_left_origin(
255
- page_no_to_page[element.page_no].size.height
256
- ).as_tuple()
257
- )
258
-
259
- if isinstance(element, TextElement):
260
-
261
- tel = BaseText(
262
- text=element.text,
263
- obj_type=layout_label_to_ds_type.get(element.label),
264
- name=element.label,
265
- prov=[
266
- Prov(
267
- bbox=target_bbox,
268
- page=element.page_no + 1,
269
- span=[0, len(element.text)],
270
- )
271
- ],
272
- )
273
- if element.label == DocItemLabel.PAGE_HEADER:
274
- index = len(page_headers)
275
- ref_str = f"#/page-headers/{index}"
276
- main_text.append(
277
- Ref(
278
- name=element.label,
279
- obj_type=layout_label_to_ds_type.get(element.label),
280
- ref=ref_str,
281
- ),
282
- )
283
- page_headers.append(tel)
284
- elif element.label == DocItemLabel.PAGE_FOOTER:
285
- index = len(page_footers)
286
- ref_str = f"#/page-footers/{index}"
287
- main_text.append(
288
- Ref(
289
- name=element.label,
290
- obj_type=layout_label_to_ds_type.get(element.label),
291
- ref=ref_str,
292
- ),
293
- )
294
- page_footers.append(tel)
295
-
296
- page_dimensions = [
297
- PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
298
- for p in conv_res.pages
299
- if p.size is not None
300
- ]
301
-
302
- ds_doc: DsDocument = DsDocument(
303
- name=title,
304
- description=desc,
305
- file_info=file_info,
306
- main_text=main_text,
307
- tables=tables,
308
- figures=figures,
309
- page_dimensions=page_dimensions,
310
- page_headers=page_headers,
311
- page_footers=page_footers,
312
- )
313
-
314
- return ds_doc
315
-
316
- def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
317
- with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
318
- ds_doc = self._to_legacy_document(conv_res)
319
- ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
320
-
321
- glm_doc = self.model.apply_on_doc(ds_doc_dict)
322
-
323
- docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
324
- 1 == 1
325
-
326
- # DEBUG code:
327
- def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
328
- clusters_to_draw = []
329
- image = copy.deepcopy(conv_res.pages[page_no].image)
330
- for ix, elem in enumerate(ds_document.main_text):
331
- if isinstance(elem, BaseText):
332
- prov = elem.prov[0] # type: ignore
333
- elif isinstance(elem, Ref):
334
- _, arr, index = elem.ref.split("/")
335
- index = int(index) # type: ignore
336
- if arr == "tables":
337
- prov = ds_document.tables[index].prov[0]
338
- elif arr == "figures":
339
- prov = ds_document.pictures[index].prov[0]
340
- else:
341
- prov = None
342
-
343
- if prov and prov.page == page_no:
344
- clusters_to_draw.append(
345
- Cluster(
346
- id=ix,
347
- label=elem.name,
348
- bbox=BoundingBox.from_tuple(
349
- coord=prov.bbox, # type: ignore
350
- origin=CoordOrigin.BOTTOMLEFT,
351
- ).to_top_left_origin(conv_res.pages[page_no].size.height),
352
- )
353
- )
354
-
355
- draw = ImageDraw.Draw(image)
356
- for c in clusters_to_draw:
357
- x0, y0, x1, y1 = c.bbox.as_tuple()
358
- draw.rectangle([(x0, y0), (x1, y1)], outline="red")
359
- draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
360
-
361
- cell_color = (
362
- random.randint(30, 140),
363
- random.randint(30, 140),
364
- random.randint(30, 140),
365
- )
366
- for tc in c.cells: # [:1]:
367
- x0, y0, x1, y1 = tc.bbox.as_tuple()
368
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
369
-
370
- if show:
371
- image.show()
372
- else:
373
- out_path: Path = (
374
- Path(settings.debug.debug_output_path)
375
- / f"debug_{conv_res.input.file.stem}"
376
- )
377
- out_path.mkdir(parents=True, exist_ok=True)
378
-
379
- out_file = out_path / f"doc_page_{page_no:05}.png"
380
- image.save(str(out_file), format="png")
381
-
382
- # for item in ds_doc.page_dimensions:
383
- # page_no = item.page
384
- # draw_clusters_and_cells(ds_doc, page_no)
385
-
386
- return docling_doc