docling-core 2.23.3__py3-none-any.whl → 2.24.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1,492 @@
1
+ """Define classes for Doctags serialization."""
2
+
3
+ from enum import Enum
4
+ from typing import Dict, List, Optional, Union
5
+
6
+ from pydantic import BaseModel
7
+ from typing_extensions import override
8
+
9
+ from docling_core.experimental.serializer.base import (
10
+ BaseDocSerializer,
11
+ BaseFallbackSerializer,
12
+ BaseFormSerializer,
13
+ BaseInlineSerializer,
14
+ BaseKeyValueSerializer,
15
+ BaseListSerializer,
16
+ BasePictureSerializer,
17
+ BaseTableSerializer,
18
+ BaseTextSerializer,
19
+ SerializationResult,
20
+ )
21
+ from docling_core.experimental.serializer.common import CommonParams, DocSerializer
22
+ from docling_core.types.doc.document import (
23
+ CodeItem,
24
+ DocItem,
25
+ DoclingDocument,
26
+ FloatingItem,
27
+ FormItem,
28
+ InlineGroup,
29
+ KeyValueItem,
30
+ ListItem,
31
+ NodeItem,
32
+ OrderedList,
33
+ PictureClassificationData,
34
+ PictureItem,
35
+ PictureMoleculeData,
36
+ TableItem,
37
+ TextItem,
38
+ UnorderedList,
39
+ )
40
+ from docling_core.types.doc.tokens import DocumentToken
41
+
42
+
43
+ def _wrap(text: str, wrap_tag: str) -> str:
44
+ return f"<{wrap_tag}>{text}</{wrap_tag}>"
45
+
46
+
47
+ class DocTagsParams(CommonParams):
48
+ """DocTags-specific serialization parameters."""
49
+
50
+ class Mode(str, Enum):
51
+ """DocTags serialization mode."""
52
+
53
+ MINIFIED = "minified"
54
+ HUMAN_FRIENDLY = "human_friendly"
55
+
56
+ xsize: int = 500
57
+ ysize: int = 500
58
+ add_location: bool = True
59
+ add_caption: bool = True
60
+ add_content: bool = True
61
+ add_table_cell_location: bool = False
62
+ add_table_cell_text: bool = True
63
+ add_page_break: bool = True
64
+
65
+ mode: Mode = Mode.HUMAN_FRIENDLY
66
+
67
+
68
+ def _get_delim(params: DocTagsParams) -> str:
69
+ if params.mode == DocTagsParams.Mode.HUMAN_FRIENDLY:
70
+ delim = "\n"
71
+ elif params.mode == DocTagsParams.Mode.MINIFIED:
72
+ delim = ""
73
+ else:
74
+ raise RuntimeError(f"Unknown DocTags mode: {params.mode}")
75
+ return delim
76
+
77
+
78
+ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
79
+ """DocTags-specific text item serializer."""
80
+
81
+ @override
82
+ def serialize(
83
+ self,
84
+ *,
85
+ item: TextItem,
86
+ doc_serializer: BaseDocSerializer,
87
+ doc: DoclingDocument,
88
+ **kwargs,
89
+ ) -> SerializationResult:
90
+ """Serializes the passed item."""
91
+ from docling_core.types.doc.document import SectionHeaderItem
92
+
93
+ params = DocTagsParams(**kwargs)
94
+ wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label(
95
+ label=item.label,
96
+ **({"level": item.level} if isinstance(item, SectionHeaderItem) else {}),
97
+ )
98
+ parts: list[str] = []
99
+
100
+ if params.add_location:
101
+ location = item.get_location_tokens(
102
+ doc=doc,
103
+ xsize=params.xsize,
104
+ ysize=params.ysize,
105
+ )
106
+ if location:
107
+ parts.append(location)
108
+
109
+ if params.add_content:
110
+ text_part = item.text
111
+ text_part = doc_serializer.post_process(
112
+ text=text_part,
113
+ formatting=item.formatting,
114
+ hyperlink=item.hyperlink,
115
+ )
116
+
117
+ if isinstance(item, CodeItem):
118
+ language_token = DocumentToken.get_code_language_token(
119
+ code_language=item.code_language,
120
+ )
121
+ text_part = f"{language_token}{text_part}"
122
+ else:
123
+ text_part = text_part.strip()
124
+ if isinstance(item, ListItem):
125
+ wrap_tag = None # deferring list item tags to list handling
126
+
127
+ if text_part:
128
+ parts.append(text_part)
129
+
130
+ if params.add_caption and isinstance(item, FloatingItem):
131
+ cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
132
+ if cap_text:
133
+ parts.append(cap_text)
134
+
135
+ text_res = "".join(parts)
136
+ if wrap_tag is not None:
137
+ text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
138
+ return SerializationResult(text=text_res)
139
+
140
+
141
+ class DocTagsTableSerializer(BaseTableSerializer):
142
+ """DocTags-specific table item serializer."""
143
+
144
+ @override
145
+ def serialize(
146
+ self,
147
+ *,
148
+ item: TableItem,
149
+ doc_serializer: BaseDocSerializer,
150
+ doc: DoclingDocument,
151
+ **kwargs,
152
+ ) -> SerializationResult:
153
+ """Serializes the passed item."""
154
+ params = DocTagsParams(**kwargs)
155
+
156
+ parts: list[str] = []
157
+
158
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
159
+ if params.add_location:
160
+ loc_text = item.get_location_tokens(
161
+ doc=doc,
162
+ xsize=params.xsize,
163
+ ysize=params.ysize,
164
+ )
165
+ parts.append(loc_text)
166
+
167
+ otsl_text = item.export_to_otsl(
168
+ doc=doc,
169
+ add_cell_location=params.add_table_cell_location,
170
+ add_cell_text=params.add_table_cell_text,
171
+ xsize=params.xsize,
172
+ ysize=params.ysize,
173
+ )
174
+ parts.append(otsl_text)
175
+
176
+ if params.add_caption:
177
+ cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
178
+ if cap_text:
179
+ parts.append(cap_text)
180
+
181
+ text_res = "".join(parts)
182
+ if text_res:
183
+ text_res = _wrap(text=text_res, wrap_tag=DocumentToken.OTSL.value)
184
+
185
+ return SerializationResult(text=text_res)
186
+
187
+
188
+ class DocTagsPictureSerializer(BasePictureSerializer):
189
+ """DocTags-specific picture item serializer."""
190
+
191
+ @override
192
+ def serialize(
193
+ self,
194
+ *,
195
+ item: PictureItem,
196
+ doc_serializer: BaseDocSerializer,
197
+ doc: DoclingDocument,
198
+ **kwargs,
199
+ ) -> SerializationResult:
200
+ """Serializes the passed item."""
201
+ params = DocTagsParams(**kwargs)
202
+ parts: list[str] = []
203
+
204
+ if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
205
+ body = ""
206
+ if params.add_location:
207
+ body += item.get_location_tokens(
208
+ doc=doc,
209
+ xsize=params.xsize,
210
+ ysize=params.ysize,
211
+ )
212
+
213
+ classifications = [
214
+ ann
215
+ for ann in item.annotations
216
+ if isinstance(ann, PictureClassificationData)
217
+ ]
218
+ if len(classifications) > 0:
219
+ predicted_class = classifications[0].predicted_classes[0].class_name
220
+ body += DocumentToken.get_picture_classification_token(predicted_class)
221
+
222
+ smiles_annotations = [
223
+ ann for ann in item.annotations if isinstance(ann, PictureMoleculeData)
224
+ ]
225
+ if len(smiles_annotations) > 0:
226
+ body += _wrap(
227
+ text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value
228
+ )
229
+ parts.append(body)
230
+
231
+ if params.add_caption:
232
+ cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
233
+ if cap_text:
234
+ parts.append(cap_text)
235
+
236
+ text_res = "".join(parts)
237
+ if text_res:
238
+ token = DocumentToken.create_token_name_from_doc_item_label(
239
+ label=item.label
240
+ )
241
+ text_res = _wrap(text=text_res, wrap_tag=token)
242
+ return SerializationResult(text=text_res)
243
+
244
+
245
+ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
246
+ """DocTags-specific key-value item serializer."""
247
+
248
+ @override
249
+ def serialize(
250
+ self,
251
+ *,
252
+ item: KeyValueItem,
253
+ doc_serializer: "BaseDocSerializer",
254
+ doc: DoclingDocument,
255
+ **kwargs,
256
+ ) -> SerializationResult:
257
+ """Serializes the passed item."""
258
+ params = DocTagsParams(**kwargs)
259
+
260
+ body = ""
261
+
262
+ page_no = 1
263
+ if len(item.prov) > 0:
264
+ page_no = item.prov[0].page_no
265
+
266
+ if params.add_location:
267
+ body += item.get_location_tokens(
268
+ doc=doc,
269
+ xsize=params.xsize,
270
+ ysize=params.ysize,
271
+ )
272
+
273
+ # mapping from source_cell_id to a list of target_cell_ids
274
+ source_to_targets: Dict[int, List[int]] = {}
275
+ for link in item.graph.links:
276
+ source_to_targets.setdefault(link.source_cell_id, []).append(
277
+ link.target_cell_id
278
+ )
279
+
280
+ for cell in item.graph.cells:
281
+ cell_txt = ""
282
+ if cell.prov is not None:
283
+ if len(doc.pages.keys()):
284
+ page_w, page_h = doc.pages[page_no].size.as_tuple()
285
+ cell_txt += DocumentToken.get_location(
286
+ bbox=cell.prov.bbox.to_top_left_origin(page_h).as_tuple(),
287
+ page_w=page_w,
288
+ page_h=page_h,
289
+ xsize=params.xsize,
290
+ ysize=params.ysize,
291
+ )
292
+ if params.add_content:
293
+ cell_txt += cell.text.strip()
294
+
295
+ if cell.cell_id in source_to_targets:
296
+ targets = source_to_targets[cell.cell_id]
297
+ for target in targets:
298
+ # TODO centralize token creation
299
+ cell_txt += f"<link_{target}>"
300
+
301
+ # TODO centralize token creation
302
+ tok = f"{cell.label.value}_{cell.cell_id}"
303
+ cell_txt = _wrap(text=cell_txt, wrap_tag=tok)
304
+ body += cell_txt
305
+
306
+ if params.add_caption:
307
+ cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
308
+ if cap_text:
309
+ body += cap_text
310
+
311
+ body = _wrap(body, DocumentToken.KEY_VALUE_REGION.value)
312
+ return SerializationResult(text=body)
313
+
314
+
315
+ class DocTagsFormSerializer(BaseFormSerializer):
316
+ """DocTags-specific form item serializer."""
317
+
318
+ @override
319
+ def serialize(
320
+ self,
321
+ *,
322
+ item: FormItem,
323
+ doc_serializer: "BaseDocSerializer",
324
+ doc: DoclingDocument,
325
+ **kwargs,
326
+ ) -> SerializationResult:
327
+ """Serializes the passed item."""
328
+ # TODO add actual implementation
329
+ text_res = ""
330
+ return SerializationResult(text=text_res)
331
+
332
+
333
+ class DocTagsListSerializer(BaseModel, BaseListSerializer):
334
+ """DocTags-specific list serializer."""
335
+
336
+ indent: int = 4
337
+
338
+ @override
339
+ def serialize(
340
+ self,
341
+ *,
342
+ item: Union[UnorderedList, OrderedList],
343
+ doc_serializer: "BaseDocSerializer",
344
+ doc: DoclingDocument,
345
+ list_level: int = 0,
346
+ is_inline_scope: bool = False,
347
+ visited: Optional[set[str]] = None, # refs of visited items
348
+ **kwargs,
349
+ ) -> SerializationResult:
350
+ """Serializes the passed item."""
351
+ my_visited = visited or set()
352
+ params = DocTagsParams(**kwargs)
353
+ parts = doc_serializer.get_parts(
354
+ item=item,
355
+ list_level=list_level + 1,
356
+ is_inline_scope=is_inline_scope,
357
+ visited=my_visited,
358
+ **kwargs,
359
+ )
360
+ delim = _get_delim(params=params)
361
+ if parts:
362
+ text_res = delim.join(
363
+ [
364
+ _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value)
365
+ for p in parts
366
+ ]
367
+ )
368
+ text_res = f"{text_res}{delim}"
369
+ wrap_tag = (
370
+ DocumentToken.ORDERED_LIST.value
371
+ if isinstance(item, OrderedList)
372
+ else DocumentToken.UNORDERED_LIST.value
373
+ )
374
+ text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
375
+ else:
376
+ text_res = ""
377
+ return SerializationResult(text=text_res)
378
+
379
+
380
+ class DocTagsInlineSerializer(BaseInlineSerializer):
381
+ """DocTags-specific inline group serializer."""
382
+
383
+ @override
384
+ def serialize(
385
+ self,
386
+ *,
387
+ item: InlineGroup,
388
+ doc_serializer: "BaseDocSerializer",
389
+ doc: DoclingDocument,
390
+ list_level: int = 0,
391
+ visited: Optional[set[str]] = None, # refs of visited items
392
+ **kwargs,
393
+ ) -> SerializationResult:
394
+ """Serializes the passed item."""
395
+ my_visited = visited or set()
396
+ params = DocTagsParams(**kwargs)
397
+ parts = doc_serializer.get_parts(
398
+ item=item,
399
+ list_level=list_level,
400
+ is_inline_scope=True,
401
+ visited=my_visited,
402
+ **kwargs,
403
+ )
404
+ wrap_tag = DocumentToken.INLINE.value
405
+ delim = _get_delim(params=params)
406
+ text_res = delim.join([p.text for p in parts if p.text])
407
+ if text_res:
408
+ text_res = f"{text_res}{delim}"
409
+ text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
410
+ return SerializationResult(text=text_res)
411
+
412
+
413
+ class DocTagsFallbackSerializer(BaseFallbackSerializer):
414
+ """DocTags-specific fallback serializer."""
415
+
416
+ @override
417
+ def serialize(
418
+ self,
419
+ *,
420
+ item: NodeItem,
421
+ doc_serializer: "BaseDocSerializer",
422
+ doc: DoclingDocument,
423
+ **kwargs,
424
+ ) -> SerializationResult:
425
+ """Serializes the passed item."""
426
+ text_res = ""
427
+ return SerializationResult(text=text_res)
428
+
429
+
430
+ class DocTagsDocSerializer(DocSerializer):
431
+ """DocTags-specific document serializer."""
432
+
433
+ text_serializer: BaseTextSerializer = DocTagsTextSerializer()
434
+ table_serializer: BaseTableSerializer = DocTagsTableSerializer()
435
+ picture_serializer: BasePictureSerializer = DocTagsPictureSerializer()
436
+ key_value_serializer: BaseKeyValueSerializer = DocTagsKeyValueSerializer()
437
+ form_serializer: BaseFormSerializer = DocTagsFormSerializer()
438
+ fallback_serializer: BaseFallbackSerializer = DocTagsFallbackSerializer()
439
+
440
+ list_serializer: BaseListSerializer = DocTagsListSerializer()
441
+ inline_serializer: BaseInlineSerializer = DocTagsInlineSerializer()
442
+
443
+ params: DocTagsParams = DocTagsParams()
444
+
445
+ @override
446
+ def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
447
+ """Serialize a page out of its parts."""
448
+ delim = _get_delim(params=self.params)
449
+ text_res = delim.join([p.text for p in parts])
450
+ return SerializationResult(text=text_res)
451
+
452
+ @override
453
+ def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
454
+ """Serialize a document out of its pages."""
455
+ delim = _get_delim(params=self.params)
456
+ if self.params.add_page_break:
457
+ page_sep = f"{delim}<{DocumentToken.PAGE_BREAK.value}>{delim}"
458
+ content = page_sep.join([p.text for p in pages if p.text])
459
+ else:
460
+ content = self.serialize_page(parts=pages).text
461
+ wrap_tag = DocumentToken.DOCUMENT.value
462
+ text_res = f"<{wrap_tag}>{content}{delim}</{wrap_tag}>"
463
+ return SerializationResult(text=text_res)
464
+
465
+ @override
466
+ def serialize_captions(
467
+ self,
468
+ item: FloatingItem,
469
+ **kwargs,
470
+ ) -> SerializationResult:
471
+ """Serialize the item's captions."""
472
+ params = DocTagsParams(**kwargs)
473
+ parts: list[str] = []
474
+
475
+ if item.captions:
476
+ cap_text = super().serialize_captions(item, **kwargs).text
477
+ if cap_text:
478
+ if params.add_location:
479
+ for caption in item.captions:
480
+ if caption.cref not in self.get_excluded_refs(**kwargs):
481
+ if isinstance(cap := caption.resolve(self.doc), DocItem):
482
+ loc_txt = cap.get_location_tokens(
483
+ doc=self.doc,
484
+ xsize=params.xsize,
485
+ ysize=params.ysize,
486
+ )
487
+ parts.append(loc_txt)
488
+ parts.append(cap_text)
489
+ text_res = "".join(parts)
490
+ if text_res:
491
+ text_res = _wrap(text=text_res, wrap_tag=DocumentToken.CAPTION.value)
492
+ return SerializationResult(text=text_res)