docling-core 2.25.0__py3-none-any.whl → 2.26.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +29 -3
- docling_core/experimental/serializer/common.py +157 -71
- docling_core/experimental/serializer/doctags.py +88 -54
- docling_core/experimental/serializer/html.py +941 -0
- docling_core/experimental/serializer/html_styles.py +212 -0
- docling_core/experimental/serializer/markdown.py +105 -63
- docling_core/transforms/chunker/base.py +8 -2
- docling_core/transforms/chunker/hierarchical_chunker.py +130 -109
- docling_core/transforms/chunker/hybrid_chunker.py +54 -12
- docling_core/types/doc/document.py +702 -482
- docling_core/types/doc/labels.py +2 -0
- docling_core/types/doc/page.py +12 -17
- docling_core/types/doc/tokens.py +3 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/METADATA +1 -1
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/RECORD +18 -16
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/LICENSE +0 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/WHEEL +0 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.1.dist-info}/entry_points.txt +0 -0
|
@@ -18,7 +18,11 @@ from docling_core.experimental.serializer.base import (
|
|
|
18
18
|
BaseTextSerializer,
|
|
19
19
|
SerializationResult,
|
|
20
20
|
)
|
|
21
|
-
from docling_core.experimental.serializer.common import
|
|
21
|
+
from docling_core.experimental.serializer.common import (
|
|
22
|
+
CommonParams,
|
|
23
|
+
DocSerializer,
|
|
24
|
+
create_ser_result,
|
|
25
|
+
)
|
|
22
26
|
from docling_core.types.doc.document import (
|
|
23
27
|
CodeItem,
|
|
24
28
|
DocItem,
|
|
@@ -33,10 +37,12 @@ from docling_core.types.doc.document import (
|
|
|
33
37
|
PictureClassificationData,
|
|
34
38
|
PictureItem,
|
|
35
39
|
PictureMoleculeData,
|
|
40
|
+
PictureTabularChartData,
|
|
36
41
|
TableItem,
|
|
37
42
|
TextItem,
|
|
38
43
|
UnorderedList,
|
|
39
44
|
)
|
|
45
|
+
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
|
|
40
46
|
from docling_core.types.doc.tokens import DocumentToken
|
|
41
47
|
|
|
42
48
|
|
|
@@ -135,7 +141,7 @@ class DocTagsTextSerializer(BaseModel, BaseTextSerializer):
|
|
|
135
141
|
text_res = "".join(parts)
|
|
136
142
|
if wrap_tag is not None:
|
|
137
143
|
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
|
|
138
|
-
return
|
|
144
|
+
return create_ser_result(text=text_res, span_source=item)
|
|
139
145
|
|
|
140
146
|
|
|
141
147
|
class DocTagsTableSerializer(BaseTableSerializer):
|
|
@@ -153,7 +159,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
153
159
|
"""Serializes the passed item."""
|
|
154
160
|
params = DocTagsParams(**kwargs)
|
|
155
161
|
|
|
156
|
-
|
|
162
|
+
res_parts: list[SerializationResult] = []
|
|
157
163
|
|
|
158
164
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
159
165
|
if params.add_location:
|
|
@@ -162,7 +168,7 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
162
168
|
xsize=params.xsize,
|
|
163
169
|
ysize=params.ysize,
|
|
164
170
|
)
|
|
165
|
-
|
|
171
|
+
res_parts.append(create_ser_result(text=loc_text, span_source=item))
|
|
166
172
|
|
|
167
173
|
otsl_text = item.export_to_otsl(
|
|
168
174
|
doc=doc,
|
|
@@ -171,18 +177,18 @@ class DocTagsTableSerializer(BaseTableSerializer):
|
|
|
171
177
|
xsize=params.xsize,
|
|
172
178
|
ysize=params.ysize,
|
|
173
179
|
)
|
|
174
|
-
|
|
180
|
+
res_parts.append(create_ser_result(text=otsl_text, span_source=item))
|
|
175
181
|
|
|
176
182
|
if params.add_caption:
|
|
177
|
-
|
|
178
|
-
if
|
|
179
|
-
|
|
183
|
+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
|
|
184
|
+
if cap_res.text:
|
|
185
|
+
res_parts.append(cap_res)
|
|
180
186
|
|
|
181
|
-
text_res = "".join(
|
|
187
|
+
text_res = "".join([r.text for r in res_parts])
|
|
182
188
|
if text_res:
|
|
183
189
|
text_res = _wrap(text=text_res, wrap_tag=DocumentToken.OTSL.value)
|
|
184
190
|
|
|
185
|
-
return
|
|
191
|
+
return create_ser_result(text=text_res, span_source=res_parts)
|
|
186
192
|
|
|
187
193
|
|
|
188
194
|
class DocTagsPictureSerializer(BasePictureSerializer):
|
|
@@ -199,7 +205,8 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
199
205
|
) -> SerializationResult:
|
|
200
206
|
"""Serializes the passed item."""
|
|
201
207
|
params = DocTagsParams(**kwargs)
|
|
202
|
-
|
|
208
|
+
res_parts: list[SerializationResult] = []
|
|
209
|
+
is_chart = False
|
|
203
210
|
|
|
204
211
|
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
|
|
205
212
|
body = ""
|
|
@@ -217,6 +224,16 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
217
224
|
]
|
|
218
225
|
if len(classifications) > 0:
|
|
219
226
|
predicted_class = classifications[0].predicted_classes[0].class_name
|
|
227
|
+
if predicted_class in [
|
|
228
|
+
PictureClassificationLabel.PIE_CHART,
|
|
229
|
+
PictureClassificationLabel.BAR_CHART,
|
|
230
|
+
PictureClassificationLabel.STACKED_BAR_CHART,
|
|
231
|
+
PictureClassificationLabel.LINE_CHART,
|
|
232
|
+
PictureClassificationLabel.FLOW_CHART,
|
|
233
|
+
PictureClassificationLabel.SCATTER_CHART,
|
|
234
|
+
PictureClassificationLabel.HEATMAP,
|
|
235
|
+
]:
|
|
236
|
+
is_chart = True
|
|
220
237
|
body += DocumentToken.get_picture_classification_token(predicted_class)
|
|
221
238
|
|
|
222
239
|
smiles_annotations = [
|
|
@@ -226,20 +243,35 @@ class DocTagsPictureSerializer(BasePictureSerializer):
|
|
|
226
243
|
body += _wrap(
|
|
227
244
|
text=smiles_annotations[0].smi, wrap_tag=DocumentToken.SMILES.value
|
|
228
245
|
)
|
|
229
|
-
|
|
246
|
+
|
|
247
|
+
tabular_chart_annotations = [
|
|
248
|
+
ann
|
|
249
|
+
for ann in item.annotations
|
|
250
|
+
if isinstance(ann, PictureTabularChartData)
|
|
251
|
+
]
|
|
252
|
+
if len(tabular_chart_annotations) > 0:
|
|
253
|
+
temp_doc = DoclingDocument(name="temp")
|
|
254
|
+
temp_table = temp_doc.add_table(
|
|
255
|
+
data=tabular_chart_annotations[0].chart_data
|
|
256
|
+
)
|
|
257
|
+
otsl_content = temp_table.export_to_otsl(
|
|
258
|
+
temp_doc, add_cell_location=False
|
|
259
|
+
)
|
|
260
|
+
body += otsl_content
|
|
261
|
+
res_parts.append(create_ser_result(text=body, span_source=item))
|
|
230
262
|
|
|
231
263
|
if params.add_caption:
|
|
232
|
-
|
|
233
|
-
if
|
|
234
|
-
|
|
264
|
+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
|
|
265
|
+
if cap_res.text:
|
|
266
|
+
res_parts.append(cap_res)
|
|
235
267
|
|
|
236
|
-
text_res = "".join(
|
|
268
|
+
text_res = "".join([r.text for r in res_parts])
|
|
237
269
|
if text_res:
|
|
238
270
|
token = DocumentToken.create_token_name_from_doc_item_label(
|
|
239
|
-
label=
|
|
271
|
+
label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE,
|
|
240
272
|
)
|
|
241
273
|
text_res = _wrap(text=text_res, wrap_tag=token)
|
|
242
|
-
return
|
|
274
|
+
return create_ser_result(text=text_res, span_source=res_parts)
|
|
243
275
|
|
|
244
276
|
|
|
245
277
|
class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
|
|
@@ -256,8 +288,8 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
256
288
|
) -> SerializationResult:
|
|
257
289
|
"""Serializes the passed item."""
|
|
258
290
|
params = DocTagsParams(**kwargs)
|
|
259
|
-
|
|
260
291
|
body = ""
|
|
292
|
+
results: list[SerializationResult] = []
|
|
261
293
|
|
|
262
294
|
page_no = 1
|
|
263
295
|
if len(item.prov) > 0:
|
|
@@ -302,14 +334,16 @@ class DocTagsKeyValueSerializer(BaseKeyValueSerializer):
|
|
|
302
334
|
tok = f"{cell.label.value}_{cell.cell_id}"
|
|
303
335
|
cell_txt = _wrap(text=cell_txt, wrap_tag=tok)
|
|
304
336
|
body += cell_txt
|
|
337
|
+
results.append(create_ser_result(text=body, span_source=item))
|
|
305
338
|
|
|
306
339
|
if params.add_caption:
|
|
307
|
-
|
|
308
|
-
if
|
|
309
|
-
|
|
340
|
+
cap_res = doc_serializer.serialize_captions(item=item, **kwargs)
|
|
341
|
+
if cap_res.text:
|
|
342
|
+
results.append(cap_res)
|
|
310
343
|
|
|
344
|
+
body = "".join([r.text for r in results])
|
|
311
345
|
body = _wrap(body, DocumentToken.KEY_VALUE_REGION.value)
|
|
312
|
-
return
|
|
346
|
+
return create_ser_result(text=body, span_source=results)
|
|
313
347
|
|
|
314
348
|
|
|
315
349
|
class DocTagsFormSerializer(BaseFormSerializer):
|
|
@@ -326,8 +360,7 @@ class DocTagsFormSerializer(BaseFormSerializer):
|
|
|
326
360
|
) -> SerializationResult:
|
|
327
361
|
"""Serializes the passed item."""
|
|
328
362
|
# TODO add actual implementation
|
|
329
|
-
|
|
330
|
-
return SerializationResult(text=text_res)
|
|
363
|
+
return create_ser_result()
|
|
331
364
|
|
|
332
365
|
|
|
333
366
|
class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
@@ -348,7 +381,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
348
381
|
**kwargs,
|
|
349
382
|
) -> SerializationResult:
|
|
350
383
|
"""Serializes the passed item."""
|
|
351
|
-
my_visited = visited
|
|
384
|
+
my_visited = visited if visited is not None else set()
|
|
352
385
|
params = DocTagsParams(**kwargs)
|
|
353
386
|
parts = doc_serializer.get_parts(
|
|
354
387
|
item=item,
|
|
@@ -361,8 +394,9 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
361
394
|
if parts:
|
|
362
395
|
text_res = delim.join(
|
|
363
396
|
[
|
|
364
|
-
|
|
397
|
+
t
|
|
365
398
|
for p in parts
|
|
399
|
+
if (t := _wrap(text=p.text, wrap_tag=DocumentToken.LIST_ITEM.value))
|
|
366
400
|
]
|
|
367
401
|
)
|
|
368
402
|
text_res = f"{text_res}{delim}"
|
|
@@ -374,7 +408,7 @@ class DocTagsListSerializer(BaseModel, BaseListSerializer):
|
|
|
374
408
|
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
|
|
375
409
|
else:
|
|
376
410
|
text_res = ""
|
|
377
|
-
return
|
|
411
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
378
412
|
|
|
379
413
|
|
|
380
414
|
class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
@@ -392,7 +426,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
392
426
|
**kwargs,
|
|
393
427
|
) -> SerializationResult:
|
|
394
428
|
"""Serializes the passed item."""
|
|
395
|
-
my_visited = visited
|
|
429
|
+
my_visited = visited if visited is not None else set()
|
|
396
430
|
params = DocTagsParams(**kwargs)
|
|
397
431
|
parts = doc_serializer.get_parts(
|
|
398
432
|
item=item,
|
|
@@ -407,7 +441,7 @@ class DocTagsInlineSerializer(BaseInlineSerializer):
|
|
|
407
441
|
if text_res:
|
|
408
442
|
text_res = f"{text_res}{delim}"
|
|
409
443
|
text_res = _wrap(text=text_res, wrap_tag=wrap_tag)
|
|
410
|
-
return
|
|
444
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
411
445
|
|
|
412
446
|
|
|
413
447
|
class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
@@ -423,8 +457,7 @@ class DocTagsFallbackSerializer(BaseFallbackSerializer):
|
|
|
423
457
|
**kwargs,
|
|
424
458
|
) -> SerializationResult:
|
|
425
459
|
"""Serializes the passed item."""
|
|
426
|
-
|
|
427
|
-
return SerializationResult(text=text_res)
|
|
460
|
+
return create_ser_result()
|
|
428
461
|
|
|
429
462
|
|
|
430
463
|
class DocTagsDocSerializer(DocSerializer):
|
|
@@ -443,24 +476,21 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
443
476
|
params: DocTagsParams = DocTagsParams()
|
|
444
477
|
|
|
445
478
|
@override
|
|
446
|
-
def
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
text_res = delim.join([p.text for p in parts])
|
|
450
|
-
return SerializationResult(text=text_res)
|
|
451
|
-
|
|
452
|
-
@override
|
|
453
|
-
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
|
|
479
|
+
def serialize_doc(
|
|
480
|
+
self, *, parts: list[SerializationResult], **kwargs
|
|
481
|
+
) -> SerializationResult:
|
|
454
482
|
"""Serialize a document out of its pages."""
|
|
455
483
|
delim = _get_delim(params=self.params)
|
|
484
|
+
text_res = delim.join([p.text for p in parts if p.text])
|
|
485
|
+
|
|
456
486
|
if self.params.add_page_break:
|
|
457
|
-
page_sep = f"
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
487
|
+
page_sep = f"<{DocumentToken.PAGE_BREAK.value}>"
|
|
488
|
+
for full_match, _, _ in self._get_page_breaks(text=text_res):
|
|
489
|
+
text_res = text_res.replace(full_match, page_sep)
|
|
490
|
+
|
|
461
491
|
wrap_tag = DocumentToken.DOCUMENT.value
|
|
462
|
-
text_res = f"<{wrap_tag}>{
|
|
463
|
-
return
|
|
492
|
+
text_res = f"<{wrap_tag}>{text_res}{delim}</{wrap_tag}>"
|
|
493
|
+
return create_ser_result(text=text_res, span_source=parts)
|
|
464
494
|
|
|
465
495
|
@override
|
|
466
496
|
def serialize_captions(
|
|
@@ -470,11 +500,10 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
470
500
|
) -> SerializationResult:
|
|
471
501
|
"""Serialize the item's captions."""
|
|
472
502
|
params = DocTagsParams(**kwargs)
|
|
473
|
-
|
|
474
|
-
|
|
503
|
+
results: list[SerializationResult] = []
|
|
475
504
|
if item.captions:
|
|
476
|
-
|
|
477
|
-
if
|
|
505
|
+
cap_res = super().serialize_captions(item, **kwargs)
|
|
506
|
+
if cap_res.text:
|
|
478
507
|
if params.add_location:
|
|
479
508
|
for caption in item.captions:
|
|
480
509
|
if caption.cref not in self.get_excluded_refs(**kwargs):
|
|
@@ -484,9 +513,14 @@ class DocTagsDocSerializer(DocSerializer):
|
|
|
484
513
|
xsize=params.xsize,
|
|
485
514
|
ysize=params.ysize,
|
|
486
515
|
)
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
text_res = "".join(
|
|
516
|
+
results.append(create_ser_result(text=loc_txt))
|
|
517
|
+
results.append(cap_res)
|
|
518
|
+
text_res = "".join([r.text for r in results])
|
|
490
519
|
if text_res:
|
|
491
520
|
text_res = _wrap(text=text_res, wrap_tag=DocumentToken.CAPTION.value)
|
|
492
|
-
return
|
|
521
|
+
return create_ser_result(text=text_res, span_source=results)
|
|
522
|
+
|
|
523
|
+
@override
|
|
524
|
+
def requires_page_break(self):
|
|
525
|
+
"""Whether to add page breaks."""
|
|
526
|
+
return self.params.add_page_break
|