docling-core 2.25.0__py3-none-any.whl → 2.26.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/experimental/serializer/base.py +23 -2
- docling_core/experimental/serializer/common.py +79 -34
- docling_core/experimental/serializer/doctags.py +83 -47
- docling_core/experimental/serializer/html.py +931 -0
- docling_core/experimental/serializer/html_styles.py +212 -0
- docling_core/experimental/serializer/markdown.py +95 -57
- docling_core/transforms/chunker/base.py +8 -2
- docling_core/transforms/chunker/hierarchical_chunker.py +130 -109
- docling_core/transforms/chunker/hybrid_chunker.py +54 -12
- docling_core/types/doc/document.py +702 -482
- docling_core/types/doc/labels.py +2 -0
- docling_core/types/doc/page.py +12 -17
- docling_core/types/doc/tokens.py +3 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/METADATA +1 -1
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/RECORD +18 -16
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/LICENSE +0 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/WHEEL +0 -0
- {docling_core-2.25.0.dist-info → docling_core-2.26.0.dist-info}/entry_points.txt +0 -0
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import copy
|
|
5
5
|
import hashlib
|
|
6
|
-
import html
|
|
7
6
|
import itertools
|
|
8
7
|
import json
|
|
9
8
|
import logging
|
|
@@ -12,17 +11,12 @@ import os
|
|
|
12
11
|
import re
|
|
13
12
|
import sys
|
|
14
13
|
import typing
|
|
15
|
-
import warnings
|
|
16
14
|
from enum import Enum
|
|
17
15
|
from io import BytesIO
|
|
18
16
|
from pathlib import Path
|
|
19
17
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
20
|
-
from urllib.parse import
|
|
21
|
-
from xml.etree.cElementTree import SubElement, tostring
|
|
22
|
-
from xml.sax.saxutils import unescape
|
|
18
|
+
from urllib.parse import unquote
|
|
23
19
|
|
|
24
|
-
import latex2mathml.converter
|
|
25
|
-
import latex2mathml.exceptions
|
|
26
20
|
import pandas as pd
|
|
27
21
|
import yaml
|
|
28
22
|
from PIL import Image as PILImage
|
|
@@ -49,13 +43,10 @@ from docling_core.types.doc.labels import (
|
|
|
49
43
|
GraphCellLabel,
|
|
50
44
|
GraphLinkLabel,
|
|
51
45
|
GroupLabel,
|
|
46
|
+
PictureClassificationLabel,
|
|
52
47
|
)
|
|
53
48
|
from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
|
|
54
|
-
from docling_core.types.doc.utils import
|
|
55
|
-
get_html_tag_with_text_direction,
|
|
56
|
-
get_text_direction,
|
|
57
|
-
relative_path,
|
|
58
|
-
)
|
|
49
|
+
from docling_core.types.doc.utils import relative_path
|
|
59
50
|
|
|
60
51
|
_logger = logging.getLogger(__name__)
|
|
61
52
|
|
|
@@ -290,22 +281,6 @@ class PictureScatterChartData(PictureChartData):
|
|
|
290
281
|
points: List[ChartPoint]
|
|
291
282
|
|
|
292
283
|
|
|
293
|
-
PictureDataType = Annotated[
|
|
294
|
-
Union[
|
|
295
|
-
PictureClassificationData,
|
|
296
|
-
PictureDescriptionData,
|
|
297
|
-
PictureMoleculeData,
|
|
298
|
-
PictureMiscData,
|
|
299
|
-
PictureLineChartData,
|
|
300
|
-
PictureBarChartData,
|
|
301
|
-
PictureStackedBarChartData,
|
|
302
|
-
PicturePieChartData,
|
|
303
|
-
PictureScatterChartData,
|
|
304
|
-
],
|
|
305
|
-
Field(discriminator="kind"),
|
|
306
|
-
]
|
|
307
|
-
|
|
308
|
-
|
|
309
284
|
class TableCell(BaseModel):
|
|
310
285
|
"""TableCell."""
|
|
311
286
|
|
|
@@ -391,6 +366,35 @@ class TableData(BaseModel): # TBD
|
|
|
391
366
|
return table_data
|
|
392
367
|
|
|
393
368
|
|
|
369
|
+
class PictureTabularChartData(PictureChartData):
|
|
370
|
+
"""Base class for picture chart data.
|
|
371
|
+
|
|
372
|
+
Attributes:
|
|
373
|
+
title (str): The title of the chart.
|
|
374
|
+
chart_data (TableData): Chart data in the table format.
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
kind: Literal["tabular_chart_data"] = "tabular_chart_data"
|
|
378
|
+
chart_data: TableData
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
PictureDataType = Annotated[
|
|
382
|
+
Union[
|
|
383
|
+
PictureClassificationData,
|
|
384
|
+
PictureDescriptionData,
|
|
385
|
+
PictureMoleculeData,
|
|
386
|
+
PictureMiscData,
|
|
387
|
+
PictureTabularChartData,
|
|
388
|
+
PictureLineChartData,
|
|
389
|
+
PictureBarChartData,
|
|
390
|
+
PictureStackedBarChartData,
|
|
391
|
+
PicturePieChartData,
|
|
392
|
+
PictureScatterChartData,
|
|
393
|
+
],
|
|
394
|
+
Field(discriminator="kind"),
|
|
395
|
+
]
|
|
396
|
+
|
|
397
|
+
|
|
394
398
|
class DocumentOrigin(BaseModel):
|
|
395
399
|
"""FileSource."""
|
|
396
400
|
|
|
@@ -458,8 +462,12 @@ class RefItem(BaseModel):
|
|
|
458
462
|
populate_by_name=True,
|
|
459
463
|
)
|
|
460
464
|
|
|
465
|
+
def _split_ref_to_path(self):
|
|
466
|
+
"""Get the path of the reference."""
|
|
467
|
+
return self.cref.split("/")
|
|
468
|
+
|
|
461
469
|
def resolve(self, doc: "DoclingDocument"):
|
|
462
|
-
"""
|
|
470
|
+
"""Resolve the path in the document."""
|
|
463
471
|
path_components = self.cref.split("/")
|
|
464
472
|
if (num_comps := len(path_components)) == 3:
|
|
465
473
|
_, path, index_str = path_components
|
|
@@ -624,10 +632,98 @@ class NodeItem(BaseModel):
|
|
|
624
632
|
|
|
625
633
|
model_config = ConfigDict(extra="forbid")
|
|
626
634
|
|
|
627
|
-
def get_ref(self):
|
|
635
|
+
def get_ref(self) -> RefItem:
|
|
628
636
|
"""get_ref."""
|
|
629
637
|
return RefItem(cref=self.self_ref)
|
|
630
638
|
|
|
639
|
+
def _get_parent_ref(
|
|
640
|
+
self, doc: "DoclingDocument", stack: list[int]
|
|
641
|
+
) -> Optional[RefItem]:
|
|
642
|
+
"""get_parent_ref."""
|
|
643
|
+
if len(stack) == 0:
|
|
644
|
+
return self.parent
|
|
645
|
+
elif len(stack) > 0 and stack[0] < len(self.children):
|
|
646
|
+
item = self.children[stack[0]].resolve(doc)
|
|
647
|
+
return item._get_parent_ref(doc=doc, stack=stack[1:])
|
|
648
|
+
|
|
649
|
+
return None
|
|
650
|
+
|
|
651
|
+
def _delete_child(self, doc: "DoclingDocument", stack: list[int]) -> bool:
|
|
652
|
+
"""Delete child node in tree."""
|
|
653
|
+
if len(stack) == 1 and stack[0] < len(self.children):
|
|
654
|
+
del self.children[stack[0]]
|
|
655
|
+
return True
|
|
656
|
+
elif len(stack) > 1 and stack[0] < len(self.children):
|
|
657
|
+
item = self.children[stack[0]].resolve(doc)
|
|
658
|
+
return item._delete_child(doc=doc, stack=stack[1:])
|
|
659
|
+
|
|
660
|
+
return False
|
|
661
|
+
|
|
662
|
+
def _update_child(
|
|
663
|
+
self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
|
|
664
|
+
) -> bool:
|
|
665
|
+
"""Update child node in tree."""
|
|
666
|
+
if len(stack) == 1 and stack[0] < len(self.children):
|
|
667
|
+
# ensure the parent is correct
|
|
668
|
+
new_item = new_ref.resolve(doc=doc)
|
|
669
|
+
new_item.parent = self.get_ref()
|
|
670
|
+
|
|
671
|
+
self.children[stack[0]] = new_ref
|
|
672
|
+
return True
|
|
673
|
+
elif len(stack) > 1 and stack[0] < len(self.children):
|
|
674
|
+
item = self.children[stack[0]].resolve(doc)
|
|
675
|
+
return item._update_child(doc=doc, stack=stack[1:], new_ref=new_ref)
|
|
676
|
+
|
|
677
|
+
return False
|
|
678
|
+
|
|
679
|
+
def _add_child(
|
|
680
|
+
self, doc: "DoclingDocument", stack: list[int], new_ref: RefItem
|
|
681
|
+
) -> bool:
|
|
682
|
+
"""Append child to node identified by stack."""
|
|
683
|
+
if len(stack) == 0:
|
|
684
|
+
|
|
685
|
+
# ensure the parent is correct
|
|
686
|
+
new_item = new_ref.resolve(doc=doc)
|
|
687
|
+
new_item.parent = self.get_ref()
|
|
688
|
+
|
|
689
|
+
self.children.append(new_ref)
|
|
690
|
+
return True
|
|
691
|
+
elif len(stack) > 0 and stack[0] < len(self.children):
|
|
692
|
+
item = self.children[stack[0]].resolve(doc)
|
|
693
|
+
return item._add_child(doc=doc, stack=stack[1:], new_ref=new_ref)
|
|
694
|
+
|
|
695
|
+
return False
|
|
696
|
+
|
|
697
|
+
def _add_sibling(
|
|
698
|
+
self,
|
|
699
|
+
doc: "DoclingDocument",
|
|
700
|
+
stack: list[int],
|
|
701
|
+
new_ref: RefItem,
|
|
702
|
+
after: bool = True,
|
|
703
|
+
) -> bool:
|
|
704
|
+
"""Add sibling node in tree."""
|
|
705
|
+
if len(stack) == 1 and stack[0] < len(self.children) and (not after):
|
|
706
|
+
# ensure the parent is correct
|
|
707
|
+
new_item = new_ref.resolve(doc=doc)
|
|
708
|
+
new_item.parent = self.get_ref()
|
|
709
|
+
|
|
710
|
+
self.children.insert(stack[0], new_ref)
|
|
711
|
+
return True
|
|
712
|
+
elif len(stack) == 1 and stack[0] < len(self.children) and (after):
|
|
713
|
+
# ensure the parent is correct
|
|
714
|
+
new_item = new_ref.resolve(doc=doc)
|
|
715
|
+
new_item.parent = self.get_ref()
|
|
716
|
+
|
|
717
|
+
self.children.insert(stack[0] + 1, new_ref)
|
|
718
|
+
return True
|
|
719
|
+
elif len(stack) > 1 and stack[0] < len(self.children):
|
|
720
|
+
item = self.children[stack[0]].resolve(doc)
|
|
721
|
+
return item._add_sibling(
|
|
722
|
+
doc=doc, stack=stack[1:], new_ref=new_ref, after=after
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
return False
|
|
726
|
+
|
|
631
727
|
|
|
632
728
|
class GroupItem(NodeItem): # Container type, can't be a leaf node
|
|
633
729
|
"""GroupItem."""
|
|
@@ -953,7 +1049,9 @@ class FormulaItem(TextItem):
|
|
|
953
1049
|
class PictureItem(FloatingItem):
|
|
954
1050
|
"""PictureItem."""
|
|
955
1051
|
|
|
956
|
-
label: typing.Literal[DocItemLabel.PICTURE] =
|
|
1052
|
+
label: typing.Literal[DocItemLabel.PICTURE, DocItemLabel.CHART] = (
|
|
1053
|
+
DocItemLabel.PICTURE
|
|
1054
|
+
)
|
|
957
1055
|
|
|
958
1056
|
annotations: List[PictureDataType] = []
|
|
959
1057
|
|
|
@@ -1020,54 +1118,19 @@ class PictureItem(FloatingItem):
|
|
|
1020
1118
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1021
1119
|
) -> str:
|
|
1022
1120
|
"""Export picture to HTML format."""
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
caption_text = ""
|
|
1028
|
-
if len(text) > 0:
|
|
1029
|
-
caption_text = get_html_tag_with_text_direction(
|
|
1030
|
-
html_tag="figcaption", text=text
|
|
1031
|
-
)
|
|
1032
|
-
|
|
1033
|
-
default_response = f"<figure>{caption_text}</figure>"
|
|
1034
|
-
|
|
1035
|
-
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
1036
|
-
return default_response
|
|
1037
|
-
|
|
1038
|
-
elif image_mode == ImageRefMode.EMBEDDED:
|
|
1039
|
-
# short-cut: we already have the image in base64
|
|
1040
|
-
if (
|
|
1041
|
-
isinstance(self.image, ImageRef)
|
|
1042
|
-
and isinstance(self.image.uri, AnyUrl)
|
|
1043
|
-
and self.image.uri.scheme == "data"
|
|
1044
|
-
):
|
|
1045
|
-
img_text = f'<img src="{self.image.uri}">'
|
|
1046
|
-
return f"<figure>{caption_text}{img_text}</figure>"
|
|
1047
|
-
|
|
1048
|
-
# get the self.image._pil or crop it out of the page-image
|
|
1049
|
-
img = self.get_image(doc)
|
|
1050
|
-
|
|
1051
|
-
if img is not None:
|
|
1052
|
-
imgb64 = self._image_to_base64(img)
|
|
1053
|
-
img_text = f'<img src="data:image/png;base64,{imgb64}">'
|
|
1054
|
-
|
|
1055
|
-
return f"<figure>{caption_text}{img_text}</figure>"
|
|
1056
|
-
else:
|
|
1057
|
-
return default_response
|
|
1058
|
-
|
|
1059
|
-
elif image_mode == ImageRefMode.REFERENCED:
|
|
1060
|
-
|
|
1061
|
-
if not isinstance(self.image, ImageRef) or (
|
|
1062
|
-
isinstance(self.image.uri, AnyUrl) and self.image.uri.scheme == "data"
|
|
1063
|
-
):
|
|
1064
|
-
return default_response
|
|
1065
|
-
|
|
1066
|
-
img_text = f'<img src="{quote(str(self.image.uri))}">'
|
|
1067
|
-
return f"<figure>{caption_text}{img_text}</figure>"
|
|
1121
|
+
from docling_core.experimental.serializer.html import (
|
|
1122
|
+
HTMLDocSerializer,
|
|
1123
|
+
HTMLParams,
|
|
1124
|
+
)
|
|
1068
1125
|
|
|
1069
|
-
|
|
1070
|
-
|
|
1126
|
+
serializer = HTMLDocSerializer(
|
|
1127
|
+
doc=doc,
|
|
1128
|
+
params=HTMLParams(
|
|
1129
|
+
image_mode=image_mode,
|
|
1130
|
+
),
|
|
1131
|
+
)
|
|
1132
|
+
text = serializer.serialize(item=self).text
|
|
1133
|
+
return text
|
|
1071
1134
|
|
|
1072
1135
|
@deprecated("Use export_to_doctags() instead.")
|
|
1073
1136
|
def export_to_document_tokens(self, *args, **kwargs):
|
|
@@ -1218,81 +1281,18 @@ class TableItem(FloatingItem):
|
|
|
1218
1281
|
add_caption: bool = True,
|
|
1219
1282
|
) -> str:
|
|
1220
1283
|
"""Export the table as html."""
|
|
1221
|
-
if doc is None:
|
|
1222
|
-
|
|
1223
|
-
"The `doc` argument will be mandatory in a future version. "
|
|
1224
|
-
"It must be provided to include a caption.",
|
|
1225
|
-
DeprecationWarning,
|
|
1226
|
-
)
|
|
1227
|
-
|
|
1228
|
-
nrows = self.data.num_rows
|
|
1229
|
-
ncols = self.data.num_cols
|
|
1230
|
-
|
|
1231
|
-
text = ""
|
|
1232
|
-
if doc is not None and add_caption and len(self.captions):
|
|
1233
|
-
text = html.escape(self.caption_text(doc))
|
|
1234
|
-
|
|
1235
|
-
if len(self.data.table_cells) == 0:
|
|
1236
|
-
return ""
|
|
1237
|
-
|
|
1238
|
-
body = ""
|
|
1239
|
-
|
|
1240
|
-
for i in range(nrows):
|
|
1241
|
-
body += "<tr>"
|
|
1242
|
-
for j in range(ncols):
|
|
1243
|
-
cell: TableCell = self.data.grid[i][j]
|
|
1244
|
-
|
|
1245
|
-
rowspan, rowstart = (
|
|
1246
|
-
cell.row_span,
|
|
1247
|
-
cell.start_row_offset_idx,
|
|
1248
|
-
)
|
|
1249
|
-
colspan, colstart = (
|
|
1250
|
-
cell.col_span,
|
|
1251
|
-
cell.start_col_offset_idx,
|
|
1252
|
-
)
|
|
1253
|
-
|
|
1254
|
-
if rowstart != i:
|
|
1255
|
-
continue
|
|
1256
|
-
if colstart != j:
|
|
1257
|
-
continue
|
|
1258
|
-
|
|
1259
|
-
content = html.escape(cell.text.strip())
|
|
1260
|
-
celltag = "td"
|
|
1261
|
-
if cell.column_header:
|
|
1262
|
-
celltag = "th"
|
|
1263
|
-
|
|
1264
|
-
opening_tag = f"{celltag}"
|
|
1265
|
-
if rowspan > 1:
|
|
1266
|
-
opening_tag += f' rowspan="{rowspan}"'
|
|
1267
|
-
if colspan > 1:
|
|
1268
|
-
opening_tag += f' colspan="{colspan}"'
|
|
1269
|
-
|
|
1270
|
-
text_dir = get_text_direction(content)
|
|
1271
|
-
if text_dir == "rtl":
|
|
1272
|
-
opening_tag += f' dir="{dir}"'
|
|
1273
|
-
|
|
1274
|
-
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
1275
|
-
body += "</tr>"
|
|
1276
|
-
|
|
1277
|
-
# dir = get_text_direction(text)
|
|
1278
|
-
|
|
1279
|
-
if len(text) > 0 and len(body) > 0:
|
|
1280
|
-
caption_text = get_html_tag_with_text_direction(
|
|
1281
|
-
html_tag="caption", text=text
|
|
1282
|
-
)
|
|
1283
|
-
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
|
|
1284
|
+
if doc is not None:
|
|
1285
|
+
from docling_core.experimental.serializer.html import HTMLDocSerializer
|
|
1284
1286
|
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
caption_text = get_html_tag_with_text_direction(
|
|
1289
|
-
html_tag="caption", text=text
|
|
1290
|
-
)
|
|
1291
|
-
body = f"<table>{caption_text}</table>"
|
|
1287
|
+
serializer = HTMLDocSerializer(doc=doc)
|
|
1288
|
+
text = serializer.serialize(item=self).text
|
|
1289
|
+
return text
|
|
1292
1290
|
else:
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1291
|
+
_logger.error(
|
|
1292
|
+
"Usage of TableItem.export_to_html() without `doc` argument is "
|
|
1293
|
+
"deprecated.",
|
|
1294
|
+
)
|
|
1295
|
+
return ""
|
|
1296
1296
|
|
|
1297
1297
|
def export_to_otsl(
|
|
1298
1298
|
self,
|
|
@@ -1567,76 +1567,6 @@ class PageItem(BaseModel):
|
|
|
1567
1567
|
class DoclingDocument(BaseModel):
|
|
1568
1568
|
"""DoclingDocument."""
|
|
1569
1569
|
|
|
1570
|
-
_HTML_DEFAULT_HEAD: str = r"""<head>
|
|
1571
|
-
<link rel="icon" type="image/png"
|
|
1572
|
-
href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
|
|
1573
|
-
<meta charset="UTF-8">
|
|
1574
|
-
<title>
|
|
1575
|
-
Powered by Docling
|
|
1576
|
-
</title>
|
|
1577
|
-
<style>
|
|
1578
|
-
html {
|
|
1579
|
-
background-color: LightGray;
|
|
1580
|
-
}
|
|
1581
|
-
body {
|
|
1582
|
-
margin: 0 auto;
|
|
1583
|
-
width:800px;
|
|
1584
|
-
padding: 30px;
|
|
1585
|
-
background-color: White;
|
|
1586
|
-
font-family: Arial, sans-serif;
|
|
1587
|
-
box-shadow: 10px 10px 10px grey;
|
|
1588
|
-
}
|
|
1589
|
-
figure{
|
|
1590
|
-
display: block;
|
|
1591
|
-
width: 100%;
|
|
1592
|
-
margin: 0px;
|
|
1593
|
-
margin-top: 10px;
|
|
1594
|
-
margin-bottom: 10px;
|
|
1595
|
-
}
|
|
1596
|
-
img {
|
|
1597
|
-
display: block;
|
|
1598
|
-
margin: auto;
|
|
1599
|
-
margin-top: 10px;
|
|
1600
|
-
margin-bottom: 10px;
|
|
1601
|
-
max-width: 640px;
|
|
1602
|
-
max-height: 640px;
|
|
1603
|
-
}
|
|
1604
|
-
table {
|
|
1605
|
-
min-width:500px;
|
|
1606
|
-
background-color: White;
|
|
1607
|
-
border-collapse: collapse;
|
|
1608
|
-
cell-padding: 5px;
|
|
1609
|
-
margin: auto;
|
|
1610
|
-
margin-top: 10px;
|
|
1611
|
-
margin-bottom: 10px;
|
|
1612
|
-
}
|
|
1613
|
-
th, td {
|
|
1614
|
-
border: 1px solid black;
|
|
1615
|
-
padding: 8px;
|
|
1616
|
-
}
|
|
1617
|
-
th {
|
|
1618
|
-
font-weight: bold;
|
|
1619
|
-
}
|
|
1620
|
-
table tr:nth-child(even) td{
|
|
1621
|
-
background-color: LightGray;
|
|
1622
|
-
}
|
|
1623
|
-
math annotation {
|
|
1624
|
-
display: none;
|
|
1625
|
-
}
|
|
1626
|
-
.formula-not-decoded {
|
|
1627
|
-
background: repeating-linear-gradient(
|
|
1628
|
-
45deg, /* Angle of the stripes */
|
|
1629
|
-
LightGray, /* First color */
|
|
1630
|
-
LightGray 10px, /* Length of the first color */
|
|
1631
|
-
White 10px, /* Second color */
|
|
1632
|
-
White 20px /* Length of the second color */
|
|
1633
|
-
);
|
|
1634
|
-
margin: 0;
|
|
1635
|
-
text-align: center;
|
|
1636
|
-
}
|
|
1637
|
-
</style>
|
|
1638
|
-
</head>"""
|
|
1639
|
-
|
|
1640
1570
|
schema_name: typing.Literal["DoclingDocument"] = "DoclingDocument"
|
|
1641
1571
|
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
1642
1572
|
CURRENT_VERSION
|
|
@@ -1683,6 +1613,364 @@ class DoclingDocument(BaseModel):
|
|
|
1683
1613
|
item["content_layer"] = "furniture"
|
|
1684
1614
|
return data
|
|
1685
1615
|
|
|
1616
|
+
# ---------------------------
|
|
1617
|
+
# Public Manipulation methods
|
|
1618
|
+
# ---------------------------
|
|
1619
|
+
|
|
1620
|
+
def append_child_item(
|
|
1621
|
+
self, *, child: NodeItem, parent: Optional[NodeItem] = None
|
|
1622
|
+
) -> None:
|
|
1623
|
+
"""Adds an item."""
|
|
1624
|
+
if len(child.children) > 0:
|
|
1625
|
+
raise ValueError("Can not append a child with children")
|
|
1626
|
+
|
|
1627
|
+
parent = parent if parent is not None else self.body
|
|
1628
|
+
|
|
1629
|
+
success, stack = self._get_stack_of_item(item=parent)
|
|
1630
|
+
|
|
1631
|
+
if not success:
|
|
1632
|
+
raise ValueError(
|
|
1633
|
+
f"Could not resolve the parent node in the document tree: {parent}"
|
|
1634
|
+
)
|
|
1635
|
+
|
|
1636
|
+
# Append the item to the attributes of the doc
|
|
1637
|
+
self._append_item(item=child, parent_ref=parent.get_ref())
|
|
1638
|
+
|
|
1639
|
+
# Update the tree of the doc
|
|
1640
|
+
success = self.body._add_child(doc=self, new_ref=child.get_ref(), stack=stack)
|
|
1641
|
+
|
|
1642
|
+
# Clean the attribute (orphan) if not successful
|
|
1643
|
+
if not success:
|
|
1644
|
+
self._pop_item(item=child)
|
|
1645
|
+
raise ValueError(f"Could not append child: {child} to parent: {parent}")
|
|
1646
|
+
|
|
1647
|
+
def insert_item_after_sibling(
|
|
1648
|
+
self, *, new_item: NodeItem, sibling: NodeItem
|
|
1649
|
+
) -> None:
|
|
1650
|
+
"""Inserts an item, given its node_item instance, after other as a sibling."""
|
|
1651
|
+
self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=True)
|
|
1652
|
+
|
|
1653
|
+
def insert_item_before_sibling(
|
|
1654
|
+
self, *, new_item: NodeItem, sibling: NodeItem
|
|
1655
|
+
) -> None:
|
|
1656
|
+
"""Inserts an item, given its node_item instance, before other as a sibling."""
|
|
1657
|
+
self._insert_item_at_refitem(item=new_item, ref=sibling.get_ref(), after=False)
|
|
1658
|
+
|
|
1659
|
+
def delete_items(self, *, node_items: List[NodeItem]) -> None:
|
|
1660
|
+
"""Deletes an item, given its instance or ref, and any children it has."""
|
|
1661
|
+
refs = []
|
|
1662
|
+
for _ in node_items:
|
|
1663
|
+
refs.append(_.get_ref())
|
|
1664
|
+
|
|
1665
|
+
self._delete_items(refs=refs)
|
|
1666
|
+
|
|
1667
|
+
def replace_item(self, *, new_item: NodeItem, old_item: NodeItem) -> None:
|
|
1668
|
+
"""Replace item with new item."""
|
|
1669
|
+
self.insert_item_after_sibling(new_item=new_item, sibling=old_item)
|
|
1670
|
+
self.delete_items(node_items=[old_item])
|
|
1671
|
+
|
|
1672
|
+
# ----------------------------
|
|
1673
|
+
# Private Manipulation methods
|
|
1674
|
+
# ----------------------------
|
|
1675
|
+
|
|
1676
|
+
def _get_stack_of_item(self, item: NodeItem) -> tuple[bool, list[int]]:
|
|
1677
|
+
"""Find the stack indices of the item."""
|
|
1678
|
+
return self._get_stack_of_refitem(ref=item.get_ref())
|
|
1679
|
+
|
|
1680
|
+
def _get_stack_of_refitem(self, ref: RefItem) -> tuple[bool, list[int]]:
|
|
1681
|
+
"""Find the stack indices of the reference."""
|
|
1682
|
+
if ref == self.body.get_ref():
|
|
1683
|
+
return (True, [])
|
|
1684
|
+
|
|
1685
|
+
node = ref.resolve(doc=self)
|
|
1686
|
+
parent_ref = node._get_parent_ref(doc=self, stack=[])
|
|
1687
|
+
|
|
1688
|
+
if parent_ref is None:
|
|
1689
|
+
return (False, [])
|
|
1690
|
+
|
|
1691
|
+
stack: list[int] = []
|
|
1692
|
+
while parent_ref is not None:
|
|
1693
|
+
parent = parent_ref.resolve(doc=self)
|
|
1694
|
+
|
|
1695
|
+
index = parent.children.index(node.get_ref())
|
|
1696
|
+
stack.insert(0, index) # prepend the index
|
|
1697
|
+
|
|
1698
|
+
node = parent
|
|
1699
|
+
parent_ref = node._get_parent_ref(doc=self, stack=[])
|
|
1700
|
+
|
|
1701
|
+
return (True, stack)
|
|
1702
|
+
|
|
1703
|
+
def _insert_item_at_refitem(
|
|
1704
|
+
self, item: NodeItem, ref: RefItem, after: bool
|
|
1705
|
+
) -> RefItem:
|
|
1706
|
+
"""Insert node-item using the self-reference."""
|
|
1707
|
+
success, stack = self._get_stack_of_refitem(ref=ref)
|
|
1708
|
+
|
|
1709
|
+
if not success:
|
|
1710
|
+
raise ValueError(
|
|
1711
|
+
f"Could not insert at {ref.cref}: could not find the stack"
|
|
1712
|
+
)
|
|
1713
|
+
|
|
1714
|
+
return self._insert_item_at_stack(item=item, stack=stack, after=after)
|
|
1715
|
+
|
|
1716
|
+
def _append_item(self, *, item: NodeItem, parent_ref: RefItem) -> RefItem:
|
|
1717
|
+
"""Append item of its type."""
|
|
1718
|
+
cref: str = "" # to be updated
|
|
1719
|
+
|
|
1720
|
+
if isinstance(item, TextItem):
|
|
1721
|
+
item_label = "texts"
|
|
1722
|
+
item_index = len(self.texts)
|
|
1723
|
+
|
|
1724
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1725
|
+
|
|
1726
|
+
item.self_ref = cref
|
|
1727
|
+
item.parent = parent_ref
|
|
1728
|
+
|
|
1729
|
+
self.texts.append(item)
|
|
1730
|
+
|
|
1731
|
+
elif isinstance(item, TableItem):
|
|
1732
|
+
item_label = "tables"
|
|
1733
|
+
item_index = len(self.tables)
|
|
1734
|
+
|
|
1735
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1736
|
+
|
|
1737
|
+
item.self_ref = cref
|
|
1738
|
+
item.parent = parent_ref
|
|
1739
|
+
|
|
1740
|
+
self.tables.append(item)
|
|
1741
|
+
|
|
1742
|
+
elif isinstance(item, PictureItem):
|
|
1743
|
+
item_label = "pictures"
|
|
1744
|
+
item_index = len(self.pictures)
|
|
1745
|
+
|
|
1746
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1747
|
+
|
|
1748
|
+
item.self_ref = cref
|
|
1749
|
+
item.parent = parent_ref
|
|
1750
|
+
|
|
1751
|
+
self.pictures.append(item)
|
|
1752
|
+
|
|
1753
|
+
elif isinstance(item, KeyValueItem):
|
|
1754
|
+
item_label = "key_value_items"
|
|
1755
|
+
item_index = len(self.key_value_items)
|
|
1756
|
+
|
|
1757
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1758
|
+
|
|
1759
|
+
item.self_ref = cref
|
|
1760
|
+
item.parent = parent_ref
|
|
1761
|
+
|
|
1762
|
+
self.key_value_items.append(item)
|
|
1763
|
+
|
|
1764
|
+
elif isinstance(item, FormItem):
|
|
1765
|
+
item_label = "form_items"
|
|
1766
|
+
item_index = len(self.form_items)
|
|
1767
|
+
|
|
1768
|
+
cref = f"#/{item_label}/{item_index}"
|
|
1769
|
+
|
|
1770
|
+
item.self_ref = cref
|
|
1771
|
+
item.parent = parent_ref
|
|
1772
|
+
|
|
1773
|
+
self.form_items.append(item)
|
|
1774
|
+
else:
|
|
1775
|
+
raise ValueError(f"Item {item} is not supported for insertion")
|
|
1776
|
+
|
|
1777
|
+
return RefItem(cref=cref)
|
|
1778
|
+
|
|
1779
|
+
def _pop_item(self, *, item: NodeItem):
|
|
1780
|
+
"""Pop the last item of its type."""
|
|
1781
|
+
path = item.self_ref.split("/")
|
|
1782
|
+
|
|
1783
|
+
if len(path) != 3:
|
|
1784
|
+
raise ValueError(f"Can not pop item with path: {path}")
|
|
1785
|
+
|
|
1786
|
+
item_label = path[1]
|
|
1787
|
+
item_index = int(path[2])
|
|
1788
|
+
|
|
1789
|
+
if (
|
|
1790
|
+
len(self.__getattribute__(item_label)) + 1 == item_index
|
|
1791
|
+
): # we can only pop the last item
|
|
1792
|
+
del self.__getattribute__(item_label)[item_index]
|
|
1793
|
+
else:
|
|
1794
|
+
msg = f"index:{item_index}, len:{len(self.__getattribute__(item_label))}"
|
|
1795
|
+
raise ValueError(f"Failed to pop: item is not last ({msg})")
|
|
1796
|
+
|
|
1797
|
+
def _insert_item_at_stack(
|
|
1798
|
+
self, item: NodeItem, stack: list[int], after: bool
|
|
1799
|
+
) -> RefItem:
|
|
1800
|
+
"""Insert node-item using the self-reference."""
|
|
1801
|
+
parent_ref = self.body._get_parent_ref(doc=self, stack=stack)
|
|
1802
|
+
|
|
1803
|
+
if parent_ref is None:
|
|
1804
|
+
raise ValueError(f"Could not find a parent at stack: {stack}")
|
|
1805
|
+
|
|
1806
|
+
new_ref = self._append_item(item=item, parent_ref=parent_ref)
|
|
1807
|
+
|
|
1808
|
+
success = self.body._add_sibling(
|
|
1809
|
+
doc=self, stack=stack, new_ref=new_ref, after=after
|
|
1810
|
+
)
|
|
1811
|
+
|
|
1812
|
+
if not success:
|
|
1813
|
+
self._pop_item(item=item)
|
|
1814
|
+
|
|
1815
|
+
return item.get_ref()
|
|
1816
|
+
|
|
1817
|
+
def _delete_items(self, refs: list[RefItem]) -> bool:
|
|
1818
|
+
"""Delete document item using the self-reference."""
|
|
1819
|
+
to_be_deleted_items: dict[tuple[int, ...], str] = {} # stack to cref
|
|
1820
|
+
|
|
1821
|
+
# Identify the to_be_deleted_items
|
|
1822
|
+
for item, stack in self._iterate_items_with_stack(with_groups=True):
|
|
1823
|
+
ref = item.get_ref()
|
|
1824
|
+
|
|
1825
|
+
if ref in refs:
|
|
1826
|
+
to_be_deleted_items[tuple(stack)] = ref.cref
|
|
1827
|
+
|
|
1828
|
+
substacks = [stack[0 : i + 1] for i in range(len(stack) - 1)]
|
|
1829
|
+
for substack in substacks:
|
|
1830
|
+
if tuple(substack) in to_be_deleted_items:
|
|
1831
|
+
to_be_deleted_items[tuple(stack)] = ref.cref
|
|
1832
|
+
|
|
1833
|
+
if len(to_be_deleted_items) == 0:
|
|
1834
|
+
raise ValueError("Nothing to be deleted ...")
|
|
1835
|
+
|
|
1836
|
+
# Clean the tree, reverse the order to not have to update
|
|
1837
|
+
for stack_, ref_ in reversed(sorted(to_be_deleted_items.items())):
|
|
1838
|
+
success = self.body._delete_child(doc=self, stack=list(stack_))
|
|
1839
|
+
|
|
1840
|
+
if not success:
|
|
1841
|
+
del to_be_deleted_items[stack_]
|
|
1842
|
+
else:
|
|
1843
|
+
_logger.info(f"deleted item in tree at stack: {stack_} => {ref_}")
|
|
1844
|
+
|
|
1845
|
+
# Create a new lookup of the orphans:
|
|
1846
|
+
# dict of item_label (`texts`, `tables`, ...) to a
|
|
1847
|
+
# dict of item_label with delta (default = -1).
|
|
1848
|
+
lookup: dict[str, dict[int, int]] = {}
|
|
1849
|
+
|
|
1850
|
+
for stack_, ref_ in to_be_deleted_items.items():
|
|
1851
|
+
path = ref_.split("/")
|
|
1852
|
+
if len(path) == 3:
|
|
1853
|
+
|
|
1854
|
+
item_label = path[1]
|
|
1855
|
+
item_index = int(path[2])
|
|
1856
|
+
|
|
1857
|
+
if item_label not in lookup:
|
|
1858
|
+
lookup[item_label] = {}
|
|
1859
|
+
|
|
1860
|
+
lookup[item_label][item_index] = -1
|
|
1861
|
+
|
|
1862
|
+
# Remove the orphans in reverse order
|
|
1863
|
+
for item_label, item_inds in lookup.items():
|
|
1864
|
+
for item_index, val in reversed(
|
|
1865
|
+
sorted(item_inds.items())
|
|
1866
|
+
): # make sure you delete the last in the list first!
|
|
1867
|
+
_logger.debug(f"deleting item in doc for {item_label} for {item_index}")
|
|
1868
|
+
del self.__getattribute__(item_label)[item_index]
|
|
1869
|
+
|
|
1870
|
+
self._update_breadth_first_with_lookup(
|
|
1871
|
+
node=self.body, refs_to_be_deleted=refs, lookup=lookup
|
|
1872
|
+
)
|
|
1873
|
+
|
|
1874
|
+
return True
|
|
1875
|
+
|
|
1876
|
+
# Update the references
|
|
1877
|
+
def _update_ref_with_lookup(
|
|
1878
|
+
self, item_label: str, item_index: int, lookup: dict[str, dict[int, int]]
|
|
1879
|
+
) -> RefItem:
|
|
1880
|
+
"""Update ref with lookup."""
|
|
1881
|
+
if item_label not in lookup: # Nothing to be done
|
|
1882
|
+
return RefItem(cref=f"#/{item_label}/{item_index}")
|
|
1883
|
+
|
|
1884
|
+
# Count how many items have been deleted in front of you
|
|
1885
|
+
delta = sum(
|
|
1886
|
+
val if item_index >= key else 0 for key, val in lookup[item_label].items()
|
|
1887
|
+
)
|
|
1888
|
+
new_index = item_index + delta
|
|
1889
|
+
|
|
1890
|
+
return RefItem(cref=f"#/{item_label}/{new_index}")
|
|
1891
|
+
|
|
1892
|
+
def _update_refitems_with_lookup(
|
|
1893
|
+
self,
|
|
1894
|
+
ref_items: list[RefItem],
|
|
1895
|
+
refs_to_be_deleted: list[RefItem],
|
|
1896
|
+
lookup: dict[str, dict[int, int]],
|
|
1897
|
+
) -> list[RefItem]:
|
|
1898
|
+
"""Update refitems with lookup."""
|
|
1899
|
+
new_refitems = []
|
|
1900
|
+
for ref_item in ref_items:
|
|
1901
|
+
|
|
1902
|
+
if (
|
|
1903
|
+
ref_item not in refs_to_be_deleted
|
|
1904
|
+
): # if ref_item is in ref, then delete/skip them
|
|
1905
|
+
path = ref_item._split_ref_to_path()
|
|
1906
|
+
if len(path) == 3:
|
|
1907
|
+
new_refitems.append(
|
|
1908
|
+
self._update_ref_with_lookup(
|
|
1909
|
+
item_label=path[1],
|
|
1910
|
+
item_index=int(path[2]),
|
|
1911
|
+
lookup=lookup,
|
|
1912
|
+
)
|
|
1913
|
+
)
|
|
1914
|
+
else:
|
|
1915
|
+
new_refitems.append(ref_item)
|
|
1916
|
+
|
|
1917
|
+
return new_refitems
|
|
1918
|
+
|
|
1919
|
+
def _update_breadth_first_with_lookup(
|
|
1920
|
+
self,
|
|
1921
|
+
node: NodeItem,
|
|
1922
|
+
refs_to_be_deleted: list[RefItem],
|
|
1923
|
+
lookup: dict[str, dict[int, int]],
|
|
1924
|
+
):
|
|
1925
|
+
"""Update breadth first with lookup."""
|
|
1926
|
+
# Update the captions, references and footnote references
|
|
1927
|
+
if isinstance(node, FloatingItem):
|
|
1928
|
+
node.captions = self._update_refitems_with_lookup(
|
|
1929
|
+
ref_items=node.captions,
|
|
1930
|
+
refs_to_be_deleted=refs_to_be_deleted,
|
|
1931
|
+
lookup=lookup,
|
|
1932
|
+
)
|
|
1933
|
+
node.references = self._update_refitems_with_lookup(
|
|
1934
|
+
ref_items=node.references,
|
|
1935
|
+
refs_to_be_deleted=refs_to_be_deleted,
|
|
1936
|
+
lookup=lookup,
|
|
1937
|
+
)
|
|
1938
|
+
node.footnotes = self._update_refitems_with_lookup(
|
|
1939
|
+
ref_items=node.footnotes,
|
|
1940
|
+
refs_to_be_deleted=refs_to_be_deleted,
|
|
1941
|
+
lookup=lookup,
|
|
1942
|
+
)
|
|
1943
|
+
|
|
1944
|
+
# Update the self_ref reference
|
|
1945
|
+
if node.parent is not None:
|
|
1946
|
+
path = node.parent._split_ref_to_path()
|
|
1947
|
+
if len(path) == 3:
|
|
1948
|
+
node.parent = self._update_ref_with_lookup(
|
|
1949
|
+
item_label=path[1], item_index=int(path[2]), lookup=lookup
|
|
1950
|
+
)
|
|
1951
|
+
|
|
1952
|
+
# Update the parent reference
|
|
1953
|
+
if node.self_ref is not None:
|
|
1954
|
+
path = node.self_ref.split("/")
|
|
1955
|
+
if len(path) == 3:
|
|
1956
|
+
_ref = self._update_ref_with_lookup(
|
|
1957
|
+
item_label=path[1], item_index=int(path[2]), lookup=lookup
|
|
1958
|
+
)
|
|
1959
|
+
node.self_ref = _ref.cref
|
|
1960
|
+
|
|
1961
|
+
# Update the child references
|
|
1962
|
+
node.children = self._update_refitems_with_lookup(
|
|
1963
|
+
ref_items=node.children,
|
|
1964
|
+
refs_to_be_deleted=refs_to_be_deleted,
|
|
1965
|
+
lookup=lookup,
|
|
1966
|
+
)
|
|
1967
|
+
|
|
1968
|
+
for i, child_ref in enumerate(node.children):
|
|
1969
|
+
node = child_ref.resolve(self)
|
|
1970
|
+
self._update_breadth_first_with_lookup(
|
|
1971
|
+
node=node, refs_to_be_deleted=refs_to_be_deleted, lookup=lookup
|
|
1972
|
+
)
|
|
1973
|
+
|
|
1686
1974
|
###################################
|
|
1687
1975
|
# TODO: refactor add* methods below
|
|
1688
1976
|
###################################
|
|
@@ -2321,21 +2609,33 @@ class DoclingDocument(BaseModel):
|
|
|
2321
2609
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2322
2610
|
_level: int = 0, # fixed parameter, carries through the node nesting level
|
|
2323
2611
|
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
|
|
2324
|
-
"""
|
|
2325
|
-
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
:
|
|
2332
|
-
|
|
2333
|
-
|
|
2612
|
+
"""Iterate elements with level."""
|
|
2613
|
+
for item, stack in self._iterate_items_with_stack(
|
|
2614
|
+
root=root,
|
|
2615
|
+
with_groups=with_groups,
|
|
2616
|
+
traverse_pictures=traverse_pictures,
|
|
2617
|
+
page_no=page_no,
|
|
2618
|
+
included_content_layers=included_content_layers,
|
|
2619
|
+
):
|
|
2620
|
+
yield item, len(stack)
|
|
2621
|
+
|
|
2622
|
+
def _iterate_items_with_stack(
|
|
2623
|
+
self,
|
|
2624
|
+
root: Optional[NodeItem] = None,
|
|
2625
|
+
with_groups: bool = False,
|
|
2626
|
+
traverse_pictures: bool = False,
|
|
2627
|
+
page_no: Optional[int] = None,
|
|
2628
|
+
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
2629
|
+
_stack: Optional[list[int]] = None,
|
|
2630
|
+
) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
|
|
2631
|
+
"""Iterate elements with stack."""
|
|
2334
2632
|
my_layers = (
|
|
2335
2633
|
included_content_layers
|
|
2336
2634
|
if included_content_layers is not None
|
|
2337
2635
|
else DEFAULT_CONTENT_LAYERS
|
|
2338
2636
|
)
|
|
2637
|
+
my_stack: list[int] = _stack if _stack is not None else []
|
|
2638
|
+
|
|
2339
2639
|
if not root:
|
|
2340
2640
|
root = self.body
|
|
2341
2641
|
|
|
@@ -2355,25 +2655,31 @@ class DoclingDocument(BaseModel):
|
|
|
2355
2655
|
)
|
|
2356
2656
|
|
|
2357
2657
|
if should_yield:
|
|
2358
|
-
yield root,
|
|
2658
|
+
yield root, my_stack
|
|
2359
2659
|
|
|
2360
2660
|
# Handle picture traversal - only traverse children if requested
|
|
2361
2661
|
if isinstance(root, PictureItem) and not traverse_pictures:
|
|
2362
2662
|
return
|
|
2363
2663
|
|
|
2664
|
+
my_stack.append(-1)
|
|
2665
|
+
|
|
2364
2666
|
# Traverse children
|
|
2365
|
-
for child_ref in root.children:
|
|
2667
|
+
for child_ind, child_ref in enumerate(root.children):
|
|
2668
|
+
my_stack[-1] = child_ind
|
|
2366
2669
|
child = child_ref.resolve(self)
|
|
2670
|
+
|
|
2367
2671
|
if isinstance(child, NodeItem):
|
|
2368
|
-
yield from self.
|
|
2672
|
+
yield from self._iterate_items_with_stack(
|
|
2369
2673
|
child,
|
|
2370
2674
|
with_groups=with_groups,
|
|
2371
2675
|
traverse_pictures=traverse_pictures,
|
|
2372
2676
|
page_no=page_no,
|
|
2373
|
-
|
|
2677
|
+
_stack=my_stack,
|
|
2374
2678
|
included_content_layers=my_layers,
|
|
2375
2679
|
)
|
|
2376
2680
|
|
|
2681
|
+
my_stack.pop()
|
|
2682
|
+
|
|
2377
2683
|
def _clear_picture_pil_cache(self):
|
|
2378
2684
|
"""Clear cache storage of all images."""
|
|
2379
2685
|
for item, level in self.iterate_items(with_groups=False):
|
|
@@ -2646,6 +2952,7 @@ class DoclingDocument(BaseModel):
|
|
|
2646
2952
|
strict_text: bool = False,
|
|
2647
2953
|
escape_underscores: bool = True,
|
|
2648
2954
|
image_placeholder: str = "<!-- image -->",
|
|
2955
|
+
enable_chart_tables: bool = True,
|
|
2649
2956
|
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
2650
2957
|
indent: int = 4,
|
|
2651
2958
|
text_width: int = -1,
|
|
@@ -2713,6 +3020,7 @@ class DoclingDocument(BaseModel):
|
|
|
2713
3020
|
stop_idx=to_element,
|
|
2714
3021
|
escape_underscores=escape_underscores,
|
|
2715
3022
|
image_placeholder=image_placeholder,
|
|
3023
|
+
enable_chart_tables=enable_chart_tables,
|
|
2716
3024
|
image_mode=image_mode,
|
|
2717
3025
|
indent=indent,
|
|
2718
3026
|
wrap_width=text_width if text_width > 0 else None,
|
|
@@ -2763,12 +3071,14 @@ class DoclingDocument(BaseModel):
|
|
|
2763
3071
|
formula_to_mathml: bool = True,
|
|
2764
3072
|
page_no: Optional[int] = None,
|
|
2765
3073
|
html_lang: str = "en",
|
|
2766
|
-
html_head: str =
|
|
3074
|
+
html_head: str = "null", # should be deprecated
|
|
2767
3075
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3076
|
+
split_page_view: bool = False,
|
|
2768
3077
|
):
|
|
2769
3078
|
"""Save to HTML."""
|
|
2770
3079
|
if isinstance(filename, str):
|
|
2771
3080
|
filename = Path(filename)
|
|
3081
|
+
|
|
2772
3082
|
artifacts_dir, reference_path = self._get_output_paths(filename, artifacts_dir)
|
|
2773
3083
|
|
|
2774
3084
|
if image_mode == ImageRefMode.REFERENCED:
|
|
@@ -2788,6 +3098,7 @@ class DoclingDocument(BaseModel):
|
|
|
2788
3098
|
html_lang=html_lang,
|
|
2789
3099
|
html_head=html_head,
|
|
2790
3100
|
included_content_layers=included_content_layers,
|
|
3101
|
+
split_page_view=split_page_view,
|
|
2791
3102
|
)
|
|
2792
3103
|
|
|
2793
3104
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
@@ -2836,245 +3147,51 @@ class DoclingDocument(BaseModel):
|
|
|
2836
3147
|
formula_to_mathml: bool = True,
|
|
2837
3148
|
page_no: Optional[int] = None,
|
|
2838
3149
|
html_lang: str = "en",
|
|
2839
|
-
html_head: str =
|
|
3150
|
+
html_head: str = "null", # should be deprecated ...
|
|
2840
3151
|
included_content_layers: Optional[set[ContentLayer]] = None,
|
|
3152
|
+
split_page_view: bool = False,
|
|
2841
3153
|
) -> str:
|
|
2842
3154
|
r"""Serialize to HTML."""
|
|
2843
|
-
|
|
3155
|
+
from docling_core.experimental.serializer.html import (
|
|
3156
|
+
HTMLDocSerializer,
|
|
3157
|
+
HTMLOutputStyle,
|
|
3158
|
+
HTMLParams,
|
|
3159
|
+
)
|
|
3160
|
+
|
|
3161
|
+
my_labels = labels if labels is not None else DOCUMENT_TOKENS_EXPORT_LABELS
|
|
2844
3162
|
my_layers = (
|
|
2845
3163
|
included_content_layers
|
|
2846
3164
|
if included_content_layers is not None
|
|
2847
3165
|
else DEFAULT_CONTENT_LAYERS
|
|
2848
3166
|
)
|
|
2849
3167
|
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
in_ordered_list: List[bool],
|
|
2854
|
-
html_texts: list[str],
|
|
2855
|
-
):
|
|
2856
|
-
|
|
2857
|
-
if len(in_ordered_list) == 0:
|
|
2858
|
-
return (in_ordered_list, html_texts)
|
|
2859
|
-
|
|
2860
|
-
while curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2861
|
-
if in_ordered_list[-1]:
|
|
2862
|
-
html_texts.append("</ol>")
|
|
2863
|
-
else:
|
|
2864
|
-
html_texts.append("</ul>")
|
|
2865
|
-
|
|
2866
|
-
prev_level -= 1
|
|
2867
|
-
in_ordered_list.pop() # = in_ordered_list[:-1]
|
|
2868
|
-
|
|
2869
|
-
return (in_ordered_list, html_texts)
|
|
2870
|
-
|
|
2871
|
-
head_lines = [
|
|
2872
|
-
"<!DOCTYPE html>",
|
|
2873
|
-
f'<html lang="{html_lang}">',
|
|
2874
|
-
html_head,
|
|
2875
|
-
]
|
|
2876
|
-
html_texts: list[str] = []
|
|
2877
|
-
|
|
2878
|
-
prev_level = 0 # Track the previous item's level
|
|
2879
|
-
|
|
2880
|
-
in_ordered_list: List[bool] = [] # False
|
|
2881
|
-
|
|
2882
|
-
def _prepare_tag_content(
|
|
2883
|
-
text: str, do_escape_html=True, do_replace_newline=True
|
|
2884
|
-
) -> str:
|
|
2885
|
-
if do_escape_html:
|
|
2886
|
-
text = html.escape(text, quote=False)
|
|
2887
|
-
if do_replace_newline:
|
|
2888
|
-
text = text.replace("\n", "<br>")
|
|
2889
|
-
return text
|
|
2890
|
-
|
|
2891
|
-
for ix, (item, curr_level) in enumerate(
|
|
2892
|
-
self.iterate_items(
|
|
2893
|
-
self.body,
|
|
2894
|
-
with_groups=True,
|
|
2895
|
-
page_no=page_no,
|
|
2896
|
-
included_content_layers=my_layers,
|
|
2897
|
-
)
|
|
2898
|
-
):
|
|
2899
|
-
# If we've moved to a lower level, we're exiting one or more groups
|
|
2900
|
-
if curr_level < prev_level and len(in_ordered_list) > 0:
|
|
2901
|
-
# Calculate how many levels we've exited
|
|
2902
|
-
# level_difference = previous_level - level
|
|
2903
|
-
# Decrement list_nesting_level for each list group we've exited
|
|
2904
|
-
# list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
2905
|
-
|
|
2906
|
-
in_ordered_list, html_texts = close_lists(
|
|
2907
|
-
curr_level=curr_level,
|
|
2908
|
-
prev_level=prev_level,
|
|
2909
|
-
in_ordered_list=in_ordered_list,
|
|
2910
|
-
html_texts=html_texts,
|
|
2911
|
-
)
|
|
2912
|
-
|
|
2913
|
-
prev_level = curr_level # Update previous_level for next iteration
|
|
2914
|
-
|
|
2915
|
-
if ix < from_element or to_element <= ix:
|
|
2916
|
-
continue # skip as many items as you want
|
|
2917
|
-
|
|
2918
|
-
if (isinstance(item, DocItem)) and (item.label not in my_labels):
|
|
2919
|
-
continue # skip any label that is not whitelisted
|
|
3168
|
+
output_style = HTMLOutputStyle.SINGLE_COLUMN
|
|
3169
|
+
if split_page_view:
|
|
3170
|
+
output_style = HTMLOutputStyle.SPLIT_PAGE
|
|
2920
3171
|
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
]:
|
|
2934
|
-
|
|
2935
|
-
text = "<ul>"
|
|
2936
|
-
html_texts.append(text)
|
|
2937
|
-
|
|
2938
|
-
# Increment list nesting level when entering a new list
|
|
2939
|
-
in_ordered_list.append(False)
|
|
2940
|
-
|
|
2941
|
-
elif isinstance(item, GroupItem):
|
|
2942
|
-
continue
|
|
2943
|
-
|
|
2944
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2945
|
-
text_inner = _prepare_tag_content(item.text)
|
|
2946
|
-
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
|
|
2947
|
-
|
|
2948
|
-
html_texts.append(text)
|
|
2949
|
-
|
|
2950
|
-
elif isinstance(item, SectionHeaderItem):
|
|
2951
|
-
|
|
2952
|
-
section_level: int = min(item.level + 1, 6)
|
|
2953
|
-
|
|
2954
|
-
text = get_html_tag_with_text_direction(
|
|
2955
|
-
html_tag=f"h{section_level}",
|
|
2956
|
-
text=_prepare_tag_content(item.text),
|
|
2957
|
-
)
|
|
2958
|
-
html_texts.append(text)
|
|
2959
|
-
|
|
2960
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2961
|
-
|
|
2962
|
-
math_formula = _prepare_tag_content(
|
|
2963
|
-
item.text, do_escape_html=False, do_replace_newline=False
|
|
2964
|
-
)
|
|
2965
|
-
text = ""
|
|
2966
|
-
|
|
2967
|
-
def _image_fallback(item: TextItem):
|
|
2968
|
-
item_image = item.get_image(doc=self)
|
|
2969
|
-
if item_image is not None:
|
|
2970
|
-
img_ref = ImageRef.from_pil(item_image, dpi=72)
|
|
2971
|
-
return (
|
|
2972
|
-
"<figure>"
|
|
2973
|
-
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
|
|
2974
|
-
"</figure>"
|
|
2975
|
-
)
|
|
2976
|
-
|
|
2977
|
-
img_fallback = _image_fallback(item)
|
|
2978
|
-
|
|
2979
|
-
# If the formula is not processed correcty, use its image
|
|
2980
|
-
if (
|
|
2981
|
-
item.text == ""
|
|
2982
|
-
and item.orig != ""
|
|
2983
|
-
and image_mode == ImageRefMode.EMBEDDED
|
|
2984
|
-
and len(item.prov) > 0
|
|
2985
|
-
and img_fallback is not None
|
|
2986
|
-
):
|
|
2987
|
-
text = img_fallback
|
|
2988
|
-
|
|
2989
|
-
# Building a math equation in MathML format
|
|
2990
|
-
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
2991
|
-
elif formula_to_mathml and len(math_formula) > 0:
|
|
2992
|
-
try:
|
|
2993
|
-
mathml_element = latex2mathml.converter.convert_to_element(
|
|
2994
|
-
math_formula, display="block"
|
|
2995
|
-
)
|
|
2996
|
-
annotation = SubElement(
|
|
2997
|
-
mathml_element, "annotation", dict(encoding="TeX")
|
|
2998
|
-
)
|
|
2999
|
-
annotation.text = math_formula
|
|
3000
|
-
mathml = unescape(tostring(mathml_element, encoding="unicode"))
|
|
3001
|
-
text = f"<div>{mathml}</div>"
|
|
3002
|
-
except Exception as err:
|
|
3003
|
-
_logger.warning(
|
|
3004
|
-
"Malformed formula cannot be rendered. "
|
|
3005
|
-
f"Error {err.__class__.__name__}, formula={math_formula}"
|
|
3006
|
-
)
|
|
3007
|
-
if (
|
|
3008
|
-
image_mode == ImageRefMode.EMBEDDED
|
|
3009
|
-
and len(item.prov) > 0
|
|
3010
|
-
and img_fallback is not None
|
|
3011
|
-
):
|
|
3012
|
-
text = img_fallback
|
|
3013
|
-
else:
|
|
3014
|
-
text = f"<pre>{math_formula}</pre>"
|
|
3015
|
-
|
|
3016
|
-
elif math_formula != "":
|
|
3017
|
-
text = f"<pre>{math_formula}</pre>"
|
|
3018
|
-
|
|
3019
|
-
if text != "":
|
|
3020
|
-
html_texts.append(text)
|
|
3021
|
-
else:
|
|
3022
|
-
html_texts.append(
|
|
3023
|
-
'<div class="formula-not-decoded">Formula not decoded</div>'
|
|
3024
|
-
)
|
|
3025
|
-
|
|
3026
|
-
elif isinstance(item, ListItem):
|
|
3027
|
-
text = get_html_tag_with_text_direction(
|
|
3028
|
-
html_tag="li", text=_prepare_tag_content(item.text)
|
|
3029
|
-
)
|
|
3030
|
-
html_texts.append(text)
|
|
3031
|
-
|
|
3032
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
3033
|
-
text = get_html_tag_with_text_direction(
|
|
3034
|
-
html_tag="li", text=_prepare_tag_content(item.text)
|
|
3035
|
-
)
|
|
3036
|
-
html_texts.append(text)
|
|
3037
|
-
|
|
3038
|
-
elif isinstance(item, CodeItem):
|
|
3039
|
-
code_text = _prepare_tag_content(
|
|
3040
|
-
item.text, do_escape_html=False, do_replace_newline=False
|
|
3041
|
-
)
|
|
3042
|
-
text = f"<pre><code>{code_text}</code></pre>"
|
|
3043
|
-
html_texts.append(text)
|
|
3044
|
-
|
|
3045
|
-
elif isinstance(item, TextItem):
|
|
3046
|
-
|
|
3047
|
-
text = get_html_tag_with_text_direction(
|
|
3048
|
-
html_tag="p", text=_prepare_tag_content(item.text)
|
|
3049
|
-
)
|
|
3050
|
-
html_texts.append(text)
|
|
3051
|
-
|
|
3052
|
-
elif isinstance(item, TableItem):
|
|
3053
|
-
|
|
3054
|
-
text = item.export_to_html(doc=self, add_caption=True)
|
|
3055
|
-
html_texts.append(text)
|
|
3056
|
-
|
|
3057
|
-
elif isinstance(item, PictureItem):
|
|
3058
|
-
|
|
3059
|
-
html_texts.append(
|
|
3060
|
-
item.export_to_html(
|
|
3061
|
-
doc=self, add_caption=True, image_mode=image_mode
|
|
3062
|
-
)
|
|
3063
|
-
)
|
|
3064
|
-
|
|
3065
|
-
elif isinstance(item, DocItem) and item.label in my_labels:
|
|
3066
|
-
continue
|
|
3067
|
-
|
|
3068
|
-
html_texts.append("</html>")
|
|
3172
|
+
params = HTMLParams(
|
|
3173
|
+
labels=my_labels,
|
|
3174
|
+
layers=my_layers,
|
|
3175
|
+
pages={page_no} if page_no is not None else None,
|
|
3176
|
+
start_idx=from_element,
|
|
3177
|
+
stop_idx=to_element,
|
|
3178
|
+
image_mode=image_mode,
|
|
3179
|
+
formula_to_mathml=formula_to_mathml,
|
|
3180
|
+
html_head=html_head,
|
|
3181
|
+
html_lang=html_lang,
|
|
3182
|
+
output_style=output_style,
|
|
3183
|
+
)
|
|
3069
3184
|
|
|
3070
|
-
|
|
3071
|
-
|
|
3072
|
-
lines.extend(html_texts)
|
|
3185
|
+
if html_head == "null":
|
|
3186
|
+
params.html_head = None
|
|
3073
3187
|
|
|
3074
|
-
|
|
3075
|
-
|
|
3188
|
+
serializer = HTMLDocSerializer(
|
|
3189
|
+
doc=self,
|
|
3190
|
+
params=params,
|
|
3191
|
+
)
|
|
3192
|
+
ser_res = serializer.serialize()
|
|
3076
3193
|
|
|
3077
|
-
return
|
|
3194
|
+
return ser_res.text
|
|
3078
3195
|
|
|
3079
3196
|
def load_from_doctags( # noqa: C901
|
|
3080
3197
|
self,
|
|
@@ -3105,6 +3222,8 @@ class DoclingDocument(BaseModel):
|
|
|
3105
3222
|
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
3106
3223
|
"""Extract <loc_...> coords from the chunk, normalized by / 500."""
|
|
3107
3224
|
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
|
3225
|
+
if len(coords) > 4:
|
|
3226
|
+
coords = coords[:4]
|
|
3108
3227
|
if len(coords) == 4:
|
|
3109
3228
|
l, t, r, b = map(float, coords)
|
|
3110
3229
|
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
|
@@ -3135,11 +3254,28 @@ class DoclingDocument(BaseModel):
|
|
|
3135
3254
|
|
|
3136
3255
|
def otsl_parse_texts(texts, tokens):
|
|
3137
3256
|
split_word = TableToken.OTSL_NL.value
|
|
3257
|
+
# CLEAN tokens from extra tags, only structural OTSL allowed
|
|
3258
|
+
clean_tokens = []
|
|
3259
|
+
for t in tokens:
|
|
3260
|
+
if t in [
|
|
3261
|
+
TableToken.OTSL_ECEL.value,
|
|
3262
|
+
TableToken.OTSL_FCEL.value,
|
|
3263
|
+
TableToken.OTSL_LCEL.value,
|
|
3264
|
+
TableToken.OTSL_UCEL.value,
|
|
3265
|
+
TableToken.OTSL_XCEL.value,
|
|
3266
|
+
TableToken.OTSL_NL.value,
|
|
3267
|
+
TableToken.OTSL_CHED.value,
|
|
3268
|
+
TableToken.OTSL_RHED.value,
|
|
3269
|
+
TableToken.OTSL_SROW.value,
|
|
3270
|
+
]:
|
|
3271
|
+
clean_tokens.append(t)
|
|
3272
|
+
tokens = clean_tokens
|
|
3138
3273
|
split_row_tokens = [
|
|
3139
3274
|
list(y)
|
|
3140
3275
|
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
3141
3276
|
if not x
|
|
3142
3277
|
]
|
|
3278
|
+
|
|
3143
3279
|
table_cells = []
|
|
3144
3280
|
r_idx = 0
|
|
3145
3281
|
c_idx = 0
|
|
@@ -3291,6 +3427,40 @@ class DoclingDocument(BaseModel):
|
|
|
3291
3427
|
table_cells=table_cells,
|
|
3292
3428
|
)
|
|
3293
3429
|
|
|
3430
|
+
def extract_chart_type(text_chunk: str):
|
|
3431
|
+
label = None
|
|
3432
|
+
chart_labels = [
|
|
3433
|
+
PictureClassificationLabel.PIE_CHART,
|
|
3434
|
+
PictureClassificationLabel.BAR_CHART,
|
|
3435
|
+
PictureClassificationLabel.STACKED_BAR_CHART,
|
|
3436
|
+
PictureClassificationLabel.LINE_CHART,
|
|
3437
|
+
PictureClassificationLabel.FLOW_CHART,
|
|
3438
|
+
PictureClassificationLabel.SCATTER_CHART,
|
|
3439
|
+
PictureClassificationLabel.HEATMAP,
|
|
3440
|
+
"line",
|
|
3441
|
+
"dot_line",
|
|
3442
|
+
"vbar_categorical",
|
|
3443
|
+
"hbar_categorical",
|
|
3444
|
+
]
|
|
3445
|
+
|
|
3446
|
+
# Current SmolDocling can predict different labels:
|
|
3447
|
+
chart_labels_mapping = {
|
|
3448
|
+
"line": PictureClassificationLabel.LINE_CHART,
|
|
3449
|
+
"dot_line": PictureClassificationLabel.LINE_CHART,
|
|
3450
|
+
"vbar_categorical": PictureClassificationLabel.BAR_CHART,
|
|
3451
|
+
"hbar_categorical": PictureClassificationLabel.BAR_CHART,
|
|
3452
|
+
}
|
|
3453
|
+
|
|
3454
|
+
for clabel in chart_labels:
|
|
3455
|
+
tag = f"<{clabel}>"
|
|
3456
|
+
if tag in text_chunk:
|
|
3457
|
+
if clabel in chart_labels_mapping:
|
|
3458
|
+
label = PictureClassificationLabel(chart_labels_mapping[clabel])
|
|
3459
|
+
else:
|
|
3460
|
+
label = PictureClassificationLabel(clabel)
|
|
3461
|
+
break
|
|
3462
|
+
return label
|
|
3463
|
+
|
|
3294
3464
|
def parse_key_value_item(
|
|
3295
3465
|
tokens: str, image: Optional[PILImage.Image] = None
|
|
3296
3466
|
) -> Tuple[GraphData, Optional[ProvenanceItem]]:
|
|
@@ -3422,10 +3592,9 @@ class DoclingDocument(BaseModel):
|
|
|
3422
3592
|
rf"{DocumentToken.ORDERED_LIST.value}|"
|
|
3423
3593
|
rf"{DocumentToken.UNORDERED_LIST.value}|"
|
|
3424
3594
|
rf"{DocItemLabel.KEY_VALUE_REGION}|"
|
|
3595
|
+
rf"{DocumentToken.CHART.value}|"
|
|
3425
3596
|
rf"{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
|
3426
3597
|
)
|
|
3427
|
-
|
|
3428
|
-
# DocumentToken.OTSL
|
|
3429
3598
|
pattern = re.compile(tag_pattern, re.DOTALL)
|
|
3430
3599
|
|
|
3431
3600
|
# Go through each match in order
|
|
@@ -3433,18 +3602,17 @@ class DoclingDocument(BaseModel):
|
|
|
3433
3602
|
full_chunk = match.group(0)
|
|
3434
3603
|
tag_name = match.group("tag")
|
|
3435
3604
|
|
|
3436
|
-
bbox = extract_bounding_box(full_chunk)
|
|
3605
|
+
bbox = extract_bounding_box(full_chunk) # Extracts first bbox
|
|
3437
3606
|
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
|
3438
3607
|
|
|
3439
3608
|
if tag_name == DocumentToken.OTSL.value:
|
|
3440
3609
|
table_data = parse_table_content(full_chunk)
|
|
3441
|
-
bbox = extract_bounding_box(full_chunk) if image else None
|
|
3442
3610
|
caption, caption_bbox = extract_caption(full_chunk)
|
|
3443
3611
|
if caption is not None and caption_bbox is not None:
|
|
3444
3612
|
caption.prov.append(
|
|
3445
3613
|
ProvenanceItem(
|
|
3446
3614
|
bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
|
|
3447
|
-
charspan=(0,
|
|
3615
|
+
charspan=(0, len(caption.text)),
|
|
3448
3616
|
page_no=page_no,
|
|
3449
3617
|
)
|
|
3450
3618
|
)
|
|
@@ -3458,8 +3626,13 @@ class DoclingDocument(BaseModel):
|
|
|
3458
3626
|
else:
|
|
3459
3627
|
self.add_table(data=table_data, caption=caption)
|
|
3460
3628
|
|
|
3461
|
-
elif tag_name
|
|
3462
|
-
|
|
3629
|
+
elif tag_name in [DocItemLabel.PICTURE, DocItemLabel.CHART]:
|
|
3630
|
+
caption, caption_bbox = extract_caption(full_chunk)
|
|
3631
|
+
table_data = None
|
|
3632
|
+
chart_type = None
|
|
3633
|
+
if tag_name == DocumentToken.CHART.value:
|
|
3634
|
+
table_data = parse_table_content(full_chunk)
|
|
3635
|
+
chart_type = extract_chart_type(full_chunk)
|
|
3463
3636
|
if image:
|
|
3464
3637
|
if bbox:
|
|
3465
3638
|
im_width, im_height = image.size
|
|
@@ -3483,30 +3656,77 @@ class DoclingDocument(BaseModel):
|
|
|
3483
3656
|
),
|
|
3484
3657
|
)
|
|
3485
3658
|
# If there is a caption to an image, add it as well
|
|
3486
|
-
if
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3659
|
+
if caption is not None and caption_bbox is not None:
|
|
3660
|
+
caption.prov.append(
|
|
3661
|
+
ProvenanceItem(
|
|
3662
|
+
bbox=caption_bbox.resize_by_scale(
|
|
3663
|
+
pg_width, pg_height
|
|
3664
|
+
),
|
|
3665
|
+
charspan=(0, len(caption.text)),
|
|
3666
|
+
page_no=page_no,
|
|
3667
|
+
)
|
|
3491
3668
|
)
|
|
3492
|
-
pic.captions.append(
|
|
3669
|
+
pic.captions.append(caption.get_ref())
|
|
3670
|
+
pic_title = "picture"
|
|
3671
|
+
if chart_type is not None:
|
|
3672
|
+
pic.annotations.append(
|
|
3673
|
+
PictureClassificationData(
|
|
3674
|
+
provenance="load_from_doctags",
|
|
3675
|
+
predicted_classes=[
|
|
3676
|
+
# chart_type
|
|
3677
|
+
PictureClassificationClass(
|
|
3678
|
+
class_name=chart_type, confidence=1.0
|
|
3679
|
+
)
|
|
3680
|
+
],
|
|
3681
|
+
)
|
|
3682
|
+
)
|
|
3683
|
+
pic_title = chart_type
|
|
3684
|
+
if table_data is not None:
|
|
3685
|
+
# Add chart data as PictureTabularChartData
|
|
3686
|
+
pd = PictureTabularChartData(
|
|
3687
|
+
chart_data=table_data, title=pic_title
|
|
3688
|
+
)
|
|
3689
|
+
pic.annotations.append(pd)
|
|
3493
3690
|
else:
|
|
3494
3691
|
if bbox:
|
|
3495
3692
|
# In case we don't have access to an binary of an image
|
|
3496
|
-
self.add_picture(
|
|
3693
|
+
pic = self.add_picture(
|
|
3497
3694
|
parent=None,
|
|
3498
3695
|
prov=ProvenanceItem(
|
|
3499
3696
|
bbox=bbox, charspan=(0, 0), page_no=page_no
|
|
3500
3697
|
),
|
|
3501
3698
|
)
|
|
3502
3699
|
# If there is a caption to an image, add it as well
|
|
3503
|
-
if
|
|
3504
|
-
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
|
|
3700
|
+
if caption is not None and caption_bbox is not None:
|
|
3701
|
+
caption.prov.append(
|
|
3702
|
+
ProvenanceItem(
|
|
3703
|
+
bbox=caption_bbox.resize_by_scale(
|
|
3704
|
+
pg_width, pg_height
|
|
3705
|
+
),
|
|
3706
|
+
charspan=(0, len(caption.text)),
|
|
3707
|
+
page_no=page_no,
|
|
3708
|
+
)
|
|
3709
|
+
)
|
|
3710
|
+
pic.captions.append(caption.get_ref())
|
|
3711
|
+
if chart_type is not None:
|
|
3712
|
+
pic.annotations.append(
|
|
3713
|
+
PictureClassificationData(
|
|
3714
|
+
provenance="load_from_doctags",
|
|
3715
|
+
predicted_classes=[
|
|
3716
|
+
# chart_type
|
|
3717
|
+
PictureClassificationClass(
|
|
3718
|
+
class_name=chart_type, confidence=1.0
|
|
3719
|
+
)
|
|
3720
|
+
],
|
|
3721
|
+
)
|
|
3508
3722
|
)
|
|
3509
|
-
|
|
3723
|
+
if table_data is not None:
|
|
3724
|
+
# Add chart data as PictureTabularChartData
|
|
3725
|
+
pd = PictureTabularChartData(
|
|
3726
|
+
chart_data=table_data, title=pic_title
|
|
3727
|
+
)
|
|
3728
|
+
pic.annotations.append(pd)
|
|
3729
|
+
|
|
3510
3730
|
elif tag_name == DocItemLabel.KEY_VALUE_REGION:
|
|
3511
3731
|
key_value_data, kv_item_prov = parse_key_value_item(
|
|
3512
3732
|
full_chunk, image
|