docling-core 2.6.1__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/document.py +94 -2
- docling_core/types/doc/tokens.py +202 -0
- {docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/METADATA +2 -2
- {docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/RECORD +7 -6
- {docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/LICENSE +0 -0
- {docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/WHEEL +0 -0
- {docling_core-2.6.1.dist-info → docling_core-2.7.0.dist-info}/entry_points.txt +0 -0
|
@@ -37,8 +37,8 @@ from docling_core.types.base import _JSON_POINTER_REGEX
|
|
|
37
37
|
from docling_core.types.doc import BoundingBox, Size
|
|
38
38
|
from docling_core.types.doc.base import ImageRefMode
|
|
39
39
|
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
40
|
+
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
40
41
|
from docling_core.types.doc.utils import relative_path
|
|
41
|
-
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
42
42
|
|
|
43
43
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
44
44
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
@@ -1008,7 +1008,6 @@ class TableItem(FloatingItem):
|
|
|
1008
1008
|
DeprecationWarning,
|
|
1009
1009
|
)
|
|
1010
1010
|
|
|
1011
|
-
body = ""
|
|
1012
1011
|
nrows = self.data.num_rows
|
|
1013
1012
|
ncols = self.data.num_cols
|
|
1014
1013
|
|
|
@@ -1065,6 +1064,99 @@ class TableItem(FloatingItem):
|
|
|
1065
1064
|
|
|
1066
1065
|
return body
|
|
1067
1066
|
|
|
1067
|
+
def export_to_otsl(
|
|
1068
|
+
self,
|
|
1069
|
+
doc: "DoclingDocument",
|
|
1070
|
+
add_cell_location: bool = True,
|
|
1071
|
+
add_cell_text: bool = True,
|
|
1072
|
+
xsize: int = 100,
|
|
1073
|
+
ysize: int = 100,
|
|
1074
|
+
) -> str:
|
|
1075
|
+
"""Export the table as OTSL."""
|
|
1076
|
+
# Possible OTSL tokens...
|
|
1077
|
+
#
|
|
1078
|
+
# Empty and full cells:
|
|
1079
|
+
# "ecel", "fcel"
|
|
1080
|
+
#
|
|
1081
|
+
# Cell spans (horisontal, vertical, 2d):
|
|
1082
|
+
# "lcel", "ucel", "xcel"
|
|
1083
|
+
#
|
|
1084
|
+
# New line:
|
|
1085
|
+
# "nl"
|
|
1086
|
+
#
|
|
1087
|
+
# Headers (column, row, section row):
|
|
1088
|
+
# "ched", "rhed", "srow"
|
|
1089
|
+
|
|
1090
|
+
body = []
|
|
1091
|
+
nrows = self.data.num_rows
|
|
1092
|
+
ncols = self.data.num_cols
|
|
1093
|
+
if len(self.data.table_cells) == 0:
|
|
1094
|
+
return ""
|
|
1095
|
+
|
|
1096
|
+
page_no = 0
|
|
1097
|
+
if len(self.prov) > 0:
|
|
1098
|
+
page_no = self.prov[0].page_no
|
|
1099
|
+
|
|
1100
|
+
for i in range(nrows):
|
|
1101
|
+
for j in range(ncols):
|
|
1102
|
+
cell: TableCell = self.data.grid[i][j]
|
|
1103
|
+
content = cell.text.strip()
|
|
1104
|
+
rowspan, rowstart = (
|
|
1105
|
+
cell.row_span,
|
|
1106
|
+
cell.start_row_offset_idx,
|
|
1107
|
+
)
|
|
1108
|
+
colspan, colstart = (
|
|
1109
|
+
cell.col_span,
|
|
1110
|
+
cell.start_col_offset_idx,
|
|
1111
|
+
)
|
|
1112
|
+
|
|
1113
|
+
if len(doc.pages.keys()):
|
|
1114
|
+
page_w, page_h = doc.pages[page_no].size.as_tuple()
|
|
1115
|
+
cell_loc = ""
|
|
1116
|
+
if cell.bbox is not None:
|
|
1117
|
+
cell_loc = DocumentToken.get_location(
|
|
1118
|
+
bbox=cell.bbox.to_bottom_left_origin(page_h).as_tuple(),
|
|
1119
|
+
page_w=page_w,
|
|
1120
|
+
page_h=page_h,
|
|
1121
|
+
xsize=xsize,
|
|
1122
|
+
ysize=ysize,
|
|
1123
|
+
page_i=page_no,
|
|
1124
|
+
)
|
|
1125
|
+
|
|
1126
|
+
if rowstart == i and colstart == j:
|
|
1127
|
+
if len(content) > 0:
|
|
1128
|
+
if cell.column_header:
|
|
1129
|
+
body.append(str(TableToken.OTSL_CHED.value))
|
|
1130
|
+
elif cell.row_header:
|
|
1131
|
+
body.append(str(TableToken.OTSL_RHED.value))
|
|
1132
|
+
elif cell.row_section:
|
|
1133
|
+
body.append(str(TableToken.OTSL_SROW.value))
|
|
1134
|
+
else:
|
|
1135
|
+
body.append(str(TableToken.OTSL_FCEL.value))
|
|
1136
|
+
if add_cell_location:
|
|
1137
|
+
body.append(str(cell_loc))
|
|
1138
|
+
if add_cell_text:
|
|
1139
|
+
body.append(str(content))
|
|
1140
|
+
else:
|
|
1141
|
+
body.append(str(TableToken.OTSL_ECEL.value))
|
|
1142
|
+
else:
|
|
1143
|
+
add_cross_cell = False
|
|
1144
|
+
if rowstart != i:
|
|
1145
|
+
if colspan == 1:
|
|
1146
|
+
body.append(str(TableToken.OTSL_UCEL.value))
|
|
1147
|
+
else:
|
|
1148
|
+
add_cross_cell = True
|
|
1149
|
+
if colstart != j:
|
|
1150
|
+
if rowspan == 1:
|
|
1151
|
+
body.append(str(TableToken.OTSL_LCEL.value))
|
|
1152
|
+
else:
|
|
1153
|
+
add_cross_cell = True
|
|
1154
|
+
if add_cross_cell:
|
|
1155
|
+
body.append(str(TableToken.OTSL_XCEL.value))
|
|
1156
|
+
body.append(str(TableToken.OTSL_NL.value))
|
|
1157
|
+
body_str = "".join(body)
|
|
1158
|
+
return body_str
|
|
1159
|
+
|
|
1068
1160
|
def export_to_document_tokens(
|
|
1069
1161
|
self,
|
|
1070
1162
|
doc: "DoclingDocument",
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Tokens used in the docling document model."""
|
|
7
|
+
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Annotated, Tuple
|
|
10
|
+
|
|
11
|
+
from pydantic import Field
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TableToken(Enum):
|
|
15
|
+
"""Class to represent an LLM friendly representation of a Table."""
|
|
16
|
+
|
|
17
|
+
CELL_LABEL_COLUMN_HEADER = "<column_header>"
|
|
18
|
+
CELL_LABEL_ROW_HEADER = "<row_header>"
|
|
19
|
+
CELL_LABEL_SECTION_HEADERE = "<section_header>"
|
|
20
|
+
CELL_LABEL_DATA = "<data>"
|
|
21
|
+
|
|
22
|
+
OTSL_ECEL = "<ecel>" # empty cell
|
|
23
|
+
OTSL_FCEL = "<fcel>" # cell with content
|
|
24
|
+
OTSL_LCEL = "<lcel>" # left looking cell,
|
|
25
|
+
OTSL_UCEL = "<ucel>" # up looking cell,
|
|
26
|
+
OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell),
|
|
27
|
+
OTSL_NL = "<nl>" # new line,
|
|
28
|
+
OTSL_CHED = "<ched>" # - column header cell,
|
|
29
|
+
OTSL_RHED = "<rhed>" # - row header cell,
|
|
30
|
+
OTSL_SROW = "<srow>" # - section row cell
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def get_special_tokens(cls):
|
|
34
|
+
"""Function to get all special document tokens."""
|
|
35
|
+
special_tokens = [token.value for token in cls]
|
|
36
|
+
return special_tokens
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def is_known_token(label):
|
|
40
|
+
"""Function to check if label is in tokens."""
|
|
41
|
+
return label in TableToken.get_special_tokens()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DocumentToken(Enum):
|
|
45
|
+
"""Class to represent an LLM friendly representation of a Document."""
|
|
46
|
+
|
|
47
|
+
BEG_DOCUMENT = "<document>"
|
|
48
|
+
END_DOCUMENT = "</document>"
|
|
49
|
+
|
|
50
|
+
BEG_TITLE = "<title>"
|
|
51
|
+
END_TITLE = "</title>"
|
|
52
|
+
|
|
53
|
+
BEG_ABSTRACT = "<abstract>"
|
|
54
|
+
END_ABSTRACT = "</abstract>"
|
|
55
|
+
|
|
56
|
+
BEG_DOI = "<doi>"
|
|
57
|
+
END_DOI = "</doi>"
|
|
58
|
+
BEG_DATE = "<date>"
|
|
59
|
+
END_DATE = "</date>"
|
|
60
|
+
|
|
61
|
+
BEG_AUTHORS = "<authors>"
|
|
62
|
+
END_AUTHORS = "</authors>"
|
|
63
|
+
BEG_AUTHOR = "<author>"
|
|
64
|
+
END_AUTHOR = "</author>"
|
|
65
|
+
|
|
66
|
+
BEG_AFFILIATIONS = "<affiliations>"
|
|
67
|
+
END_AFFILIATIONS = "</affiliations>"
|
|
68
|
+
BEG_AFFILIATION = "<affiliation>"
|
|
69
|
+
END_AFFILIATION = "</affiliation>"
|
|
70
|
+
|
|
71
|
+
BEG_HEADER = "<section-header>"
|
|
72
|
+
END_HEADER = "</section-header>"
|
|
73
|
+
BEG_TEXT = "<text>"
|
|
74
|
+
END_TEXT = "</text>"
|
|
75
|
+
BEG_PARAGRAPH = "<paragraph>"
|
|
76
|
+
END_PARAGRAPH = "</paragraph>"
|
|
77
|
+
BEG_TABLE = "<table>"
|
|
78
|
+
END_TABLE = "</table>"
|
|
79
|
+
BEG_FIGURE = "<figure>"
|
|
80
|
+
END_FIGURE = "</figure>"
|
|
81
|
+
BEG_CAPTION = "<caption>"
|
|
82
|
+
END_CAPTION = "</caption>"
|
|
83
|
+
BEG_EQUATION = "<equation>"
|
|
84
|
+
END_EQUATION = "</equation>"
|
|
85
|
+
BEG_LIST = "<list>"
|
|
86
|
+
END_LIST = "</list>"
|
|
87
|
+
BEG_LISTITEM = "<list-item>"
|
|
88
|
+
END_LISTITEM = "</list-item>"
|
|
89
|
+
|
|
90
|
+
BEG_LOCATION = "<location>"
|
|
91
|
+
END_LOCATION = "</location>"
|
|
92
|
+
BEG_GROUP = "<group>"
|
|
93
|
+
END_GROUP = "</group>"
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def get_special_tokens(
|
|
97
|
+
cls,
|
|
98
|
+
max_rows: int = 100,
|
|
99
|
+
max_cols: int = 100,
|
|
100
|
+
max_pages: int = 1000,
|
|
101
|
+
page_dimension: Tuple[int, int] = (100, 100),
|
|
102
|
+
):
|
|
103
|
+
"""Function to get all special document tokens."""
|
|
104
|
+
special_tokens = [token.value for token in cls]
|
|
105
|
+
|
|
106
|
+
# Adding dynamically generated row and col tokens
|
|
107
|
+
for i in range(0, max_rows + 1):
|
|
108
|
+
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
|
|
109
|
+
|
|
110
|
+
for i in range(0, max_cols + 1):
|
|
111
|
+
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
|
|
112
|
+
|
|
113
|
+
for i in range(6):
|
|
114
|
+
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
|
|
115
|
+
|
|
116
|
+
# FIXME: this is synonym of section header
|
|
117
|
+
for i in range(6):
|
|
118
|
+
special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
|
|
119
|
+
|
|
120
|
+
# Adding dynamically generated page-tokens
|
|
121
|
+
for i in range(0, max_pages + 1):
|
|
122
|
+
special_tokens.append(f"<page_{i}>")
|
|
123
|
+
special_tokens.append(f"</page_{i}>")
|
|
124
|
+
|
|
125
|
+
# Adding dynamically generated location-tokens
|
|
126
|
+
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
|
|
127
|
+
special_tokens.append(f"<loc_{i}>")
|
|
128
|
+
|
|
129
|
+
return special_tokens
|
|
130
|
+
|
|
131
|
+
@staticmethod
|
|
132
|
+
def is_known_token(label):
|
|
133
|
+
"""Function to check if label is in tokens."""
|
|
134
|
+
return label in DocumentToken.get_special_tokens()
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def get_row_token(row: int, beg=bool) -> str:
|
|
138
|
+
"""Function to get page tokens."""
|
|
139
|
+
if beg:
|
|
140
|
+
return f"<row_{row}>"
|
|
141
|
+
else:
|
|
142
|
+
return f"</row_{row}>"
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def get_col_token(col: int, beg=bool) -> str:
|
|
146
|
+
"""Function to get page tokens."""
|
|
147
|
+
if beg:
|
|
148
|
+
return f"<col_{col}>"
|
|
149
|
+
else:
|
|
150
|
+
return f"</col_{col}>"
|
|
151
|
+
|
|
152
|
+
@staticmethod
|
|
153
|
+
def get_page_token(page: int):
|
|
154
|
+
"""Function to get page tokens."""
|
|
155
|
+
return f"<page_{page}>"
|
|
156
|
+
|
|
157
|
+
@staticmethod
|
|
158
|
+
def get_location_token(val: float, rnorm: int = 100):
|
|
159
|
+
"""Function to get location tokens."""
|
|
160
|
+
val_ = round(rnorm * val)
|
|
161
|
+
|
|
162
|
+
if val_ < 0:
|
|
163
|
+
return "<loc_0>"
|
|
164
|
+
|
|
165
|
+
if val_ > rnorm:
|
|
166
|
+
return f"<loc_{rnorm}>"
|
|
167
|
+
|
|
168
|
+
return f"<loc_{val_}>"
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def get_location(
|
|
172
|
+
# bbox: Tuple[float, float, float, float],
|
|
173
|
+
bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
|
|
174
|
+
page_w: float,
|
|
175
|
+
page_h: float,
|
|
176
|
+
xsize: int = 100,
|
|
177
|
+
ysize: int = 100,
|
|
178
|
+
page_i: int = -1,
|
|
179
|
+
):
|
|
180
|
+
"""Get the location string give bbox and page-dim."""
|
|
181
|
+
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
|
|
182
|
+
assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
|
|
183
|
+
|
|
184
|
+
x0 = bbox[0] / page_w
|
|
185
|
+
y0 = bbox[1] / page_h
|
|
186
|
+
x1 = bbox[2] / page_w
|
|
187
|
+
y1 = bbox[3] / page_h
|
|
188
|
+
|
|
189
|
+
page_tok = ""
|
|
190
|
+
if page_i != -1:
|
|
191
|
+
page_tok = DocumentToken.get_page_token(page=page_i)
|
|
192
|
+
|
|
193
|
+
x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
|
|
194
|
+
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
|
|
195
|
+
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
|
|
196
|
+
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
|
|
197
|
+
|
|
198
|
+
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
|
|
199
|
+
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
|
|
200
|
+
loc_str += f"{DocumentToken.END_LOCATION.value}"
|
|
201
|
+
|
|
202
|
+
return loc_str
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.7.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -29,7 +29,7 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
|
29
29
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
30
30
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
31
31
|
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
32
|
-
Requires-Dist: pydantic (>=2.6.0,<2.10)
|
|
32
|
+
Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
|
|
33
33
|
Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
34
34
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
35
35
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
@@ -21,8 +21,9 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
|
|
|
21
21
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
22
22
|
docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
|
|
23
23
|
docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
|
|
24
|
-
docling_core/types/doc/document.py,sha256=
|
|
24
|
+
docling_core/types/doc/document.py,sha256=LXmDD0qZiB34WTWSTklcdWndetOqumMFN3yJEqifb8M,90500
|
|
25
25
|
docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
|
|
26
|
+
docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
|
|
26
27
|
docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
|
|
27
28
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
28
29
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -51,8 +52,8 @@ docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6
|
|
|
51
52
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
52
53
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
53
54
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
54
|
-
docling_core-2.
|
|
55
|
-
docling_core-2.
|
|
56
|
-
docling_core-2.
|
|
57
|
-
docling_core-2.
|
|
58
|
-
docling_core-2.
|
|
55
|
+
docling_core-2.7.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
56
|
+
docling_core-2.7.0.dist-info/METADATA,sha256=ht4UM23KfXIPp2aeUjSr9AUruTANa-kSt9kDwHQyeNk,5547
|
|
57
|
+
docling_core-2.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
58
|
+
docling_core-2.7.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
|
|
59
|
+
docling_core-2.7.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|