docling-core 2.6.1__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -37,8 +37,8 @@ from docling_core.types.base import _JSON_POINTER_REGEX
37
37
  from docling_core.types.doc import BoundingBox, Size
38
38
  from docling_core.types.doc.base import ImageRefMode
39
39
  from docling_core.types.doc.labels import DocItemLabel, GroupLabel
40
+ from docling_core.types.doc.tokens import DocumentToken, TableToken
40
41
  from docling_core.types.doc.utils import relative_path
41
- from docling_core.types.legacy_doc.tokens import DocumentToken
42
42
 
43
43
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
44
44
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -1008,7 +1008,6 @@ class TableItem(FloatingItem):
1008
1008
  DeprecationWarning,
1009
1009
  )
1010
1010
 
1011
- body = ""
1012
1011
  nrows = self.data.num_rows
1013
1012
  ncols = self.data.num_cols
1014
1013
 
@@ -1065,6 +1064,99 @@ class TableItem(FloatingItem):
1065
1064
 
1066
1065
  return body
1067
1066
 
1067
+ def export_to_otsl(
1068
+ self,
1069
+ doc: "DoclingDocument",
1070
+ add_cell_location: bool = True,
1071
+ add_cell_text: bool = True,
1072
+ xsize: int = 100,
1073
+ ysize: int = 100,
1074
+ ) -> str:
1075
+ """Export the table as OTSL."""
1076
+ # Possible OTSL tokens...
1077
+ #
1078
+ # Empty and full cells:
1079
+ # "ecel", "fcel"
1080
+ #
1081
+ # Cell spans (horisontal, vertical, 2d):
1082
+ # "lcel", "ucel", "xcel"
1083
+ #
1084
+ # New line:
1085
+ # "nl"
1086
+ #
1087
+ # Headers (column, row, section row):
1088
+ # "ched", "rhed", "srow"
1089
+
1090
+ body = []
1091
+ nrows = self.data.num_rows
1092
+ ncols = self.data.num_cols
1093
+ if len(self.data.table_cells) == 0:
1094
+ return ""
1095
+
1096
+ page_no = 0
1097
+ if len(self.prov) > 0:
1098
+ page_no = self.prov[0].page_no
1099
+
1100
+ for i in range(nrows):
1101
+ for j in range(ncols):
1102
+ cell: TableCell = self.data.grid[i][j]
1103
+ content = cell.text.strip()
1104
+ rowspan, rowstart = (
1105
+ cell.row_span,
1106
+ cell.start_row_offset_idx,
1107
+ )
1108
+ colspan, colstart = (
1109
+ cell.col_span,
1110
+ cell.start_col_offset_idx,
1111
+ )
1112
+
1113
+ if len(doc.pages.keys()):
1114
+ page_w, page_h = doc.pages[page_no].size.as_tuple()
1115
+ cell_loc = ""
1116
+ if cell.bbox is not None:
1117
+ cell_loc = DocumentToken.get_location(
1118
+ bbox=cell.bbox.to_bottom_left_origin(page_h).as_tuple(),
1119
+ page_w=page_w,
1120
+ page_h=page_h,
1121
+ xsize=xsize,
1122
+ ysize=ysize,
1123
+ page_i=page_no,
1124
+ )
1125
+
1126
+ if rowstart == i and colstart == j:
1127
+ if len(content) > 0:
1128
+ if cell.column_header:
1129
+ body.append(str(TableToken.OTSL_CHED.value))
1130
+ elif cell.row_header:
1131
+ body.append(str(TableToken.OTSL_RHED.value))
1132
+ elif cell.row_section:
1133
+ body.append(str(TableToken.OTSL_SROW.value))
1134
+ else:
1135
+ body.append(str(TableToken.OTSL_FCEL.value))
1136
+ if add_cell_location:
1137
+ body.append(str(cell_loc))
1138
+ if add_cell_text:
1139
+ body.append(str(content))
1140
+ else:
1141
+ body.append(str(TableToken.OTSL_ECEL.value))
1142
+ else:
1143
+ add_cross_cell = False
1144
+ if rowstart != i:
1145
+ if colspan == 1:
1146
+ body.append(str(TableToken.OTSL_UCEL.value))
1147
+ else:
1148
+ add_cross_cell = True
1149
+ if colstart != j:
1150
+ if rowspan == 1:
1151
+ body.append(str(TableToken.OTSL_LCEL.value))
1152
+ else:
1153
+ add_cross_cell = True
1154
+ if add_cross_cell:
1155
+ body.append(str(TableToken.OTSL_XCEL.value))
1156
+ body.append(str(TableToken.OTSL_NL.value))
1157
+ body_str = "".join(body)
1158
+ return body_str
1159
+
1068
1160
  def export_to_document_tokens(
1069
1161
  self,
1070
1162
  doc: "DoclingDocument",
@@ -0,0 +1,202 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Tokens used in the docling document model."""
7
+
8
+ from enum import Enum
9
+ from typing import Annotated, Tuple
10
+
11
+ from pydantic import Field
12
+
13
+
14
+ class TableToken(Enum):
15
+ """Class to represent an LLM friendly representation of a Table."""
16
+
17
+ CELL_LABEL_COLUMN_HEADER = "<column_header>"
18
+ CELL_LABEL_ROW_HEADER = "<row_header>"
19
+ CELL_LABEL_SECTION_HEADERE = "<section_header>"
20
+ CELL_LABEL_DATA = "<data>"
21
+
22
+ OTSL_ECEL = "<ecel>" # empty cell
23
+ OTSL_FCEL = "<fcel>" # cell with content
24
+ OTSL_LCEL = "<lcel>" # left looking cell,
25
+ OTSL_UCEL = "<ucel>" # up looking cell,
26
+ OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell),
27
+ OTSL_NL = "<nl>" # new line,
28
+ OTSL_CHED = "<ched>" # - column header cell,
29
+ OTSL_RHED = "<rhed>" # - row header cell,
30
+ OTSL_SROW = "<srow>" # - section row cell
31
+
32
+ @classmethod
33
+ def get_special_tokens(cls):
34
+ """Function to get all special document tokens."""
35
+ special_tokens = [token.value for token in cls]
36
+ return special_tokens
37
+
38
+ @staticmethod
39
+ def is_known_token(label):
40
+ """Function to check if label is in tokens."""
41
+ return label in TableToken.get_special_tokens()
42
+
43
+
44
+ class DocumentToken(Enum):
45
+ """Class to represent an LLM friendly representation of a Document."""
46
+
47
+ BEG_DOCUMENT = "<document>"
48
+ END_DOCUMENT = "</document>"
49
+
50
+ BEG_TITLE = "<title>"
51
+ END_TITLE = "</title>"
52
+
53
+ BEG_ABSTRACT = "<abstract>"
54
+ END_ABSTRACT = "</abstract>"
55
+
56
+ BEG_DOI = "<doi>"
57
+ END_DOI = "</doi>"
58
+ BEG_DATE = "<date>"
59
+ END_DATE = "</date>"
60
+
61
+ BEG_AUTHORS = "<authors>"
62
+ END_AUTHORS = "</authors>"
63
+ BEG_AUTHOR = "<author>"
64
+ END_AUTHOR = "</author>"
65
+
66
+ BEG_AFFILIATIONS = "<affiliations>"
67
+ END_AFFILIATIONS = "</affiliations>"
68
+ BEG_AFFILIATION = "<affiliation>"
69
+ END_AFFILIATION = "</affiliation>"
70
+
71
+ BEG_HEADER = "<section-header>"
72
+ END_HEADER = "</section-header>"
73
+ BEG_TEXT = "<text>"
74
+ END_TEXT = "</text>"
75
+ BEG_PARAGRAPH = "<paragraph>"
76
+ END_PARAGRAPH = "</paragraph>"
77
+ BEG_TABLE = "<table>"
78
+ END_TABLE = "</table>"
79
+ BEG_FIGURE = "<figure>"
80
+ END_FIGURE = "</figure>"
81
+ BEG_CAPTION = "<caption>"
82
+ END_CAPTION = "</caption>"
83
+ BEG_EQUATION = "<equation>"
84
+ END_EQUATION = "</equation>"
85
+ BEG_LIST = "<list>"
86
+ END_LIST = "</list>"
87
+ BEG_LISTITEM = "<list-item>"
88
+ END_LISTITEM = "</list-item>"
89
+
90
+ BEG_LOCATION = "<location>"
91
+ END_LOCATION = "</location>"
92
+ BEG_GROUP = "<group>"
93
+ END_GROUP = "</group>"
94
+
95
+ @classmethod
96
+ def get_special_tokens(
97
+ cls,
98
+ max_rows: int = 100,
99
+ max_cols: int = 100,
100
+ max_pages: int = 1000,
101
+ page_dimension: Tuple[int, int] = (100, 100),
102
+ ):
103
+ """Function to get all special document tokens."""
104
+ special_tokens = [token.value for token in cls]
105
+
106
+ # Adding dynamically generated row and col tokens
107
+ for i in range(0, max_rows + 1):
108
+ special_tokens += [f"<row_{i}>", f"</row_{i}>"]
109
+
110
+ for i in range(0, max_cols + 1):
111
+ special_tokens += [f"<col_{i}>", f"</col_{i}>"]
112
+
113
+ for i in range(6):
114
+ special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
115
+
116
+ # FIXME: this is synonym of section header
117
+ for i in range(6):
118
+ special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
119
+
120
+ # Adding dynamically generated page-tokens
121
+ for i in range(0, max_pages + 1):
122
+ special_tokens.append(f"<page_{i}>")
123
+ special_tokens.append(f"</page_{i}>")
124
+
125
+ # Adding dynamically generated location-tokens
126
+ for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
127
+ special_tokens.append(f"<loc_{i}>")
128
+
129
+ return special_tokens
130
+
131
+ @staticmethod
132
+ def is_known_token(label):
133
+ """Function to check if label is in tokens."""
134
+ return label in DocumentToken.get_special_tokens()
135
+
136
+ @staticmethod
137
+ def get_row_token(row: int, beg=bool) -> str:
138
+ """Function to get page tokens."""
139
+ if beg:
140
+ return f"<row_{row}>"
141
+ else:
142
+ return f"</row_{row}>"
143
+
144
+ @staticmethod
145
+ def get_col_token(col: int, beg=bool) -> str:
146
+ """Function to get page tokens."""
147
+ if beg:
148
+ return f"<col_{col}>"
149
+ else:
150
+ return f"</col_{col}>"
151
+
152
+ @staticmethod
153
+ def get_page_token(page: int):
154
+ """Function to get page tokens."""
155
+ return f"<page_{page}>"
156
+
157
+ @staticmethod
158
+ def get_location_token(val: float, rnorm: int = 100):
159
+ """Function to get location tokens."""
160
+ val_ = round(rnorm * val)
161
+
162
+ if val_ < 0:
163
+ return "<loc_0>"
164
+
165
+ if val_ > rnorm:
166
+ return f"<loc_{rnorm}>"
167
+
168
+ return f"<loc_{val_}>"
169
+
170
+ @staticmethod
171
+ def get_location(
172
+ # bbox: Tuple[float, float, float, float],
173
+ bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
174
+ page_w: float,
175
+ page_h: float,
176
+ xsize: int = 100,
177
+ ysize: int = 100,
178
+ page_i: int = -1,
179
+ ):
180
+ """Get the location string give bbox and page-dim."""
181
+ assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
182
+ assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
183
+
184
+ x0 = bbox[0] / page_w
185
+ y0 = bbox[1] / page_h
186
+ x1 = bbox[2] / page_w
187
+ y1 = bbox[3] / page_h
188
+
189
+ page_tok = ""
190
+ if page_i != -1:
191
+ page_tok = DocumentToken.get_page_token(page=page_i)
192
+
193
+ x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
194
+ y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
195
+ x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
196
+ y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
197
+
198
+ loc_str = f"{DocumentToken.BEG_LOCATION.value}"
199
+ loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
200
+ loc_str += f"{DocumentToken.END_LOCATION.value}"
201
+
202
+ return loc_str
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.6.1
3
+ Version: 2.7.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
29
29
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
30
30
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
31
  Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
- Requires-Dist: pydantic (>=2.6.0,<2.10)
32
+ Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
33
33
  Requires-Dist: pyyaml (>=5.1,<7.0.0)
34
34
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
35
35
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
@@ -21,8 +21,9 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
21
21
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
22
22
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
23
23
  docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
24
- docling_core/types/doc/document.py,sha256=8qVhet6eQtvju286zUkdOU0NXnkZ0AoOVAysMEZ3Aws,87099
24
+ docling_core/types/doc/document.py,sha256=LXmDD0qZiB34WTWSTklcdWndetOqumMFN3yJEqifb8M,90500
25
25
  docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
26
+ docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
26
27
  docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
27
28
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
28
29
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
@@ -51,8 +52,8 @@ docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6
51
52
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
52
53
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
53
54
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
54
- docling_core-2.6.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
55
- docling_core-2.6.1.dist-info/METADATA,sha256=aHtmbajidCAFKmJiAq-sSW-rSjZhHAMsqSEfRrpYBes,5519
56
- docling_core-2.6.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
57
- docling_core-2.6.1.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
58
- docling_core-2.6.1.dist-info/RECORD,,
55
+ docling_core-2.7.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
56
+ docling_core-2.7.0.dist-info/METADATA,sha256=ht4UM23KfXIPp2aeUjSr9AUruTANa-kSt9kDwHQyeNk,5547
57
+ docling_core-2.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
58
+ docling_core-2.7.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
59
+ docling_core-2.7.0.dist-info/RECORD,,