docling-core 2.6.0__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -37,8 +37,8 @@ from docling_core.types.base import _JSON_POINTER_REGEX
37
37
  from docling_core.types.doc import BoundingBox, Size
38
38
  from docling_core.types.doc.base import ImageRefMode
39
39
  from docling_core.types.doc.labels import DocItemLabel, GroupLabel
40
- from docling_core.types.legacy_doc.tokens import DocumentToken
41
- from docling_core.utils.file import relative_path
40
+ from docling_core.types.doc.tokens import DocumentToken, TableToken
41
+ from docling_core.types.doc.utils import relative_path
42
42
 
43
43
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
44
44
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -1008,7 +1008,6 @@ class TableItem(FloatingItem):
1008
1008
  DeprecationWarning,
1009
1009
  )
1010
1010
 
1011
- body = ""
1012
1011
  nrows = self.data.num_rows
1013
1012
  ncols = self.data.num_cols
1014
1013
 
@@ -1065,6 +1064,99 @@ class TableItem(FloatingItem):
1065
1064
 
1066
1065
  return body
1067
1066
 
1067
+ def export_to_otsl(
1068
+ self,
1069
+ doc: "DoclingDocument",
1070
+ add_cell_location: bool = True,
1071
+ add_cell_text: bool = True,
1072
+ xsize: int = 100,
1073
+ ysize: int = 100,
1074
+ ) -> str:
1075
+ """Export the table as OTSL."""
1076
+ # Possible OTSL tokens...
1077
+ #
1078
+ # Empty and full cells:
1079
+ # "ecel", "fcel"
1080
+ #
1081
+ # Cell spans (horisontal, vertical, 2d):
1082
+ # "lcel", "ucel", "xcel"
1083
+ #
1084
+ # New line:
1085
+ # "nl"
1086
+ #
1087
+ # Headers (column, row, section row):
1088
+ # "ched", "rhed", "srow"
1089
+
1090
+ body = []
1091
+ nrows = self.data.num_rows
1092
+ ncols = self.data.num_cols
1093
+ if len(self.data.table_cells) == 0:
1094
+ return ""
1095
+
1096
+ page_no = 0
1097
+ if len(self.prov) > 0:
1098
+ page_no = self.prov[0].page_no
1099
+
1100
+ for i in range(nrows):
1101
+ for j in range(ncols):
1102
+ cell: TableCell = self.data.grid[i][j]
1103
+ content = cell.text.strip()
1104
+ rowspan, rowstart = (
1105
+ cell.row_span,
1106
+ cell.start_row_offset_idx,
1107
+ )
1108
+ colspan, colstart = (
1109
+ cell.col_span,
1110
+ cell.start_col_offset_idx,
1111
+ )
1112
+
1113
+ if len(doc.pages.keys()):
1114
+ page_w, page_h = doc.pages[page_no].size.as_tuple()
1115
+ cell_loc = ""
1116
+ if cell.bbox is not None:
1117
+ cell_loc = DocumentToken.get_location(
1118
+ bbox=cell.bbox.to_bottom_left_origin(page_h).as_tuple(),
1119
+ page_w=page_w,
1120
+ page_h=page_h,
1121
+ xsize=xsize,
1122
+ ysize=ysize,
1123
+ page_i=page_no,
1124
+ )
1125
+
1126
+ if rowstart == i and colstart == j:
1127
+ if len(content) > 0:
1128
+ if cell.column_header:
1129
+ body.append(str(TableToken.OTSL_CHED.value))
1130
+ elif cell.row_header:
1131
+ body.append(str(TableToken.OTSL_RHED.value))
1132
+ elif cell.row_section:
1133
+ body.append(str(TableToken.OTSL_SROW.value))
1134
+ else:
1135
+ body.append(str(TableToken.OTSL_FCEL.value))
1136
+ if add_cell_location:
1137
+ body.append(str(cell_loc))
1138
+ if add_cell_text:
1139
+ body.append(str(content))
1140
+ else:
1141
+ body.append(str(TableToken.OTSL_ECEL.value))
1142
+ else:
1143
+ add_cross_cell = False
1144
+ if rowstart != i:
1145
+ if colspan == 1:
1146
+ body.append(str(TableToken.OTSL_UCEL.value))
1147
+ else:
1148
+ add_cross_cell = True
1149
+ if colstart != j:
1150
+ if rowspan == 1:
1151
+ body.append(str(TableToken.OTSL_LCEL.value))
1152
+ else:
1153
+ add_cross_cell = True
1154
+ if add_cross_cell:
1155
+ body.append(str(TableToken.OTSL_XCEL.value))
1156
+ body.append(str(TableToken.OTSL_NL.value))
1157
+ body_str = "".join(body)
1158
+ return body_str
1159
+
1068
1160
  def export_to_document_tokens(
1069
1161
  self,
1070
1162
  doc: "DoclingDocument",
@@ -0,0 +1,202 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Tokens used in the docling document model."""
7
+
8
+ from enum import Enum
9
+ from typing import Annotated, Tuple
10
+
11
+ from pydantic import Field
12
+
13
+
14
+ class TableToken(Enum):
15
+ """Class to represent an LLM friendly representation of a Table."""
16
+
17
+ CELL_LABEL_COLUMN_HEADER = "<column_header>"
18
+ CELL_LABEL_ROW_HEADER = "<row_header>"
19
+ CELL_LABEL_SECTION_HEADERE = "<section_header>"
20
+ CELL_LABEL_DATA = "<data>"
21
+
22
+ OTSL_ECEL = "<ecel>" # empty cell
23
+ OTSL_FCEL = "<fcel>" # cell with content
24
+ OTSL_LCEL = "<lcel>" # left looking cell,
25
+ OTSL_UCEL = "<ucel>" # up looking cell,
26
+ OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell),
27
+ OTSL_NL = "<nl>" # new line,
28
+ OTSL_CHED = "<ched>" # - column header cell,
29
+ OTSL_RHED = "<rhed>" # - row header cell,
30
+ OTSL_SROW = "<srow>" # - section row cell
31
+
32
+ @classmethod
33
+ def get_special_tokens(cls):
34
+ """Function to get all special document tokens."""
35
+ special_tokens = [token.value for token in cls]
36
+ return special_tokens
37
+
38
+ @staticmethod
39
+ def is_known_token(label):
40
+ """Function to check if label is in tokens."""
41
+ return label in TableToken.get_special_tokens()
42
+
43
+
44
+ class DocumentToken(Enum):
45
+ """Class to represent an LLM friendly representation of a Document."""
46
+
47
+ BEG_DOCUMENT = "<document>"
48
+ END_DOCUMENT = "</document>"
49
+
50
+ BEG_TITLE = "<title>"
51
+ END_TITLE = "</title>"
52
+
53
+ BEG_ABSTRACT = "<abstract>"
54
+ END_ABSTRACT = "</abstract>"
55
+
56
+ BEG_DOI = "<doi>"
57
+ END_DOI = "</doi>"
58
+ BEG_DATE = "<date>"
59
+ END_DATE = "</date>"
60
+
61
+ BEG_AUTHORS = "<authors>"
62
+ END_AUTHORS = "</authors>"
63
+ BEG_AUTHOR = "<author>"
64
+ END_AUTHOR = "</author>"
65
+
66
+ BEG_AFFILIATIONS = "<affiliations>"
67
+ END_AFFILIATIONS = "</affiliations>"
68
+ BEG_AFFILIATION = "<affiliation>"
69
+ END_AFFILIATION = "</affiliation>"
70
+
71
+ BEG_HEADER = "<section-header>"
72
+ END_HEADER = "</section-header>"
73
+ BEG_TEXT = "<text>"
74
+ END_TEXT = "</text>"
75
+ BEG_PARAGRAPH = "<paragraph>"
76
+ END_PARAGRAPH = "</paragraph>"
77
+ BEG_TABLE = "<table>"
78
+ END_TABLE = "</table>"
79
+ BEG_FIGURE = "<figure>"
80
+ END_FIGURE = "</figure>"
81
+ BEG_CAPTION = "<caption>"
82
+ END_CAPTION = "</caption>"
83
+ BEG_EQUATION = "<equation>"
84
+ END_EQUATION = "</equation>"
85
+ BEG_LIST = "<list>"
86
+ END_LIST = "</list>"
87
+ BEG_LISTITEM = "<list-item>"
88
+ END_LISTITEM = "</list-item>"
89
+
90
+ BEG_LOCATION = "<location>"
91
+ END_LOCATION = "</location>"
92
+ BEG_GROUP = "<group>"
93
+ END_GROUP = "</group>"
94
+
95
+ @classmethod
96
+ def get_special_tokens(
97
+ cls,
98
+ max_rows: int = 100,
99
+ max_cols: int = 100,
100
+ max_pages: int = 1000,
101
+ page_dimension: Tuple[int, int] = (100, 100),
102
+ ):
103
+ """Function to get all special document tokens."""
104
+ special_tokens = [token.value for token in cls]
105
+
106
+ # Adding dynamically generated row and col tokens
107
+ for i in range(0, max_rows + 1):
108
+ special_tokens += [f"<row_{i}>", f"</row_{i}>"]
109
+
110
+ for i in range(0, max_cols + 1):
111
+ special_tokens += [f"<col_{i}>", f"</col_{i}>"]
112
+
113
+ for i in range(6):
114
+ special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
115
+
116
+ # FIXME: this is synonym of section header
117
+ for i in range(6):
118
+ special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
119
+
120
+ # Adding dynamically generated page-tokens
121
+ for i in range(0, max_pages + 1):
122
+ special_tokens.append(f"<page_{i}>")
123
+ special_tokens.append(f"</page_{i}>")
124
+
125
+ # Adding dynamically generated location-tokens
126
+ for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
127
+ special_tokens.append(f"<loc_{i}>")
128
+
129
+ return special_tokens
130
+
131
+ @staticmethod
132
+ def is_known_token(label):
133
+ """Function to check if label is in tokens."""
134
+ return label in DocumentToken.get_special_tokens()
135
+
136
+ @staticmethod
137
+ def get_row_token(row: int, beg=bool) -> str:
138
+ """Function to get page tokens."""
139
+ if beg:
140
+ return f"<row_{row}>"
141
+ else:
142
+ return f"</row_{row}>"
143
+
144
+ @staticmethod
145
+ def get_col_token(col: int, beg=bool) -> str:
146
+ """Function to get page tokens."""
147
+ if beg:
148
+ return f"<col_{col}>"
149
+ else:
150
+ return f"</col_{col}>"
151
+
152
+ @staticmethod
153
+ def get_page_token(page: int):
154
+ """Function to get page tokens."""
155
+ return f"<page_{page}>"
156
+
157
+ @staticmethod
158
+ def get_location_token(val: float, rnorm: int = 100):
159
+ """Function to get location tokens."""
160
+ val_ = round(rnorm * val)
161
+
162
+ if val_ < 0:
163
+ return "<loc_0>"
164
+
165
+ if val_ > rnorm:
166
+ return f"<loc_{rnorm}>"
167
+
168
+ return f"<loc_{val_}>"
169
+
170
+ @staticmethod
171
+ def get_location(
172
+ # bbox: Tuple[float, float, float, float],
173
+ bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
174
+ page_w: float,
175
+ page_h: float,
176
+ xsize: int = 100,
177
+ ysize: int = 100,
178
+ page_i: int = -1,
179
+ ):
180
+ """Get the location string give bbox and page-dim."""
181
+ assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
182
+ assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
183
+
184
+ x0 = bbox[0] / page_w
185
+ y0 = bbox[1] / page_h
186
+ x1 = bbox[2] / page_w
187
+ y1 = bbox[3] / page_h
188
+
189
+ page_tok = ""
190
+ if page_i != -1:
191
+ page_tok = DocumentToken.get_page_token(page=page_i)
192
+
193
+ x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
194
+ y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
195
+ x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
196
+ y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
197
+
198
+ loc_str = f"{DocumentToken.BEG_LOCATION.value}"
199
+ loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
200
+ loc_str += f"{DocumentToken.END_LOCATION.value}"
201
+
202
+ return loc_str
@@ -0,0 +1,48 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Utils for document types."""
7
+
8
+ from pathlib import Path
9
+
10
+
11
+ def relative_path(src: Path, target: Path) -> Path:
12
+ """Compute the relative path from `src` to `target`.
13
+
14
+ Args:
15
+ src (str | Path): The source directory or file path (must be absolute).
16
+ target (str | Path): The target directory or file path (must be absolute).
17
+
18
+ Returns:
19
+ Path: The relative path from `src` to `target`.
20
+
21
+ Raises:
22
+ ValueError: If either `src` or `target` is not an absolute path.
23
+ """
24
+ src = Path(src).resolve()
25
+ target = Path(target).resolve()
26
+
27
+ # Ensure both paths are absolute
28
+ if not src.is_absolute():
29
+ raise ValueError(f"The source path must be absolute: {src}")
30
+ if not target.is_absolute():
31
+ raise ValueError(f"The target path must be absolute: {target}")
32
+
33
+ # Find the common ancestor
34
+ common_parts = []
35
+ for src_part, target_part in zip(src.parts, target.parts):
36
+ if src_part == target_part:
37
+ common_parts.append(src_part)
38
+ else:
39
+ break
40
+
41
+ # Determine the path to go up from src to the common ancestor
42
+ up_segments = [".."] * (len(src.parts) - len(common_parts))
43
+
44
+ # Add the path from the common ancestor to the target
45
+ down_segments = target.parts[len(common_parts) :]
46
+
47
+ # Combine and return the result
48
+ return Path(*up_segments, *down_segments)
@@ -15,6 +15,7 @@ import requests
15
15
  from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
16
16
  from typing_extensions import deprecated
17
17
 
18
+ from docling_core.types.doc.utils import relative_path # noqa
18
19
  from docling_core.types.io import DocumentStream
19
20
 
20
21
 
@@ -168,43 +169,3 @@ def resolve_file_source(
168
169
  source=source,
169
170
  headers=headers,
170
171
  )
171
-
172
-
173
- def relative_path(src: Path, target: Path) -> Path:
174
- """Compute the relative path from `src` to `target`.
175
-
176
- Args:
177
- src (str | Path): The source directory or file path (must be absolute).
178
- target (str | Path): The target directory or file path (must be absolute).
179
-
180
- Returns:
181
- Path: The relative path from `src` to `target`.
182
-
183
- Raises:
184
- ValueError: If either `src` or `target` is not an absolute path.
185
- """
186
- src = Path(src).resolve()
187
- target = Path(target).resolve()
188
-
189
- # Ensure both paths are absolute
190
- if not src.is_absolute():
191
- raise ValueError(f"The source path must be absolute: {src}")
192
- if not target.is_absolute():
193
- raise ValueError(f"The target path must be absolute: {target}")
194
-
195
- # Find the common ancestor
196
- common_parts = []
197
- for src_part, target_part in zip(src.parts, target.parts):
198
- if src_part == target_part:
199
- common_parts.append(src_part)
200
- else:
201
- break
202
-
203
- # Determine the path to go up from src to the common ancestor
204
- up_segments = [".."] * (len(src.parts) - len(common_parts))
205
-
206
- # Add the path from the common ancestor to the target
207
- down_segments = target.parts[len(common_parts) :]
208
-
209
- # Combine and return the result
210
- return Path(*up_segments, *down_segments)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.6.0
3
+ Version: 2.7.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: jsonref (>=1.1.0,<2.0.0)
29
29
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
30
30
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
31
  Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
- Requires-Dist: pydantic (>=2.6.0,<2.10)
32
+ Requires-Dist: pydantic (>=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2)
33
33
  Requires-Dist: pyyaml (>=5.1,<7.0.0)
34
34
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
35
35
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
@@ -21,8 +21,10 @@ docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HX
21
21
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
22
22
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
23
23
  docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
24
- docling_core/types/doc/document.py,sha256=K6ixUeB0vyrnd3_ljM0Ed_8JBdltLPCsrGz7IoLgjUI,87094
24
+ docling_core/types/doc/document.py,sha256=LXmDD0qZiB34WTWSTklcdWndetOqumMFN3yJEqifb8M,90500
25
25
  docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
26
+ docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
27
+ docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
26
28
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
27
29
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
28
30
  docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
@@ -45,13 +47,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
45
47
  docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
46
48
  docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
47
49
  docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
48
- docling_core/utils/file.py,sha256=B1Iu8buqk_Yz4bhrGf7NyFIiYlsa_MC37vZLwQHqKLU,6876
50
+ docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
49
51
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
50
52
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
51
53
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
52
54
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
53
- docling_core-2.6.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
54
- docling_core-2.6.0.dist-info/METADATA,sha256=LhnsqU5AgndZllazTDXe_acmPWQ6NuMuH_b6-d4K1gM,5519
55
- docling_core-2.6.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
56
- docling_core-2.6.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
57
- docling_core-2.6.0.dist-info/RECORD,,
55
+ docling_core-2.7.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
56
+ docling_core-2.7.0.dist-info/METADATA,sha256=ht4UM23KfXIPp2aeUjSr9AUruTANa-kSt9kDwHQyeNk,5547
57
+ docling_core-2.7.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
58
+ docling_core-2.7.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
59
+ docling_core-2.7.0.dist-info/RECORD,,