docling-core 1.7.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +12 -8
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1288 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/METADATA +11 -11
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
- docling_core-2.0.0.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.1.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
- {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
docling_core/types/doc/base.py
CHANGED
|
@@ -1,485 +1,170 @@
|
|
|
1
|
-
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
1
|
+
"""Models for the base data types."""
|
|
5
2
|
|
|
6
|
-
|
|
7
|
-
from
|
|
3
|
+
import copy
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Tuple
|
|
8
6
|
|
|
9
|
-
|
|
10
|
-
from pydantic import BaseModel, Field, PositiveInt, StrictStr
|
|
7
|
+
from pydantic import BaseModel
|
|
11
8
|
|
|
12
|
-
from docling_core.search.mapping import es_field
|
|
13
|
-
from docling_core.types.doc.tokens import DocumentToken
|
|
14
|
-
from docling_core.utils.alias import AliasModel
|
|
15
9
|
|
|
16
|
-
|
|
10
|
+
class CoordOrigin(str, Enum):
|
|
11
|
+
"""CoordOrigin."""
|
|
17
12
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
Literal["y0"],
|
|
21
|
-
Literal["x1"],
|
|
22
|
-
Literal["y1"],
|
|
23
|
-
Literal["font"],
|
|
24
|
-
Literal["text"],
|
|
25
|
-
]
|
|
13
|
+
TOPLEFT = "TOPLEFT"
|
|
14
|
+
BOTTOMLEFT = "BOTTOMLEFT"
|
|
26
15
|
|
|
27
|
-
BoundingBox = Annotated[list[float], Field(min_length=4, max_length=4)]
|
|
28
16
|
|
|
29
|
-
|
|
17
|
+
class Size(BaseModel):
|
|
18
|
+
"""Size."""
|
|
30
19
|
|
|
20
|
+
width: float = 0.0
|
|
21
|
+
height: float = 0.0
|
|
31
22
|
|
|
32
|
-
|
|
33
|
-
|
|
23
|
+
def as_tuple(self):
|
|
24
|
+
"""as_tuple."""
|
|
25
|
+
return (self.width, self.height)
|
|
34
26
|
|
|
35
|
-
data: Optional[list[CellData]] = None
|
|
36
|
-
header: CellHeader = ("x0", "y0", "x1", "y1", "font", "text")
|
|
37
27
|
|
|
28
|
+
class BoundingBox(BaseModel):
|
|
29
|
+
"""BoundingBox."""
|
|
38
30
|
|
|
39
|
-
|
|
40
|
-
|
|
31
|
+
l: float # left
|
|
32
|
+
t: float # top
|
|
33
|
+
r: float # right
|
|
34
|
+
b: float # bottom
|
|
41
35
|
|
|
42
|
-
|
|
43
|
-
path: str
|
|
44
|
-
page: Optional[PositiveInt] = None
|
|
36
|
+
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
|
45
37
|
|
|
38
|
+
@property
|
|
39
|
+
def width(self):
|
|
40
|
+
"""width."""
|
|
41
|
+
return self.r - self.l
|
|
46
42
|
|
|
47
|
-
|
|
48
|
-
|
|
43
|
+
@property
|
|
44
|
+
def height(self):
|
|
45
|
+
"""height."""
|
|
46
|
+
return abs(self.t - self.b)
|
|
49
47
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
pdf_images: Optional[list[S3Resource]] = Field(default=None, alias="pdf-images")
|
|
53
|
-
json_document: Optional[S3Resource] = Field(default=None, alias="json-document")
|
|
54
|
-
json_meta: Optional[S3Resource] = Field(default=None, alias="json-meta")
|
|
55
|
-
glm_json_document: Optional[S3Resource] = Field(
|
|
56
|
-
default=None, alias="glm-json-document"
|
|
57
|
-
)
|
|
58
|
-
figures: Optional[list[S3Resource]] = None
|
|
48
|
+
def scaled(self, scale: float) -> "BoundingBox":
|
|
49
|
+
"""scaled.
|
|
59
50
|
|
|
51
|
+
:param scale: float:
|
|
60
52
|
|
|
61
|
-
|
|
62
|
-
|
|
53
|
+
"""
|
|
54
|
+
out_bbox = copy.deepcopy(self)
|
|
55
|
+
out_bbox.l *= scale
|
|
56
|
+
out_bbox.r *= scale
|
|
57
|
+
out_bbox.t *= scale
|
|
58
|
+
out_bbox.b *= scale
|
|
63
59
|
|
|
64
|
-
|
|
65
|
-
alias="__ref_s3_data", examples=["#/_s3_data/figures/0"]
|
|
66
|
-
)
|
|
60
|
+
return out_bbox
|
|
67
61
|
|
|
62
|
+
def normalized(self, page_size: Size) -> "BoundingBox":
|
|
63
|
+
"""normalized.
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
"""Provenance."""
|
|
65
|
+
:param page_size: Size:
|
|
71
66
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
67
|
+
"""
|
|
68
|
+
out_bbox = copy.deepcopy(self)
|
|
69
|
+
out_bbox.l /= page_size.width
|
|
70
|
+
out_bbox.r /= page_size.width
|
|
71
|
+
out_bbox.t /= page_size.height
|
|
72
|
+
out_bbox.b /= page_size.height
|
|
78
73
|
|
|
74
|
+
return out_bbox
|
|
79
75
|
|
|
80
|
-
|
|
81
|
-
|
|
76
|
+
def as_tuple(self):
|
|
77
|
+
"""as_tuple."""
|
|
78
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
79
|
+
return (self.l, self.t, self.r, self.b)
|
|
80
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
81
|
+
return (self.l, self.b, self.r, self.t)
|
|
82
82
|
|
|
83
|
-
|
|
84
|
-
|
|
83
|
+
@classmethod
|
|
84
|
+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
|
85
|
+
"""from_tuple.
|
|
85
86
|
|
|
87
|
+
:param coord: Tuple[float:
|
|
88
|
+
:param ...]:
|
|
89
|
+
:param origin: CoordOrigin:
|
|
86
90
|
|
|
87
|
-
|
|
88
|
-
|
|
91
|
+
"""
|
|
92
|
+
if origin == CoordOrigin.TOPLEFT:
|
|
93
|
+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
|
94
|
+
if r < l:
|
|
95
|
+
l, r = r, l
|
|
96
|
+
if b < t:
|
|
97
|
+
b, t = t, b
|
|
89
98
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
99
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
|
100
|
+
elif origin == CoordOrigin.BOTTOMLEFT:
|
|
101
|
+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
|
102
|
+
if r < l:
|
|
103
|
+
l, r = r, l
|
|
104
|
+
if b > t:
|
|
105
|
+
b, t = t, b
|
|
95
106
|
|
|
107
|
+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
|
96
108
|
|
|
97
|
-
|
|
98
|
-
|
|
109
|
+
def area(self) -> float:
|
|
110
|
+
"""area."""
|
|
111
|
+
area = (self.r - self.l) * (self.b - self.t)
|
|
112
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
113
|
+
area = -area
|
|
114
|
+
return area
|
|
99
115
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
text: Optional[str] = Field(
|
|
138
|
-
default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
|
|
139
|
-
)
|
|
140
|
-
obj_type: str = Field(
|
|
141
|
-
alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
def get_location_tokens(
|
|
145
|
-
self,
|
|
146
|
-
new_line: str,
|
|
147
|
-
page_w: float,
|
|
148
|
-
page_h: float,
|
|
149
|
-
xsize: int = 100,
|
|
150
|
-
ysize: int = 100,
|
|
151
|
-
add_page_index: bool = True,
|
|
152
|
-
) -> str:
|
|
153
|
-
"""Get the location string for the BaseCell."""
|
|
154
|
-
if self.prov is None:
|
|
155
|
-
return ""
|
|
156
|
-
|
|
157
|
-
location = ""
|
|
158
|
-
for prov in self.prov:
|
|
159
|
-
|
|
160
|
-
page_i = -1
|
|
161
|
-
if add_page_index:
|
|
162
|
-
page_i = prov.page
|
|
163
|
-
|
|
164
|
-
loc_str = DocumentToken.get_location(
|
|
165
|
-
bbox=prov.bbox,
|
|
166
|
-
page_w=page_w,
|
|
167
|
-
page_h=page_h,
|
|
168
|
-
xsize=xsize,
|
|
169
|
-
ysize=ysize,
|
|
170
|
-
page_i=page_i,
|
|
171
|
-
)
|
|
172
|
-
location += f"{loc_str}{new_line}"
|
|
173
|
-
|
|
174
|
-
return location
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
class Table(BaseCell):
|
|
178
|
-
"""Table."""
|
|
179
|
-
|
|
180
|
-
num_cols: int = Field(alias="#-cols")
|
|
181
|
-
num_rows: int = Field(alias="#-rows")
|
|
182
|
-
data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
|
|
183
|
-
model: Optional[str] = None
|
|
184
|
-
|
|
185
|
-
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
186
|
-
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
187
|
-
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
def _get_tablecell_span(self, cell: TableCell, ix: int):
|
|
191
|
-
if cell.spans is None:
|
|
192
|
-
span = set()
|
|
193
|
-
else:
|
|
194
|
-
span = set([s[ix] for s in cell.spans])
|
|
195
|
-
if len(span) == 0:
|
|
196
|
-
return 1, None, None
|
|
197
|
-
return len(span), min(span), max(span)
|
|
198
|
-
|
|
199
|
-
def export_to_dataframe(self) -> pd.DataFrame:
|
|
200
|
-
"""Export the table as a Pandas DataFrame."""
|
|
201
|
-
if self.data is None or self.num_rows == 0 or self.num_cols == 0:
|
|
202
|
-
return pd.DataFrame()
|
|
203
|
-
|
|
204
|
-
# Count how many rows are column headers
|
|
205
|
-
num_headers = 0
|
|
206
|
-
for i, row in enumerate(self.data):
|
|
207
|
-
if len(row) == 0:
|
|
208
|
-
raise RuntimeError(f"Invalid table. {len(row)=} but {self.num_cols=}.")
|
|
209
|
-
|
|
210
|
-
any_header = False
|
|
211
|
-
for cell in row:
|
|
212
|
-
if cell.obj_type == "col_header":
|
|
213
|
-
any_header = True
|
|
214
|
-
break
|
|
215
|
-
|
|
216
|
-
if any_header:
|
|
217
|
-
num_headers += 1
|
|
218
|
-
else:
|
|
219
|
-
break
|
|
220
|
-
|
|
221
|
-
# Create the column names from all col_headers
|
|
222
|
-
columns: Optional[List[str]] = None
|
|
223
|
-
if num_headers > 0:
|
|
224
|
-
columns = ["" for _ in range(self.num_cols)]
|
|
225
|
-
for i in range(num_headers):
|
|
226
|
-
for j, cell in enumerate(self.data[i]):
|
|
227
|
-
col_name = cell.text
|
|
228
|
-
if columns[j] != "":
|
|
229
|
-
col_name = f".{col_name}"
|
|
230
|
-
columns[j] += col_name
|
|
231
|
-
|
|
232
|
-
# Create table data
|
|
233
|
-
table_data = [[cell.text for cell in row] for row in self.data[num_headers:]]
|
|
234
|
-
|
|
235
|
-
# Create DataFrame
|
|
236
|
-
df = pd.DataFrame(table_data, columns=columns)
|
|
237
|
-
|
|
238
|
-
return df
|
|
239
|
-
|
|
240
|
-
def export_to_html(self) -> str:
|
|
241
|
-
"""Export the table as html."""
|
|
242
|
-
body = ""
|
|
243
|
-
nrows = self.num_rows
|
|
244
|
-
ncols = self.num_cols
|
|
245
|
-
|
|
246
|
-
if self.data is None:
|
|
247
|
-
return ""
|
|
248
|
-
for i in range(nrows):
|
|
249
|
-
body += "<tr>"
|
|
250
|
-
for j in range(ncols):
|
|
251
|
-
cell: TableCell = self.data[i][j]
|
|
252
|
-
|
|
253
|
-
rowspan, rowstart, rowend = self._get_tablecell_span(cell, 0)
|
|
254
|
-
colspan, colstart, colend = self._get_tablecell_span(cell, 1)
|
|
255
|
-
|
|
256
|
-
if rowstart is not None and rowstart != i:
|
|
257
|
-
continue
|
|
258
|
-
if colstart is not None and colstart != j:
|
|
259
|
-
continue
|
|
260
|
-
|
|
261
|
-
if rowstart is None:
|
|
262
|
-
rowstart = i
|
|
263
|
-
if colstart is None:
|
|
264
|
-
colstart = j
|
|
265
|
-
|
|
266
|
-
content = cell.text.strip()
|
|
267
|
-
label = cell.obj_type
|
|
268
|
-
celltag = "td"
|
|
269
|
-
if label in ["row_header", "row_multi_header", "row_title"]:
|
|
270
|
-
pass
|
|
271
|
-
elif label in ["col_header", "col_multi_header"]:
|
|
272
|
-
celltag = "th"
|
|
273
|
-
|
|
274
|
-
opening_tag = f"{celltag}"
|
|
275
|
-
if rowspan > 1:
|
|
276
|
-
opening_tag += f' rowspan="{rowspan}"'
|
|
277
|
-
if colspan > 1:
|
|
278
|
-
opening_tag += f' colspan="{colspan}"'
|
|
279
|
-
|
|
280
|
-
body += f"<{opening_tag}>{content}</{celltag}>"
|
|
281
|
-
body += "</tr>"
|
|
282
|
-
body = f"<table>{body}</table>"
|
|
283
|
-
|
|
284
|
-
return body
|
|
285
|
-
|
|
286
|
-
def export_to_document_tokens(
|
|
287
|
-
self,
|
|
288
|
-
new_line: str = "\n",
|
|
289
|
-
page_w: float = 0.0,
|
|
290
|
-
page_h: float = 0.0,
|
|
291
|
-
xsize: int = 100,
|
|
292
|
-
ysize: int = 100,
|
|
293
|
-
add_location: bool = True,
|
|
294
|
-
add_caption: bool = True,
|
|
295
|
-
add_content: bool = True,
|
|
296
|
-
add_cell_location: bool = True,
|
|
297
|
-
add_cell_label: bool = True,
|
|
298
|
-
add_cell_text: bool = True,
|
|
299
|
-
add_page_index: bool = True,
|
|
300
|
-
):
|
|
301
|
-
"""Export table to document tokens format."""
|
|
302
|
-
body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
|
|
303
|
-
|
|
304
|
-
if add_location:
|
|
305
|
-
body += self.get_location_tokens(
|
|
306
|
-
new_line=new_line,
|
|
307
|
-
page_w=page_w,
|
|
308
|
-
page_h=page_h,
|
|
309
|
-
xsize=xsize,
|
|
310
|
-
ysize=ysize,
|
|
311
|
-
add_page_index=add_page_index,
|
|
312
|
-
)
|
|
313
|
-
|
|
314
|
-
if add_caption and self.text is not None and len(self.text) > 0:
|
|
315
|
-
body += f"{DocumentToken.BEG_CAPTION.value}"
|
|
316
|
-
body += f"{self.text.strip()}"
|
|
317
|
-
body += f"{DocumentToken.END_CAPTION.value}"
|
|
318
|
-
body += f"{new_line}"
|
|
319
|
-
|
|
320
|
-
if add_content and self.data is not None and len(self.data) > 0:
|
|
321
|
-
for i, row in enumerate(self.data):
|
|
322
|
-
body += f"<row_{i}>"
|
|
323
|
-
for j, col in enumerate(row):
|
|
324
|
-
|
|
325
|
-
text = ""
|
|
326
|
-
if add_cell_text:
|
|
327
|
-
text = col.text.strip()
|
|
328
|
-
|
|
329
|
-
cell_loc = ""
|
|
330
|
-
if (
|
|
331
|
-
col.bbox is not None
|
|
332
|
-
and add_cell_location
|
|
333
|
-
and add_page_index
|
|
334
|
-
and self.prov is not None
|
|
335
|
-
and len(self.prov) > 0
|
|
336
|
-
):
|
|
337
|
-
cell_loc = DocumentToken.get_location(
|
|
338
|
-
bbox=col.bbox,
|
|
339
|
-
page_w=page_w,
|
|
340
|
-
page_h=page_h,
|
|
341
|
-
xsize=xsize,
|
|
342
|
-
ysize=ysize,
|
|
343
|
-
page_i=self.prov[0].page,
|
|
344
|
-
)
|
|
345
|
-
elif (
|
|
346
|
-
col.bbox is not None
|
|
347
|
-
and add_cell_location
|
|
348
|
-
and not add_page_index
|
|
349
|
-
):
|
|
350
|
-
cell_loc = DocumentToken.get_location(
|
|
351
|
-
bbox=col.bbox,
|
|
352
|
-
page_w=page_w,
|
|
353
|
-
page_h=page_h,
|
|
354
|
-
xsize=xsize,
|
|
355
|
-
ysize=ysize,
|
|
356
|
-
page_i=-1,
|
|
357
|
-
)
|
|
358
|
-
|
|
359
|
-
cell_label = ""
|
|
360
|
-
if (
|
|
361
|
-
add_cell_label
|
|
362
|
-
and col.obj_type is not None
|
|
363
|
-
and len(col.obj_type) > 0
|
|
364
|
-
):
|
|
365
|
-
cell_label = f"<{col.obj_type}>"
|
|
366
|
-
|
|
367
|
-
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
|
|
368
|
-
|
|
369
|
-
body += f"</row_{i}>{new_line}"
|
|
370
|
-
|
|
371
|
-
body += f"{DocumentToken.END_TABLE.value}{new_line}"
|
|
372
|
-
|
|
373
|
-
return body
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
# FIXME: let's add some figure specific data-types later
|
|
377
|
-
class Figure(BaseCell):
|
|
378
|
-
"""Figure."""
|
|
379
|
-
|
|
380
|
-
# FIXME: we need to check why we have bounding_box (this should be in prov)
|
|
381
|
-
bounding_box: Optional[BoundingBoxContainer] = Field(
|
|
382
|
-
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
|
|
383
|
-
)
|
|
384
|
-
|
|
385
|
-
def export_to_document_tokens(
|
|
386
|
-
self,
|
|
387
|
-
new_line: str = "\n",
|
|
388
|
-
page_w: float = 0.0,
|
|
389
|
-
page_h: float = 0.0,
|
|
390
|
-
xsize: int = 100,
|
|
391
|
-
ysize: int = 100,
|
|
392
|
-
add_location: bool = True,
|
|
393
|
-
add_caption: bool = True,
|
|
394
|
-
add_content: bool = True, # not used at the moment
|
|
395
|
-
add_page_index: bool = True,
|
|
396
|
-
):
|
|
397
|
-
"""Export figure to document tokens format."""
|
|
398
|
-
body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
|
|
399
|
-
|
|
400
|
-
if add_location:
|
|
401
|
-
body += self.get_location_tokens(
|
|
402
|
-
new_line=new_line,
|
|
403
|
-
page_w=page_w,
|
|
404
|
-
page_h=page_h,
|
|
405
|
-
xsize=xsize,
|
|
406
|
-
ysize=ysize,
|
|
407
|
-
add_page_index=add_page_index,
|
|
116
|
+
def intersection_area_with(self, other: "BoundingBox") -> float:
|
|
117
|
+
"""intersection_area_with.
|
|
118
|
+
|
|
119
|
+
:param other: "BoundingBox":
|
|
120
|
+
|
|
121
|
+
"""
|
|
122
|
+
# Calculate intersection coordinates
|
|
123
|
+
left = max(self.l, other.l)
|
|
124
|
+
top = max(self.t, other.t)
|
|
125
|
+
right = min(self.r, other.r)
|
|
126
|
+
bottom = min(self.b, other.b)
|
|
127
|
+
|
|
128
|
+
# Calculate intersection dimensions
|
|
129
|
+
width = right - left
|
|
130
|
+
height = bottom - top
|
|
131
|
+
|
|
132
|
+
# If the bounding boxes do not overlap, width or height will be negative
|
|
133
|
+
if width <= 0 or height <= 0:
|
|
134
|
+
return 0.0
|
|
135
|
+
|
|
136
|
+
return width * height
|
|
137
|
+
|
|
138
|
+
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
|
139
|
+
"""to_bottom_left_origin.
|
|
140
|
+
|
|
141
|
+
:param page_height:
|
|
142
|
+
|
|
143
|
+
"""
|
|
144
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
145
|
+
return self
|
|
146
|
+
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
|
147
|
+
return BoundingBox(
|
|
148
|
+
l=self.l,
|
|
149
|
+
r=self.r,
|
|
150
|
+
t=page_height - self.t,
|
|
151
|
+
b=page_height - self.b,
|
|
152
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
408
153
|
)
|
|
409
154
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
name: Optional[StrictStr] = Field(
|
|
426
|
-
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
|
|
427
|
-
)
|
|
428
|
-
font: Optional[str] = None
|
|
429
|
-
|
|
430
|
-
def export_to_document_tokens(
|
|
431
|
-
self,
|
|
432
|
-
new_line: str = "\n",
|
|
433
|
-
page_w: float = 0.0,
|
|
434
|
-
page_h: float = 0.0,
|
|
435
|
-
xsize: int = 100,
|
|
436
|
-
ysize: int = 100,
|
|
437
|
-
add_location: bool = True,
|
|
438
|
-
add_content: bool = True,
|
|
439
|
-
add_page_index: bool = True,
|
|
440
|
-
):
|
|
441
|
-
"""Export text element to document tokens format."""
|
|
442
|
-
body = f"<{self.obj_type}>"
|
|
443
|
-
|
|
444
|
-
assert DocumentToken.is_known_token(
|
|
445
|
-
body
|
|
446
|
-
), f"failed DocumentToken.is_known_token({body})"
|
|
447
|
-
|
|
448
|
-
if add_location:
|
|
449
|
-
body += self.get_location_tokens(
|
|
450
|
-
new_line="",
|
|
451
|
-
page_w=page_w,
|
|
452
|
-
page_h=page_h,
|
|
453
|
-
xsize=xsize,
|
|
454
|
-
ysize=ysize,
|
|
455
|
-
add_page_index=add_page_index,
|
|
155
|
+
def to_top_left_origin(self, page_height):
|
|
156
|
+
"""to_top_left_origin.
|
|
157
|
+
|
|
158
|
+
:param page_height:
|
|
159
|
+
|
|
160
|
+
"""
|
|
161
|
+
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
162
|
+
return self
|
|
163
|
+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
164
|
+
return BoundingBox(
|
|
165
|
+
l=self.l,
|
|
166
|
+
r=self.r,
|
|
167
|
+
t=page_height - self.t, # self.b
|
|
168
|
+
b=page_height - self.b, # self.t
|
|
169
|
+
coord_origin=CoordOrigin.TOPLEFT,
|
|
456
170
|
)
|
|
457
|
-
|
|
458
|
-
if add_content and self.text is not None:
|
|
459
|
-
body += self.text.strip()
|
|
460
|
-
|
|
461
|
-
body += f"</{self.obj_type}>{new_line}"
|
|
462
|
-
|
|
463
|
-
return body
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
class ListItem(BaseText):
|
|
467
|
-
"""List item."""
|
|
468
|
-
|
|
469
|
-
identifier: str
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
class Ref(AliasModel):
|
|
473
|
-
"""Reference."""
|
|
474
|
-
|
|
475
|
-
name: str
|
|
476
|
-
obj_type: str = Field(alias="type")
|
|
477
|
-
ref: str = Field(alias="$ref")
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
class PageReference(BaseModel):
|
|
481
|
-
"""Page reference."""
|
|
482
|
-
|
|
483
|
-
hash: str = Field(json_schema_extra=es_field(type="keyword", ignore_above=8191))
|
|
484
|
-
model: str = Field(json_schema_extra=es_field(suppress=True))
|
|
485
|
-
page: PositiveInt = Field(json_schema_extra=es_field(type="short"))
|