docling-core 1.7.2__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +2 -8
- docling_core/transforms/chunker/base.py +27 -40
- docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
- docling_core/types/__init__.py +3 -18
- docling_core/types/doc/__init__.py +25 -0
- docling_core/types/doc/base.py +136 -451
- docling_core/types/doc/document.py +1289 -559
- docling_core/types/{experimental → doc}/labels.py +4 -1
- docling_core/types/legacy_doc/__init__.py +6 -0
- docling_core/types/legacy_doc/base.py +485 -0
- docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
- docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
- docling_core/types/legacy_doc/document.py +715 -0
- docling_core/types/rec/subject.py +1 -1
- docling_core/utils/generate_docs.py +82 -0
- docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
- docling_core/utils/validators.py +3 -3
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/METADATA +17 -17
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/RECORD +24 -31
- docling_core-2.0.1.dist-info/entry_points.txt +5 -0
- docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core/transforms/id_generator/base.py +0 -30
- docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core/types/experimental/__init__.py +0 -30
- docling_core/types/experimental/base.py +0 -167
- docling_core/types/experimental/document.py +0 -1192
- docling_core/utils/ds_generate_docs.py +0 -144
- docling_core-1.7.2.dist-info/entry_points.txt +0 -5
- /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/LICENSE +0 -0
- {docling_core-1.7.2.dist-info → docling_core-2.0.1.dist-info}/WHEEL +0 -0
|
@@ -1,167 +0,0 @@
|
|
|
1
|
-
"""Models for the base data types."""
|
|
2
|
-
|
|
3
|
-
import copy
|
|
4
|
-
from enum import Enum
|
|
5
|
-
from typing import Tuple
|
|
6
|
-
|
|
7
|
-
from pydantic import BaseModel
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class CoordOrigin(str, Enum):
|
|
11
|
-
"""CoordOrigin."""
|
|
12
|
-
|
|
13
|
-
TOPLEFT = "TOPLEFT"
|
|
14
|
-
BOTTOMLEFT = "BOTTOMLEFT"
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class Size(BaseModel):
|
|
18
|
-
"""Size."""
|
|
19
|
-
|
|
20
|
-
width: float = 0.0
|
|
21
|
-
height: float = 0.0
|
|
22
|
-
|
|
23
|
-
def as_tuple(self):
|
|
24
|
-
"""as_tuple."""
|
|
25
|
-
return (self.width, self.height)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class BoundingBox(BaseModel):
|
|
29
|
-
"""BoundingBox."""
|
|
30
|
-
|
|
31
|
-
l: float # left
|
|
32
|
-
t: float # top
|
|
33
|
-
r: float # right
|
|
34
|
-
b: float # bottom
|
|
35
|
-
|
|
36
|
-
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
|
37
|
-
|
|
38
|
-
@property
|
|
39
|
-
def width(self):
|
|
40
|
-
"""width."""
|
|
41
|
-
return self.r - self.l
|
|
42
|
-
|
|
43
|
-
@property
|
|
44
|
-
def height(self):
|
|
45
|
-
"""height."""
|
|
46
|
-
return abs(self.t - self.b)
|
|
47
|
-
|
|
48
|
-
def scaled(self, scale: float) -> "BoundingBox":
|
|
49
|
-
"""scaled.
|
|
50
|
-
|
|
51
|
-
:param scale: float:
|
|
52
|
-
|
|
53
|
-
"""
|
|
54
|
-
out_bbox = copy.deepcopy(self)
|
|
55
|
-
out_bbox.l *= scale
|
|
56
|
-
out_bbox.r *= scale
|
|
57
|
-
out_bbox.t *= scale
|
|
58
|
-
out_bbox.b *= scale
|
|
59
|
-
|
|
60
|
-
return out_bbox
|
|
61
|
-
|
|
62
|
-
def normalized(self, page_size: Size) -> "BoundingBox":
|
|
63
|
-
"""normalized.
|
|
64
|
-
|
|
65
|
-
:param page_size: Size:
|
|
66
|
-
|
|
67
|
-
"""
|
|
68
|
-
out_bbox = copy.deepcopy(self)
|
|
69
|
-
out_bbox.l /= page_size.width
|
|
70
|
-
out_bbox.r /= page_size.width
|
|
71
|
-
out_bbox.t /= page_size.height
|
|
72
|
-
out_bbox.b /= page_size.height
|
|
73
|
-
|
|
74
|
-
return out_bbox
|
|
75
|
-
|
|
76
|
-
def as_tuple(self):
|
|
77
|
-
"""as_tuple."""
|
|
78
|
-
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
79
|
-
return (self.l, self.t, self.r, self.b)
|
|
80
|
-
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
81
|
-
return (self.l, self.b, self.r, self.t)
|
|
82
|
-
|
|
83
|
-
@classmethod
|
|
84
|
-
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
|
85
|
-
"""from_tuple.
|
|
86
|
-
|
|
87
|
-
:param coord: Tuple[float:
|
|
88
|
-
:param ...]:
|
|
89
|
-
:param origin: CoordOrigin:
|
|
90
|
-
|
|
91
|
-
"""
|
|
92
|
-
if origin == CoordOrigin.TOPLEFT:
|
|
93
|
-
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
|
94
|
-
if r < l:
|
|
95
|
-
l, r = r, l
|
|
96
|
-
if b < t:
|
|
97
|
-
b, t = t, b
|
|
98
|
-
|
|
99
|
-
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
|
100
|
-
elif origin == CoordOrigin.BOTTOMLEFT:
|
|
101
|
-
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
|
102
|
-
if r < l:
|
|
103
|
-
l, r = r, l
|
|
104
|
-
if b > t:
|
|
105
|
-
b, t = t, b
|
|
106
|
-
|
|
107
|
-
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
|
108
|
-
|
|
109
|
-
def area(self) -> float:
|
|
110
|
-
"""area."""
|
|
111
|
-
return (self.r - self.l) * (self.b - self.t)
|
|
112
|
-
|
|
113
|
-
def intersection_area_with(self, other: "BoundingBox") -> float:
|
|
114
|
-
"""intersection_area_with.
|
|
115
|
-
|
|
116
|
-
:param other: "BoundingBox":
|
|
117
|
-
|
|
118
|
-
"""
|
|
119
|
-
# Calculate intersection coordinates
|
|
120
|
-
left = max(self.l, other.l)
|
|
121
|
-
top = max(self.t, other.t)
|
|
122
|
-
right = min(self.r, other.r)
|
|
123
|
-
bottom = min(self.b, other.b)
|
|
124
|
-
|
|
125
|
-
# Calculate intersection dimensions
|
|
126
|
-
width = right - left
|
|
127
|
-
height = bottom - top
|
|
128
|
-
|
|
129
|
-
# If the bounding boxes do not overlap, width or height will be negative
|
|
130
|
-
if width <= 0 or height <= 0:
|
|
131
|
-
return 0.0
|
|
132
|
-
|
|
133
|
-
return width * height
|
|
134
|
-
|
|
135
|
-
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
|
136
|
-
"""to_bottom_left_origin.
|
|
137
|
-
|
|
138
|
-
:param page_height:
|
|
139
|
-
|
|
140
|
-
"""
|
|
141
|
-
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
142
|
-
return self
|
|
143
|
-
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
|
144
|
-
return BoundingBox(
|
|
145
|
-
l=self.l,
|
|
146
|
-
r=self.r,
|
|
147
|
-
t=page_height - self.t,
|
|
148
|
-
b=page_height - self.b,
|
|
149
|
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
|
150
|
-
)
|
|
151
|
-
|
|
152
|
-
def to_top_left_origin(self, page_height):
|
|
153
|
-
"""to_top_left_origin.
|
|
154
|
-
|
|
155
|
-
:param page_height:
|
|
156
|
-
|
|
157
|
-
"""
|
|
158
|
-
if self.coord_origin == CoordOrigin.TOPLEFT:
|
|
159
|
-
return self
|
|
160
|
-
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
161
|
-
return BoundingBox(
|
|
162
|
-
l=self.l,
|
|
163
|
-
r=self.r,
|
|
164
|
-
t=page_height - self.t, # self.b
|
|
165
|
-
b=page_height - self.b, # self.t
|
|
166
|
-
coord_origin=CoordOrigin.TOPLEFT,
|
|
167
|
-
)
|