docling-core 1.7.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (36) hide show
  1. docling_core/transforms/chunker/__init__.py +2 -8
  2. docling_core/transforms/chunker/base.py +27 -40
  3. docling_core/transforms/chunker/hierarchical_chunker.py +144 -312
  4. docling_core/types/__init__.py +12 -8
  5. docling_core/types/doc/__init__.py +25 -0
  6. docling_core/types/doc/base.py +136 -451
  7. docling_core/types/doc/document.py +1288 -559
  8. docling_core/types/{experimental → doc}/labels.py +4 -1
  9. docling_core/types/legacy_doc/__init__.py +6 -0
  10. docling_core/types/legacy_doc/base.py +485 -0
  11. docling_core/types/{doc → legacy_doc}/doc_ann.py +1 -1
  12. docling_core/types/{doc → legacy_doc}/doc_ocr.py +1 -1
  13. docling_core/types/{doc → legacy_doc}/doc_raw.py +1 -1
  14. docling_core/types/legacy_doc/document.py +715 -0
  15. docling_core/types/rec/subject.py +1 -1
  16. docling_core/utils/generate_docs.py +82 -0
  17. docling_core/utils/{ds_generate_jsonschema.py → generate_jsonschema.py} +4 -4
  18. docling_core/utils/validators.py +3 -3
  19. {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/METADATA +11 -11
  20. {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/RECORD +24 -31
  21. docling_core-2.0.0.dist-info/entry_points.txt +5 -0
  22. docling_core/transforms/id_generator/__init__.py +0 -12
  23. docling_core/transforms/id_generator/base.py +0 -30
  24. docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  25. docling_core/transforms/id_generator/uuid_generator.py +0 -34
  26. docling_core/transforms/metadata_extractor/__init__.py +0 -13
  27. docling_core/transforms/metadata_extractor/base.py +0 -59
  28. docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  29. docling_core/types/experimental/__init__.py +0 -30
  30. docling_core/types/experimental/base.py +0 -167
  31. docling_core/types/experimental/document.py +0 -1192
  32. docling_core/utils/ds_generate_docs.py +0 -144
  33. docling_core-1.7.1.dist-info/entry_points.txt +0 -5
  34. /docling_core/types/{doc → legacy_doc}/tokens.py +0 -0
  35. {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/LICENSE +0 -0
  36. {docling_core-1.7.1.dist-info → docling_core-2.0.0.dist-info}/WHEEL +0 -0
@@ -1,167 +0,0 @@
1
- """Models for the base data types."""
2
-
3
- import copy
4
- from enum import Enum
5
- from typing import Tuple
6
-
7
- from pydantic import BaseModel
8
-
9
-
10
- class CoordOrigin(str, Enum):
11
- """CoordOrigin."""
12
-
13
- TOPLEFT = "TOPLEFT"
14
- BOTTOMLEFT = "BOTTOMLEFT"
15
-
16
-
17
- class Size(BaseModel):
18
- """Size."""
19
-
20
- width: float = 0.0
21
- height: float = 0.0
22
-
23
- def as_tuple(self):
24
- """as_tuple."""
25
- return (self.width, self.height)
26
-
27
-
28
- class BoundingBox(BaseModel):
29
- """BoundingBox."""
30
-
31
- l: float # left
32
- t: float # top
33
- r: float # right
34
- b: float # bottom
35
-
36
- coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
37
-
38
- @property
39
- def width(self):
40
- """width."""
41
- return self.r - self.l
42
-
43
- @property
44
- def height(self):
45
- """height."""
46
- return abs(self.t - self.b)
47
-
48
- def scaled(self, scale: float) -> "BoundingBox":
49
- """scaled.
50
-
51
- :param scale: float:
52
-
53
- """
54
- out_bbox = copy.deepcopy(self)
55
- out_bbox.l *= scale
56
- out_bbox.r *= scale
57
- out_bbox.t *= scale
58
- out_bbox.b *= scale
59
-
60
- return out_bbox
61
-
62
- def normalized(self, page_size: Size) -> "BoundingBox":
63
- """normalized.
64
-
65
- :param page_size: Size:
66
-
67
- """
68
- out_bbox = copy.deepcopy(self)
69
- out_bbox.l /= page_size.width
70
- out_bbox.r /= page_size.width
71
- out_bbox.t /= page_size.height
72
- out_bbox.b /= page_size.height
73
-
74
- return out_bbox
75
-
76
- def as_tuple(self):
77
- """as_tuple."""
78
- if self.coord_origin == CoordOrigin.TOPLEFT:
79
- return (self.l, self.t, self.r, self.b)
80
- elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
81
- return (self.l, self.b, self.r, self.t)
82
-
83
- @classmethod
84
- def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
85
- """from_tuple.
86
-
87
- :param coord: Tuple[float:
88
- :param ...]:
89
- :param origin: CoordOrigin:
90
-
91
- """
92
- if origin == CoordOrigin.TOPLEFT:
93
- l, t, r, b = coord[0], coord[1], coord[2], coord[3]
94
- if r < l:
95
- l, r = r, l
96
- if b < t:
97
- b, t = t, b
98
-
99
- return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
100
- elif origin == CoordOrigin.BOTTOMLEFT:
101
- l, b, r, t = coord[0], coord[1], coord[2], coord[3]
102
- if r < l:
103
- l, r = r, l
104
- if b > t:
105
- b, t = t, b
106
-
107
- return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
108
-
109
- def area(self) -> float:
110
- """area."""
111
- return (self.r - self.l) * (self.b - self.t)
112
-
113
- def intersection_area_with(self, other: "BoundingBox") -> float:
114
- """intersection_area_with.
115
-
116
- :param other: "BoundingBox":
117
-
118
- """
119
- # Calculate intersection coordinates
120
- left = max(self.l, other.l)
121
- top = max(self.t, other.t)
122
- right = min(self.r, other.r)
123
- bottom = min(self.b, other.b)
124
-
125
- # Calculate intersection dimensions
126
- width = right - left
127
- height = bottom - top
128
-
129
- # If the bounding boxes do not overlap, width or height will be negative
130
- if width <= 0 or height <= 0:
131
- return 0.0
132
-
133
- return width * height
134
-
135
- def to_bottom_left_origin(self, page_height) -> "BoundingBox":
136
- """to_bottom_left_origin.
137
-
138
- :param page_height:
139
-
140
- """
141
- if self.coord_origin == CoordOrigin.BOTTOMLEFT:
142
- return self
143
- elif self.coord_origin == CoordOrigin.TOPLEFT:
144
- return BoundingBox(
145
- l=self.l,
146
- r=self.r,
147
- t=page_height - self.t,
148
- b=page_height - self.b,
149
- coord_origin=CoordOrigin.BOTTOMLEFT,
150
- )
151
-
152
- def to_top_left_origin(self, page_height):
153
- """to_top_left_origin.
154
-
155
- :param page_height:
156
-
157
- """
158
- if self.coord_origin == CoordOrigin.TOPLEFT:
159
- return self
160
- elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
161
- return BoundingBox(
162
- l=self.l,
163
- r=self.r,
164
- t=page_height - self.t, # self.b
165
- b=page_height - self.b, # self.t
166
- coord_origin=CoordOrigin.TOPLEFT,
167
- )