hopeit.dataframes 0.26.0rc0__tar.gz → 0.26.0rc2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/PKG-INFO +4 -3
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/pyproject.toml +4 -3
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/__init__.py +5 -13
- hopeit_dataframes-0.26.0rc2/src/hopeit/dataframes/datablocks.py +294 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/dataframe.py +5 -4
- hopeit_dataframes-0.26.0rc2/src/hopeit/dataframes/serialization/dataset.py +118 -0
- hopeit_dataframes-0.26.0rc2/src/hopeit/dataframes/serialization/files.py +213 -0
- hopeit_dataframes-0.26.0rc2/src/hopeit/dataframes/serialization/protocol.py +28 -0
- hopeit_dataframes-0.26.0rc2/src/hopeit/dataframes/serialization/settings.py +34 -0
- hopeit_dataframes-0.26.0rc2/src/hopeit/dataframes/setup/dataframes.py +30 -0
- hopeit_dataframes-0.26.0rc2/src/hopeit/dataframes/setup/register_database.py +26 -0
- hopeit_dataframes-0.26.0rc2/src/hopeit/dataframes/setup/registry.py +70 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit.dataframes.egg-info/PKG-INFO +4 -3
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit.dataframes.egg-info/SOURCES.txt +4 -1
- hopeit_dataframes-0.26.0rc2/src/hopeit.dataframes.egg-info/requires.txt +5 -0
- hopeit_dataframes-0.26.0rc0/src/hopeit/dataframes/datablocks.py +0 -140
- hopeit_dataframes-0.26.0rc0/src/hopeit/dataframes/serialization/dataset.py +0 -93
- hopeit_dataframes-0.26.0rc0/src/hopeit/dataframes/serialization/files.py +0 -85
- hopeit_dataframes-0.26.0rc0/src/hopeit/dataframes/serialization/settings.py +0 -13
- hopeit_dataframes-0.26.0rc0/src/hopeit/dataframes/setup/dataframes.py +0 -36
- hopeit_dataframes-0.26.0rc0/src/hopeit.dataframes.egg-info/requires.txt +0 -4
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/README.md +0 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/setup.cfg +0 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/py.typed +0 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/serialization/__init__.py +0 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/serialization/py.typed +0 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/setup/__init__.py +0 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/setup/py.typed +0 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
- {hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.26.
|
|
3
|
+
Version: 0.26.0rc2
|
|
4
4
|
Summary: Hopeit Engine Dataframes for Pandas
|
|
5
5
|
Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
|
|
6
6
|
License: Apache 2
|
|
@@ -24,9 +24,10 @@ Classifier: Topic :: Internet :: WWW/HTTP
|
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
25
|
Classifier: Framework :: AsyncIO
|
|
26
26
|
Description-Content-Type: text/markdown
|
|
27
|
-
Requires-Dist: hopeit.engine>=0.26.
|
|
28
|
-
Requires-Dist: hopeit.fs-storage>=0.26.
|
|
27
|
+
Requires-Dist: hopeit.engine>=0.26.0rc2
|
|
28
|
+
Requires-Dist: hopeit.fs-storage>=0.26.0rc2
|
|
29
29
|
Requires-Dist: pandas>=2.2.3
|
|
30
|
+
Requires-Dist: pyarrow>=19.0.1
|
|
30
31
|
Requires-Dist: numpy>=1.26.4
|
|
31
32
|
|
|
32
33
|
# hopeit.engine dataframes plugin
|
|
@@ -4,15 +4,16 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hopeit.dataframes"
|
|
7
|
-
version = "0.26.
|
|
7
|
+
version = "0.26.0rc2"
|
|
8
8
|
|
|
9
9
|
description = "Hopeit Engine Dataframes for Pandas"
|
|
10
10
|
dynamic = ["readme"]
|
|
11
11
|
|
|
12
12
|
dependencies = [
|
|
13
|
-
"hopeit.engine>=0.26.
|
|
14
|
-
"hopeit.fs-storage>=0.26.
|
|
13
|
+
"hopeit.engine>=0.26.0rc2",
|
|
14
|
+
"hopeit.fs-storage>=0.26.0rc2",
|
|
15
15
|
"pandas>=2.2.3",
|
|
16
|
+
"pyarrow>=19.0.1",
|
|
16
17
|
"numpy>=1.26.4"
|
|
17
18
|
]
|
|
18
19
|
|
{hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/__init__.py
RENAMED
|
@@ -36,13 +36,11 @@ class MyWebResponse:
|
|
|
36
36
|
dataset_name: str
|
|
37
37
|
example_data: List[MyData.DataObject]
|
|
38
38
|
|
|
39
|
-
#
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
location="/tmp/data",
|
|
43
|
-
partition_dateformat="%Y/%m/%d/%H/",
|
|
44
|
-
))
|
|
39
|
+
# Initialization: this step is not needed if SETUP event is configured in app
|
|
40
|
+
settings = DataframesSettings(...) # settings example in `plugin-config.json`
|
|
41
|
+
await registry.init_registry(settings)
|
|
45
42
|
|
|
43
|
+
# Usage
|
|
46
44
|
df = pd.DataFrame([ # Create or load a pandas DataFrame
|
|
47
45
|
{"field1": 1, "field2": "text1"},
|
|
48
46
|
{"field1": 2, "field2": "text2"},
|
|
@@ -77,10 +75,8 @@ from typing import Dict, Generic, Iterator, List, Type
|
|
|
77
75
|
import numpy as np
|
|
78
76
|
import pandas as pd
|
|
79
77
|
from hopeit.dataframes.dataframe import DataFrameT, dataframe
|
|
80
|
-
from hopeit.dataframes.datablocks import DataBlocks
|
|
81
78
|
from hopeit.dataframes.serialization.dataset import Dataset
|
|
82
|
-
from hopeit.dataframes.
|
|
83
|
-
from hopeit.dataframes.setup.dataframes import register_serialization
|
|
79
|
+
from hopeit.dataframes.datablocks import DataBlocks
|
|
84
80
|
from hopeit.dataobjects import DataObject
|
|
85
81
|
|
|
86
82
|
__all__ = ["DataBlocks", "DataFrames", "Dataset", "dataframe"]
|
|
@@ -91,10 +87,6 @@ class DataFrames(Generic[DataFrameT, DataObject]):
|
|
|
91
87
|
Dataframes manipulation utilities methods
|
|
92
88
|
"""
|
|
93
89
|
|
|
94
|
-
@staticmethod
|
|
95
|
-
def setup(settings: DatasetSerialization):
|
|
96
|
-
register_serialization(settings)
|
|
97
|
-
|
|
98
90
|
@staticmethod
|
|
99
91
|
def from_df(
|
|
100
92
|
datatype: Type[DataFrameT], df: pd.DataFrame, **series: Dict[str, pd.Series]
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataBlocks is a utility that allows users of the dataframes plugin to create dataobjects
|
|
3
|
+
that contain combined properties with one or multiple Datasets but can be manipulated
|
|
4
|
+
and saved as a single flat pandas DataFrame.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import AsyncGenerator, Generic, Optional, Type, TypeVar, get_args, get_origin
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
from hopeit.dataobjects import dataobject, dataclass, fields
|
|
12
|
+
|
|
13
|
+
from hopeit.dataframes.serialization.dataset import Dataset, DatasetLoadError
|
|
14
|
+
from hopeit.dataframes.serialization.protocol import find_dataframe_type
|
|
15
|
+
from hopeit.dataframes.setup.registry import get_dataset_storage
|
|
16
|
+
|
|
17
|
+
DataBlockType = TypeVar("DataBlockType")
|
|
18
|
+
DataBlockItemType = TypeVar("DataBlockItemType")
|
|
19
|
+
DataFrameType = TypeVar("DataFrameType")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataobject
|
|
23
|
+
@dataclass
|
|
24
|
+
class DataBlockMetadata:
|
|
25
|
+
partition_dt: Optional[datetime] = None
|
|
26
|
+
database_key: Optional[str] = None
|
|
27
|
+
group_key: Optional[str] = None
|
|
28
|
+
collection: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def default(cls) -> "DataBlockMetadata":
|
|
32
|
+
return cls()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataobject
|
|
36
|
+
@dataclass
|
|
37
|
+
class DataBlockQuery:
|
|
38
|
+
from_partition_dt: datetime
|
|
39
|
+
to_partition_dt: datetime
|
|
40
|
+
select: list[str] | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TempDataBlock(Generic[DataBlockType, DataBlockItemType]):
|
|
44
|
+
"""
|
|
45
|
+
TempDataBlock allows to convers a pandas Dataframe to a from dataobjects
|
|
46
|
+
using DatabBlockType and DataBlockItemType schemas. So from a flat pandas
|
|
47
|
+
dataframe, an object containing subsections of the data can be created.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, datatype: Type[DataBlockType], df: pd.DataFrame) -> None:
|
|
51
|
+
self.datatype = datatype
|
|
52
|
+
self.df = df
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_dataobjects(
|
|
56
|
+
cls, datatype: Type[DataBlockType], items: list[DataBlockItemType]
|
|
57
|
+
) -> "TempDataBlock[DataBlockType, DataBlockItemType]":
|
|
58
|
+
result_df: Optional[pd.DataFrame] = None
|
|
59
|
+
for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
|
|
60
|
+
if get_origin(field_info.annotation) is Dataset:
|
|
61
|
+
block_items = (getattr(item, field_name) for item in items)
|
|
62
|
+
block_type = get_args(field_info.annotation)[0]
|
|
63
|
+
block = block_type._from_dataobjects(block_items)
|
|
64
|
+
block_df = block._df
|
|
65
|
+
else:
|
|
66
|
+
block_df = pd.DataFrame({field_name: [getattr(item, field_name) for item in items]})
|
|
67
|
+
|
|
68
|
+
if result_df is None:
|
|
69
|
+
result_df = block_df
|
|
70
|
+
else:
|
|
71
|
+
# Skips duplicated column names to they are included only once
|
|
72
|
+
result_df = result_df.join(
|
|
73
|
+
block_df[[col for col in block_df.columns if col not in result_df.columns]]
|
|
74
|
+
)
|
|
75
|
+
assert result_df is not None
|
|
76
|
+
return cls(datatype, result_df)
|
|
77
|
+
|
|
78
|
+
def to_dataobjects(
|
|
79
|
+
self, item_type: Type[DataBlockItemType], *, normalize_null_values: bool = False
|
|
80
|
+
) -> list[DataBlockItemType]:
|
|
81
|
+
keys: list[str] = []
|
|
82
|
+
entries: list[list] = []
|
|
83
|
+
for field_name, field_info in fields(self.datatype).items(): # type: ignore[type-var]
|
|
84
|
+
if get_origin(field_info.annotation) is Dataset:
|
|
85
|
+
block_type = get_args(field_info.annotation)[0]
|
|
86
|
+
keys.append(field_name)
|
|
87
|
+
dataframe = block_type._from_df(self.df)
|
|
88
|
+
entries.append(
|
|
89
|
+
dataframe._to_dataobjects(normalize_null_values=normalize_null_values)
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
keys.append(field_name)
|
|
93
|
+
entries.append(self.df[field_name].to_list())
|
|
94
|
+
|
|
95
|
+
return [
|
|
96
|
+
item_type(**{field_name: entry[i] for i, field_name in enumerate(keys)})
|
|
97
|
+
for entry in zip(*entries)
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
102
|
+
"""
|
|
103
|
+
DataBlocks is a utility class that allows users to create dataobjects containing multiple Datasets.
|
|
104
|
+
These dataobjects can be converted and saved as a single pandas DataFrame.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
@classmethod
|
|
108
|
+
async def load(
|
|
109
|
+
cls,
|
|
110
|
+
datablock: DataBlockType,
|
|
111
|
+
*,
|
|
112
|
+
select: Optional[list[str]] = None,
|
|
113
|
+
database_key: Optional[str] = None,
|
|
114
|
+
) -> pd.DataFrame:
|
|
115
|
+
"""
|
|
116
|
+
Converts a DataBlockType object to a pandas DataFrame, by reading the subyacent Dataset/s and
|
|
117
|
+
putting al the fields defined in the DataBlockType in a flat pandas DataFrame.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
datablock (DataBlockType): The data block to convert.
|
|
121
|
+
select (Optional[list[str]]): Optional list of field names to select.
|
|
122
|
+
database_key (Optional[str]): Optional database key for loading data.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
pd.DataFrame: The resulting pandas DataFrame.
|
|
126
|
+
"""
|
|
127
|
+
keys = [
|
|
128
|
+
field_name
|
|
129
|
+
for field_name, field_info in fields(datablock).items() # type: ignore[arg-type]
|
|
130
|
+
if get_origin(field_info.annotation) is Dataset
|
|
131
|
+
and (select is None or field_name in select)
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
# Filter/validate selected field names using saved schema,
|
|
135
|
+
# generates a single field for every common/duplicated field in the datasets
|
|
136
|
+
field_names = list(
|
|
137
|
+
dict.fromkeys(
|
|
138
|
+
[
|
|
139
|
+
field_name
|
|
140
|
+
for key in keys
|
|
141
|
+
for field_name in getattr(datablock, key).schema["properties"].keys()
|
|
142
|
+
]
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Load data from first dataset (datablock uses a single file for all datasets)
|
|
147
|
+
dataset: Dataset = getattr(datablock, keys[0])
|
|
148
|
+
storage = await get_dataset_storage(database_key)
|
|
149
|
+
result_df = await DataBlocks._load_datablock_df(storage, dataset, field_names, database_key)
|
|
150
|
+
|
|
151
|
+
# Add missing optional fields using class schema (allows schema evolution)
|
|
152
|
+
cls._adapt_to_schema(datablock, keys, result_df)
|
|
153
|
+
|
|
154
|
+
# Adding constant value fields
|
|
155
|
+
for field_name, field_info in fields(datablock).items(): # type: ignore[arg-type]
|
|
156
|
+
if get_origin(field_info.annotation) is not Dataset:
|
|
157
|
+
result_df[field_name] = getattr(datablock, field_name) # type: ignore[index]
|
|
158
|
+
|
|
159
|
+
return result_df
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
async def save(
|
|
163
|
+
datatype: Type[DataBlockType],
|
|
164
|
+
df: pd.DataFrame,
|
|
165
|
+
metadata: DataBlockMetadata | None = None,
|
|
166
|
+
**kwargs, # Non-Dataset field values for DataBlockType
|
|
167
|
+
) -> DataBlockType:
|
|
168
|
+
"""
|
|
169
|
+
Creates a DataBlockType object from a pandas DataFrame, by saving the pandas Dataframe to a single
|
|
170
|
+
location, usually a file, and returning a dataobject with Datasets that reference the saved data.
|
|
171
|
+
The returned DataBlock can be retrieved in one shot using `DataBlocks.df` to get back a flat pandas
|
|
172
|
+
DataFrame, or each of the individual DataSets can be loaded independently.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
datatype (Type[DataBlockType]): The type of the data block.
|
|
176
|
+
df (pd.DataFrame): The pandas DataFrame to convert.
|
|
177
|
+
metadata (Optional[DataBlockMetadata]): Optional metadata for the data block.
|
|
178
|
+
**kwargs: Additional non-Dataset field values for the DataBlockType.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
DataBlockType: The resulting data block.
|
|
182
|
+
"""
|
|
183
|
+
if metadata is None:
|
|
184
|
+
metadata = DataBlockMetadata.default()
|
|
185
|
+
|
|
186
|
+
storage = await get_dataset_storage(metadata.database_key)
|
|
187
|
+
|
|
188
|
+
block_dataset = await Dataset._save_df(
|
|
189
|
+
storage,
|
|
190
|
+
df,
|
|
191
|
+
datatype,
|
|
192
|
+
database_key=metadata.database_key,
|
|
193
|
+
partition_dt=metadata.partition_dt,
|
|
194
|
+
group_key=metadata.group_key,
|
|
195
|
+
collection=metadata.collection,
|
|
196
|
+
save_schema=True, # Required for datablocks
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
blocks = {}
|
|
200
|
+
for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
|
|
201
|
+
if get_origin(field_info.annotation) is Dataset:
|
|
202
|
+
block_type = get_args(field_info.annotation)[0]
|
|
203
|
+
blocks[field_name] = block_dataset._adapt(block_type)
|
|
204
|
+
else:
|
|
205
|
+
blocks[field_name] = kwargs[field_name]
|
|
206
|
+
|
|
207
|
+
return datatype(**blocks)
|
|
208
|
+
|
|
209
|
+
@staticmethod
|
|
210
|
+
def default(datatype: Type[DataBlockType]) -> DataBlockType:
|
|
211
|
+
return datatype(**{field_name: [] for field_name in list(fields(datatype))}) # type: ignore[type-var]
|
|
212
|
+
|
|
213
|
+
@classmethod
|
|
214
|
+
async def load_batch(
|
|
215
|
+
cls,
|
|
216
|
+
datatype: Type[DataBlockType],
|
|
217
|
+
query: DataBlockQuery,
|
|
218
|
+
metadata: DataBlockMetadata | None = None,
|
|
219
|
+
**kwargs, # Non-Dataset field values for DataBlockType
|
|
220
|
+
) -> AsyncGenerator[pd.DataFrame, None]:
|
|
221
|
+
if metadata is None:
|
|
222
|
+
metadata = DataBlockMetadata.default()
|
|
223
|
+
|
|
224
|
+
storage = await get_dataset_storage(metadata.database_key)
|
|
225
|
+
|
|
226
|
+
async for block_dataset in storage._get_batch( # type: ignore[attr-defined]
|
|
227
|
+
datatype,
|
|
228
|
+
database_key=metadata.database_key,
|
|
229
|
+
from_partition_dt=query.from_partition_dt,
|
|
230
|
+
to_partition_dt=query.to_partition_dt,
|
|
231
|
+
group_key=metadata.group_key,
|
|
232
|
+
collection=metadata.collection,
|
|
233
|
+
):
|
|
234
|
+
dataset_types = [
|
|
235
|
+
(field_name, get_args(field_info.annotation)[0])
|
|
236
|
+
for field_name, field_info in fields(datatype).items() # type: ignore[type-var]
|
|
237
|
+
if get_origin(field_info.annotation) is Dataset
|
|
238
|
+
and (query.select is None or field_name in query.select)
|
|
239
|
+
]
|
|
240
|
+
field_names = list(
|
|
241
|
+
dict.fromkeys(
|
|
242
|
+
[
|
|
243
|
+
field_name
|
|
244
|
+
for _, dataset_type in dataset_types
|
|
245
|
+
for field_name, _ in fields(dataset_type).items()
|
|
246
|
+
]
|
|
247
|
+
)
|
|
248
|
+
)
|
|
249
|
+
result_df = await DataBlocks._load_datablock_df(
|
|
250
|
+
storage, block_dataset, field_names, metadata.database_key
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Adding constant value fields
|
|
254
|
+
for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
|
|
255
|
+
if get_origin(field_info.annotation) is not Dataset:
|
|
256
|
+
result_df[field_name] = kwargs.get(field_name)
|
|
257
|
+
|
|
258
|
+
yield result_df
|
|
259
|
+
|
|
260
|
+
@staticmethod
|
|
261
|
+
def _get_datablock_keys(
|
|
262
|
+
datablocktype: Type[DataBlockType],
|
|
263
|
+
*,
|
|
264
|
+
select: Optional[list[str]] = None,
|
|
265
|
+
) -> list[str]:
|
|
266
|
+
return [
|
|
267
|
+
field_name
|
|
268
|
+
for field_name, field_info in fields(datablocktype).items() # type: ignore[type-var]
|
|
269
|
+
if get_origin(field_info.annotation) is Dataset
|
|
270
|
+
and (select is None or field_name in select)
|
|
271
|
+
]
|
|
272
|
+
|
|
273
|
+
@staticmethod
|
|
274
|
+
async def _load_datablock_df(
|
|
275
|
+
storage: object,
|
|
276
|
+
dataset: Dataset,
|
|
277
|
+
columns: Optional[list[str]] = None,
|
|
278
|
+
database_key: Optional[str] = None,
|
|
279
|
+
) -> pd.DataFrame:
|
|
280
|
+
try:
|
|
281
|
+
return await dataset._load_df(storage, columns)
|
|
282
|
+
except (RuntimeError, IOError, KeyError) as e:
|
|
283
|
+
raise DatasetLoadError(
|
|
284
|
+
f"Error {type(e).__name__}: {e} loading datablock of type {dataset.datatype} "
|
|
285
|
+
f"at location {dataset.partition_key}/{dataset.key}"
|
|
286
|
+
) from e
|
|
287
|
+
|
|
288
|
+
@classmethod
|
|
289
|
+
def _adapt_to_schema(cls, datablock: DataBlockType, keys: list[str], df: pd.DataFrame) -> None:
|
|
290
|
+
for key in keys:
|
|
291
|
+
datatype = find_dataframe_type(getattr(datablock, key).datatype) # type: ignore[var-annotated]
|
|
292
|
+
valid_df = datatype._from_df(df)._df
|
|
293
|
+
for col in valid_df.columns:
|
|
294
|
+
df[col] = valid_df[col]
|
{hopeit_dataframes-0.26.0rc0 → hopeit_dataframes-0.26.0rc2}/src/hopeit/dataframes/dataframe.py
RENAMED
|
@@ -124,7 +124,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
124
124
|
raise NotImplementedError # must use @dataframe decorator # pragma: no cover
|
|
125
125
|
|
|
126
126
|
@staticmethod
|
|
127
|
-
def __init_from_series__(self, **series: pd.Series): # pylint: disable=bad-staticmethod-argument
|
|
127
|
+
def __init_from_series__(self, **series: pd.Series) -> None: # pylint: disable=bad-staticmethod-argument
|
|
128
128
|
df = pd.DataFrame(series)
|
|
129
129
|
df.index.name = None # Removes index name to avoid colisions with series name
|
|
130
130
|
if self.__data_object__["validate"]:
|
|
@@ -240,9 +240,10 @@ def dataframe(
|
|
|
240
240
|
return amended_class
|
|
241
241
|
return cls
|
|
242
242
|
|
|
243
|
-
def add_dataframe_metadata(cls):
|
|
243
|
+
def add_dataframe_metadata(cls) -> None:
|
|
244
244
|
serialized_fields = {k: (v.annotation, v) for k, v in fields(cls).items()}
|
|
245
|
-
|
|
245
|
+
dataobject_name = str(cls.__name__) + "DataObject"
|
|
246
|
+
dataobject_type = create_model(dataobject_name, **serialized_fields) # type: ignore[call-overload]
|
|
246
247
|
dataobject_type = dataobject(dataobject_type, unsafe=True)
|
|
247
248
|
|
|
248
249
|
setattr(cls, "DataObject", dataobject_type)
|
|
@@ -255,7 +256,7 @@ def dataframe(
|
|
|
255
256
|
),
|
|
256
257
|
)
|
|
257
258
|
|
|
258
|
-
def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool):
|
|
259
|
+
def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool) -> None:
|
|
259
260
|
setattr(
|
|
260
261
|
cls,
|
|
261
262
|
"__data_object__",
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Dataset objects definition, used as a result of serialized dataframes"""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Dict, Generic, Optional, Type, TypeVar
|
|
5
|
+
|
|
6
|
+
from hopeit.dataobjects import dataclass, dataobject
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import TypeAdapter
|
|
9
|
+
|
|
10
|
+
from hopeit.dataframes.setup.registry import get_dataset_storage
|
|
11
|
+
from hopeit.dataframes.serialization.protocol import find_dataframe_type
|
|
12
|
+
|
|
13
|
+
DataFrameT = TypeVar("DataFrameT")
|
|
14
|
+
GenericDataFrameT = TypeVar("GenericDataFrameT")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatasetLoadError(Exception):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DatasetConvertError(Exception):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataobject
|
|
26
|
+
@dataclass
|
|
27
|
+
class Dataset(Generic[DataFrameT]):
|
|
28
|
+
"""Persisted representation of a @dataframe object"""
|
|
29
|
+
|
|
30
|
+
protocol: str
|
|
31
|
+
partition_key: str
|
|
32
|
+
key: str
|
|
33
|
+
datatype: str
|
|
34
|
+
partition_dt: Optional[datetime] = None
|
|
35
|
+
database_key: Optional[str] = None
|
|
36
|
+
group_key: Optional[str] = None
|
|
37
|
+
collection: Optional[str] = None
|
|
38
|
+
schema: Optional[Dict[str, Any]] = None
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
async def save(
|
|
42
|
+
cls,
|
|
43
|
+
dataframe: DataFrameT,
|
|
44
|
+
*,
|
|
45
|
+
partition_dt: Optional[datetime] = None,
|
|
46
|
+
database_key: Optional[str] = None,
|
|
47
|
+
group_key: Optional[str] = None,
|
|
48
|
+
collection: Optional[str] = None,
|
|
49
|
+
save_schema: bool = False,
|
|
50
|
+
) -> "Dataset[DataFrameT]":
|
|
51
|
+
storage = await get_dataset_storage(database_key)
|
|
52
|
+
return await storage.save( # type: ignore[attr-defined]
|
|
53
|
+
dataframe,
|
|
54
|
+
partition_dt=partition_dt,
|
|
55
|
+
database_key=database_key,
|
|
56
|
+
group_key=group_key,
|
|
57
|
+
collection=collection,
|
|
58
|
+
save_schema=save_schema,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
async def load(
|
|
63
|
+
cls, dataset: "Dataset[DataFrameT]", database_key: Optional[str] = None
|
|
64
|
+
) -> DataFrameT:
|
|
65
|
+
try:
|
|
66
|
+
storage = await get_dataset_storage(database_key)
|
|
67
|
+
df = await dataset._load_df(storage)
|
|
68
|
+
return dataset._convert(df)
|
|
69
|
+
except (RuntimeError, IOError, KeyError) as e:
|
|
70
|
+
raise DatasetLoadError(
|
|
71
|
+
f"Error {type(e).__name__}: {e} loading dataset of type {dataset.datatype} "
|
|
72
|
+
f"at location {dataset.partition_key}/{dataset.key}"
|
|
73
|
+
) from e
|
|
74
|
+
|
|
75
|
+
async def _load_df(self, storage: object, columns: Optional[list[str]] = None) -> pd.DataFrame:
|
|
76
|
+
return await storage.load_df(self, columns) # type: ignore[attr-defined]
|
|
77
|
+
|
|
78
|
+
def _convert(self, df: pd.DataFrame) -> DataFrameT:
|
|
79
|
+
"""Converts loaded pandas Dataframe to @dataframe annotated object using Dataset metadata"""
|
|
80
|
+
datatype: Type[DataFrameT] = find_dataframe_type(self.datatype)
|
|
81
|
+
return datatype._from_df(df) # type: ignore[attr-defined]
|
|
82
|
+
|
|
83
|
+
def _adapt(self, datatype: DataFrameT) -> "Dataset[DataFrameT]":
|
|
84
|
+
"""Adapts a more generic dataset that contains combined fields to be type specific"""
|
|
85
|
+
return Dataset(
|
|
86
|
+
protocol=self.protocol,
|
|
87
|
+
partition_key=self.partition_key,
|
|
88
|
+
key=self.key,
|
|
89
|
+
datatype=f"{datatype.__module__}.{datatype.__qualname__}", # type: ignore[attr-defined]
|
|
90
|
+
partition_dt=self.partition_dt,
|
|
91
|
+
database_key=self.database_key,
|
|
92
|
+
group_key=self.group_key,
|
|
93
|
+
collection=self.collection,
|
|
94
|
+
schema=TypeAdapter(datatype).json_schema() if self.schema else None,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
async def _save_df(
|
|
99
|
+
cls,
|
|
100
|
+
storage: object,
|
|
101
|
+
df: pd.DataFrame,
|
|
102
|
+
datatype: Type[GenericDataFrameT],
|
|
103
|
+
*,
|
|
104
|
+
partition_dt: Optional[datetime],
|
|
105
|
+
database_key: Optional[str],
|
|
106
|
+
group_key: Optional[str],
|
|
107
|
+
collection: Optional[str],
|
|
108
|
+
save_schema: bool,
|
|
109
|
+
) -> "Dataset[GenericDataFrameT]":
|
|
110
|
+
return await storage.save_df( # type: ignore[attr-defined]
|
|
111
|
+
df,
|
|
112
|
+
datatype,
|
|
113
|
+
partition_dt=partition_dt,
|
|
114
|
+
database_key=database_key,
|
|
115
|
+
group_key=group_key,
|
|
116
|
+
collection=collection,
|
|
117
|
+
save_schema=save_schema,
|
|
118
|
+
)
|