hopeit.dataframes 0.24.2__tar.gz → 0.25.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/PKG-INFO +3 -4
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/setup.py +1 -2
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/__init__.py +55 -31
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/dataframe.py +40 -84
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/dataset.py +12 -4
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/files.py +3 -40
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/settings.py +1 -2
- hopeit_dataframes-0.25.0/src/hopeit/dataframes/setup/dataframes.py +36 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/PKG-INFO +3 -4
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/SOURCES.txt +0 -1
- hopeit_dataframes-0.25.0/src/hopeit.dataframes.egg-info/requires.txt +6 -0
- hopeit.dataframes-0.24.2/src/hopeit/dataframes/dataframeobject.py +0 -184
- hopeit.dataframes-0.24.2/src/hopeit/dataframes/setup/dataframes.py +0 -52
- hopeit.dataframes-0.24.2/src/hopeit.dataframes.egg-info/requires.txt +0 -6
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/README.md +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/setup.cfg +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/py.typed +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/__init__.py +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/setup/__init__.py +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.25.0
|
|
4
4
|
Summary: Hopeit Engine Dataframes Toolkit
|
|
5
5
|
Home-page: https://github.com/hopeit-git/hopeit.engine
|
|
6
6
|
Author: Leo Smerling and Pablo Canto
|
|
@@ -13,7 +13,6 @@ Project-URL: GitHub: repo, https://github.com/hopeit-git/hopeit.engine
|
|
|
13
13
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
14
|
Classifier: Intended Audience :: Developers
|
|
15
15
|
Classifier: Programming Language :: Python
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
@@ -24,9 +23,9 @@ Classifier: Operating System :: Microsoft :: Windows
|
|
|
24
23
|
Classifier: Topic :: Internet :: WWW/HTTP
|
|
25
24
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
25
|
Classifier: Framework :: AsyncIO
|
|
27
|
-
Requires-Python: >=3.
|
|
26
|
+
Requires-Python: >=3.9
|
|
28
27
|
Description-Content-Type: text/markdown
|
|
29
|
-
Requires-Dist: hopeit.engine[fs-storage]==0.
|
|
28
|
+
Requires-Dist: hopeit.engine[fs-storage]==0.25.0
|
|
30
29
|
Requires-Dist: pandas
|
|
31
30
|
Requires-Dist: numpy
|
|
32
31
|
Provides-Extra: pyarrow
|
|
@@ -18,7 +18,6 @@ setuptools.setup(
|
|
|
18
18
|
"License :: OSI Approved :: Apache Software License",
|
|
19
19
|
"Intended Audience :: Developers",
|
|
20
20
|
"Programming Language :: Python",
|
|
21
|
-
"Programming Language :: Python :: 3.8",
|
|
22
21
|
"Programming Language :: Python :: 3.9",
|
|
23
22
|
"Programming Language :: Python :: 3.10",
|
|
24
23
|
"Programming Language :: Python :: 3.11",
|
|
@@ -46,7 +45,7 @@ setuptools.setup(
|
|
|
46
45
|
package_data={
|
|
47
46
|
"hopeit.dataframes": ["py.typed"],
|
|
48
47
|
},
|
|
49
|
-
python_requires=">=3.
|
|
48
|
+
python_requires=">=3.9",
|
|
50
49
|
install_requires=[
|
|
51
50
|
f"hopeit.engine[fs-storage]=={version['ENGINE_VERSION']}",
|
|
52
51
|
"pandas",
|
|
@@ -1,37 +1,74 @@
|
|
|
1
1
|
"""
|
|
2
2
|
hopeit.engine dataframes plugin entry point
|
|
3
3
|
|
|
4
|
-
This module exposes the
|
|
4
|
+
This module exposes the 2 main constructions to be used inside apps,
|
|
5
|
+
to extend @dataobject functionallity supporting working with `pandas DataFrames`
|
|
5
6
|
`@dataframe` dataclass annotation
|
|
6
|
-
`@dataframeobject` dataclass annotation
|
|
7
7
|
`DataFrames` class to handle manipulation of dataframe/dataframeobjects
|
|
8
8
|
|
|
9
9
|
Usage:
|
|
10
10
|
```
|
|
11
|
-
from
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
16
|
+
from hopeit.dataframes import DataFrames, Dataset, dataframe
|
|
17
|
+
from hopeit.dataobjects import dataobject, dataclass
|
|
18
|
+
from hopeit.dataobjects.payload import Payload
|
|
12
19
|
|
|
13
20
|
@dataframe
|
|
14
21
|
@dataclass
|
|
15
|
-
class
|
|
22
|
+
class MyData:
|
|
16
23
|
field1: int
|
|
17
24
|
field2: str
|
|
18
25
|
...
|
|
19
26
|
|
|
20
|
-
@
|
|
27
|
+
@dataobject
|
|
21
28
|
@dataclass
|
|
22
29
|
class MyDataset:
|
|
23
30
|
dataset_name: str
|
|
24
|
-
example_data:
|
|
31
|
+
example_data: Dataset[MyData]
|
|
32
|
+
|
|
33
|
+
@dataobject
|
|
34
|
+
@dataclass
|
|
35
|
+
class MyWebResponse:
|
|
36
|
+
dataset_name: str
|
|
37
|
+
example_data: List[MyData.DataObject]
|
|
38
|
+
|
|
39
|
+
# This step is not needed if SETUP event is configured in app
|
|
40
|
+
DataFrames.setup(DatasetSerialization(
|
|
41
|
+
protocol="hopeit.dataframes.serialization.files.DatasetFileStorage",
|
|
42
|
+
location="/tmp/data",
|
|
43
|
+
partition_dateformat="%Y/%m/%d/%H/",
|
|
44
|
+
))
|
|
45
|
+
|
|
46
|
+
df = pd.DataFrame([ # Create or load a pandas DataFrame
|
|
47
|
+
{"field1": 1, "field2": "text1"},
|
|
48
|
+
{"field1": 2, "field2": "text2"},
|
|
49
|
+
])
|
|
50
|
+
|
|
51
|
+
my_data: MyData = DataFrames.from_df(MyData, df)
|
|
25
52
|
|
|
53
|
+
# return dataset after saving data to disk
|
|
54
|
+
my_dataset = MyDataset(
|
|
55
|
+
dataset_name="example",
|
|
56
|
+
example_data=await Dataset.save(my_data)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
print(Payload.to_json(my_dataset))
|
|
26
60
|
|
|
27
|
-
|
|
61
|
+
my_data_again: MyData = await my_dataset.example_data.load()
|
|
28
62
|
|
|
29
|
-
|
|
63
|
+
print(DataFrames.df(my_data_again))
|
|
30
64
|
|
|
31
|
-
return
|
|
65
|
+
# return dataframe converted to list of dataobjects that can be directly converted to json
|
|
66
|
+
my_json_response = MyWebResponse(
|
|
32
67
|
dataset_name="example",
|
|
33
|
-
example_data=my_data
|
|
68
|
+
example_data=DataFrames.to_dataobjects(my_data)
|
|
34
69
|
)
|
|
70
|
+
|
|
71
|
+
print(Payload.to_json(my_json_response))
|
|
35
72
|
```
|
|
36
73
|
"""
|
|
37
74
|
|
|
@@ -40,35 +77,22 @@ from typing import Dict, Generic, Iterator, List, Type
|
|
|
40
77
|
import numpy as np
|
|
41
78
|
import pandas as pd
|
|
42
79
|
from hopeit.dataframes.dataframe import DataFrameT, dataframe
|
|
43
|
-
from hopeit.dataframes.
|
|
80
|
+
from hopeit.dataframes.serialization.dataset import Dataset
|
|
81
|
+
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
82
|
+
from hopeit.dataframes.setup.dataframes import register_serialization
|
|
44
83
|
from hopeit.dataobjects import DataObject
|
|
45
84
|
|
|
46
|
-
__all__ = ["DataFrames", "
|
|
85
|
+
__all__ = ["DataFrames", "Dataset", "dataframe"]
|
|
47
86
|
|
|
48
87
|
|
|
49
|
-
class DataFrames(Generic[DataFrameT,
|
|
88
|
+
class DataFrames(Generic[DataFrameT, DataObject]):
|
|
50
89
|
"""
|
|
51
90
|
Dataframes manipulation utilities methods
|
|
52
91
|
"""
|
|
53
92
|
|
|
54
93
|
@staticmethod
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
and converts to a `DataObject` json-compatible with pointers to saved
|
|
58
|
-
locations.
|
|
59
|
-
|
|
60
|
-
This method can be used to i.e. return `@dataframeobject`s as a JSON response
|
|
61
|
-
"""
|
|
62
|
-
return await obj._serialize() # type: ignore # pylint: disable=protected-access
|
|
63
|
-
|
|
64
|
-
@staticmethod
|
|
65
|
-
async def deserialize(
|
|
66
|
-
datatype: Type[DataFrameObjectT], dataobject: DataObject
|
|
67
|
-
) -> DataFrameObjectT:
|
|
68
|
-
"""Deserialize/load contents of serialized dataobject fields of a `@dataframeobject`
|
|
69
|
-
loading saved Dataset information for @dataframe fields
|
|
70
|
-
"""
|
|
71
|
-
return await datatype._deserialize(dataobject) # type: ignore # pylint: disable=protected-access
|
|
94
|
+
def setup(settings: DatasetSerialization):
|
|
95
|
+
register_serialization(settings)
|
|
72
96
|
|
|
73
97
|
@staticmethod
|
|
74
98
|
def from_df(
|
|
@@ -88,7 +112,7 @@ class DataFrames(Generic[DataFrameT, DataFrameObjectT, DataObject]):
|
|
|
88
112
|
|
|
89
113
|
@staticmethod
|
|
90
114
|
def from_dataobjects(
|
|
91
|
-
datatype: Type[DataFrameT], dataobjects: Iterator[
|
|
115
|
+
datatype: Type[DataFrameT], dataobjects: Iterator[DataObject]
|
|
92
116
|
) -> DataFrameT:
|
|
93
117
|
"""Converts standard json serializable `@dataobject`s to a single `@dataframe`"""
|
|
94
118
|
return datatype._from_dataobjects(dataobjects) # type: ignore # pylint: disable=protected-access
|
|
@@ -1,58 +1,49 @@
|
|
|
1
1
|
"""
|
|
2
2
|
DataFrames type abstractions.
|
|
3
|
-
|
|
4
|
-
Example:
|
|
5
|
-
|
|
6
|
-
from hopeit.dataobjects import dataclass # equivalent to `dataclasses.dataclass`
|
|
7
|
-
from hopeit.dataframes import dataframe
|
|
8
|
-
|
|
9
|
-
@dataframe
|
|
10
|
-
@dataclass
|
|
11
|
-
class MyObject:
|
|
12
|
-
name: str
|
|
13
|
-
number: int
|
|
14
3
|
"""
|
|
15
4
|
|
|
16
|
-
|
|
5
|
+
import dataclasses
|
|
17
6
|
from datetime import date, datetime, timezone
|
|
18
|
-
from
|
|
7
|
+
from functools import partial
|
|
8
|
+
from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar
|
|
19
9
|
|
|
20
10
|
import numpy as np
|
|
21
11
|
import pandas as pd
|
|
22
|
-
from
|
|
12
|
+
from pydantic import create_model
|
|
13
|
+
from pydantic.fields import FieldInfo
|
|
14
|
+
|
|
23
15
|
from hopeit.dataobjects import (
|
|
24
16
|
DataObject,
|
|
25
17
|
StreamEventMixin,
|
|
26
18
|
StreamEventParams,
|
|
27
19
|
dataobject,
|
|
20
|
+
fields,
|
|
28
21
|
)
|
|
22
|
+
from hopeit.dataobjects.payload import Payload
|
|
29
23
|
|
|
30
24
|
DataFrameT = TypeVar("DataFrameT")
|
|
31
25
|
|
|
32
26
|
|
|
33
|
-
@dataclass
|
|
34
|
-
class DataFrameMetadata
|
|
27
|
+
@dataclasses.dataclass
|
|
28
|
+
class DataFrameMetadata:
|
|
35
29
|
columns: List[str]
|
|
36
|
-
fields: Dict[str,
|
|
37
|
-
serialized_type: Type[DataObject]
|
|
30
|
+
fields: Dict[str, FieldInfo]
|
|
38
31
|
|
|
39
32
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
Helper class used to access attributes in @dataframe
|
|
44
|
-
decorated objects, based on dot notation expressions
|
|
45
|
-
"""
|
|
33
|
+
# Functions to do type coercion
|
|
34
|
+
def _series_to_int(x: pd.Series) -> pd.Series:
|
|
35
|
+
return x.astype(np.int64)
|
|
46
36
|
|
|
47
|
-
datatypes: Optional[str]
|
|
48
37
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
38
|
+
def _series_to_float(x: pd.Series) -> pd.Series:
|
|
39
|
+
return x.astype(np.float64)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _series_to_str(x: pd.Series) -> pd.Series:
|
|
43
|
+
return x.astype(str)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
_series_to_utc_datetime = partial(pd.to_datetime, utc=True)
|
|
56
47
|
|
|
57
48
|
|
|
58
49
|
class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
@@ -63,11 +54,11 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
63
54
|
"""
|
|
64
55
|
|
|
65
56
|
DATATYPE_MAPPING = {
|
|
66
|
-
int:
|
|
67
|
-
float:
|
|
68
|
-
str:
|
|
57
|
+
int: _series_to_int,
|
|
58
|
+
float: _series_to_float,
|
|
59
|
+
str: _series_to_str,
|
|
69
60
|
date: pd.to_datetime,
|
|
70
|
-
datetime:
|
|
61
|
+
datetime: _series_to_utc_datetime,
|
|
71
62
|
}
|
|
72
63
|
|
|
73
64
|
def __init__(self) -> None:
|
|
@@ -78,9 +69,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
78
69
|
raise NotImplementedError # must use @dataframe decorator # pragma: no cover
|
|
79
70
|
|
|
80
71
|
@staticmethod
|
|
81
|
-
def __init_from_series__(
|
|
82
|
-
self, **series: pd.Series
|
|
83
|
-
): # pylint: disable=bad-staticmethod-argument
|
|
72
|
+
def __init_from_series__(self, **series: pd.Series): # pylint: disable=bad-staticmethod-argument
|
|
84
73
|
df = pd.DataFrame(series)
|
|
85
74
|
df.index.name = None # Removes index name to avoid colisions with series name
|
|
86
75
|
if self.__data_object__["validate"]:
|
|
@@ -99,7 +88,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
99
88
|
|
|
100
89
|
@classmethod
|
|
101
90
|
def _from_dataobjects(cls, items: Iterator[DataObject]) -> DataFrameT:
|
|
102
|
-
return cls._from_df(pd.DataFrame(
|
|
91
|
+
return cls._from_df(pd.DataFrame(Payload.to_obj(item) for item in items)) # type: ignore[misc]
|
|
103
92
|
|
|
104
93
|
@classmethod
|
|
105
94
|
def _from_df_unsafe(cls, df: pd.DataFrame, **series: pd.Series) -> DataFrameT:
|
|
@@ -116,40 +105,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
116
105
|
return self._from_df(self.__df[key])
|
|
117
106
|
|
|
118
107
|
def _to_dataobjects(self) -> List[DataObject]:
|
|
119
|
-
return [
|
|
120
|
-
self.__dataframe__.serialized_type(**fields)
|
|
121
|
-
for fields in self.__df.to_dict(orient="records")
|
|
122
|
-
]
|
|
123
|
-
|
|
124
|
-
def to_json(self, *args, **kwargs) -> str:
|
|
125
|
-
raise NotImplementedError(
|
|
126
|
-
"Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
|
|
130
|
-
raise NotImplementedError(
|
|
131
|
-
"Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
@classmethod
|
|
135
|
-
def from_json(cls, *args, **kwargs) -> DataObject:
|
|
136
|
-
return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
|
|
137
|
-
|
|
138
|
-
@classmethod
|
|
139
|
-
def from_dict(
|
|
140
|
-
cls,
|
|
141
|
-
*args,
|
|
142
|
-
**kwargs,
|
|
143
|
-
) -> DataObject:
|
|
144
|
-
return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
|
|
145
|
-
|
|
146
|
-
@classmethod
|
|
147
|
-
def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
|
|
148
|
-
if cls.__data_object__["schema"]:
|
|
149
|
-
schema = cls.__dataframe__.serialized_type.json_schema(*args, **kwargs)
|
|
150
|
-
schema[cls.__name__] = schema[cls.__dataframe__.serialized_type.__name__]
|
|
151
|
-
return schema
|
|
152
|
-
return {}
|
|
108
|
+
return [self.DataObject(**fields) for fields in self.__df.to_dict(orient="records")]
|
|
153
109
|
|
|
154
110
|
def event_id(self, *args, **kwargs) -> str:
|
|
155
111
|
return ""
|
|
@@ -174,7 +130,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
174
130
|
|
|
175
131
|
def _coerce_datatypes(self, df: pd.DataFrame) -> Dict[str, pd.Series]:
|
|
176
132
|
return {
|
|
177
|
-
name: self.DATATYPE_MAPPING[field.
|
|
133
|
+
name: self.DATATYPE_MAPPING[field.annotation](df[name]) # type: ignore
|
|
178
134
|
for name, field in self.__dataframe__.fields.items()
|
|
179
135
|
}
|
|
180
136
|
|
|
@@ -193,7 +149,7 @@ def dataframe(
|
|
|
193
149
|
if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
|
|
194
150
|
amended_class = type(
|
|
195
151
|
cls.__name__,
|
|
196
|
-
(DataFrameMixin,
|
|
152
|
+
(DataFrameMixin,) + cls.__mro__,
|
|
197
153
|
dict(cls.__dict__),
|
|
198
154
|
)
|
|
199
155
|
setattr(amended_class, "__init__", DataFrameMixin.__init_from_series__)
|
|
@@ -201,17 +157,17 @@ def dataframe(
|
|
|
201
157
|
return cls
|
|
202
158
|
|
|
203
159
|
def add_dataframe_metadata(cls):
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
160
|
+
serialized_fields = {k: (v.annotation, v) for k, v in fields(cls).items()}
|
|
161
|
+
dataobject_type = create_model(cls.__name__ + "DataObject", **serialized_fields)
|
|
162
|
+
dataobject_type = dataobject(dataobject_type, unsafe=True)
|
|
207
163
|
|
|
164
|
+
setattr(cls, "DataObject", dataobject_type)
|
|
208
165
|
setattr(
|
|
209
166
|
cls,
|
|
210
167
|
"__dataframe__",
|
|
211
168
|
DataFrameMetadata(
|
|
212
|
-
columns=
|
|
213
|
-
fields=
|
|
214
|
-
serialized_type=serialized_type,
|
|
169
|
+
columns=list(fields(cls).keys()),
|
|
170
|
+
fields=dict(fields(cls).items()),
|
|
215
171
|
),
|
|
216
172
|
)
|
|
217
173
|
|
|
@@ -226,14 +182,14 @@ def dataframe(
|
|
|
226
182
|
setattr(cls, "event_ts", StreamEventMixin.event_ts)
|
|
227
183
|
|
|
228
184
|
def set_fields_optional(cls):
|
|
229
|
-
for field in fields(cls):
|
|
185
|
+
for _, field in fields(cls).items():
|
|
230
186
|
field.default = None
|
|
231
187
|
|
|
232
188
|
def wrap(cls) -> Type[DataFrameMixin]:
|
|
233
189
|
if hasattr(cls, "__dataframe__"):
|
|
234
190
|
return cls
|
|
191
|
+
add_dataframe_metadata(cls)
|
|
235
192
|
amended_class = add_dataframe_mixin(cls)
|
|
236
|
-
add_dataframe_metadata(amended_class)
|
|
237
193
|
add_dataobject_annotations(amended_class, unsafe, validate, schema)
|
|
238
194
|
set_fields_optional(amended_class)
|
|
239
195
|
return amended_class
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/dataset.py
RENAMED
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
"""Dataset objects definition, used as a result of serialized dataframes
|
|
2
|
-
"""
|
|
1
|
+
"""Dataset objects definition, used as a result of serialized dataframes"""
|
|
3
2
|
|
|
4
3
|
from importlib import import_module
|
|
5
|
-
from typing import Type, TypeVar
|
|
4
|
+
from typing import Generic, Type, TypeVar
|
|
6
5
|
|
|
7
6
|
from hopeit.dataobjects import dataclass, dataobject
|
|
8
7
|
|
|
@@ -11,12 +10,21 @@ DataFrameT = TypeVar("DataFrameT")
|
|
|
11
10
|
|
|
12
11
|
@dataobject
|
|
13
12
|
@dataclass
|
|
14
|
-
class Dataset:
|
|
13
|
+
class Dataset(Generic[DataFrameT]):
|
|
14
|
+
"""Persisted representation of a @dataframe object"""
|
|
15
|
+
|
|
15
16
|
protocol: str
|
|
16
17
|
partition_key: str
|
|
17
18
|
key: str
|
|
18
19
|
datatype: str
|
|
19
20
|
|
|
21
|
+
async def load(self) -> DataFrameT:
|
|
22
|
+
return await self.__storage.load(self) # type: ignore[attr-defined]
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
|
|
26
|
+
return await cls.__storage.save(dataframe) # type: ignore[attr-defined]
|
|
27
|
+
|
|
20
28
|
|
|
21
29
|
def find_protocol_impl(qual_type_name: str) -> Type:
|
|
22
30
|
mod_name, type_name = (
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/files.py
RENAMED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
"""Support for `@dataframes` serialization to files
|
|
2
|
-
"""
|
|
1
|
+
"""Support for `@dataframes` serialization to files"""
|
|
3
2
|
|
|
4
3
|
import io
|
|
5
4
|
from importlib import import_module
|
|
6
|
-
from typing import
|
|
5
|
+
from typing import Generic, Optional, Type, TypeVar
|
|
7
6
|
from uuid import uuid4
|
|
8
7
|
|
|
9
8
|
import pandas as pd
|
|
@@ -59,48 +58,12 @@ class DatasetFileStorage(Generic[DataFrameT]):
|
|
|
59
58
|
async def load(self, dataset: Dataset) -> EventPayloadType:
|
|
60
59
|
"""Loads @dataframe annotated object using Dataset metadata"""
|
|
61
60
|
datatype: Type[DataFrameT] = find_dataframe_type(dataset.datatype)
|
|
62
|
-
data = await self.storage.get_file(
|
|
63
|
-
dataset.key, partition_key=dataset.partition_key
|
|
64
|
-
)
|
|
61
|
+
data = await self.storage.get_file(dataset.key, partition_key=dataset.partition_key)
|
|
65
62
|
if data is None:
|
|
66
63
|
raise FileNotFoundError(dataset.key)
|
|
67
64
|
df = pd.read_parquet(io.BytesIO(data), engine="pyarrow")
|
|
68
65
|
return datatype._from_df(df) # pylint: disable=protected-access
|
|
69
66
|
|
|
70
|
-
async def ser_wrapper(
|
|
71
|
-
self,
|
|
72
|
-
base_serialization: Callable,
|
|
73
|
-
data: Union[EventPayloadType, DataFrameT],
|
|
74
|
-
level: int,
|
|
75
|
-
) -> bytes:
|
|
76
|
-
"""Serialization wrapper that plugins-in into hopeit.engine
|
|
77
|
-
serialization when dataframes plugin is initialized
|
|
78
|
-
"""
|
|
79
|
-
if hasattr(data, "__dataframeobject__"):
|
|
80
|
-
data = await data._serialize() # type: ignore # pylint: disable=protected-access
|
|
81
|
-
if hasattr(data, "__dataframe__"):
|
|
82
|
-
data = await self.save(data) # type: ignore
|
|
83
|
-
return await base_serialization(data, level)
|
|
84
|
-
|
|
85
|
-
async def deser_wrapper(
|
|
86
|
-
self,
|
|
87
|
-
base_deserialization: Callable,
|
|
88
|
-
data: bytes,
|
|
89
|
-
datatype: Union[Type[EventPayloadType], Type[DataFrameT]],
|
|
90
|
-
) -> Union[EventPayloadType, DataFrameT]:
|
|
91
|
-
"""Deerialization wrapper that plugins-in into hopeit.engine
|
|
92
|
-
deserialization when dataframes plugin is initialized
|
|
93
|
-
"""
|
|
94
|
-
if hasattr(datatype, "__dataframeobject__"):
|
|
95
|
-
dataset = await base_deserialization(
|
|
96
|
-
data, datatype.__dataframeobject__.serialized_type # type: ignore
|
|
97
|
-
)
|
|
98
|
-
return await datatype._deserialize(dataset) # type: ignore # pylint: disable=protected-access
|
|
99
|
-
if hasattr(datatype, "__dataframe__"):
|
|
100
|
-
dataset = await base_deserialization(data, Dataset)
|
|
101
|
-
return await self.load(dataset)
|
|
102
|
-
return await base_deserialization(data, datatype)
|
|
103
|
-
|
|
104
67
|
|
|
105
68
|
def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
|
|
106
69
|
"""Returns dataframe class based on type name used during serialization"""
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""hopeit.engine dataframes plugin SETUP event.
|
|
2
|
+
|
|
3
|
+
This event executes when engine starts with dataframes plugin configuration file loaded,
|
|
4
|
+
and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
|
|
5
|
+
types
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from hopeit.app.context import EventContext
|
|
9
|
+
from hopeit.app.logger import app_logger
|
|
10
|
+
from hopeit.dataframes.serialization.dataset import Dataset, find_protocol_impl
|
|
11
|
+
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
12
|
+
|
|
13
|
+
logger = app_logger()
|
|
14
|
+
|
|
15
|
+
__steps__ = ["setup"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def setup(payload: None, context: EventContext) -> None:
|
|
19
|
+
"""Setups serizaltion wrappers in hopeit.engine based on
|
|
20
|
+
`DataSerialization` settings configured in plugin configuration file
|
|
21
|
+
"""
|
|
22
|
+
logger.info(context, "Configuring Dataset serialization...")
|
|
23
|
+
settings: DatasetSerialization = context.settings(
|
|
24
|
+
key="dataset_serialization", datatype=DatasetSerialization
|
|
25
|
+
)
|
|
26
|
+
register_serialization(settings)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def register_serialization(settings: DatasetSerialization):
|
|
30
|
+
impl = find_protocol_impl(settings.protocol)
|
|
31
|
+
storage = impl(
|
|
32
|
+
protocol=settings.protocol,
|
|
33
|
+
location=settings.location,
|
|
34
|
+
partition_dateformat=settings.partition_dateformat,
|
|
35
|
+
)
|
|
36
|
+
setattr(Dataset, "_Dataset__storage", storage)
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.25.0
|
|
4
4
|
Summary: Hopeit Engine Dataframes Toolkit
|
|
5
5
|
Home-page: https://github.com/hopeit-git/hopeit.engine
|
|
6
6
|
Author: Leo Smerling and Pablo Canto
|
|
@@ -13,7 +13,6 @@ Project-URL: GitHub: repo, https://github.com/hopeit-git/hopeit.engine
|
|
|
13
13
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
14
|
Classifier: Intended Audience :: Developers
|
|
15
15
|
Classifier: Programming Language :: Python
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
@@ -24,9 +23,9 @@ Classifier: Operating System :: Microsoft :: Windows
|
|
|
24
23
|
Classifier: Topic :: Internet :: WWW/HTTP
|
|
25
24
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
25
|
Classifier: Framework :: AsyncIO
|
|
27
|
-
Requires-Python: >=3.
|
|
26
|
+
Requires-Python: >=3.9
|
|
28
27
|
Description-Content-Type: text/markdown
|
|
29
|
-
Requires-Dist: hopeit.engine[fs-storage]==0.
|
|
28
|
+
Requires-Dist: hopeit.engine[fs-storage]==0.25.0
|
|
30
29
|
Requires-Dist: pandas
|
|
31
30
|
Requires-Dist: numpy
|
|
32
31
|
Provides-Extra: pyarrow
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/SOURCES.txt
RENAMED
|
@@ -7,7 +7,6 @@ src/hopeit.dataframes.egg-info/requires.txt
|
|
|
7
7
|
src/hopeit.dataframes.egg-info/top_level.txt
|
|
8
8
|
src/hopeit/dataframes/__init__.py
|
|
9
9
|
src/hopeit/dataframes/dataframe.py
|
|
10
|
-
src/hopeit/dataframes/dataframeobject.py
|
|
11
10
|
src/hopeit/dataframes/py.typed
|
|
12
11
|
src/hopeit/dataframes/serialization/__init__.py
|
|
13
12
|
src/hopeit/dataframes/serialization/dataset.py
|
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`@dataframeobject` annonation mixin to serialize a group of `@dataframe`s.
|
|
3
|
-
|
|
4
|
-
Datasets behaves as DataObject so they can be used as payload
|
|
5
|
-
for endpoints and streams.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from dataclasses import Field, dataclass, fields, make_dataclass
|
|
9
|
-
from typing import (
|
|
10
|
-
Any,
|
|
11
|
-
Callable,
|
|
12
|
-
ClassVar,
|
|
13
|
-
Dict,
|
|
14
|
-
Generic,
|
|
15
|
-
Optional,
|
|
16
|
-
Type,
|
|
17
|
-
TypeVar,
|
|
18
|
-
Union,
|
|
19
|
-
get_args,
|
|
20
|
-
get_origin,
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
from hopeit.dataframes.serialization.dataset import Dataset
|
|
24
|
-
from hopeit.dataobjects import (
|
|
25
|
-
DataObject,
|
|
26
|
-
StreamEventMixin,
|
|
27
|
-
StreamEventParams,
|
|
28
|
-
dataobject,
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
DataFrameObjectT = TypeVar("DataFrameObjectT")
|
|
32
|
-
NoneType = type(None)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class DataFrameObjectMetadata(Generic[DataObject]):
|
|
37
|
-
serialized_type: Type[DataObject]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class DataFrameObjectMixin(Generic[DataFrameObjectT]):
|
|
41
|
-
"""
|
|
42
|
-
MixIn class to add functionality for `@dataframeobject`s
|
|
43
|
-
|
|
44
|
-
Do not use this class directly, instead use `@dataframeobject` class decorator.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
__storage: ClassVar[Any] = None # pylint: disable=invalid-name
|
|
48
|
-
|
|
49
|
-
def __init__(self) -> None:
|
|
50
|
-
self.__dataframeobject__: DataFrameObjectMetadata = None # type: ignore
|
|
51
|
-
raise NotImplementedError(
|
|
52
|
-
"DataFrameObjectMixin() should not be called directly. Use `@dataframeobject` annotation"
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
async def _serialize(self) -> Optional[DataObject]:
|
|
56
|
-
"""Saves internal `@dataframe`s using configured serialization protocol
|
|
57
|
-
and returns json-serialiable dataobject
|
|
58
|
-
"""
|
|
59
|
-
datasets = {}
|
|
60
|
-
for field in fields(self): # type: ignore
|
|
61
|
-
if _is_dataframe_field(field):
|
|
62
|
-
dataframe = getattr(self, field.name)
|
|
63
|
-
dataset = (
|
|
64
|
-
None if dataframe is None else await self.__storage.save(dataframe)
|
|
65
|
-
)
|
|
66
|
-
datasets[field.name] = dataset
|
|
67
|
-
else:
|
|
68
|
-
datasets[field.name] = getattr(self, field.name)
|
|
69
|
-
return self.__dataframeobject__.serialized_type(**datasets)
|
|
70
|
-
|
|
71
|
-
@classmethod
|
|
72
|
-
async def _deserialize(
|
|
73
|
-
cls, serialized: DataObject
|
|
74
|
-
) -> "DataFrameObjectMixin[DataFrameObjectT]":
|
|
75
|
-
"""From a serialized datframeobject, load inner `@dataframe` objects
|
|
76
|
-
and returns a `@dataframeobject` instance"""
|
|
77
|
-
dataframes = {}
|
|
78
|
-
for field in fields(cls): # type: ignore
|
|
79
|
-
if _is_dataframe_field(field):
|
|
80
|
-
dataset = getattr(serialized, field.name)
|
|
81
|
-
dataframe = (
|
|
82
|
-
None if dataset is None else await cls.__storage.load(dataset)
|
|
83
|
-
)
|
|
84
|
-
dataframes[field.name] = dataframe
|
|
85
|
-
else:
|
|
86
|
-
dataframes[field.name] = getattr(serialized, field.name)
|
|
87
|
-
return cls(**dataframes)
|
|
88
|
-
|
|
89
|
-
@classmethod
|
|
90
|
-
def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
|
|
91
|
-
schema = cls.__dataframeobject__.serialized_type.json_schema(*args, **kwargs)
|
|
92
|
-
schema[cls.__name__] = schema[cls.__dataframeobject__.serialized_type.__name__]
|
|
93
|
-
return schema
|
|
94
|
-
|
|
95
|
-
def to_json(self, *args, **kwargs) -> Dict[str, Any]:
|
|
96
|
-
raise RuntimeError(
|
|
97
|
-
f"`{type(self).__name__}` `@dataframeobject` cannot be converted to json directly. "
|
|
98
|
-
"i.e. use `return await DataFrames.serialize(obj)` to return it as a reponse."
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def _is_dataframe_field(field: Field) -> bool:
|
|
103
|
-
return any(
|
|
104
|
-
hasattr(field_type, "__dataframe__")
|
|
105
|
-
for field_type in [field.type, *get_args(field.type)]
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def _serialized_field_type(field: Field) -> Type[Any]:
|
|
110
|
-
"""Computes the `@dataobject` datatype used as a result
|
|
111
|
-
of serialized `@dataframeobject`
|
|
112
|
-
"""
|
|
113
|
-
if hasattr(field.type, "__dataframe__"):
|
|
114
|
-
return Dataset
|
|
115
|
-
if get_origin(field.type) is Union:
|
|
116
|
-
args = get_args(field.type)
|
|
117
|
-
if (
|
|
118
|
-
len(args) == 2
|
|
119
|
-
and any(hasattr(field_type, "__dataframe__") for field_type in args)
|
|
120
|
-
and any(field_type is NoneType for field_type in args)
|
|
121
|
-
):
|
|
122
|
-
return Optional[Dataset] # type: ignore
|
|
123
|
-
if _is_dataframe_field(field):
|
|
124
|
-
raise TypeError(
|
|
125
|
-
f"field {field.name}: only `DataFrameT` or `Optional[DataFrameT]` are supported"
|
|
126
|
-
)
|
|
127
|
-
return field.type
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def dataframeobject(
|
|
131
|
-
decorated_class=None,
|
|
132
|
-
) -> Callable[[Type], Type[DataFrameObjectMixin]]:
|
|
133
|
-
"""
|
|
134
|
-
Decorator for dataclasses intended to be used as dataframes.
|
|
135
|
-
"""
|
|
136
|
-
|
|
137
|
-
def add_dataframe_mixin(cls) -> Type[DataFrameObjectMixin]:
|
|
138
|
-
if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
|
|
139
|
-
amended_class = type(
|
|
140
|
-
cls.__name__,
|
|
141
|
-
(DataFrameObjectMixin,) + cls.__mro__,
|
|
142
|
-
dict(cls.__dict__),
|
|
143
|
-
)
|
|
144
|
-
return amended_class
|
|
145
|
-
return cls
|
|
146
|
-
|
|
147
|
-
def add_dataframeobject_metadata(cls):
|
|
148
|
-
serialized_fiels = [
|
|
149
|
-
(field.name, _serialized_field_type(field)) for field in fields(cls)
|
|
150
|
-
]
|
|
151
|
-
serialized_type = make_dataclass(cls.__name__ + "_", serialized_fiels)
|
|
152
|
-
serialized_type = dataobject(serialized_type, unsafe=True)
|
|
153
|
-
|
|
154
|
-
setattr(
|
|
155
|
-
cls,
|
|
156
|
-
"__dataframeobject__",
|
|
157
|
-
DataFrameObjectMetadata(
|
|
158
|
-
serialized_type=serialized_type,
|
|
159
|
-
),
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool):
|
|
163
|
-
setattr(
|
|
164
|
-
cls,
|
|
165
|
-
"__data_object__",
|
|
166
|
-
{"unsafe": unsafe, "validate": validate, "schema": schema},
|
|
167
|
-
)
|
|
168
|
-
setattr(cls, "__stream_event__", StreamEventParams(None, None))
|
|
169
|
-
setattr(cls, "event_id", StreamEventMixin.event_id)
|
|
170
|
-
setattr(cls, "event_ts", StreamEventMixin.event_ts)
|
|
171
|
-
|
|
172
|
-
def wrap(cls) -> Type[DataFrameObjectMixin]:
|
|
173
|
-
if hasattr(cls, "__dataframeobject__"):
|
|
174
|
-
return cls
|
|
175
|
-
amended_class = add_dataframe_mixin(cls)
|
|
176
|
-
add_dataframeobject_metadata(amended_class)
|
|
177
|
-
add_dataobject_annotations(
|
|
178
|
-
amended_class, unsafe=False, validate=True, schema=True
|
|
179
|
-
)
|
|
180
|
-
return amended_class
|
|
181
|
-
|
|
182
|
-
if decorated_class is None:
|
|
183
|
-
return wrap
|
|
184
|
-
return wrap(decorated_class) # type: ignore
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
"""hopeit.engine dataframes plugin SETUP event.
|
|
2
|
-
|
|
3
|
-
This event executes when engine starts with dataframes plugin configuration file loaded,
|
|
4
|
-
and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
|
|
5
|
-
types
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from functools import partial
|
|
9
|
-
|
|
10
|
-
from hopeit.app.context import EventContext
|
|
11
|
-
from hopeit.app.logger import app_logger
|
|
12
|
-
from hopeit.dataframes.dataframeobject import DataFrameObjectMixin
|
|
13
|
-
from hopeit.dataframes.serialization.dataset import find_protocol_impl
|
|
14
|
-
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
15
|
-
from hopeit.server import serialization
|
|
16
|
-
|
|
17
|
-
logger = app_logger()
|
|
18
|
-
|
|
19
|
-
__steps__ = ["register_serialization"]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def register_serialization(payload: None, context: EventContext) -> None:
|
|
23
|
-
"""Setups serizaltion wrappers in hopeit.engine based on
|
|
24
|
-
`DataSerialization` settings configured in plugin configuration file
|
|
25
|
-
"""
|
|
26
|
-
logger.info(context, "Registering serialization methods...")
|
|
27
|
-
|
|
28
|
-
settings: DatasetSerialization = context.settings(
|
|
29
|
-
key="dataset_serialization", datatype=DatasetSerialization
|
|
30
|
-
)
|
|
31
|
-
impl = find_protocol_impl(settings.protocol)
|
|
32
|
-
|
|
33
|
-
storage = impl(
|
|
34
|
-
protocol=settings.protocol,
|
|
35
|
-
location=settings.location,
|
|
36
|
-
partition_dateformat=settings.partition_dateformat,
|
|
37
|
-
)
|
|
38
|
-
setattr(DataFrameObjectMixin, "_DataFrameObjectMixin__storage", storage)
|
|
39
|
-
|
|
40
|
-
serdeser_wrappers = {}
|
|
41
|
-
for (
|
|
42
|
-
serdeser,
|
|
43
|
-
methods,
|
|
44
|
-
) in serialization._SERDESER.items(): # pylint: disable=protected-access
|
|
45
|
-
serdeser_wrappers[serdeser] = (
|
|
46
|
-
partial(storage.ser_wrapper, methods[0]),
|
|
47
|
-
methods[1],
|
|
48
|
-
partial(storage.deser_wrapper, methods[2]),
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
for serdeser, methods in serdeser_wrappers.items():
|
|
52
|
-
serialization._SERDESER[serdeser] = methods # pylint: disable=protected-access
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/setup/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/top_level.txt
RENAMED
|
File without changes
|