hopeit.dataframes 0.24.2__tar.gz → 0.25.0b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/PKG-INFO +2 -2
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/__init__.py +55 -31
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/dataframe.py +22 -79
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/dataset.py +10 -2
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/files.py +1 -35
- hopeit_dataframes-0.25.0b2/src/hopeit/dataframes/setup/dataframes.py +36 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/PKG-INFO +2 -2
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/SOURCES.txt +0 -1
- hopeit_dataframes-0.25.0b2/src/hopeit.dataframes.egg-info/requires.txt +6 -0
- hopeit.dataframes-0.24.2/src/hopeit/dataframes/dataframeobject.py +0 -184
- hopeit.dataframes-0.24.2/src/hopeit/dataframes/setup/dataframes.py +0 -52
- hopeit.dataframes-0.24.2/src/hopeit.dataframes.egg-info/requires.txt +0 -6
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/README.md +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/setup.cfg +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/setup.py +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/py.typed +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/__init__.py +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/settings.py +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/setup/__init__.py +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
- {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.25.0b2
|
|
4
4
|
Summary: Hopeit Engine Dataframes Toolkit
|
|
5
5
|
Home-page: https://github.com/hopeit-git/hopeit.engine
|
|
6
6
|
Author: Leo Smerling and Pablo Canto
|
|
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
26
26
|
Classifier: Framework :: AsyncIO
|
|
27
27
|
Requires-Python: >=3.8
|
|
28
28
|
Description-Content-Type: text/markdown
|
|
29
|
-
Requires-Dist: hopeit.engine[fs-storage]==0.
|
|
29
|
+
Requires-Dist: hopeit.engine[fs-storage]==0.25.0b2
|
|
30
30
|
Requires-Dist: pandas
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Provides-Extra: pyarrow
|
|
@@ -1,37 +1,74 @@
|
|
|
1
1
|
"""
|
|
2
2
|
hopeit.engine dataframes plugin entry point
|
|
3
3
|
|
|
4
|
-
This module exposes the
|
|
4
|
+
This module exposes the 2 main constructions to be used inside apps,
|
|
5
|
+
to extend @dataobject functionallity supporting working with `pandas DataFrames`
|
|
5
6
|
`@dataframe` dataclass annotation
|
|
6
|
-
`@dataframeobject` dataclass annotation
|
|
7
7
|
`DataFrames` class to handle manipulation of dataframe/dataframeobjects
|
|
8
8
|
|
|
9
9
|
Usage:
|
|
10
10
|
```
|
|
11
|
-
from
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
16
|
+
from hopeit.dataframes import DataFrames, Dataset, dataframe
|
|
17
|
+
from hopeit.dataobjects import dataobject, dataclass
|
|
18
|
+
from hopeit.dataobjects.payload import Payload
|
|
12
19
|
|
|
13
20
|
@dataframe
|
|
14
21
|
@dataclass
|
|
15
|
-
class
|
|
22
|
+
class MyData:
|
|
16
23
|
field1: int
|
|
17
24
|
field2: str
|
|
18
25
|
...
|
|
19
26
|
|
|
20
|
-
@
|
|
27
|
+
@dataobject
|
|
21
28
|
@dataclass
|
|
22
29
|
class MyDataset:
|
|
23
30
|
dataset_name: str
|
|
24
|
-
example_data:
|
|
31
|
+
example_data: Dataset[MyData]
|
|
32
|
+
|
|
33
|
+
@dataobject
|
|
34
|
+
@dataclass
|
|
35
|
+
class MyWebResponse:
|
|
36
|
+
dataset_name: str
|
|
37
|
+
example_data: List[MyData.DataObject]
|
|
38
|
+
|
|
39
|
+
# This step is not needed if SETUP event is configured in app
|
|
40
|
+
DataFrames.setup(DatasetSerialization(
|
|
41
|
+
protocol="hopeit.dataframes.serialization.files.DatasetFileStorage",
|
|
42
|
+
location="/tmp/data",
|
|
43
|
+
partition_dateformat="%Y/%m/%d/%H/",
|
|
44
|
+
))
|
|
45
|
+
|
|
46
|
+
df = pd.DataFrame([ # Create or load a pandas DataFrame
|
|
47
|
+
{"field1": 1, "field2": "text1"},
|
|
48
|
+
{"field1": 2, "field2": "text2"},
|
|
49
|
+
])
|
|
50
|
+
|
|
51
|
+
my_data: MyData = DataFrames.from_df(MyData, df)
|
|
25
52
|
|
|
53
|
+
# return dataset after saving data to disk
|
|
54
|
+
my_dataset = MyDataset(
|
|
55
|
+
dataset_name="example",
|
|
56
|
+
example_data=await Dataset.save(my_data)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
print(Payload.to_json(my_dataset))
|
|
26
60
|
|
|
27
|
-
|
|
61
|
+
my_data_again: MyData = await my_dataset.example_data.load()
|
|
28
62
|
|
|
29
|
-
|
|
63
|
+
print(DataFrames.df(my_data_again))
|
|
30
64
|
|
|
31
|
-
return
|
|
65
|
+
# return dataframe converted to list of dataobjects that can be directly converted to json
|
|
66
|
+
my_json_response = MyWebResponse(
|
|
32
67
|
dataset_name="example",
|
|
33
|
-
example_data=my_data
|
|
68
|
+
example_data=DataFrames.to_dataobjects(my_data)
|
|
34
69
|
)
|
|
70
|
+
|
|
71
|
+
print(Payload.to_json(my_json_response))
|
|
35
72
|
```
|
|
36
73
|
"""
|
|
37
74
|
|
|
@@ -40,35 +77,22 @@ from typing import Dict, Generic, Iterator, List, Type
|
|
|
40
77
|
import numpy as np
|
|
41
78
|
import pandas as pd
|
|
42
79
|
from hopeit.dataframes.dataframe import DataFrameT, dataframe
|
|
43
|
-
from hopeit.dataframes.
|
|
80
|
+
from hopeit.dataframes.serialization.dataset import Dataset
|
|
81
|
+
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
82
|
+
from hopeit.dataframes.setup.dataframes import register_serialization
|
|
44
83
|
from hopeit.dataobjects import DataObject
|
|
45
84
|
|
|
46
|
-
__all__ = ["DataFrames", "
|
|
85
|
+
__all__ = ["DataFrames", "Dataset", "dataframe"]
|
|
47
86
|
|
|
48
87
|
|
|
49
|
-
class DataFrames(Generic[DataFrameT,
|
|
88
|
+
class DataFrames(Generic[DataFrameT, DataObject]):
|
|
50
89
|
"""
|
|
51
90
|
Dataframes manipulation utilities methods
|
|
52
91
|
"""
|
|
53
92
|
|
|
54
93
|
@staticmethod
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
and converts to a `DataObject` json-compatible with pointers to saved
|
|
58
|
-
locations.
|
|
59
|
-
|
|
60
|
-
This method can be used to i.e. return `@dataframeobject`s as a JSON response
|
|
61
|
-
"""
|
|
62
|
-
return await obj._serialize() # type: ignore # pylint: disable=protected-access
|
|
63
|
-
|
|
64
|
-
@staticmethod
|
|
65
|
-
async def deserialize(
|
|
66
|
-
datatype: Type[DataFrameObjectT], dataobject: DataObject
|
|
67
|
-
) -> DataFrameObjectT:
|
|
68
|
-
"""Deserialize/load contents of serialized dataobject fields of a `@dataframeobject`
|
|
69
|
-
loading saved Dataset information for @dataframe fields
|
|
70
|
-
"""
|
|
71
|
-
return await datatype._deserialize(dataobject) # type: ignore # pylint: disable=protected-access
|
|
94
|
+
def setup(settings: DatasetSerialization):
|
|
95
|
+
register_serialization(settings)
|
|
72
96
|
|
|
73
97
|
@staticmethod
|
|
74
98
|
def from_df(
|
|
@@ -88,7 +112,7 @@ class DataFrames(Generic[DataFrameT, DataFrameObjectT, DataObject]):
|
|
|
88
112
|
|
|
89
113
|
@staticmethod
|
|
90
114
|
def from_dataobjects(
|
|
91
|
-
datatype: Type[DataFrameT], dataobjects: Iterator[
|
|
115
|
+
datatype: Type[DataFrameT], dataobjects: Iterator[DataObject]
|
|
92
116
|
) -> DataFrameT:
|
|
93
117
|
"""Converts standard json serializable `@dataobject`s to a single `@dataframe`"""
|
|
94
118
|
return datatype._from_dataobjects(dataobjects) # type: ignore # pylint: disable=protected-access
|
|
@@ -1,58 +1,31 @@
|
|
|
1
1
|
"""
|
|
2
2
|
DataFrames type abstractions.
|
|
3
|
-
|
|
4
|
-
Example:
|
|
5
|
-
|
|
6
|
-
from hopeit.dataobjects import dataclass # equivalent to `dataclasses.dataclass`
|
|
7
|
-
from hopeit.dataframes import dataframe
|
|
8
|
-
|
|
9
|
-
@dataframe
|
|
10
|
-
@dataclass
|
|
11
|
-
class MyObject:
|
|
12
|
-
name: str
|
|
13
|
-
number: int
|
|
14
3
|
"""
|
|
15
|
-
|
|
16
|
-
from dataclasses import Field, asdict, dataclass, fields, make_dataclass
|
|
4
|
+
import dataclasses
|
|
17
5
|
from datetime import date, datetime, timezone
|
|
18
|
-
from typing import Any, Callable, Dict, Generic, Iterator, List,
|
|
6
|
+
from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar
|
|
19
7
|
|
|
20
8
|
import numpy as np
|
|
21
9
|
import pandas as pd
|
|
22
|
-
from
|
|
10
|
+
from pydantic import create_model
|
|
11
|
+
from pydantic.fields import FieldInfo
|
|
12
|
+
|
|
23
13
|
from hopeit.dataobjects import (
|
|
24
14
|
DataObject,
|
|
25
15
|
StreamEventMixin,
|
|
26
16
|
StreamEventParams,
|
|
27
17
|
dataobject,
|
|
18
|
+
fields,
|
|
28
19
|
)
|
|
20
|
+
from hopeit.dataobjects.payload import Payload
|
|
29
21
|
|
|
30
22
|
DataFrameT = TypeVar("DataFrameT")
|
|
31
23
|
|
|
32
24
|
|
|
33
|
-
@dataclass
|
|
34
|
-
class DataFrameMetadata(
|
|
25
|
+
@dataclasses.dataclass
|
|
26
|
+
class DataFrameMetadata():
|
|
35
27
|
columns: List[str]
|
|
36
|
-
fields: Dict[str,
|
|
37
|
-
serialized_type: Type[DataObject]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@dataclass
|
|
41
|
-
class DataFrameParams:
|
|
42
|
-
"""
|
|
43
|
-
Helper class used to access attributes in @dataframe
|
|
44
|
-
decorated objects, based on dot notation expressions
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
datatypes: Optional[str]
|
|
48
|
-
|
|
49
|
-
@staticmethod
|
|
50
|
-
def extract_attr(obj, expr):
|
|
51
|
-
value = obj
|
|
52
|
-
for attr_name in expr.split("."):
|
|
53
|
-
if value:
|
|
54
|
-
value = getattr(value, attr_name)
|
|
55
|
-
return value
|
|
28
|
+
fields: Dict[str, FieldInfo]
|
|
56
29
|
|
|
57
30
|
|
|
58
31
|
class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
@@ -99,7 +72,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
99
72
|
|
|
100
73
|
@classmethod
|
|
101
74
|
def _from_dataobjects(cls, items: Iterator[DataObject]) -> DataFrameT:
|
|
102
|
-
return cls._from_df(pd.DataFrame(
|
|
75
|
+
return cls._from_df(pd.DataFrame(Payload.to_obj(item) for item in items)) # type: ignore[misc]
|
|
103
76
|
|
|
104
77
|
@classmethod
|
|
105
78
|
def _from_df_unsafe(cls, df: pd.DataFrame, **series: pd.Series) -> DataFrameT:
|
|
@@ -117,40 +90,10 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
117
90
|
|
|
118
91
|
def _to_dataobjects(self) -> List[DataObject]:
|
|
119
92
|
return [
|
|
120
|
-
self.
|
|
93
|
+
self.DataObject(**fields)
|
|
121
94
|
for fields in self.__df.to_dict(orient="records")
|
|
122
95
|
]
|
|
123
96
|
|
|
124
|
-
def to_json(self, *args, **kwargs) -> str:
|
|
125
|
-
raise NotImplementedError(
|
|
126
|
-
"Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
|
|
127
|
-
)
|
|
128
|
-
|
|
129
|
-
def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
|
|
130
|
-
raise NotImplementedError(
|
|
131
|
-
"Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
@classmethod
|
|
135
|
-
def from_json(cls, *args, **kwargs) -> DataObject:
|
|
136
|
-
return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
|
|
137
|
-
|
|
138
|
-
@classmethod
|
|
139
|
-
def from_dict(
|
|
140
|
-
cls,
|
|
141
|
-
*args,
|
|
142
|
-
**kwargs,
|
|
143
|
-
) -> DataObject:
|
|
144
|
-
return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
|
|
145
|
-
|
|
146
|
-
@classmethod
|
|
147
|
-
def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
|
|
148
|
-
if cls.__data_object__["schema"]:
|
|
149
|
-
schema = cls.__dataframe__.serialized_type.json_schema(*args, **kwargs)
|
|
150
|
-
schema[cls.__name__] = schema[cls.__dataframe__.serialized_type.__name__]
|
|
151
|
-
return schema
|
|
152
|
-
return {}
|
|
153
|
-
|
|
154
97
|
def event_id(self, *args, **kwargs) -> str:
|
|
155
98
|
return ""
|
|
156
99
|
|
|
@@ -174,7 +117,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
|
|
|
174
117
|
|
|
175
118
|
def _coerce_datatypes(self, df: pd.DataFrame) -> Dict[str, pd.Series]:
|
|
176
119
|
return {
|
|
177
|
-
name: self.DATATYPE_MAPPING[field.
|
|
120
|
+
name: self.DATATYPE_MAPPING[field.annotation](df[name]) # type: ignore
|
|
178
121
|
for name, field in self.__dataframe__.fields.items()
|
|
179
122
|
}
|
|
180
123
|
|
|
@@ -193,7 +136,7 @@ def dataframe(
|
|
|
193
136
|
if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
|
|
194
137
|
amended_class = type(
|
|
195
138
|
cls.__name__,
|
|
196
|
-
(DataFrameMixin,
|
|
139
|
+
(DataFrameMixin, ) + cls.__mro__,
|
|
197
140
|
dict(cls.__dict__),
|
|
198
141
|
)
|
|
199
142
|
setattr(amended_class, "__init__", DataFrameMixin.__init_from_series__)
|
|
@@ -201,17 +144,17 @@ def dataframe(
|
|
|
201
144
|
return cls
|
|
202
145
|
|
|
203
146
|
def add_dataframe_metadata(cls):
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
147
|
+
serialized_fields = {k: (v.annotation, v) for k, v in fields(cls).items()}
|
|
148
|
+
dataobject_type = create_model(cls.__name__+"DataObject", **serialized_fields)
|
|
149
|
+
dataobject_type = dataobject(dataobject_type, unsafe=True)
|
|
207
150
|
|
|
151
|
+
setattr(cls, "DataObject", dataobject_type)
|
|
208
152
|
setattr(
|
|
209
153
|
cls,
|
|
210
154
|
"__dataframe__",
|
|
211
155
|
DataFrameMetadata(
|
|
212
|
-
columns=
|
|
213
|
-
fields=
|
|
214
|
-
serialized_type=serialized_type,
|
|
156
|
+
columns=list(fields(cls).keys()),
|
|
157
|
+
fields=dict(fields(cls).items()),
|
|
215
158
|
),
|
|
216
159
|
)
|
|
217
160
|
|
|
@@ -226,14 +169,14 @@ def dataframe(
|
|
|
226
169
|
setattr(cls, "event_ts", StreamEventMixin.event_ts)
|
|
227
170
|
|
|
228
171
|
def set_fields_optional(cls):
|
|
229
|
-
for field in fields(cls):
|
|
172
|
+
for _, field in fields(cls).items():
|
|
230
173
|
field.default = None
|
|
231
174
|
|
|
232
175
|
def wrap(cls) -> Type[DataFrameMixin]:
|
|
233
176
|
if hasattr(cls, "__dataframe__"):
|
|
234
177
|
return cls
|
|
178
|
+
add_dataframe_metadata(cls)
|
|
235
179
|
amended_class = add_dataframe_mixin(cls)
|
|
236
|
-
add_dataframe_metadata(amended_class)
|
|
237
180
|
add_dataobject_annotations(amended_class, unsafe, validate, schema)
|
|
238
181
|
set_fields_optional(amended_class)
|
|
239
182
|
return amended_class
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"""
|
|
3
3
|
|
|
4
4
|
from importlib import import_module
|
|
5
|
-
from typing import Type, TypeVar
|
|
5
|
+
from typing import Generic, Type, TypeVar
|
|
6
6
|
|
|
7
7
|
from hopeit.dataobjects import dataclass, dataobject
|
|
8
8
|
|
|
@@ -11,12 +11,20 @@ DataFrameT = TypeVar("DataFrameT")
|
|
|
11
11
|
|
|
12
12
|
@dataobject
|
|
13
13
|
@dataclass
|
|
14
|
-
class Dataset:
|
|
14
|
+
class Dataset(Generic[DataFrameT]):
|
|
15
|
+
"""Persisted representation of a @dataframe object"""
|
|
15
16
|
protocol: str
|
|
16
17
|
partition_key: str
|
|
17
18
|
key: str
|
|
18
19
|
datatype: str
|
|
19
20
|
|
|
21
|
+
async def load(self) -> DataFrameT:
|
|
22
|
+
return await self.__storage.load(self) # type: ignore[attr-defined]
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
|
|
26
|
+
return await cls.__storage.save(dataframe) # type: ignore[attr-defined]
|
|
27
|
+
|
|
20
28
|
|
|
21
29
|
def find_protocol_impl(qual_type_name: str) -> Type:
|
|
22
30
|
mod_name, type_name = (
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/files.py
RENAMED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
import io
|
|
5
5
|
from importlib import import_module
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Generic, Optional, Type, TypeVar
|
|
7
7
|
from uuid import uuid4
|
|
8
8
|
|
|
9
9
|
import pandas as pd
|
|
@@ -67,40 +67,6 @@ class DatasetFileStorage(Generic[DataFrameT]):
|
|
|
67
67
|
df = pd.read_parquet(io.BytesIO(data), engine="pyarrow")
|
|
68
68
|
return datatype._from_df(df) # pylint: disable=protected-access
|
|
69
69
|
|
|
70
|
-
async def ser_wrapper(
|
|
71
|
-
self,
|
|
72
|
-
base_serialization: Callable,
|
|
73
|
-
data: Union[EventPayloadType, DataFrameT],
|
|
74
|
-
level: int,
|
|
75
|
-
) -> bytes:
|
|
76
|
-
"""Serialization wrapper that plugins-in into hopeit.engine
|
|
77
|
-
serialization when dataframes plugin is initialized
|
|
78
|
-
"""
|
|
79
|
-
if hasattr(data, "__dataframeobject__"):
|
|
80
|
-
data = await data._serialize() # type: ignore # pylint: disable=protected-access
|
|
81
|
-
if hasattr(data, "__dataframe__"):
|
|
82
|
-
data = await self.save(data) # type: ignore
|
|
83
|
-
return await base_serialization(data, level)
|
|
84
|
-
|
|
85
|
-
async def deser_wrapper(
|
|
86
|
-
self,
|
|
87
|
-
base_deserialization: Callable,
|
|
88
|
-
data: bytes,
|
|
89
|
-
datatype: Union[Type[EventPayloadType], Type[DataFrameT]],
|
|
90
|
-
) -> Union[EventPayloadType, DataFrameT]:
|
|
91
|
-
"""Deerialization wrapper that plugins-in into hopeit.engine
|
|
92
|
-
deserialization when dataframes plugin is initialized
|
|
93
|
-
"""
|
|
94
|
-
if hasattr(datatype, "__dataframeobject__"):
|
|
95
|
-
dataset = await base_deserialization(
|
|
96
|
-
data, datatype.__dataframeobject__.serialized_type # type: ignore
|
|
97
|
-
)
|
|
98
|
-
return await datatype._deserialize(dataset) # type: ignore # pylint: disable=protected-access
|
|
99
|
-
if hasattr(datatype, "__dataframe__"):
|
|
100
|
-
dataset = await base_deserialization(data, Dataset)
|
|
101
|
-
return await self.load(dataset)
|
|
102
|
-
return await base_deserialization(data, datatype)
|
|
103
|
-
|
|
104
70
|
|
|
105
71
|
def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
|
|
106
72
|
"""Returns dataframe class based on type name used during serialization"""
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""hopeit.engine dataframes plugin SETUP event.
|
|
2
|
+
|
|
3
|
+
This event executes when engine starts with dataframes plugin configuration file loaded,
|
|
4
|
+
and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
|
|
5
|
+
types
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from hopeit.app.context import EventContext
|
|
9
|
+
from hopeit.app.logger import app_logger
|
|
10
|
+
from hopeit.dataframes.serialization.dataset import Dataset, find_protocol_impl
|
|
11
|
+
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
12
|
+
|
|
13
|
+
logger = app_logger()
|
|
14
|
+
|
|
15
|
+
__steps__ = ["setup"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def setup(payload: None, context: EventContext) -> None:
|
|
19
|
+
"""Setups serizaltion wrappers in hopeit.engine based on
|
|
20
|
+
`DataSerialization` settings configured in plugin configuration file
|
|
21
|
+
"""
|
|
22
|
+
logger.info(context, "Configuring Dataset serialization...")
|
|
23
|
+
settings: DatasetSerialization = context.settings(
|
|
24
|
+
key="dataset_serialization", datatype=DatasetSerialization
|
|
25
|
+
)
|
|
26
|
+
register_serialization(settings)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def register_serialization(settings: DatasetSerialization):
|
|
30
|
+
impl = find_protocol_impl(settings.protocol)
|
|
31
|
+
storage = impl(
|
|
32
|
+
protocol=settings.protocol,
|
|
33
|
+
location=settings.location,
|
|
34
|
+
partition_dateformat=settings.partition_dateformat,
|
|
35
|
+
)
|
|
36
|
+
setattr(Dataset, "_Dataset__storage", storage)
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.25.0b2
|
|
4
4
|
Summary: Hopeit Engine Dataframes Toolkit
|
|
5
5
|
Home-page: https://github.com/hopeit-git/hopeit.engine
|
|
6
6
|
Author: Leo Smerling and Pablo Canto
|
|
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
26
26
|
Classifier: Framework :: AsyncIO
|
|
27
27
|
Requires-Python: >=3.8
|
|
28
28
|
Description-Content-Type: text/markdown
|
|
29
|
-
Requires-Dist: hopeit.engine[fs-storage]==0.
|
|
29
|
+
Requires-Dist: hopeit.engine[fs-storage]==0.25.0b2
|
|
30
30
|
Requires-Dist: pandas
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Provides-Extra: pyarrow
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/SOURCES.txt
RENAMED
|
@@ -7,7 +7,6 @@ src/hopeit.dataframes.egg-info/requires.txt
|
|
|
7
7
|
src/hopeit.dataframes.egg-info/top_level.txt
|
|
8
8
|
src/hopeit/dataframes/__init__.py
|
|
9
9
|
src/hopeit/dataframes/dataframe.py
|
|
10
|
-
src/hopeit/dataframes/dataframeobject.py
|
|
11
10
|
src/hopeit/dataframes/py.typed
|
|
12
11
|
src/hopeit/dataframes/serialization/__init__.py
|
|
13
12
|
src/hopeit/dataframes/serialization/dataset.py
|
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`@dataframeobject` annonation mixin to serialize a group of `@dataframe`s.
|
|
3
|
-
|
|
4
|
-
Datasets behaves as DataObject so they can be used as payload
|
|
5
|
-
for endpoints and streams.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from dataclasses import Field, dataclass, fields, make_dataclass
|
|
9
|
-
from typing import (
|
|
10
|
-
Any,
|
|
11
|
-
Callable,
|
|
12
|
-
ClassVar,
|
|
13
|
-
Dict,
|
|
14
|
-
Generic,
|
|
15
|
-
Optional,
|
|
16
|
-
Type,
|
|
17
|
-
TypeVar,
|
|
18
|
-
Union,
|
|
19
|
-
get_args,
|
|
20
|
-
get_origin,
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
from hopeit.dataframes.serialization.dataset import Dataset
|
|
24
|
-
from hopeit.dataobjects import (
|
|
25
|
-
DataObject,
|
|
26
|
-
StreamEventMixin,
|
|
27
|
-
StreamEventParams,
|
|
28
|
-
dataobject,
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
DataFrameObjectT = TypeVar("DataFrameObjectT")
|
|
32
|
-
NoneType = type(None)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@dataclass
|
|
36
|
-
class DataFrameObjectMetadata(Generic[DataObject]):
|
|
37
|
-
serialized_type: Type[DataObject]
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class DataFrameObjectMixin(Generic[DataFrameObjectT]):
|
|
41
|
-
"""
|
|
42
|
-
MixIn class to add functionality for `@dataframeobject`s
|
|
43
|
-
|
|
44
|
-
Do not use this class directly, instead use `@dataframeobject` class decorator.
|
|
45
|
-
"""
|
|
46
|
-
|
|
47
|
-
__storage: ClassVar[Any] = None # pylint: disable=invalid-name
|
|
48
|
-
|
|
49
|
-
def __init__(self) -> None:
|
|
50
|
-
self.__dataframeobject__: DataFrameObjectMetadata = None # type: ignore
|
|
51
|
-
raise NotImplementedError(
|
|
52
|
-
"DataFrameObjectMixin() should not be called directly. Use `@dataframeobject` annotation"
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
async def _serialize(self) -> Optional[DataObject]:
|
|
56
|
-
"""Saves internal `@dataframe`s using configured serialization protocol
|
|
57
|
-
and returns json-serialiable dataobject
|
|
58
|
-
"""
|
|
59
|
-
datasets = {}
|
|
60
|
-
for field in fields(self): # type: ignore
|
|
61
|
-
if _is_dataframe_field(field):
|
|
62
|
-
dataframe = getattr(self, field.name)
|
|
63
|
-
dataset = (
|
|
64
|
-
None if dataframe is None else await self.__storage.save(dataframe)
|
|
65
|
-
)
|
|
66
|
-
datasets[field.name] = dataset
|
|
67
|
-
else:
|
|
68
|
-
datasets[field.name] = getattr(self, field.name)
|
|
69
|
-
return self.__dataframeobject__.serialized_type(**datasets)
|
|
70
|
-
|
|
71
|
-
@classmethod
|
|
72
|
-
async def _deserialize(
|
|
73
|
-
cls, serialized: DataObject
|
|
74
|
-
) -> "DataFrameObjectMixin[DataFrameObjectT]":
|
|
75
|
-
"""From a serialized datframeobject, load inner `@dataframe` objects
|
|
76
|
-
and returns a `@dataframeobject` instance"""
|
|
77
|
-
dataframes = {}
|
|
78
|
-
for field in fields(cls): # type: ignore
|
|
79
|
-
if _is_dataframe_field(field):
|
|
80
|
-
dataset = getattr(serialized, field.name)
|
|
81
|
-
dataframe = (
|
|
82
|
-
None if dataset is None else await cls.__storage.load(dataset)
|
|
83
|
-
)
|
|
84
|
-
dataframes[field.name] = dataframe
|
|
85
|
-
else:
|
|
86
|
-
dataframes[field.name] = getattr(serialized, field.name)
|
|
87
|
-
return cls(**dataframes)
|
|
88
|
-
|
|
89
|
-
@classmethod
|
|
90
|
-
def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
|
|
91
|
-
schema = cls.__dataframeobject__.serialized_type.json_schema(*args, **kwargs)
|
|
92
|
-
schema[cls.__name__] = schema[cls.__dataframeobject__.serialized_type.__name__]
|
|
93
|
-
return schema
|
|
94
|
-
|
|
95
|
-
def to_json(self, *args, **kwargs) -> Dict[str, Any]:
|
|
96
|
-
raise RuntimeError(
|
|
97
|
-
f"`{type(self).__name__}` `@dataframeobject` cannot be converted to json directly. "
|
|
98
|
-
"i.e. use `return await DataFrames.serialize(obj)` to return it as a reponse."
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def _is_dataframe_field(field: Field) -> bool:
|
|
103
|
-
return any(
|
|
104
|
-
hasattr(field_type, "__dataframe__")
|
|
105
|
-
for field_type in [field.type, *get_args(field.type)]
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def _serialized_field_type(field: Field) -> Type[Any]:
|
|
110
|
-
"""Computes the `@dataobject` datatype used as a result
|
|
111
|
-
of serialized `@dataframeobject`
|
|
112
|
-
"""
|
|
113
|
-
if hasattr(field.type, "__dataframe__"):
|
|
114
|
-
return Dataset
|
|
115
|
-
if get_origin(field.type) is Union:
|
|
116
|
-
args = get_args(field.type)
|
|
117
|
-
if (
|
|
118
|
-
len(args) == 2
|
|
119
|
-
and any(hasattr(field_type, "__dataframe__") for field_type in args)
|
|
120
|
-
and any(field_type is NoneType for field_type in args)
|
|
121
|
-
):
|
|
122
|
-
return Optional[Dataset] # type: ignore
|
|
123
|
-
if _is_dataframe_field(field):
|
|
124
|
-
raise TypeError(
|
|
125
|
-
f"field {field.name}: only `DataFrameT` or `Optional[DataFrameT]` are supported"
|
|
126
|
-
)
|
|
127
|
-
return field.type
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def dataframeobject(
|
|
131
|
-
decorated_class=None,
|
|
132
|
-
) -> Callable[[Type], Type[DataFrameObjectMixin]]:
|
|
133
|
-
"""
|
|
134
|
-
Decorator for dataclasses intended to be used as dataframes.
|
|
135
|
-
"""
|
|
136
|
-
|
|
137
|
-
def add_dataframe_mixin(cls) -> Type[DataFrameObjectMixin]:
|
|
138
|
-
if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
|
|
139
|
-
amended_class = type(
|
|
140
|
-
cls.__name__,
|
|
141
|
-
(DataFrameObjectMixin,) + cls.__mro__,
|
|
142
|
-
dict(cls.__dict__),
|
|
143
|
-
)
|
|
144
|
-
return amended_class
|
|
145
|
-
return cls
|
|
146
|
-
|
|
147
|
-
def add_dataframeobject_metadata(cls):
|
|
148
|
-
serialized_fiels = [
|
|
149
|
-
(field.name, _serialized_field_type(field)) for field in fields(cls)
|
|
150
|
-
]
|
|
151
|
-
serialized_type = make_dataclass(cls.__name__ + "_", serialized_fiels)
|
|
152
|
-
serialized_type = dataobject(serialized_type, unsafe=True)
|
|
153
|
-
|
|
154
|
-
setattr(
|
|
155
|
-
cls,
|
|
156
|
-
"__dataframeobject__",
|
|
157
|
-
DataFrameObjectMetadata(
|
|
158
|
-
serialized_type=serialized_type,
|
|
159
|
-
),
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool):
|
|
163
|
-
setattr(
|
|
164
|
-
cls,
|
|
165
|
-
"__data_object__",
|
|
166
|
-
{"unsafe": unsafe, "validate": validate, "schema": schema},
|
|
167
|
-
)
|
|
168
|
-
setattr(cls, "__stream_event__", StreamEventParams(None, None))
|
|
169
|
-
setattr(cls, "event_id", StreamEventMixin.event_id)
|
|
170
|
-
setattr(cls, "event_ts", StreamEventMixin.event_ts)
|
|
171
|
-
|
|
172
|
-
def wrap(cls) -> Type[DataFrameObjectMixin]:
|
|
173
|
-
if hasattr(cls, "__dataframeobject__"):
|
|
174
|
-
return cls
|
|
175
|
-
amended_class = add_dataframe_mixin(cls)
|
|
176
|
-
add_dataframeobject_metadata(amended_class)
|
|
177
|
-
add_dataobject_annotations(
|
|
178
|
-
amended_class, unsafe=False, validate=True, schema=True
|
|
179
|
-
)
|
|
180
|
-
return amended_class
|
|
181
|
-
|
|
182
|
-
if decorated_class is None:
|
|
183
|
-
return wrap
|
|
184
|
-
return wrap(decorated_class) # type: ignore
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
"""hopeit.engine dataframes plugin SETUP event.
|
|
2
|
-
|
|
3
|
-
This event executes when engine starts with dataframes plugin configuration file loaded,
|
|
4
|
-
and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
|
|
5
|
-
types
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from functools import partial
|
|
9
|
-
|
|
10
|
-
from hopeit.app.context import EventContext
|
|
11
|
-
from hopeit.app.logger import app_logger
|
|
12
|
-
from hopeit.dataframes.dataframeobject import DataFrameObjectMixin
|
|
13
|
-
from hopeit.dataframes.serialization.dataset import find_protocol_impl
|
|
14
|
-
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
15
|
-
from hopeit.server import serialization
|
|
16
|
-
|
|
17
|
-
logger = app_logger()
|
|
18
|
-
|
|
19
|
-
__steps__ = ["register_serialization"]
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def register_serialization(payload: None, context: EventContext) -> None:
|
|
23
|
-
"""Setups serizaltion wrappers in hopeit.engine based on
|
|
24
|
-
`DataSerialization` settings configured in plugin configuration file
|
|
25
|
-
"""
|
|
26
|
-
logger.info(context, "Registering serialization methods...")
|
|
27
|
-
|
|
28
|
-
settings: DatasetSerialization = context.settings(
|
|
29
|
-
key="dataset_serialization", datatype=DatasetSerialization
|
|
30
|
-
)
|
|
31
|
-
impl = find_protocol_impl(settings.protocol)
|
|
32
|
-
|
|
33
|
-
storage = impl(
|
|
34
|
-
protocol=settings.protocol,
|
|
35
|
-
location=settings.location,
|
|
36
|
-
partition_dateformat=settings.partition_dateformat,
|
|
37
|
-
)
|
|
38
|
-
setattr(DataFrameObjectMixin, "_DataFrameObjectMixin__storage", storage)
|
|
39
|
-
|
|
40
|
-
serdeser_wrappers = {}
|
|
41
|
-
for (
|
|
42
|
-
serdeser,
|
|
43
|
-
methods,
|
|
44
|
-
) in serialization._SERDESER.items(): # pylint: disable=protected-access
|
|
45
|
-
serdeser_wrappers[serdeser] = (
|
|
46
|
-
partial(storage.ser_wrapper, methods[0]),
|
|
47
|
-
methods[1],
|
|
48
|
-
partial(storage.deser_wrapper, methods[2]),
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
for serdeser, methods in serdeser_wrappers.items():
|
|
52
|
-
serialization._SERDESER[serdeser] = methods # pylint: disable=protected-access
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/setup/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/top_level.txt
RENAMED
|
File without changes
|