hopeit.dataframes 0.24.2__tar.gz → 0.25.0b2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/PKG-INFO +2 -2
  2. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/__init__.py +55 -31
  3. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/dataframe.py +22 -79
  4. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/dataset.py +10 -2
  5. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/files.py +1 -35
  6. hopeit_dataframes-0.25.0b2/src/hopeit/dataframes/setup/dataframes.py +36 -0
  7. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/PKG-INFO +2 -2
  8. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/SOURCES.txt +0 -1
  9. hopeit_dataframes-0.25.0b2/src/hopeit.dataframes.egg-info/requires.txt +6 -0
  10. hopeit.dataframes-0.24.2/src/hopeit/dataframes/dataframeobject.py +0 -184
  11. hopeit.dataframes-0.24.2/src/hopeit/dataframes/setup/dataframes.py +0 -52
  12. hopeit.dataframes-0.24.2/src/hopeit.dataframes.egg-info/requires.txt +0 -6
  13. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/README.md +0 -0
  14. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/setup.cfg +0 -0
  15. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/setup.py +0 -0
  16. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/py.typed +0 -0
  17. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/__init__.py +0 -0
  18. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/serialization/settings.py +0 -0
  19. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit/dataframes/setup/__init__.py +0 -0
  20. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
  21. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0b2}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.24.2
3
+ Version: 0.25.0b2
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
5
  Home-page: https://github.com/hopeit-git/hopeit.engine
6
6
  Author: Leo Smerling and Pablo Canto
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
26
  Classifier: Framework :: AsyncIO
27
27
  Requires-Python: >=3.8
28
28
  Description-Content-Type: text/markdown
29
- Requires-Dist: hopeit.engine[fs-storage]==0.24.2
29
+ Requires-Dist: hopeit.engine[fs-storage]==0.25.0b2
30
30
  Requires-Dist: pandas
31
31
  Requires-Dist: numpy
32
32
  Provides-Extra: pyarrow
@@ -1,37 +1,74 @@
1
1
  """
2
2
  hopeit.engine dataframes plugin entry point
3
3
 
4
- This module exposes the 3 main constructions to be used inside apps:
4
+ This module exposes the 2 main constructions to be used inside apps,
5
+ to extend @dataobject functionallity supporting working with `pandas DataFrames`
5
6
  `@dataframe` dataclass annotation
6
- `@dataframeobject` dataclass annotation
7
7
  `DataFrames` class to handle manipulation of dataframe/dataframeobjects
8
8
 
9
9
  Usage:
10
10
  ```
11
- from hopeit.dataframes import DataFrames, dataframe, dataframeobject
11
+ from typing import List
12
+
13
+ import pandas as pd
14
+
15
+ from hopeit.dataframes.serialization.settings import DatasetSerialization
16
+ from hopeit.dataframes import DataFrames, Dataset, dataframe
17
+ from hopeit.dataobjects import dataobject, dataclass
18
+ from hopeit.dataobjects.payload import Payload
12
19
 
13
20
  @dataframe
14
21
  @dataclass
15
- class MyDataFrame:
22
+ class MyData:
16
23
  field1: int
17
24
  field2: str
18
25
  ...
19
26
 
20
- @dataframeobject
27
+ @dataobject
21
28
  @dataclass
22
29
  class MyDataset:
23
30
  dataset_name: str
24
- example_data: MyDataFrame
31
+ example_data: Dataset[MyData]
32
+
33
+ @dataobject
34
+ @dataclass
35
+ class MyWebResponse:
36
+ dataset_name: str
37
+ example_data: List[MyData.DataObject]
38
+
39
+ # This step is not needed if SETUP event is configured in app
40
+ DataFrames.setup(DatasetSerialization(
41
+ protocol="hopeit.dataframes.serialization.files.DatasetFileStorage",
42
+ location="/tmp/data",
43
+ partition_dateformat="%Y/%m/%d/%H/",
44
+ ))
45
+
46
+ df = pd.DataFrame([ # Create or load a pandas DataFrame
47
+ {"field1": 1, "field2": "text1"},
48
+ {"field1": 2, "field2": "text2"},
49
+ ])
50
+
51
+ my_data: MyData = DataFrames.from_df(MyData, df)
25
52
 
53
+ # return dataset after saving data to disk
54
+ my_dataset = MyDataset(
55
+ dataset_name="example",
56
+ example_data=await Dataset.save(my_data)
57
+ )
58
+
59
+ print(Payload.to_json(my_dataset))
26
60
 
27
- df = pd.DataFrame(...) # create or load your pandas dataframe
61
+ my_data_again: MyData = await my_dataset.example_data.load()
28
62
 
29
- my_data = DataFrames.from_df(pd.DataFrame(..))
63
+ print(DataFrames.df(my_data_again))
30
64
 
31
- return MyDataSet(
65
+ # return dataframe converted to list of dataobjects that can be directly converted to json
66
+ my_json_response = MyWebResponse(
32
67
  dataset_name="example",
33
- example_data=my_data
68
+ example_data=DataFrames.to_dataobjects(my_data)
34
69
  )
70
+
71
+ print(Payload.to_json(my_json_response))
35
72
  ```
36
73
  """
37
74
 
@@ -40,35 +77,22 @@ from typing import Dict, Generic, Iterator, List, Type
40
77
  import numpy as np
41
78
  import pandas as pd
42
79
  from hopeit.dataframes.dataframe import DataFrameT, dataframe
43
- from hopeit.dataframes.dataframeobject import DataFrameObjectT, dataframeobject
80
+ from hopeit.dataframes.serialization.dataset import Dataset
81
+ from hopeit.dataframes.serialization.settings import DatasetSerialization
82
+ from hopeit.dataframes.setup.dataframes import register_serialization
44
83
  from hopeit.dataobjects import DataObject
45
84
 
46
- __all__ = ["DataFrames", "dataframe", "dataframeobject"]
85
+ __all__ = ["DataFrames", "Dataset", "dataframe"]
47
86
 
48
87
 
49
- class DataFrames(Generic[DataFrameT, DataFrameObjectT, DataObject]):
88
+ class DataFrames(Generic[DataFrameT, DataObject]):
50
89
  """
51
90
  Dataframes manipulation utilities methods
52
91
  """
53
92
 
54
93
  @staticmethod
55
- async def serialize(obj: DataFrameObjectT) -> DataObject:
56
- """Serialize/saves contents of dataframe fields of a `@dataframeobject`
57
- and converts to a `DataObject` json-compatible with pointers to saved
58
- locations.
59
-
60
- This method can be used to i.e. return `@dataframeobject`s as a JSON response
61
- """
62
- return await obj._serialize() # type: ignore # pylint: disable=protected-access
63
-
64
- @staticmethod
65
- async def deserialize(
66
- datatype: Type[DataFrameObjectT], dataobject: DataObject
67
- ) -> DataFrameObjectT:
68
- """Deserialize/load contents of serialized dataobject fields of a `@dataframeobject`
69
- loading saved Dataset information for @dataframe fields
70
- """
71
- return await datatype._deserialize(dataobject) # type: ignore # pylint: disable=protected-access
94
+ def setup(settings: DatasetSerialization):
95
+ register_serialization(settings)
72
96
 
73
97
  @staticmethod
74
98
  def from_df(
@@ -88,7 +112,7 @@ class DataFrames(Generic[DataFrameT, DataFrameObjectT, DataObject]):
88
112
 
89
113
  @staticmethod
90
114
  def from_dataobjects(
91
- datatype: Type[DataFrameT], dataobjects: Iterator[DataFrameObjectT]
115
+ datatype: Type[DataFrameT], dataobjects: Iterator[DataObject]
92
116
  ) -> DataFrameT:
93
117
  """Converts standard json serializable `@dataobject`s to a single `@dataframe`"""
94
118
  return datatype._from_dataobjects(dataobjects) # type: ignore # pylint: disable=protected-access
@@ -1,58 +1,31 @@
1
1
  """
2
2
  DataFrames type abstractions.
3
-
4
- Example:
5
-
6
- from hopeit.dataobjects import dataclass # equivalent to `dataclasses.dataclass`
7
- from hopeit.dataframes import dataframe
8
-
9
- @dataframe
10
- @dataclass
11
- class MyObject:
12
- name: str
13
- number: int
14
3
  """
15
-
16
- from dataclasses import Field, asdict, dataclass, fields, make_dataclass
4
+ import dataclasses
17
5
  from datetime import date, datetime, timezone
18
- from typing import Any, Callable, Dict, Generic, Iterator, List, Optional, Type, TypeVar
6
+ from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar
19
7
 
20
8
  import numpy as np
21
9
  import pandas as pd
22
- from dataclasses_jsonschema import JsonSchemaMixin
10
+ from pydantic import create_model
11
+ from pydantic.fields import FieldInfo
12
+
23
13
  from hopeit.dataobjects import (
24
14
  DataObject,
25
15
  StreamEventMixin,
26
16
  StreamEventParams,
27
17
  dataobject,
18
+ fields,
28
19
  )
20
+ from hopeit.dataobjects.payload import Payload
29
21
 
30
22
  DataFrameT = TypeVar("DataFrameT")
31
23
 
32
24
 
33
- @dataclass
34
- class DataFrameMetadata(Generic[DataObject]):
25
+ @dataclasses.dataclass
26
+ class DataFrameMetadata():
35
27
  columns: List[str]
36
- fields: Dict[str, Field]
37
- serialized_type: Type[DataObject]
38
-
39
-
40
- @dataclass
41
- class DataFrameParams:
42
- """
43
- Helper class used to access attributes in @dataframe
44
- decorated objects, based on dot notation expressions
45
- """
46
-
47
- datatypes: Optional[str]
48
-
49
- @staticmethod
50
- def extract_attr(obj, expr):
51
- value = obj
52
- for attr_name in expr.split("."):
53
- if value:
54
- value = getattr(value, attr_name)
55
- return value
28
+ fields: Dict[str, FieldInfo]
56
29
 
57
30
 
58
31
  class DataFrameMixin(Generic[DataFrameT, DataObject]):
@@ -99,7 +72,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
99
72
 
100
73
  @classmethod
101
74
  def _from_dataobjects(cls, items: Iterator[DataObject]) -> DataFrameT:
102
- return cls._from_df(pd.DataFrame(asdict(item) for item in items)) # type: ignore
75
+ return cls._from_df(pd.DataFrame(Payload.to_obj(item) for item in items)) # type: ignore[misc]
103
76
 
104
77
  @classmethod
105
78
  def _from_df_unsafe(cls, df: pd.DataFrame, **series: pd.Series) -> DataFrameT:
@@ -117,40 +90,10 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
117
90
 
118
91
  def _to_dataobjects(self) -> List[DataObject]:
119
92
  return [
120
- self.__dataframe__.serialized_type(**fields)
93
+ self.DataObject(**fields)
121
94
  for fields in self.__df.to_dict(orient="records")
122
95
  ]
123
96
 
124
- def to_json(self, *args, **kwargs) -> str:
125
- raise NotImplementedError(
126
- "Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
127
- )
128
-
129
- def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
130
- raise NotImplementedError(
131
- "Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
132
- )
133
-
134
- @classmethod
135
- def from_json(cls, *args, **kwargs) -> DataObject:
136
- return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
137
-
138
- @classmethod
139
- def from_dict(
140
- cls,
141
- *args,
142
- **kwargs,
143
- ) -> DataObject:
144
- return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
145
-
146
- @classmethod
147
- def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
148
- if cls.__data_object__["schema"]:
149
- schema = cls.__dataframe__.serialized_type.json_schema(*args, **kwargs)
150
- schema[cls.__name__] = schema[cls.__dataframe__.serialized_type.__name__]
151
- return schema
152
- return {}
153
-
154
97
  def event_id(self, *args, **kwargs) -> str:
155
98
  return ""
156
99
 
@@ -174,7 +117,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
174
117
 
175
118
  def _coerce_datatypes(self, df: pd.DataFrame) -> Dict[str, pd.Series]:
176
119
  return {
177
- name: self.DATATYPE_MAPPING[field.type](df[name]) # type: ignore
120
+ name: self.DATATYPE_MAPPING[field.annotation](df[name]) # type: ignore
178
121
  for name, field in self.__dataframe__.fields.items()
179
122
  }
180
123
 
@@ -193,7 +136,7 @@ def dataframe(
193
136
  if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
194
137
  amended_class = type(
195
138
  cls.__name__,
196
- (DataFrameMixin, JsonSchemaMixin) + cls.__mro__,
139
+ (DataFrameMixin, ) + cls.__mro__,
197
140
  dict(cls.__dict__),
198
141
  )
199
142
  setattr(amended_class, "__init__", DataFrameMixin.__init_from_series__)
@@ -201,17 +144,17 @@ def dataframe(
201
144
  return cls
202
145
 
203
146
  def add_dataframe_metadata(cls):
204
- serialized_fiels = [(field.name, field.type) for field in fields(cls)]
205
- serialized_type = make_dataclass(cls.__name__ + "_", serialized_fiels)
206
- serialized_type = dataobject(serialized_type, unsafe=True)
147
+ serialized_fields = {k: (v.annotation, v) for k, v in fields(cls).items()}
148
+ dataobject_type = create_model(cls.__name__+"DataObject", **serialized_fields)
149
+ dataobject_type = dataobject(dataobject_type, unsafe=True)
207
150
 
151
+ setattr(cls, "DataObject", dataobject_type)
208
152
  setattr(
209
153
  cls,
210
154
  "__dataframe__",
211
155
  DataFrameMetadata(
212
- columns=[field.name for field in fields(cls)],
213
- fields={field.name: field for field in fields(cls)},
214
- serialized_type=serialized_type,
156
+ columns=list(fields(cls).keys()),
157
+ fields=dict(fields(cls).items()),
215
158
  ),
216
159
  )
217
160
 
@@ -226,14 +169,14 @@ def dataframe(
226
169
  setattr(cls, "event_ts", StreamEventMixin.event_ts)
227
170
 
228
171
  def set_fields_optional(cls):
229
- for field in fields(cls):
172
+ for _, field in fields(cls).items():
230
173
  field.default = None
231
174
 
232
175
  def wrap(cls) -> Type[DataFrameMixin]:
233
176
  if hasattr(cls, "__dataframe__"):
234
177
  return cls
178
+ add_dataframe_metadata(cls)
235
179
  amended_class = add_dataframe_mixin(cls)
236
- add_dataframe_metadata(amended_class)
237
180
  add_dataobject_annotations(amended_class, unsafe, validate, schema)
238
181
  set_fields_optional(amended_class)
239
182
  return amended_class
@@ -2,7 +2,7 @@
2
2
  """
3
3
 
4
4
  from importlib import import_module
5
- from typing import Type, TypeVar
5
+ from typing import Generic, Type, TypeVar
6
6
 
7
7
  from hopeit.dataobjects import dataclass, dataobject
8
8
 
@@ -11,12 +11,20 @@ DataFrameT = TypeVar("DataFrameT")
11
11
 
12
12
  @dataobject
13
13
  @dataclass
14
- class Dataset:
14
+ class Dataset(Generic[DataFrameT]):
15
+ """Persisted representation of a @dataframe object"""
15
16
  protocol: str
16
17
  partition_key: str
17
18
  key: str
18
19
  datatype: str
19
20
 
21
+ async def load(self) -> DataFrameT:
22
+ return await self.__storage.load(self) # type: ignore[attr-defined]
23
+
24
+ @classmethod
25
+ async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
26
+ return await cls.__storage.save(dataframe) # type: ignore[attr-defined]
27
+
20
28
 
21
29
  def find_protocol_impl(qual_type_name: str) -> Type:
22
30
  mod_name, type_name = (
@@ -3,7 +3,7 @@
3
3
 
4
4
  import io
5
5
  from importlib import import_module
6
- from typing import Callable, Generic, Optional, Type, TypeVar, Union
6
+ from typing import Generic, Optional, Type, TypeVar
7
7
  from uuid import uuid4
8
8
 
9
9
  import pandas as pd
@@ -67,40 +67,6 @@ class DatasetFileStorage(Generic[DataFrameT]):
67
67
  df = pd.read_parquet(io.BytesIO(data), engine="pyarrow")
68
68
  return datatype._from_df(df) # pylint: disable=protected-access
69
69
 
70
- async def ser_wrapper(
71
- self,
72
- base_serialization: Callable,
73
- data: Union[EventPayloadType, DataFrameT],
74
- level: int,
75
- ) -> bytes:
76
- """Serialization wrapper that plugins-in into hopeit.engine
77
- serialization when dataframes plugin is initialized
78
- """
79
- if hasattr(data, "__dataframeobject__"):
80
- data = await data._serialize() # type: ignore # pylint: disable=protected-access
81
- if hasattr(data, "__dataframe__"):
82
- data = await self.save(data) # type: ignore
83
- return await base_serialization(data, level)
84
-
85
- async def deser_wrapper(
86
- self,
87
- base_deserialization: Callable,
88
- data: bytes,
89
- datatype: Union[Type[EventPayloadType], Type[DataFrameT]],
90
- ) -> Union[EventPayloadType, DataFrameT]:
91
- """Deerialization wrapper that plugins-in into hopeit.engine
92
- deserialization when dataframes plugin is initialized
93
- """
94
- if hasattr(datatype, "__dataframeobject__"):
95
- dataset = await base_deserialization(
96
- data, datatype.__dataframeobject__.serialized_type # type: ignore
97
- )
98
- return await datatype._deserialize(dataset) # type: ignore # pylint: disable=protected-access
99
- if hasattr(datatype, "__dataframe__"):
100
- dataset = await base_deserialization(data, Dataset)
101
- return await self.load(dataset)
102
- return await base_deserialization(data, datatype)
103
-
104
70
 
105
71
  def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
106
72
  """Returns dataframe class based on type name used during serialization"""
@@ -0,0 +1,36 @@
1
+ """hopeit.engine dataframes plugin SETUP event.
2
+
3
+ This event executes when engine starts with dataframes plugin configuration file loaded,
4
+ and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
5
+ types
6
+ """
7
+
8
+ from hopeit.app.context import EventContext
9
+ from hopeit.app.logger import app_logger
10
+ from hopeit.dataframes.serialization.dataset import Dataset, find_protocol_impl
11
+ from hopeit.dataframes.serialization.settings import DatasetSerialization
12
+
13
+ logger = app_logger()
14
+
15
+ __steps__ = ["setup"]
16
+
17
+
18
+ def setup(payload: None, context: EventContext) -> None:
19
+ """Setups serizaltion wrappers in hopeit.engine based on
20
+ `DataSerialization` settings configured in plugin configuration file
21
+ """
22
+ logger.info(context, "Configuring Dataset serialization...")
23
+ settings: DatasetSerialization = context.settings(
24
+ key="dataset_serialization", datatype=DatasetSerialization
25
+ )
26
+ register_serialization(settings)
27
+
28
+
29
+ def register_serialization(settings: DatasetSerialization):
30
+ impl = find_protocol_impl(settings.protocol)
31
+ storage = impl(
32
+ protocol=settings.protocol,
33
+ location=settings.location,
34
+ partition_dateformat=settings.partition_dateformat,
35
+ )
36
+ setattr(Dataset, "_Dataset__storage", storage)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.24.2
3
+ Version: 0.25.0b2
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
5
  Home-page: https://github.com/hopeit-git/hopeit.engine
6
6
  Author: Leo Smerling and Pablo Canto
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
26
  Classifier: Framework :: AsyncIO
27
27
  Requires-Python: >=3.8
28
28
  Description-Content-Type: text/markdown
29
- Requires-Dist: hopeit.engine[fs-storage]==0.24.2
29
+ Requires-Dist: hopeit.engine[fs-storage]==0.25.0b2
30
30
  Requires-Dist: pandas
31
31
  Requires-Dist: numpy
32
32
  Provides-Extra: pyarrow
@@ -7,7 +7,6 @@ src/hopeit.dataframes.egg-info/requires.txt
7
7
  src/hopeit.dataframes.egg-info/top_level.txt
8
8
  src/hopeit/dataframes/__init__.py
9
9
  src/hopeit/dataframes/dataframe.py
10
- src/hopeit/dataframes/dataframeobject.py
11
10
  src/hopeit/dataframes/py.typed
12
11
  src/hopeit/dataframes/serialization/__init__.py
13
12
  src/hopeit/dataframes/serialization/dataset.py
@@ -0,0 +1,6 @@
1
+ hopeit.engine[fs-storage]==0.25.0b2
2
+ pandas
3
+ numpy
4
+
5
+ [pyarrow]
6
+ pyarrow
@@ -1,184 +0,0 @@
1
- """
2
- `@dataframeobject` annonation mixin to serialize a group of `@dataframe`s.
3
-
4
- Datasets behaves as DataObject so they can be used as payload
5
- for endpoints and streams.
6
- """
7
-
8
- from dataclasses import Field, dataclass, fields, make_dataclass
9
- from typing import (
10
- Any,
11
- Callable,
12
- ClassVar,
13
- Dict,
14
- Generic,
15
- Optional,
16
- Type,
17
- TypeVar,
18
- Union,
19
- get_args,
20
- get_origin,
21
- )
22
-
23
- from hopeit.dataframes.serialization.dataset import Dataset
24
- from hopeit.dataobjects import (
25
- DataObject,
26
- StreamEventMixin,
27
- StreamEventParams,
28
- dataobject,
29
- )
30
-
31
- DataFrameObjectT = TypeVar("DataFrameObjectT")
32
- NoneType = type(None)
33
-
34
-
35
- @dataclass
36
- class DataFrameObjectMetadata(Generic[DataObject]):
37
- serialized_type: Type[DataObject]
38
-
39
-
40
- class DataFrameObjectMixin(Generic[DataFrameObjectT]):
41
- """
42
- MixIn class to add functionality for `@dataframeobject`s
43
-
44
- Do not use this class directly, instead use `@dataframeobject` class decorator.
45
- """
46
-
47
- __storage: ClassVar[Any] = None # pylint: disable=invalid-name
48
-
49
- def __init__(self) -> None:
50
- self.__dataframeobject__: DataFrameObjectMetadata = None # type: ignore
51
- raise NotImplementedError(
52
- "DataFrameObjectMixin() should not be called directly. Use `@dataframeobject` annotation"
53
- )
54
-
55
- async def _serialize(self) -> Optional[DataObject]:
56
- """Saves internal `@dataframe`s using configured serialization protocol
57
- and returns json-serialiable dataobject
58
- """
59
- datasets = {}
60
- for field in fields(self): # type: ignore
61
- if _is_dataframe_field(field):
62
- dataframe = getattr(self, field.name)
63
- dataset = (
64
- None if dataframe is None else await self.__storage.save(dataframe)
65
- )
66
- datasets[field.name] = dataset
67
- else:
68
- datasets[field.name] = getattr(self, field.name)
69
- return self.__dataframeobject__.serialized_type(**datasets)
70
-
71
- @classmethod
72
- async def _deserialize(
73
- cls, serialized: DataObject
74
- ) -> "DataFrameObjectMixin[DataFrameObjectT]":
75
- """From a serialized datframeobject, load inner `@dataframe` objects
76
- and returns a `@dataframeobject` instance"""
77
- dataframes = {}
78
- for field in fields(cls): # type: ignore
79
- if _is_dataframe_field(field):
80
- dataset = getattr(serialized, field.name)
81
- dataframe = (
82
- None if dataset is None else await cls.__storage.load(dataset)
83
- )
84
- dataframes[field.name] = dataframe
85
- else:
86
- dataframes[field.name] = getattr(serialized, field.name)
87
- return cls(**dataframes)
88
-
89
- @classmethod
90
- def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
91
- schema = cls.__dataframeobject__.serialized_type.json_schema(*args, **kwargs)
92
- schema[cls.__name__] = schema[cls.__dataframeobject__.serialized_type.__name__]
93
- return schema
94
-
95
- def to_json(self, *args, **kwargs) -> Dict[str, Any]:
96
- raise RuntimeError(
97
- f"`{type(self).__name__}` `@dataframeobject` cannot be converted to json directly. "
98
- "i.e. use `return await DataFrames.serialize(obj)` to return it as a reponse."
99
- )
100
-
101
-
102
- def _is_dataframe_field(field: Field) -> bool:
103
- return any(
104
- hasattr(field_type, "__dataframe__")
105
- for field_type in [field.type, *get_args(field.type)]
106
- )
107
-
108
-
109
- def _serialized_field_type(field: Field) -> Type[Any]:
110
- """Computes the `@dataobject` datatype used as a result
111
- of serialized `@dataframeobject`
112
- """
113
- if hasattr(field.type, "__dataframe__"):
114
- return Dataset
115
- if get_origin(field.type) is Union:
116
- args = get_args(field.type)
117
- if (
118
- len(args) == 2
119
- and any(hasattr(field_type, "__dataframe__") for field_type in args)
120
- and any(field_type is NoneType for field_type in args)
121
- ):
122
- return Optional[Dataset] # type: ignore
123
- if _is_dataframe_field(field):
124
- raise TypeError(
125
- f"field {field.name}: only `DataFrameT` or `Optional[DataFrameT]` are supported"
126
- )
127
- return field.type
128
-
129
-
130
- def dataframeobject(
131
- decorated_class=None,
132
- ) -> Callable[[Type], Type[DataFrameObjectMixin]]:
133
- """
134
- Decorator for dataclasses intended to be used as dataframes.
135
- """
136
-
137
- def add_dataframe_mixin(cls) -> Type[DataFrameObjectMixin]:
138
- if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
139
- amended_class = type(
140
- cls.__name__,
141
- (DataFrameObjectMixin,) + cls.__mro__,
142
- dict(cls.__dict__),
143
- )
144
- return amended_class
145
- return cls
146
-
147
- def add_dataframeobject_metadata(cls):
148
- serialized_fiels = [
149
- (field.name, _serialized_field_type(field)) for field in fields(cls)
150
- ]
151
- serialized_type = make_dataclass(cls.__name__ + "_", serialized_fiels)
152
- serialized_type = dataobject(serialized_type, unsafe=True)
153
-
154
- setattr(
155
- cls,
156
- "__dataframeobject__",
157
- DataFrameObjectMetadata(
158
- serialized_type=serialized_type,
159
- ),
160
- )
161
-
162
- def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool):
163
- setattr(
164
- cls,
165
- "__data_object__",
166
- {"unsafe": unsafe, "validate": validate, "schema": schema},
167
- )
168
- setattr(cls, "__stream_event__", StreamEventParams(None, None))
169
- setattr(cls, "event_id", StreamEventMixin.event_id)
170
- setattr(cls, "event_ts", StreamEventMixin.event_ts)
171
-
172
- def wrap(cls) -> Type[DataFrameObjectMixin]:
173
- if hasattr(cls, "__dataframeobject__"):
174
- return cls
175
- amended_class = add_dataframe_mixin(cls)
176
- add_dataframeobject_metadata(amended_class)
177
- add_dataobject_annotations(
178
- amended_class, unsafe=False, validate=True, schema=True
179
- )
180
- return amended_class
181
-
182
- if decorated_class is None:
183
- return wrap
184
- return wrap(decorated_class) # type: ignore
@@ -1,52 +0,0 @@
1
- """hopeit.engine dataframes plugin SETUP event.
2
-
3
- This event executes when engine starts with dataframes plugin configuration file loaded,
4
- and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
5
- types
6
- """
7
-
8
- from functools import partial
9
-
10
- from hopeit.app.context import EventContext
11
- from hopeit.app.logger import app_logger
12
- from hopeit.dataframes.dataframeobject import DataFrameObjectMixin
13
- from hopeit.dataframes.serialization.dataset import find_protocol_impl
14
- from hopeit.dataframes.serialization.settings import DatasetSerialization
15
- from hopeit.server import serialization
16
-
17
- logger = app_logger()
18
-
19
- __steps__ = ["register_serialization"]
20
-
21
-
22
- def register_serialization(payload: None, context: EventContext) -> None:
23
- """Setups serizaltion wrappers in hopeit.engine based on
24
- `DataSerialization` settings configured in plugin configuration file
25
- """
26
- logger.info(context, "Registering serialization methods...")
27
-
28
- settings: DatasetSerialization = context.settings(
29
- key="dataset_serialization", datatype=DatasetSerialization
30
- )
31
- impl = find_protocol_impl(settings.protocol)
32
-
33
- storage = impl(
34
- protocol=settings.protocol,
35
- location=settings.location,
36
- partition_dateformat=settings.partition_dateformat,
37
- )
38
- setattr(DataFrameObjectMixin, "_DataFrameObjectMixin__storage", storage)
39
-
40
- serdeser_wrappers = {}
41
- for (
42
- serdeser,
43
- methods,
44
- ) in serialization._SERDESER.items(): # pylint: disable=protected-access
45
- serdeser_wrappers[serdeser] = (
46
- partial(storage.ser_wrapper, methods[0]),
47
- methods[1],
48
- partial(storage.deser_wrapper, methods[2]),
49
- )
50
-
51
- for serdeser, methods in serdeser_wrappers.items():
52
- serialization._SERDESER[serdeser] = methods # pylint: disable=protected-access
@@ -1,6 +0,0 @@
1
- hopeit.engine[fs-storage]==0.24.2
2
- pandas
3
- numpy
4
-
5
- [pyarrow]
6
- pyarrow