hopeit.dataframes 0.24.2__tar.gz → 0.25.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/PKG-INFO +3 -4
  2. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/setup.py +1 -2
  3. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/__init__.py +55 -31
  4. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/dataframe.py +40 -84
  5. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/dataset.py +12 -4
  6. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/files.py +3 -40
  7. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/settings.py +1 -2
  8. hopeit_dataframes-0.25.0/src/hopeit/dataframes/setup/dataframes.py +36 -0
  9. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/PKG-INFO +3 -4
  10. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/SOURCES.txt +0 -1
  11. hopeit_dataframes-0.25.0/src/hopeit.dataframes.egg-info/requires.txt +6 -0
  12. hopeit.dataframes-0.24.2/src/hopeit/dataframes/dataframeobject.py +0 -184
  13. hopeit.dataframes-0.24.2/src/hopeit/dataframes/setup/dataframes.py +0 -52
  14. hopeit.dataframes-0.24.2/src/hopeit.dataframes.egg-info/requires.txt +0 -6
  15. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/README.md +0 -0
  16. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/setup.cfg +0 -0
  17. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/py.typed +0 -0
  18. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/serialization/__init__.py +0 -0
  19. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit/dataframes/setup/__init__.py +0 -0
  20. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
  21. {hopeit.dataframes-0.24.2 → hopeit_dataframes-0.25.0}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.24.2
3
+ Version: 0.25.0
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
5
  Home-page: https://github.com/hopeit-git/hopeit.engine
6
6
  Author: Leo Smerling and Pablo Canto
@@ -13,7 +13,6 @@ Project-URL: GitHub: repo, https://github.com/hopeit-git/hopeit.engine
13
13
  Classifier: License :: OSI Approved :: Apache Software License
14
14
  Classifier: Intended Audience :: Developers
15
15
  Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
@@ -24,9 +23,9 @@ Classifier: Operating System :: Microsoft :: Windows
24
23
  Classifier: Topic :: Internet :: WWW/HTTP
25
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
25
  Classifier: Framework :: AsyncIO
27
- Requires-Python: >=3.8
26
+ Requires-Python: >=3.9
28
27
  Description-Content-Type: text/markdown
29
- Requires-Dist: hopeit.engine[fs-storage]==0.24.2
28
+ Requires-Dist: hopeit.engine[fs-storage]==0.25.0
30
29
  Requires-Dist: pandas
31
30
  Requires-Dist: numpy
32
31
  Provides-Extra: pyarrow
@@ -18,7 +18,6 @@ setuptools.setup(
18
18
  "License :: OSI Approved :: Apache Software License",
19
19
  "Intended Audience :: Developers",
20
20
  "Programming Language :: Python",
21
- "Programming Language :: Python :: 3.8",
22
21
  "Programming Language :: Python :: 3.9",
23
22
  "Programming Language :: Python :: 3.10",
24
23
  "Programming Language :: Python :: 3.11",
@@ -46,7 +45,7 @@ setuptools.setup(
46
45
  package_data={
47
46
  "hopeit.dataframes": ["py.typed"],
48
47
  },
49
- python_requires=">=3.8",
48
+ python_requires=">=3.9",
50
49
  install_requires=[
51
50
  f"hopeit.engine[fs-storage]=={version['ENGINE_VERSION']}",
52
51
  "pandas",
@@ -1,37 +1,74 @@
1
1
  """
2
2
  hopeit.engine dataframes plugin entry point
3
3
 
4
- This module exposes the 3 main constructions to be used inside apps:
4
+ This module exposes the 2 main constructions to be used inside apps,
5
+ to extend @dataobject functionallity supporting working with `pandas DataFrames`
5
6
  `@dataframe` dataclass annotation
6
- `@dataframeobject` dataclass annotation
7
7
  `DataFrames` class to handle manipulation of dataframe/dataframeobjects
8
8
 
9
9
  Usage:
10
10
  ```
11
- from hopeit.dataframes import DataFrames, dataframe, dataframeobject
11
+ from typing import List
12
+
13
+ import pandas as pd
14
+
15
+ from hopeit.dataframes.serialization.settings import DatasetSerialization
16
+ from hopeit.dataframes import DataFrames, Dataset, dataframe
17
+ from hopeit.dataobjects import dataobject, dataclass
18
+ from hopeit.dataobjects.payload import Payload
12
19
 
13
20
  @dataframe
14
21
  @dataclass
15
- class MyDataFrame:
22
+ class MyData:
16
23
  field1: int
17
24
  field2: str
18
25
  ...
19
26
 
20
- @dataframeobject
27
+ @dataobject
21
28
  @dataclass
22
29
  class MyDataset:
23
30
  dataset_name: str
24
- example_data: MyDataFrame
31
+ example_data: Dataset[MyData]
32
+
33
+ @dataobject
34
+ @dataclass
35
+ class MyWebResponse:
36
+ dataset_name: str
37
+ example_data: List[MyData.DataObject]
38
+
39
+ # This step is not needed if SETUP event is configured in app
40
+ DataFrames.setup(DatasetSerialization(
41
+ protocol="hopeit.dataframes.serialization.files.DatasetFileStorage",
42
+ location="/tmp/data",
43
+ partition_dateformat="%Y/%m/%d/%H/",
44
+ ))
45
+
46
+ df = pd.DataFrame([ # Create or load a pandas DataFrame
47
+ {"field1": 1, "field2": "text1"},
48
+ {"field1": 2, "field2": "text2"},
49
+ ])
50
+
51
+ my_data: MyData = DataFrames.from_df(MyData, df)
25
52
 
53
+ # return dataset after saving data to disk
54
+ my_dataset = MyDataset(
55
+ dataset_name="example",
56
+ example_data=await Dataset.save(my_data)
57
+ )
58
+
59
+ print(Payload.to_json(my_dataset))
26
60
 
27
- df = pd.DataFrame(...) # create or load your pandas dataframe
61
+ my_data_again: MyData = await my_dataset.example_data.load()
28
62
 
29
- my_data = DataFrames.from_df(pd.DataFrame(..))
63
+ print(DataFrames.df(my_data_again))
30
64
 
31
- return MyDataSet(
65
+ # return dataframe converted to list of dataobjects that can be directly converted to json
66
+ my_json_response = MyWebResponse(
32
67
  dataset_name="example",
33
- example_data=my_data
68
+ example_data=DataFrames.to_dataobjects(my_data)
34
69
  )
70
+
71
+ print(Payload.to_json(my_json_response))
35
72
  ```
36
73
  """
37
74
 
@@ -40,35 +77,22 @@ from typing import Dict, Generic, Iterator, List, Type
40
77
  import numpy as np
41
78
  import pandas as pd
42
79
  from hopeit.dataframes.dataframe import DataFrameT, dataframe
43
- from hopeit.dataframes.dataframeobject import DataFrameObjectT, dataframeobject
80
+ from hopeit.dataframes.serialization.dataset import Dataset
81
+ from hopeit.dataframes.serialization.settings import DatasetSerialization
82
+ from hopeit.dataframes.setup.dataframes import register_serialization
44
83
  from hopeit.dataobjects import DataObject
45
84
 
46
- __all__ = ["DataFrames", "dataframe", "dataframeobject"]
85
+ __all__ = ["DataFrames", "Dataset", "dataframe"]
47
86
 
48
87
 
49
- class DataFrames(Generic[DataFrameT, DataFrameObjectT, DataObject]):
88
+ class DataFrames(Generic[DataFrameT, DataObject]):
50
89
  """
51
90
  Dataframes manipulation utilities methods
52
91
  """
53
92
 
54
93
  @staticmethod
55
- async def serialize(obj: DataFrameObjectT) -> DataObject:
56
- """Serialize/saves contents of dataframe fields of a `@dataframeobject`
57
- and converts to a `DataObject` json-compatible with pointers to saved
58
- locations.
59
-
60
- This method can be used to i.e. return `@dataframeobject`s as a JSON response
61
- """
62
- return await obj._serialize() # type: ignore # pylint: disable=protected-access
63
-
64
- @staticmethod
65
- async def deserialize(
66
- datatype: Type[DataFrameObjectT], dataobject: DataObject
67
- ) -> DataFrameObjectT:
68
- """Deserialize/load contents of serialized dataobject fields of a `@dataframeobject`
69
- loading saved Dataset information for @dataframe fields
70
- """
71
- return await datatype._deserialize(dataobject) # type: ignore # pylint: disable=protected-access
94
+ def setup(settings: DatasetSerialization):
95
+ register_serialization(settings)
72
96
 
73
97
  @staticmethod
74
98
  def from_df(
@@ -88,7 +112,7 @@ class DataFrames(Generic[DataFrameT, DataFrameObjectT, DataObject]):
88
112
 
89
113
  @staticmethod
90
114
  def from_dataobjects(
91
- datatype: Type[DataFrameT], dataobjects: Iterator[DataFrameObjectT]
115
+ datatype: Type[DataFrameT], dataobjects: Iterator[DataObject]
92
116
  ) -> DataFrameT:
93
117
  """Converts standard json serializable `@dataobject`s to a single `@dataframe`"""
94
118
  return datatype._from_dataobjects(dataobjects) # type: ignore # pylint: disable=protected-access
@@ -1,58 +1,49 @@
1
1
  """
2
2
  DataFrames type abstractions.
3
-
4
- Example:
5
-
6
- from hopeit.dataobjects import dataclass # equivalent to `dataclasses.dataclass`
7
- from hopeit.dataframes import dataframe
8
-
9
- @dataframe
10
- @dataclass
11
- class MyObject:
12
- name: str
13
- number: int
14
3
  """
15
4
 
16
- from dataclasses import Field, asdict, dataclass, fields, make_dataclass
5
+ import dataclasses
17
6
  from datetime import date, datetime, timezone
18
- from typing import Any, Callable, Dict, Generic, Iterator, List, Optional, Type, TypeVar
7
+ from functools import partial
8
+ from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar
19
9
 
20
10
  import numpy as np
21
11
  import pandas as pd
22
- from dataclasses_jsonschema import JsonSchemaMixin
12
+ from pydantic import create_model
13
+ from pydantic.fields import FieldInfo
14
+
23
15
  from hopeit.dataobjects import (
24
16
  DataObject,
25
17
  StreamEventMixin,
26
18
  StreamEventParams,
27
19
  dataobject,
20
+ fields,
28
21
  )
22
+ from hopeit.dataobjects.payload import Payload
29
23
 
30
24
  DataFrameT = TypeVar("DataFrameT")
31
25
 
32
26
 
33
- @dataclass
34
- class DataFrameMetadata(Generic[DataObject]):
27
+ @dataclasses.dataclass
28
+ class DataFrameMetadata:
35
29
  columns: List[str]
36
- fields: Dict[str, Field]
37
- serialized_type: Type[DataObject]
30
+ fields: Dict[str, FieldInfo]
38
31
 
39
32
 
40
- @dataclass
41
- class DataFrameParams:
42
- """
43
- Helper class used to access attributes in @dataframe
44
- decorated objects, based on dot notation expressions
45
- """
33
+ # Functions to do type coercion
34
+ def _series_to_int(x: pd.Series) -> pd.Series:
35
+ return x.astype(np.int64)
46
36
 
47
- datatypes: Optional[str]
48
37
 
49
- @staticmethod
50
- def extract_attr(obj, expr):
51
- value = obj
52
- for attr_name in expr.split("."):
53
- if value:
54
- value = getattr(value, attr_name)
55
- return value
38
+ def _series_to_float(x: pd.Series) -> pd.Series:
39
+ return x.astype(np.float64)
40
+
41
+
42
+ def _series_to_str(x: pd.Series) -> pd.Series:
43
+ return x.astype(str)
44
+
45
+
46
+ _series_to_utc_datetime = partial(pd.to_datetime, utc=True)
56
47
 
57
48
 
58
49
  class DataFrameMixin(Generic[DataFrameT, DataObject]):
@@ -63,11 +54,11 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
63
54
  """
64
55
 
65
56
  DATATYPE_MAPPING = {
66
- int: lambda x: x.astype(np.int64),
67
- float: lambda x: x.astype(np.float64),
68
- str: lambda x: x.astype(object),
57
+ int: _series_to_int,
58
+ float: _series_to_float,
59
+ str: _series_to_str,
69
60
  date: pd.to_datetime,
70
- datetime: pd.to_datetime,
61
+ datetime: _series_to_utc_datetime,
71
62
  }
72
63
 
73
64
  def __init__(self) -> None:
@@ -78,9 +69,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
78
69
  raise NotImplementedError # must use @dataframe decorator # pragma: no cover
79
70
 
80
71
  @staticmethod
81
- def __init_from_series__(
82
- self, **series: pd.Series
83
- ): # pylint: disable=bad-staticmethod-argument
72
+ def __init_from_series__(self, **series: pd.Series): # pylint: disable=bad-staticmethod-argument
84
73
  df = pd.DataFrame(series)
85
74
  df.index.name = None # Removes index name to avoid colisions with series name
86
75
  if self.__data_object__["validate"]:
@@ -99,7 +88,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
99
88
 
100
89
  @classmethod
101
90
  def _from_dataobjects(cls, items: Iterator[DataObject]) -> DataFrameT:
102
- return cls._from_df(pd.DataFrame(asdict(item) for item in items)) # type: ignore
91
+ return cls._from_df(pd.DataFrame(Payload.to_obj(item) for item in items)) # type: ignore[misc]
103
92
 
104
93
  @classmethod
105
94
  def _from_df_unsafe(cls, df: pd.DataFrame, **series: pd.Series) -> DataFrameT:
@@ -116,40 +105,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
116
105
  return self._from_df(self.__df[key])
117
106
 
118
107
  def _to_dataobjects(self) -> List[DataObject]:
119
- return [
120
- self.__dataframe__.serialized_type(**fields)
121
- for fields in self.__df.to_dict(orient="records")
122
- ]
123
-
124
- def to_json(self, *args, **kwargs) -> str:
125
- raise NotImplementedError(
126
- "Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
127
- )
128
-
129
- def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
130
- raise NotImplementedError(
131
- "Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
132
- )
133
-
134
- @classmethod
135
- def from_json(cls, *args, **kwargs) -> DataObject:
136
- return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
137
-
138
- @classmethod
139
- def from_dict(
140
- cls,
141
- *args,
142
- **kwargs,
143
- ) -> DataObject:
144
- return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
145
-
146
- @classmethod
147
- def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
148
- if cls.__data_object__["schema"]:
149
- schema = cls.__dataframe__.serialized_type.json_schema(*args, **kwargs)
150
- schema[cls.__name__] = schema[cls.__dataframe__.serialized_type.__name__]
151
- return schema
152
- return {}
108
+ return [self.DataObject(**fields) for fields in self.__df.to_dict(orient="records")]
153
109
 
154
110
  def event_id(self, *args, **kwargs) -> str:
155
111
  return ""
@@ -174,7 +130,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
174
130
 
175
131
  def _coerce_datatypes(self, df: pd.DataFrame) -> Dict[str, pd.Series]:
176
132
  return {
177
- name: self.DATATYPE_MAPPING[field.type](df[name]) # type: ignore
133
+ name: self.DATATYPE_MAPPING[field.annotation](df[name]) # type: ignore
178
134
  for name, field in self.__dataframe__.fields.items()
179
135
  }
180
136
 
@@ -193,7 +149,7 @@ def dataframe(
193
149
  if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
194
150
  amended_class = type(
195
151
  cls.__name__,
196
- (DataFrameMixin, JsonSchemaMixin) + cls.__mro__,
152
+ (DataFrameMixin,) + cls.__mro__,
197
153
  dict(cls.__dict__),
198
154
  )
199
155
  setattr(amended_class, "__init__", DataFrameMixin.__init_from_series__)
@@ -201,17 +157,17 @@ def dataframe(
201
157
  return cls
202
158
 
203
159
  def add_dataframe_metadata(cls):
204
- serialized_fiels = [(field.name, field.type) for field in fields(cls)]
205
- serialized_type = make_dataclass(cls.__name__ + "_", serialized_fiels)
206
- serialized_type = dataobject(serialized_type, unsafe=True)
160
+ serialized_fields = {k: (v.annotation, v) for k, v in fields(cls).items()}
161
+ dataobject_type = create_model(cls.__name__ + "DataObject", **serialized_fields)
162
+ dataobject_type = dataobject(dataobject_type, unsafe=True)
207
163
 
164
+ setattr(cls, "DataObject", dataobject_type)
208
165
  setattr(
209
166
  cls,
210
167
  "__dataframe__",
211
168
  DataFrameMetadata(
212
- columns=[field.name for field in fields(cls)],
213
- fields={field.name: field for field in fields(cls)},
214
- serialized_type=serialized_type,
169
+ columns=list(fields(cls).keys()),
170
+ fields=dict(fields(cls).items()),
215
171
  ),
216
172
  )
217
173
 
@@ -226,14 +182,14 @@ def dataframe(
226
182
  setattr(cls, "event_ts", StreamEventMixin.event_ts)
227
183
 
228
184
  def set_fields_optional(cls):
229
- for field in fields(cls):
185
+ for _, field in fields(cls).items():
230
186
  field.default = None
231
187
 
232
188
  def wrap(cls) -> Type[DataFrameMixin]:
233
189
  if hasattr(cls, "__dataframe__"):
234
190
  return cls
191
+ add_dataframe_metadata(cls)
235
192
  amended_class = add_dataframe_mixin(cls)
236
- add_dataframe_metadata(amended_class)
237
193
  add_dataobject_annotations(amended_class, unsafe, validate, schema)
238
194
  set_fields_optional(amended_class)
239
195
  return amended_class
@@ -1,8 +1,7 @@
1
- """Dataset objects definition, used as a result of serialized dataframes
2
- """
1
+ """Dataset objects definition, used as a result of serialized dataframes"""
3
2
 
4
3
  from importlib import import_module
5
- from typing import Type, TypeVar
4
+ from typing import Generic, Type, TypeVar
6
5
 
7
6
  from hopeit.dataobjects import dataclass, dataobject
8
7
 
@@ -11,12 +10,21 @@ DataFrameT = TypeVar("DataFrameT")
11
10
 
12
11
  @dataobject
13
12
  @dataclass
14
- class Dataset:
13
+ class Dataset(Generic[DataFrameT]):
14
+ """Persisted representation of a @dataframe object"""
15
+
15
16
  protocol: str
16
17
  partition_key: str
17
18
  key: str
18
19
  datatype: str
19
20
 
21
+ async def load(self) -> DataFrameT:
22
+ return await self.__storage.load(self) # type: ignore[attr-defined]
23
+
24
+ @classmethod
25
+ async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
26
+ return await cls.__storage.save(dataframe) # type: ignore[attr-defined]
27
+
20
28
 
21
29
  def find_protocol_impl(qual_type_name: str) -> Type:
22
30
  mod_name, type_name = (
@@ -1,9 +1,8 @@
1
- """Support for `@dataframes` serialization to files
2
- """
1
+ """Support for `@dataframes` serialization to files"""
3
2
 
4
3
  import io
5
4
  from importlib import import_module
6
- from typing import Callable, Generic, Optional, Type, TypeVar, Union
5
+ from typing import Generic, Optional, Type, TypeVar
7
6
  from uuid import uuid4
8
7
 
9
8
  import pandas as pd
@@ -59,48 +58,12 @@ class DatasetFileStorage(Generic[DataFrameT]):
59
58
  async def load(self, dataset: Dataset) -> EventPayloadType:
60
59
  """Loads @dataframe annotated object using Dataset metadata"""
61
60
  datatype: Type[DataFrameT] = find_dataframe_type(dataset.datatype)
62
- data = await self.storage.get_file(
63
- dataset.key, partition_key=dataset.partition_key
64
- )
61
+ data = await self.storage.get_file(dataset.key, partition_key=dataset.partition_key)
65
62
  if data is None:
66
63
  raise FileNotFoundError(dataset.key)
67
64
  df = pd.read_parquet(io.BytesIO(data), engine="pyarrow")
68
65
  return datatype._from_df(df) # pylint: disable=protected-access
69
66
 
70
- async def ser_wrapper(
71
- self,
72
- base_serialization: Callable,
73
- data: Union[EventPayloadType, DataFrameT],
74
- level: int,
75
- ) -> bytes:
76
- """Serialization wrapper that plugins-in into hopeit.engine
77
- serialization when dataframes plugin is initialized
78
- """
79
- if hasattr(data, "__dataframeobject__"):
80
- data = await data._serialize() # type: ignore # pylint: disable=protected-access
81
- if hasattr(data, "__dataframe__"):
82
- data = await self.save(data) # type: ignore
83
- return await base_serialization(data, level)
84
-
85
- async def deser_wrapper(
86
- self,
87
- base_deserialization: Callable,
88
- data: bytes,
89
- datatype: Union[Type[EventPayloadType], Type[DataFrameT]],
90
- ) -> Union[EventPayloadType, DataFrameT]:
91
- """Deerialization wrapper that plugins-in into hopeit.engine
92
- deserialization when dataframes plugin is initialized
93
- """
94
- if hasattr(datatype, "__dataframeobject__"):
95
- dataset = await base_deserialization(
96
- data, datatype.__dataframeobject__.serialized_type # type: ignore
97
- )
98
- return await datatype._deserialize(dataset) # type: ignore # pylint: disable=protected-access
99
- if hasattr(datatype, "__dataframe__"):
100
- dataset = await base_deserialization(data, Dataset)
101
- return await self.load(dataset)
102
- return await base_deserialization(data, datatype)
103
-
104
67
 
105
68
  def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
106
69
  """Returns dataframe class based on type name used during serialization"""
@@ -1,5 +1,4 @@
1
- """Support for plugin configuration
2
- """
1
+ """Support for plugin configuration"""
3
2
 
4
3
  from typing import Optional
5
4
 
@@ -0,0 +1,36 @@
1
+ """hopeit.engine dataframes plugin SETUP event.
2
+
3
+ This event executes when engine starts with dataframes plugin configuration file loaded,
4
+ and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
5
+ types
6
+ """
7
+
8
+ from hopeit.app.context import EventContext
9
+ from hopeit.app.logger import app_logger
10
+ from hopeit.dataframes.serialization.dataset import Dataset, find_protocol_impl
11
+ from hopeit.dataframes.serialization.settings import DatasetSerialization
12
+
13
+ logger = app_logger()
14
+
15
+ __steps__ = ["setup"]
16
+
17
+
18
+ def setup(payload: None, context: EventContext) -> None:
19
+ """Setups serizaltion wrappers in hopeit.engine based on
20
+ `DataSerialization` settings configured in plugin configuration file
21
+ """
22
+ logger.info(context, "Configuring Dataset serialization...")
23
+ settings: DatasetSerialization = context.settings(
24
+ key="dataset_serialization", datatype=DatasetSerialization
25
+ )
26
+ register_serialization(settings)
27
+
28
+
29
+ def register_serialization(settings: DatasetSerialization):
30
+ impl = find_protocol_impl(settings.protocol)
31
+ storage = impl(
32
+ protocol=settings.protocol,
33
+ location=settings.location,
34
+ partition_dateformat=settings.partition_dateformat,
35
+ )
36
+ setattr(Dataset, "_Dataset__storage", storage)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.24.2
3
+ Version: 0.25.0
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
5
  Home-page: https://github.com/hopeit-git/hopeit.engine
6
6
  Author: Leo Smerling and Pablo Canto
@@ -13,7 +13,6 @@ Project-URL: GitHub: repo, https://github.com/hopeit-git/hopeit.engine
13
13
  Classifier: License :: OSI Approved :: Apache Software License
14
14
  Classifier: Intended Audience :: Developers
15
15
  Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
@@ -24,9 +23,9 @@ Classifier: Operating System :: Microsoft :: Windows
24
23
  Classifier: Topic :: Internet :: WWW/HTTP
25
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
25
  Classifier: Framework :: AsyncIO
27
- Requires-Python: >=3.8
26
+ Requires-Python: >=3.9
28
27
  Description-Content-Type: text/markdown
29
- Requires-Dist: hopeit.engine[fs-storage]==0.24.2
28
+ Requires-Dist: hopeit.engine[fs-storage]==0.25.0
30
29
  Requires-Dist: pandas
31
30
  Requires-Dist: numpy
32
31
  Provides-Extra: pyarrow
@@ -7,7 +7,6 @@ src/hopeit.dataframes.egg-info/requires.txt
7
7
  src/hopeit.dataframes.egg-info/top_level.txt
8
8
  src/hopeit/dataframes/__init__.py
9
9
  src/hopeit/dataframes/dataframe.py
10
- src/hopeit/dataframes/dataframeobject.py
11
10
  src/hopeit/dataframes/py.typed
12
11
  src/hopeit/dataframes/serialization/__init__.py
13
12
  src/hopeit/dataframes/serialization/dataset.py
@@ -0,0 +1,6 @@
1
+ hopeit.engine[fs-storage]==0.25.0
2
+ pandas
3
+ numpy
4
+
5
+ [pyarrow]
6
+ pyarrow
@@ -1,184 +0,0 @@
1
- """
2
- `@dataframeobject` annonation mixin to serialize a group of `@dataframe`s.
3
-
4
- Datasets behaves as DataObject so they can be used as payload
5
- for endpoints and streams.
6
- """
7
-
8
- from dataclasses import Field, dataclass, fields, make_dataclass
9
- from typing import (
10
- Any,
11
- Callable,
12
- ClassVar,
13
- Dict,
14
- Generic,
15
- Optional,
16
- Type,
17
- TypeVar,
18
- Union,
19
- get_args,
20
- get_origin,
21
- )
22
-
23
- from hopeit.dataframes.serialization.dataset import Dataset
24
- from hopeit.dataobjects import (
25
- DataObject,
26
- StreamEventMixin,
27
- StreamEventParams,
28
- dataobject,
29
- )
30
-
31
- DataFrameObjectT = TypeVar("DataFrameObjectT")
32
- NoneType = type(None)
33
-
34
-
35
- @dataclass
36
- class DataFrameObjectMetadata(Generic[DataObject]):
37
- serialized_type: Type[DataObject]
38
-
39
-
40
- class DataFrameObjectMixin(Generic[DataFrameObjectT]):
41
- """
42
- MixIn class to add functionality for `@dataframeobject`s
43
-
44
- Do not use this class directly, instead use `@dataframeobject` class decorator.
45
- """
46
-
47
- __storage: ClassVar[Any] = None # pylint: disable=invalid-name
48
-
49
- def __init__(self) -> None:
50
- self.__dataframeobject__: DataFrameObjectMetadata = None # type: ignore
51
- raise NotImplementedError(
52
- "DataFrameObjectMixin() should not be called directly. Use `@dataframeobject` annotation"
53
- )
54
-
55
- async def _serialize(self) -> Optional[DataObject]:
56
- """Saves internal `@dataframe`s using configured serialization protocol
57
- and returns json-serialiable dataobject
58
- """
59
- datasets = {}
60
- for field in fields(self): # type: ignore
61
- if _is_dataframe_field(field):
62
- dataframe = getattr(self, field.name)
63
- dataset = (
64
- None if dataframe is None else await self.__storage.save(dataframe)
65
- )
66
- datasets[field.name] = dataset
67
- else:
68
- datasets[field.name] = getattr(self, field.name)
69
- return self.__dataframeobject__.serialized_type(**datasets)
70
-
71
- @classmethod
72
- async def _deserialize(
73
- cls, serialized: DataObject
74
- ) -> "DataFrameObjectMixin[DataFrameObjectT]":
75
- """From a serialized datframeobject, load inner `@dataframe` objects
76
- and returns a `@dataframeobject` instance"""
77
- dataframes = {}
78
- for field in fields(cls): # type: ignore
79
- if _is_dataframe_field(field):
80
- dataset = getattr(serialized, field.name)
81
- dataframe = (
82
- None if dataset is None else await cls.__storage.load(dataset)
83
- )
84
- dataframes[field.name] = dataframe
85
- else:
86
- dataframes[field.name] = getattr(serialized, field.name)
87
- return cls(**dataframes)
88
-
89
- @classmethod
90
- def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
91
- schema = cls.__dataframeobject__.serialized_type.json_schema(*args, **kwargs)
92
- schema[cls.__name__] = schema[cls.__dataframeobject__.serialized_type.__name__]
93
- return schema
94
-
95
- def to_json(self, *args, **kwargs) -> Dict[str, Any]:
96
- raise RuntimeError(
97
- f"`{type(self).__name__}` `@dataframeobject` cannot be converted to json directly. "
98
- "i.e. use `return await DataFrames.serialize(obj)` to return it as a reponse."
99
- )
100
-
101
-
102
- def _is_dataframe_field(field: Field) -> bool:
103
- return any(
104
- hasattr(field_type, "__dataframe__")
105
- for field_type in [field.type, *get_args(field.type)]
106
- )
107
-
108
-
109
- def _serialized_field_type(field: Field) -> Type[Any]:
110
- """Computes the `@dataobject` datatype used as a result
111
- of serialized `@dataframeobject`
112
- """
113
- if hasattr(field.type, "__dataframe__"):
114
- return Dataset
115
- if get_origin(field.type) is Union:
116
- args = get_args(field.type)
117
- if (
118
- len(args) == 2
119
- and any(hasattr(field_type, "__dataframe__") for field_type in args)
120
- and any(field_type is NoneType for field_type in args)
121
- ):
122
- return Optional[Dataset] # type: ignore
123
- if _is_dataframe_field(field):
124
- raise TypeError(
125
- f"field {field.name}: only `DataFrameT` or `Optional[DataFrameT]` are supported"
126
- )
127
- return field.type
128
-
129
-
130
- def dataframeobject(
131
- decorated_class=None,
132
- ) -> Callable[[Type], Type[DataFrameObjectMixin]]:
133
- """
134
- Decorator for dataclasses intended to be used as dataframes.
135
- """
136
-
137
- def add_dataframe_mixin(cls) -> Type[DataFrameObjectMixin]:
138
- if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
139
- amended_class = type(
140
- cls.__name__,
141
- (DataFrameObjectMixin,) + cls.__mro__,
142
- dict(cls.__dict__),
143
- )
144
- return amended_class
145
- return cls
146
-
147
- def add_dataframeobject_metadata(cls):
148
- serialized_fiels = [
149
- (field.name, _serialized_field_type(field)) for field in fields(cls)
150
- ]
151
- serialized_type = make_dataclass(cls.__name__ + "_", serialized_fiels)
152
- serialized_type = dataobject(serialized_type, unsafe=True)
153
-
154
- setattr(
155
- cls,
156
- "__dataframeobject__",
157
- DataFrameObjectMetadata(
158
- serialized_type=serialized_type,
159
- ),
160
- )
161
-
162
- def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool):
163
- setattr(
164
- cls,
165
- "__data_object__",
166
- {"unsafe": unsafe, "validate": validate, "schema": schema},
167
- )
168
- setattr(cls, "__stream_event__", StreamEventParams(None, None))
169
- setattr(cls, "event_id", StreamEventMixin.event_id)
170
- setattr(cls, "event_ts", StreamEventMixin.event_ts)
171
-
172
- def wrap(cls) -> Type[DataFrameObjectMixin]:
173
- if hasattr(cls, "__dataframeobject__"):
174
- return cls
175
- amended_class = add_dataframe_mixin(cls)
176
- add_dataframeobject_metadata(amended_class)
177
- add_dataobject_annotations(
178
- amended_class, unsafe=False, validate=True, schema=True
179
- )
180
- return amended_class
181
-
182
- if decorated_class is None:
183
- return wrap
184
- return wrap(decorated_class) # type: ignore
@@ -1,52 +0,0 @@
1
- """hopeit.engine dataframes plugin SETUP event.
2
-
3
- This event executes when engine starts with dataframes plugin configuration file loaded,
4
- and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
5
- types
6
- """
7
-
8
- from functools import partial
9
-
10
- from hopeit.app.context import EventContext
11
- from hopeit.app.logger import app_logger
12
- from hopeit.dataframes.dataframeobject import DataFrameObjectMixin
13
- from hopeit.dataframes.serialization.dataset import find_protocol_impl
14
- from hopeit.dataframes.serialization.settings import DatasetSerialization
15
- from hopeit.server import serialization
16
-
17
- logger = app_logger()
18
-
19
- __steps__ = ["register_serialization"]
20
-
21
-
22
- def register_serialization(payload: None, context: EventContext) -> None:
23
- """Setups serizaltion wrappers in hopeit.engine based on
24
- `DataSerialization` settings configured in plugin configuration file
25
- """
26
- logger.info(context, "Registering serialization methods...")
27
-
28
- settings: DatasetSerialization = context.settings(
29
- key="dataset_serialization", datatype=DatasetSerialization
30
- )
31
- impl = find_protocol_impl(settings.protocol)
32
-
33
- storage = impl(
34
- protocol=settings.protocol,
35
- location=settings.location,
36
- partition_dateformat=settings.partition_dateformat,
37
- )
38
- setattr(DataFrameObjectMixin, "_DataFrameObjectMixin__storage", storage)
39
-
40
- serdeser_wrappers = {}
41
- for (
42
- serdeser,
43
- methods,
44
- ) in serialization._SERDESER.items(): # pylint: disable=protected-access
45
- serdeser_wrappers[serdeser] = (
46
- partial(storage.ser_wrapper, methods[0]),
47
- methods[1],
48
- partial(storage.deser_wrapper, methods[2]),
49
- )
50
-
51
- for serdeser, methods in serdeser_wrappers.items():
52
- serialization._SERDESER[serdeser] = methods # pylint: disable=protected-access
@@ -1,6 +0,0 @@
1
- hopeit.engine[fs-storage]==0.24.2
2
- pandas
3
- numpy
4
-
5
- [pyarrow]
6
- pyarrow