hopeit.dataframes 0.24.1__tar.gz → 0.25.0b1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/PKG-INFO +2 -2
  2. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/dataframe.py +51 -47
  3. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/dataframeobject.py +39 -36
  4. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit.dataframes.egg-info/PKG-INFO +2 -2
  5. hopeit_dataframes-0.25.0b1/src/hopeit.dataframes.egg-info/requires.txt +6 -0
  6. hopeit.dataframes-0.24.1/src/hopeit.dataframes.egg-info/requires.txt +0 -6
  7. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/README.md +0 -0
  8. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/setup.cfg +0 -0
  9. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/setup.py +0 -0
  10. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/__init__.py +0 -0
  11. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/py.typed +0 -0
  12. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/serialization/__init__.py +0 -0
  13. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/serialization/dataset.py +0 -0
  14. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/serialization/files.py +0 -0
  15. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/serialization/settings.py +0 -0
  16. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/setup/__init__.py +0 -0
  17. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit/dataframes/setup/dataframes.py +0 -0
  18. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit.dataframes.egg-info/SOURCES.txt +0 -0
  19. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
  20. {hopeit.dataframes-0.24.1 → hopeit_dataframes-0.25.0b1}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.24.1
3
+ Version: 0.25.0b1
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
5
  Home-page: https://github.com/hopeit-git/hopeit.engine
6
6
  Author: Leo Smerling and Pablo Canto
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
26
  Classifier: Framework :: AsyncIO
27
27
  Requires-Python: >=3.8
28
28
  Description-Content-Type: text/markdown
29
- Requires-Dist: hopeit.engine[fs-storage]==0.24.1
29
+ Requires-Dist: hopeit.engine[fs-storage]==0.25.0b1
30
30
  Requires-Dist: pandas
31
31
  Requires-Dist: numpy
32
32
  Provides-Extra: pyarrow
@@ -12,32 +12,35 @@ Example:
12
12
  name: str
13
13
  number: int
14
14
  """
15
-
16
- from dataclasses import Field, asdict, dataclass, fields, make_dataclass
15
+ import dataclasses
17
16
  from datetime import date, datetime, timezone
18
17
  from typing import Any, Callable, Dict, Generic, Iterator, List, Optional, Type, TypeVar
19
18
 
20
19
  import numpy as np
21
20
  import pandas as pd
22
- from dataclasses_jsonschema import JsonSchemaMixin
21
+ from pydantic import create_model
22
+ from pydantic.fields import FieldInfo
23
+
23
24
  from hopeit.dataobjects import (
24
25
  DataObject,
25
26
  StreamEventMixin,
26
27
  StreamEventParams,
27
28
  dataobject,
29
+ fields,
28
30
  )
31
+ from hopeit.dataobjects.payload import Payload
29
32
 
30
33
  DataFrameT = TypeVar("DataFrameT")
31
34
 
32
35
 
33
- @dataclass
36
+ @dataclasses.dataclass
34
37
  class DataFrameMetadata(Generic[DataObject]):
35
38
  columns: List[str]
36
- fields: Dict[str, Field]
39
+ fields: Dict[str, FieldInfo]
37
40
  serialized_type: Type[DataObject]
38
41
 
39
42
 
40
- @dataclass
43
+ @dataclasses.dataclass
41
44
  class DataFrameParams:
42
45
  """
43
46
  Helper class used to access attributes in @dataframe
@@ -81,9 +84,10 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
81
84
  def __init_from_series__(
82
85
  self, **series: pd.Series
83
86
  ): # pylint: disable=bad-staticmethod-argument
84
- if self.__data_object__["validate"]:
85
- series = self._coerce_datatypes(series)
86
87
  df = pd.DataFrame(series)
88
+ df.index.name = None # Removes index name to avoid colisions with series name
89
+ if self.__data_object__["validate"]:
90
+ df = pd.DataFrame(self._coerce_datatypes(df))
87
91
  setattr(self, "__df", df[self.__dataframe__.columns])
88
92
 
89
93
  @classmethod
@@ -98,7 +102,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
98
102
 
99
103
  @classmethod
100
104
  def _from_dataobjects(cls, items: Iterator[DataObject]) -> DataFrameT:
101
- return cls._from_df(pd.DataFrame(asdict(item) for item in items)) # type: ignore
105
+ return cls._from_df(pd.DataFrame(Payload.to_obj(item) for item in items)) # type: ignore[misc]
102
106
 
103
107
  @classmethod
104
108
  def _from_df_unsafe(cls, df: pd.DataFrame, **series: pd.Series) -> DataFrameT:
@@ -120,35 +124,35 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
120
124
  for fields in self.__df.to_dict(orient="records")
121
125
  ]
122
126
 
123
- def to_json(self, *args, **kwargs) -> str:
124
- raise NotImplementedError(
125
- "Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
126
- )
127
-
128
- def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
129
- raise NotImplementedError(
130
- "Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
131
- )
132
-
133
- @classmethod
134
- def from_json(cls, *args, **kwargs) -> DataObject:
135
- return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
136
-
137
- @classmethod
138
- def from_dict(
139
- cls,
140
- *args,
141
- **kwargs,
142
- ) -> DataObject:
143
- return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
144
-
145
- @classmethod
146
- def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
147
- if cls.__data_object__["schema"]:
148
- schema = cls.__dataframe__.serialized_type.json_schema(*args, **kwargs)
149
- schema[cls.__name__] = schema[cls.__dataframe__.serialized_type.__name__]
150
- return schema
151
- return {}
127
+ # def to_json(self, *args, **kwargs) -> str:
128
+ # raise NotImplementedError(
129
+ # "Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
130
+ # )
131
+
132
+ # def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
133
+ # raise NotImplementedError(
134
+ # "Dataframe must be used inside `@dataobject(unsafe=True)` to be used as an output"
135
+ # )
136
+
137
+ # @classmethod
138
+ # def from_json(cls, *args, **kwargs) -> DataObject:
139
+ # return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
140
+
141
+ # @classmethod
142
+ # def from_dict(
143
+ # cls,
144
+ # *args,
145
+ # **kwargs,
146
+ # ) -> DataObject:
147
+ # return cls.__dataframe__.serialized_type.from_dict(*args, **kwargs)
148
+
149
+ # @classmethod
150
+ # def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
151
+ # if cls.__data_object__["schema"]:
152
+ # schema = cls.__dataframe__.serialized_type.json_schema(*args, **kwargs)
153
+ # schema[cls.__name__] = schema[cls.__dataframe__.serialized_type.__name__]
154
+ # return schema
155
+ # return {}
152
156
 
153
157
  def event_id(self, *args, **kwargs) -> str:
154
158
  return ""
@@ -171,9 +175,9 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
171
175
  else:
172
176
  object.__setattr__(self, name, value)
173
177
 
174
- def _coerce_datatypes(self, series: Dict[str, pd.Series]) -> Dict[str, pd.Series]:
178
+ def _coerce_datatypes(self, df: pd.DataFrame) -> Dict[str, pd.Series]:
175
179
  return {
176
- name: self.DATATYPE_MAPPING[field.type](series[name]) # type: ignore
180
+ name: self.DATATYPE_MAPPING[field.annotation](df[name]) # type: ignore
177
181
  for name, field in self.__dataframe__.fields.items()
178
182
  }
179
183
 
@@ -192,7 +196,7 @@ def dataframe(
192
196
  if hasattr(cls, "__annotations__") and hasattr(cls, "__dataclass_fields__"):
193
197
  amended_class = type(
194
198
  cls.__name__,
195
- (DataFrameMixin, JsonSchemaMixin) + cls.__mro__,
199
+ (DataFrameMixin, ) + cls.__mro__,
196
200
  dict(cls.__dict__),
197
201
  )
198
202
  setattr(amended_class, "__init__", DataFrameMixin.__init_from_series__)
@@ -200,16 +204,16 @@ def dataframe(
200
204
  return cls
201
205
 
202
206
  def add_dataframe_metadata(cls):
203
- serialized_fiels = [(field.name, field.type) for field in fields(cls)]
204
- serialized_type = make_dataclass(cls.__name__ + "_", serialized_fiels)
207
+ serialized_fields = {k: (v.annotation, v) for k, v in fields(cls).items()}
208
+ serialized_type = create_model(cls.__name__+"_", **serialized_fields)
205
209
  serialized_type = dataobject(serialized_type, unsafe=True)
206
210
 
207
211
  setattr(
208
212
  cls,
209
213
  "__dataframe__",
210
214
  DataFrameMetadata(
211
- columns=[field.name for field in fields(cls)],
212
- fields={field.name: field for field in fields(cls)},
215
+ columns=list(fields(cls).keys()),
216
+ fields=dict(fields(cls).items()),
213
217
  serialized_type=serialized_type,
214
218
  ),
215
219
  )
@@ -225,14 +229,14 @@ def dataframe(
225
229
  setattr(cls, "event_ts", StreamEventMixin.event_ts)
226
230
 
227
231
  def set_fields_optional(cls):
228
- for field in fields(cls):
232
+ for _, field in fields(cls).items():
229
233
  field.default = None
230
234
 
231
235
  def wrap(cls) -> Type[DataFrameMixin]:
232
236
  if hasattr(cls, "__dataframe__"):
233
237
  return cls
238
+ add_dataframe_metadata(cls)
234
239
  amended_class = add_dataframe_mixin(cls)
235
- add_dataframe_metadata(amended_class)
236
240
  add_dataobject_annotations(amended_class, unsafe, validate, schema)
237
241
  set_fields_optional(amended_class)
238
242
  return amended_class
@@ -5,7 +5,7 @@ Datasets behaves as DataObject so they can be used as payload
5
5
  for endpoints and streams.
6
6
  """
7
7
 
8
- from dataclasses import Field, dataclass, fields, make_dataclass
8
+ import dataclasses
9
9
  from typing import (
10
10
  Any,
11
11
  Callable,
@@ -20,19 +20,23 @@ from typing import (
20
20
  get_origin,
21
21
  )
22
22
 
23
+ from pydantic import TypeAdapter, create_model
24
+ from pydantic.fields import FieldInfo
25
+
23
26
  from hopeit.dataframes.serialization.dataset import Dataset
24
27
  from hopeit.dataobjects import (
25
28
  DataObject,
26
29
  StreamEventMixin,
27
30
  StreamEventParams,
28
31
  dataobject,
32
+ fields,
29
33
  )
30
34
 
31
35
  DataFrameObjectT = TypeVar("DataFrameObjectT")
32
36
  NoneType = type(None)
33
37
 
34
38
 
35
- @dataclass
39
+ @dataclasses.dataclass
36
40
  class DataFrameObjectMetadata(Generic[DataObject]):
37
41
  serialized_type: Type[DataObject]
38
42
 
@@ -57,15 +61,15 @@ class DataFrameObjectMixin(Generic[DataFrameObjectT]):
57
61
  and returns json-serialiable dataobject
58
62
  """
59
63
  datasets = {}
60
- for field in fields(self): # type: ignore
61
- if _is_dataframe_field(field):
62
- dataframe = getattr(self, field.name)
64
+ for field_name, field in fields(self).items(): # type: ignore[arg-type]
65
+ if Dataset in {field.annotation, *get_args(field.annotation)}:
66
+ dataframe = getattr(self, field_name)
63
67
  dataset = (
64
68
  None if dataframe is None else await self.__storage.save(dataframe)
65
69
  )
66
- datasets[field.name] = dataset
70
+ datasets[field_name] = dataset
67
71
  else:
68
- datasets[field.name] = getattr(self, field.name)
72
+ datasets[field_name] = getattr(self, field_name)
69
73
  return self.__dataframeobject__.serialized_type(**datasets)
70
74
 
71
75
  @classmethod
@@ -75,45 +79,44 @@ class DataFrameObjectMixin(Generic[DataFrameObjectT]):
75
79
  """From a serialized datframeobject, load inner `@dataframe` objects
76
80
  and returns a `@dataframeobject` instance"""
77
81
  dataframes = {}
78
- for field in fields(cls): # type: ignore
79
- if _is_dataframe_field(field):
80
- dataset = getattr(serialized, field.name)
82
+ for field_name, field in fields(cls).items(): # type: ignore[type-var]
83
+ if Dataset in {field.annotation, *get_args(field.annotation)}:
84
+ dataset = getattr(serialized, field_name)
81
85
  dataframe = (
82
86
  None if dataset is None else await cls.__storage.load(dataset)
83
87
  )
84
- dataframes[field.name] = dataframe
88
+ dataframes[field_name] = dataframe
85
89
  else:
86
- dataframes[field.name] = getattr(serialized, field.name)
90
+ dataframes[field_name] = getattr(serialized, field_name)
87
91
  return cls(**dataframes)
88
92
 
89
93
  @classmethod
90
94
  def json_schema(cls, *args, **kwargs) -> Dict[str, Any]:
91
- schema = cls.__dataframeobject__.serialized_type.json_schema(*args, **kwargs)
92
- schema[cls.__name__] = schema[cls.__dataframeobject__.serialized_type.__name__]
95
+ schema = TypeAdapter(cls.__dataframeobject__.serialized_type).json_schema(*args, **kwargs)
93
96
  return schema
94
97
 
95
- def to_json(self, *args, **kwargs) -> Dict[str, Any]:
96
- raise RuntimeError(
97
- f"`{type(self).__name__}` `@dataframeobject` cannot be converted to json directly. "
98
- "i.e. use `return await DataFrames.serialize(obj)` to return it as a reponse."
99
- )
98
+ # def to_json(self, *args, **kwargs) -> Dict[str, Any]:
99
+ # raise RuntimeError(
100
+ # f"`{type(self).__name__}` `@dataframeobject` cannot be converted to json directly. "
101
+ # "i.e. use `return await DataFrames.serialize(obj)` to return it as a response."
102
+ # )
100
103
 
101
104
 
102
- def _is_dataframe_field(field: Field) -> bool:
105
+ def _is_dataframe_field(field: FieldInfo) -> bool:
103
106
  return any(
104
107
  hasattr(field_type, "__dataframe__")
105
- for field_type in [field.type, *get_args(field.type)]
108
+ for field_type in [field.annotation, *get_args(field.annotation)]
106
109
  )
107
110
 
108
111
 
109
- def _serialized_field_type(field: Field) -> Type[Any]:
112
+ def _serialized_field_type(field_name: str, field: FieldInfo) -> Optional[Type[Any]]:
110
113
  """Computes the `@dataobject` datatype used as a result
111
114
  of serialized `@dataframeobject`
112
115
  """
113
- if hasattr(field.type, "__dataframe__"):
116
+ if hasattr(field.annotation, "__dataframe__"):
114
117
  return Dataset
115
- if get_origin(field.type) is Union:
116
- args = get_args(field.type)
118
+ if get_origin(field.annotation) is Union:
119
+ args = get_args(field.annotation)
117
120
  if (
118
121
  len(args) == 2
119
122
  and any(hasattr(field_type, "__dataframe__") for field_type in args)
@@ -122,9 +125,9 @@ def _serialized_field_type(field: Field) -> Type[Any]:
122
125
  return Optional[Dataset] # type: ignore
123
126
  if _is_dataframe_field(field):
124
127
  raise TypeError(
125
- f"field {field.name}: only `DataFrameT` or `Optional[DataFrameT]` are supported"
128
+ f"field {field_name}: only `DataFrameT` or `Optional[DataFrameT]` are supported"
126
129
  )
127
- return field.type
130
+ return field.annotation
128
131
 
129
132
 
130
133
  def dataframeobject(
@@ -145,12 +148,12 @@ def dataframeobject(
145
148
  return cls
146
149
 
147
150
  def add_dataframeobject_metadata(cls):
148
- serialized_fiels = [
149
- (field.name, _serialized_field_type(field)) for field in fields(cls)
150
- ]
151
- serialized_type = make_dataclass(cls.__name__ + "_", serialized_fiels)
151
+ serialized_fields = {
152
+ field_name: (_serialized_field_type(field_name, field_info), field_info)
153
+ for field_name, field_info in fields(cls).items()
154
+ }
155
+ serialized_type = create_model(cls.__name__+"_", **serialized_fields)
152
156
  serialized_type = dataobject(serialized_type, unsafe=True)
153
-
154
157
  setattr(
155
158
  cls,
156
159
  "__dataframeobject__",
@@ -159,11 +162,11 @@ def dataframeobject(
159
162
  ),
160
163
  )
161
164
 
162
- def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool):
165
+ def add_dataobject_annotations(cls, unsafe: bool, schema: bool):
163
166
  setattr(
164
167
  cls,
165
168
  "__data_object__",
166
- {"unsafe": unsafe, "validate": validate, "schema": schema},
169
+ {"unsafe": unsafe, "schema": schema},
167
170
  )
168
171
  setattr(cls, "__stream_event__", StreamEventParams(None, None))
169
172
  setattr(cls, "event_id", StreamEventMixin.event_id)
@@ -172,10 +175,10 @@ def dataframeobject(
172
175
  def wrap(cls) -> Type[DataFrameObjectMixin]:
173
176
  if hasattr(cls, "__dataframeobject__"):
174
177
  return cls
178
+ add_dataframeobject_metadata(cls)
175
179
  amended_class = add_dataframe_mixin(cls)
176
- add_dataframeobject_metadata(amended_class)
177
180
  add_dataobject_annotations(
178
- amended_class, unsafe=False, validate=True, schema=True
181
+ amended_class, unsafe=False, schema=True
179
182
  )
180
183
  return amended_class
181
184
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.24.1
3
+ Version: 0.25.0b1
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
5
  Home-page: https://github.com/hopeit-git/hopeit.engine
6
6
  Author: Leo Smerling and Pablo Canto
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
26
  Classifier: Framework :: AsyncIO
27
27
  Requires-Python: >=3.8
28
28
  Description-Content-Type: text/markdown
29
- Requires-Dist: hopeit.engine[fs-storage]==0.24.1
29
+ Requires-Dist: hopeit.engine[fs-storage]==0.25.0b1
30
30
  Requires-Dist: pandas
31
31
  Requires-Dist: numpy
32
32
  Provides-Extra: pyarrow
@@ -0,0 +1,6 @@
1
+ hopeit.engine[fs-storage]==0.25.0b1
2
+ pandas
3
+ numpy
4
+
5
+ [pyarrow]
6
+ pyarrow
@@ -1,6 +0,0 @@
1
- hopeit.engine[fs-storage]==0.24.1
2
- pandas
3
- numpy
4
-
5
- [pyarrow]
6
- pyarrow