hopeit.dataframes 0.24.0__tar.gz → 0.24.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/PKG-INFO +2 -2
  2. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/setup.py +2 -0
  3. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/src/hopeit/dataframes/dataframe.py +5 -4
  4. hopeit.dataframes-0.24.2/src/hopeit/dataframes/serialization/__init__.py +0 -0
  5. hopeit.dataframes-0.24.2/src/hopeit/dataframes/serialization/dataset.py +28 -0
  6. hopeit.dataframes-0.24.2/src/hopeit/dataframes/serialization/files.py +116 -0
  7. hopeit.dataframes-0.24.2/src/hopeit/dataframes/serialization/settings.py +14 -0
  8. hopeit.dataframes-0.24.2/src/hopeit/dataframes/setup/__init__.py +0 -0
  9. hopeit.dataframes-0.24.2/src/hopeit/dataframes/setup/dataframes.py +52 -0
  10. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/src/hopeit.dataframes.egg-info/PKG-INFO +2 -2
  11. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/src/hopeit.dataframes.egg-info/SOURCES.txt +7 -1
  12. hopeit.dataframes-0.24.2/src/hopeit.dataframes.egg-info/requires.txt +6 -0
  13. hopeit.dataframes-0.24.0/src/hopeit.dataframes.egg-info/requires.txt +0 -6
  14. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/README.md +0 -0
  15. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/setup.cfg +0 -0
  16. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/src/hopeit/dataframes/__init__.py +0 -0
  17. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/src/hopeit/dataframes/dataframeobject.py +0 -0
  18. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/src/hopeit/dataframes/py.typed +0 -0
  19. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
  20. {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.2}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.24.0
3
+ Version: 0.24.2
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
5
  Home-page: https://github.com/hopeit-git/hopeit.engine
6
6
  Author: Leo Smerling and Pablo Canto
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
26
  Classifier: Framework :: AsyncIO
27
27
  Requires-Python: >=3.8
28
28
  Description-Content-Type: text/markdown
29
- Requires-Dist: hopeit.engine[fs-storage]==0.24.0
29
+ Requires-Dist: hopeit.engine[fs-storage]==0.24.2
30
30
  Requires-Dist: pandas
31
31
  Requires-Dist: numpy
32
32
  Provides-Extra: pyarrow
@@ -39,6 +39,8 @@ setuptools.setup(
39
39
  package_dir={"": "src"},
40
40
  packages=[
41
41
  "hopeit.dataframes",
42
+ "hopeit.dataframes.serialization",
43
+ "hopeit.dataframes.setup",
42
44
  ],
43
45
  include_package_data=True,
44
46
  package_data={
@@ -81,9 +81,10 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
81
81
  def __init_from_series__(
82
82
  self, **series: pd.Series
83
83
  ): # pylint: disable=bad-staticmethod-argument
84
- if self.__data_object__["validate"]:
85
- series = self._coerce_datatypes(series)
86
84
  df = pd.DataFrame(series)
85
+ df.index.name = None # Removes index name to avoid colisions with series name
86
+ if self.__data_object__["validate"]:
87
+ df = pd.DataFrame(self._coerce_datatypes(df))
87
88
  setattr(self, "__df", df[self.__dataframe__.columns])
88
89
 
89
90
  @classmethod
@@ -171,9 +172,9 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
171
172
  else:
172
173
  object.__setattr__(self, name, value)
173
174
 
174
- def _coerce_datatypes(self, series: Dict[str, pd.Series]) -> Dict[str, pd.Series]:
175
+ def _coerce_datatypes(self, df: pd.DataFrame) -> Dict[str, pd.Series]:
175
176
  return {
176
- name: self.DATATYPE_MAPPING[field.type](series[name]) # type: ignore
177
+ name: self.DATATYPE_MAPPING[field.type](df[name]) # type: ignore
177
178
  for name, field in self.__dataframe__.fields.items()
178
179
  }
179
180
 
@@ -0,0 +1,28 @@
1
+ """Dataset objects definition, used as a result of serialized dataframes
2
+ """
3
+
4
+ from importlib import import_module
5
+ from typing import Type, TypeVar
6
+
7
+ from hopeit.dataobjects import dataclass, dataobject
8
+
9
+ DataFrameT = TypeVar("DataFrameT")
10
+
11
+
12
+ @dataobject
13
+ @dataclass
14
+ class Dataset:
15
+ protocol: str
16
+ partition_key: str
17
+ key: str
18
+ datatype: str
19
+
20
+
21
+ def find_protocol_impl(qual_type_name: str) -> Type:
22
+ mod_name, type_name = (
23
+ ".".join(qual_type_name.split(".")[:-1]),
24
+ qual_type_name.split(".")[-1],
25
+ )
26
+ module = import_module(mod_name)
27
+ datatype = getattr(module, type_name)
28
+ return datatype
@@ -0,0 +1,116 @@
1
+ """Support for `@dataframes` serialization to files
2
+ """
3
+
4
+ import io
5
+ from importlib import import_module
6
+ from typing import Callable, Generic, Optional, Type, TypeVar, Union
7
+ from uuid import uuid4
8
+
9
+ import pandas as pd
10
+
11
+ try:
12
+ import pyarrow # type: ignore # noqa # pylint: disable=unused-import
13
+ except ImportError as e:
14
+ raise ImportError(
15
+ "`pyarrow` needs to be installed to use `DatasetFileStorage`",
16
+ "Run `pip install hopeit.dataframes[pyarrow]`",
17
+ ) from e
18
+
19
+ from hopeit.dataframes.dataframe import DataFrameMixin
20
+ from hopeit.dataframes.serialization.dataset import Dataset
21
+ from hopeit.dataobjects import EventPayloadType
22
+ from hopeit.fs_storage import FileStorage
23
+
24
+ DataFrameT = TypeVar("DataFrameT", bound=DataFrameMixin)
25
+
26
+
27
+ class DatasetFileStorage(Generic[DataFrameT]):
28
+ """Support to store dataframes as files,
29
+ using pandas parquet format support in combination
30
+ with `hopeit.engine` file storage plugins
31
+ """
32
+
33
+ def __init__(self, *, location: str, partition_dateformat: Optional[str], **kwargs):
34
+ self.storage: FileStorage = FileStorage(
35
+ path=location, partition_dateformat=partition_dateformat
36
+ )
37
+
38
+ async def save(self, dataframe: DataFrameT) -> Dataset:
39
+ """Saves @dataframe annotated object as parquet to file system
40
+ and returns Dataset metadata to be used for retrieval
41
+ """
42
+ datatype = type(dataframe)
43
+ key = f"{datatype.__qualname__.lower()}_{uuid4()}.parquet"
44
+ data = io.BytesIO(
45
+ dataframe._df.to_parquet( # pylint: disable=protected-access
46
+ engine="pyarrow"
47
+ )
48
+ )
49
+ location = await self.storage.store_file(file_name=key, value=data)
50
+ partition_key = self.storage.partition_key(location)
51
+
52
+ return Dataset(
53
+ protocol=f"{__name__}.{type(self).__name__}",
54
+ partition_key=partition_key,
55
+ key=key,
56
+ datatype=f"{datatype.__module__}.{datatype.__qualname__}",
57
+ )
58
+
59
+ async def load(self, dataset: Dataset) -> EventPayloadType:
60
+ """Loads @dataframe annotated object using Dataset metadata"""
61
+ datatype: Type[DataFrameT] = find_dataframe_type(dataset.datatype)
62
+ data = await self.storage.get_file(
63
+ dataset.key, partition_key=dataset.partition_key
64
+ )
65
+ if data is None:
66
+ raise FileNotFoundError(dataset.key)
67
+ df = pd.read_parquet(io.BytesIO(data), engine="pyarrow")
68
+ return datatype._from_df(df) # pylint: disable=protected-access
69
+
70
+ async def ser_wrapper(
71
+ self,
72
+ base_serialization: Callable,
73
+ data: Union[EventPayloadType, DataFrameT],
74
+ level: int,
75
+ ) -> bytes:
76
+ """Serialization wrapper that plugins-in into hopeit.engine
77
+ serialization when dataframes plugin is initialized
78
+ """
79
+ if hasattr(data, "__dataframeobject__"):
80
+ data = await data._serialize() # type: ignore # pylint: disable=protected-access
81
+ if hasattr(data, "__dataframe__"):
82
+ data = await self.save(data) # type: ignore
83
+ return await base_serialization(data, level)
84
+
85
+ async def deser_wrapper(
86
+ self,
87
+ base_deserialization: Callable,
88
+ data: bytes,
89
+ datatype: Union[Type[EventPayloadType], Type[DataFrameT]],
90
+ ) -> Union[EventPayloadType, DataFrameT]:
91
+ """Deerialization wrapper that plugins-in into hopeit.engine
92
+ deserialization when dataframes plugin is initialized
93
+ """
94
+ if hasattr(datatype, "__dataframeobject__"):
95
+ dataset = await base_deserialization(
96
+ data, datatype.__dataframeobject__.serialized_type # type: ignore
97
+ )
98
+ return await datatype._deserialize(dataset) # type: ignore # pylint: disable=protected-access
99
+ if hasattr(datatype, "__dataframe__"):
100
+ dataset = await base_deserialization(data, Dataset)
101
+ return await self.load(dataset)
102
+ return await base_deserialization(data, datatype)
103
+
104
+
105
+ def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
106
+ """Returns dataframe class based on type name used during serialization"""
107
+ mod_name, type_name = (
108
+ ".".join(qual_type_name.split(".")[:-1]),
109
+ qual_type_name.split(".")[-1],
110
+ )
111
+ module = import_module(mod_name)
112
+ datatype = getattr(module, type_name)
113
+ assert hasattr(
114
+ datatype, "__dataframe__"
115
+ ), f"Type {qual_type_name} must be annotated with `@dataframe`."
116
+ return datatype
@@ -0,0 +1,14 @@
1
+ """Support for plugin configuration
2
+ """
3
+
4
+ from typing import Optional
5
+
6
+ from hopeit.dataobjects import dataclass, dataobject
7
+
8
+
9
+ @dataobject
10
+ @dataclass
11
+ class DatasetSerialization:
12
+ protocol: str
13
+ location: str
14
+ partition_dateformat: Optional[str] = None
@@ -0,0 +1,52 @@
1
+ """hopeit.engine dataframes plugin SETUP event.
2
+
3
+ This event executes when engine starts with dataframes plugin configuration file loaded,
4
+ and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
5
+ types
6
+ """
7
+
8
+ from functools import partial
9
+
10
+ from hopeit.app.context import EventContext
11
+ from hopeit.app.logger import app_logger
12
+ from hopeit.dataframes.dataframeobject import DataFrameObjectMixin
13
+ from hopeit.dataframes.serialization.dataset import find_protocol_impl
14
+ from hopeit.dataframes.serialization.settings import DatasetSerialization
15
+ from hopeit.server import serialization
16
+
17
+ logger = app_logger()
18
+
19
+ __steps__ = ["register_serialization"]
20
+
21
+
22
+ def register_serialization(payload: None, context: EventContext) -> None:
23
+ """Setups serizaltion wrappers in hopeit.engine based on
24
+ `DataSerialization` settings configured in plugin configuration file
25
+ """
26
+ logger.info(context, "Registering serialization methods...")
27
+
28
+ settings: DatasetSerialization = context.settings(
29
+ key="dataset_serialization", datatype=DatasetSerialization
30
+ )
31
+ impl = find_protocol_impl(settings.protocol)
32
+
33
+ storage = impl(
34
+ protocol=settings.protocol,
35
+ location=settings.location,
36
+ partition_dateformat=settings.partition_dateformat,
37
+ )
38
+ setattr(DataFrameObjectMixin, "_DataFrameObjectMixin__storage", storage)
39
+
40
+ serdeser_wrappers = {}
41
+ for (
42
+ serdeser,
43
+ methods,
44
+ ) in serialization._SERDESER.items(): # pylint: disable=protected-access
45
+ serdeser_wrappers[serdeser] = (
46
+ partial(storage.ser_wrapper, methods[0]),
47
+ methods[1],
48
+ partial(storage.deser_wrapper, methods[2]),
49
+ )
50
+
51
+ for serdeser, methods in serdeser_wrappers.items():
52
+ serialization._SERDESER[serdeser] = methods # pylint: disable=protected-access
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.24.0
3
+ Version: 0.24.2
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
5
  Home-page: https://github.com/hopeit-git/hopeit.engine
6
6
  Author: Leo Smerling and Pablo Canto
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
26
  Classifier: Framework :: AsyncIO
27
27
  Requires-Python: >=3.8
28
28
  Description-Content-Type: text/markdown
29
- Requires-Dist: hopeit.engine[fs-storage]==0.24.0
29
+ Requires-Dist: hopeit.engine[fs-storage]==0.24.2
30
30
  Requires-Dist: pandas
31
31
  Requires-Dist: numpy
32
32
  Provides-Extra: pyarrow
@@ -8,4 +8,10 @@ src/hopeit.dataframes.egg-info/top_level.txt
8
8
  src/hopeit/dataframes/__init__.py
9
9
  src/hopeit/dataframes/dataframe.py
10
10
  src/hopeit/dataframes/dataframeobject.py
11
- src/hopeit/dataframes/py.typed
11
+ src/hopeit/dataframes/py.typed
12
+ src/hopeit/dataframes/serialization/__init__.py
13
+ src/hopeit/dataframes/serialization/dataset.py
14
+ src/hopeit/dataframes/serialization/files.py
15
+ src/hopeit/dataframes/serialization/settings.py
16
+ src/hopeit/dataframes/setup/__init__.py
17
+ src/hopeit/dataframes/setup/dataframes.py
@@ -0,0 +1,6 @@
1
+ hopeit.engine[fs-storage]==0.24.2
2
+ pandas
3
+ numpy
4
+
5
+ [pyarrow]
6
+ pyarrow
@@ -1,6 +0,0 @@
1
- hopeit.engine[fs-storage]==0.24.0
2
- pandas
3
- numpy
4
-
5
- [pyarrow]
6
- pyarrow