hopeit.dataframes 0.24.0__tar.gz → 0.24.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/PKG-INFO +2 -2
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/setup.py +2 -0
- hopeit.dataframes-0.24.1/src/hopeit/dataframes/serialization/__init__.py +0 -0
- hopeit.dataframes-0.24.1/src/hopeit/dataframes/serialization/dataset.py +28 -0
- hopeit.dataframes-0.24.1/src/hopeit/dataframes/serialization/files.py +116 -0
- hopeit.dataframes-0.24.1/src/hopeit/dataframes/serialization/settings.py +14 -0
- hopeit.dataframes-0.24.1/src/hopeit/dataframes/setup/__init__.py +0 -0
- hopeit.dataframes-0.24.1/src/hopeit/dataframes/setup/dataframes.py +52 -0
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit.dataframes.egg-info/PKG-INFO +2 -2
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit.dataframes.egg-info/SOURCES.txt +7 -1
- hopeit.dataframes-0.24.1/src/hopeit.dataframes.egg-info/requires.txt +6 -0
- hopeit.dataframes-0.24.0/src/hopeit.dataframes.egg-info/requires.txt +0 -6
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/README.md +0 -0
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/setup.cfg +0 -0
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit/dataframes/__init__.py +0 -0
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit/dataframes/dataframe.py +0 -0
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit/dataframes/dataframeobject.py +0 -0
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit/dataframes/py.typed +0 -0
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
- {hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.24.
|
|
3
|
+
Version: 0.24.1
|
|
4
4
|
Summary: Hopeit Engine Dataframes Toolkit
|
|
5
5
|
Home-page: https://github.com/hopeit-git/hopeit.engine
|
|
6
6
|
Author: Leo Smerling and Pablo Canto
|
|
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
26
26
|
Classifier: Framework :: AsyncIO
|
|
27
27
|
Requires-Python: >=3.8
|
|
28
28
|
Description-Content-Type: text/markdown
|
|
29
|
-
Requires-Dist: hopeit.engine[fs-storage]==0.24.
|
|
29
|
+
Requires-Dist: hopeit.engine[fs-storage]==0.24.1
|
|
30
30
|
Requires-Dist: pandas
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Provides-Extra: pyarrow
|
|
File without changes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Dataset objects definition, used as a result of serialized dataframes
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
from importlib import import_module
|
|
5
|
+
from typing import Type, TypeVar
|
|
6
|
+
|
|
7
|
+
from hopeit.dataobjects import dataclass, dataobject
|
|
8
|
+
|
|
9
|
+
DataFrameT = TypeVar("DataFrameT")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataobject
|
|
13
|
+
@dataclass
|
|
14
|
+
class Dataset:
|
|
15
|
+
protocol: str
|
|
16
|
+
partition_key: str
|
|
17
|
+
key: str
|
|
18
|
+
datatype: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def find_protocol_impl(qual_type_name: str) -> Type:
|
|
22
|
+
mod_name, type_name = (
|
|
23
|
+
".".join(qual_type_name.split(".")[:-1]),
|
|
24
|
+
qual_type_name.split(".")[-1],
|
|
25
|
+
)
|
|
26
|
+
module = import_module(mod_name)
|
|
27
|
+
datatype = getattr(module, type_name)
|
|
28
|
+
return datatype
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Support for `@dataframes` serialization to files
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
import io
|
|
5
|
+
from importlib import import_module
|
|
6
|
+
from typing import Callable, Generic, Optional, Type, TypeVar, Union
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import pyarrow # type: ignore # noqa # pylint: disable=unused-import
|
|
13
|
+
except ImportError as e:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"`pyarrow` needs to be installed to use `DatasetFileStorage`",
|
|
16
|
+
"Run `pip install hopeit.dataframes[pyarrow]`",
|
|
17
|
+
) from e
|
|
18
|
+
|
|
19
|
+
from hopeit.dataframes.dataframe import DataFrameMixin
|
|
20
|
+
from hopeit.dataframes.serialization.dataset import Dataset
|
|
21
|
+
from hopeit.dataobjects import EventPayloadType
|
|
22
|
+
from hopeit.fs_storage import FileStorage
|
|
23
|
+
|
|
24
|
+
DataFrameT = TypeVar("DataFrameT", bound=DataFrameMixin)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DatasetFileStorage(Generic[DataFrameT]):
|
|
28
|
+
"""Support to store dataframes as files,
|
|
29
|
+
using pandas parquet format support in combination
|
|
30
|
+
with `hopeit.engine` file storage plugins
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, *, location: str, partition_dateformat: Optional[str], **kwargs):
|
|
34
|
+
self.storage: FileStorage = FileStorage(
|
|
35
|
+
path=location, partition_dateformat=partition_dateformat
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
async def save(self, dataframe: DataFrameT) -> Dataset:
|
|
39
|
+
"""Saves @dataframe annotated object as parquet to file system
|
|
40
|
+
and returns Dataset metadata to be used for retrieval
|
|
41
|
+
"""
|
|
42
|
+
datatype = type(dataframe)
|
|
43
|
+
key = f"{datatype.__qualname__.lower()}_{uuid4()}.parquet"
|
|
44
|
+
data = io.BytesIO(
|
|
45
|
+
dataframe._df.to_parquet( # pylint: disable=protected-access
|
|
46
|
+
engine="pyarrow"
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
location = await self.storage.store_file(file_name=key, value=data)
|
|
50
|
+
partition_key = self.storage.partition_key(location)
|
|
51
|
+
|
|
52
|
+
return Dataset(
|
|
53
|
+
protocol=f"{__name__}.{type(self).__name__}",
|
|
54
|
+
partition_key=partition_key,
|
|
55
|
+
key=key,
|
|
56
|
+
datatype=f"{datatype.__module__}.{datatype.__qualname__}",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
async def load(self, dataset: Dataset) -> EventPayloadType:
|
|
60
|
+
"""Loads @dataframe annotated object using Dataset metadata"""
|
|
61
|
+
datatype: Type[DataFrameT] = find_dataframe_type(dataset.datatype)
|
|
62
|
+
data = await self.storage.get_file(
|
|
63
|
+
dataset.key, partition_key=dataset.partition_key
|
|
64
|
+
)
|
|
65
|
+
if data is None:
|
|
66
|
+
raise FileNotFoundError(dataset.key)
|
|
67
|
+
df = pd.read_parquet(io.BytesIO(data), engine="pyarrow")
|
|
68
|
+
return datatype._from_df(df) # pylint: disable=protected-access
|
|
69
|
+
|
|
70
|
+
async def ser_wrapper(
|
|
71
|
+
self,
|
|
72
|
+
base_serialization: Callable,
|
|
73
|
+
data: Union[EventPayloadType, DataFrameT],
|
|
74
|
+
level: int,
|
|
75
|
+
) -> bytes:
|
|
76
|
+
"""Serialization wrapper that plugins-in into hopeit.engine
|
|
77
|
+
serialization when dataframes plugin is initialized
|
|
78
|
+
"""
|
|
79
|
+
if hasattr(data, "__dataframeobject__"):
|
|
80
|
+
data = await data._serialize() # type: ignore # pylint: disable=protected-access
|
|
81
|
+
if hasattr(data, "__dataframe__"):
|
|
82
|
+
data = await self.save(data) # type: ignore
|
|
83
|
+
return await base_serialization(data, level)
|
|
84
|
+
|
|
85
|
+
async def deser_wrapper(
|
|
86
|
+
self,
|
|
87
|
+
base_deserialization: Callable,
|
|
88
|
+
data: bytes,
|
|
89
|
+
datatype: Union[Type[EventPayloadType], Type[DataFrameT]],
|
|
90
|
+
) -> Union[EventPayloadType, DataFrameT]:
|
|
91
|
+
"""Deerialization wrapper that plugins-in into hopeit.engine
|
|
92
|
+
deserialization when dataframes plugin is initialized
|
|
93
|
+
"""
|
|
94
|
+
if hasattr(datatype, "__dataframeobject__"):
|
|
95
|
+
dataset = await base_deserialization(
|
|
96
|
+
data, datatype.__dataframeobject__.serialized_type # type: ignore
|
|
97
|
+
)
|
|
98
|
+
return await datatype._deserialize(dataset) # type: ignore # pylint: disable=protected-access
|
|
99
|
+
if hasattr(datatype, "__dataframe__"):
|
|
100
|
+
dataset = await base_deserialization(data, Dataset)
|
|
101
|
+
return await self.load(dataset)
|
|
102
|
+
return await base_deserialization(data, datatype)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
|
|
106
|
+
"""Returns dataframe class based on type name used during serialization"""
|
|
107
|
+
mod_name, type_name = (
|
|
108
|
+
".".join(qual_type_name.split(".")[:-1]),
|
|
109
|
+
qual_type_name.split(".")[-1],
|
|
110
|
+
)
|
|
111
|
+
module = import_module(mod_name)
|
|
112
|
+
datatype = getattr(module, type_name)
|
|
113
|
+
assert hasattr(
|
|
114
|
+
datatype, "__dataframe__"
|
|
115
|
+
), f"Type {qual_type_name} must be annotated with `@dataframe`."
|
|
116
|
+
return datatype
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Support for plugin configuration
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from hopeit.dataobjects import dataclass, dataobject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataobject
|
|
10
|
+
@dataclass
|
|
11
|
+
class DatasetSerialization:
|
|
12
|
+
protocol: str
|
|
13
|
+
location: str
|
|
14
|
+
partition_dateformat: Optional[str] = None
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""hopeit.engine dataframes plugin SETUP event.
|
|
2
|
+
|
|
3
|
+
This event executes when engine starts with dataframes plugin configuration file loaded,
|
|
4
|
+
and ensures that the engine will support serialization of `@dataframe` and `@dataframeobject`
|
|
5
|
+
types
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from functools import partial
|
|
9
|
+
|
|
10
|
+
from hopeit.app.context import EventContext
|
|
11
|
+
from hopeit.app.logger import app_logger
|
|
12
|
+
from hopeit.dataframes.dataframeobject import DataFrameObjectMixin
|
|
13
|
+
from hopeit.dataframes.serialization.dataset import find_protocol_impl
|
|
14
|
+
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
15
|
+
from hopeit.server import serialization
|
|
16
|
+
|
|
17
|
+
logger = app_logger()
|
|
18
|
+
|
|
19
|
+
__steps__ = ["register_serialization"]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def register_serialization(payload: None, context: EventContext) -> None:
|
|
23
|
+
"""Setups serizaltion wrappers in hopeit.engine based on
|
|
24
|
+
`DataSerialization` settings configured in plugin configuration file
|
|
25
|
+
"""
|
|
26
|
+
logger.info(context, "Registering serialization methods...")
|
|
27
|
+
|
|
28
|
+
settings: DatasetSerialization = context.settings(
|
|
29
|
+
key="dataset_serialization", datatype=DatasetSerialization
|
|
30
|
+
)
|
|
31
|
+
impl = find_protocol_impl(settings.protocol)
|
|
32
|
+
|
|
33
|
+
storage = impl(
|
|
34
|
+
protocol=settings.protocol,
|
|
35
|
+
location=settings.location,
|
|
36
|
+
partition_dateformat=settings.partition_dateformat,
|
|
37
|
+
)
|
|
38
|
+
setattr(DataFrameObjectMixin, "_DataFrameObjectMixin__storage", storage)
|
|
39
|
+
|
|
40
|
+
serdeser_wrappers = {}
|
|
41
|
+
for (
|
|
42
|
+
serdeser,
|
|
43
|
+
methods,
|
|
44
|
+
) in serialization._SERDESER.items(): # pylint: disable=protected-access
|
|
45
|
+
serdeser_wrappers[serdeser] = (
|
|
46
|
+
partial(storage.ser_wrapper, methods[0]),
|
|
47
|
+
methods[1],
|
|
48
|
+
partial(storage.deser_wrapper, methods[2]),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
for serdeser, methods in serdeser_wrappers.items():
|
|
52
|
+
serialization._SERDESER[serdeser] = methods # pylint: disable=protected-access
|
{hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit.dataframes.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.24.
|
|
3
|
+
Version: 0.24.1
|
|
4
4
|
Summary: Hopeit Engine Dataframes Toolkit
|
|
5
5
|
Home-page: https://github.com/hopeit-git/hopeit.engine
|
|
6
6
|
Author: Leo Smerling and Pablo Canto
|
|
@@ -26,7 +26,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
26
26
|
Classifier: Framework :: AsyncIO
|
|
27
27
|
Requires-Python: >=3.8
|
|
28
28
|
Description-Content-Type: text/markdown
|
|
29
|
-
Requires-Dist: hopeit.engine[fs-storage]==0.24.
|
|
29
|
+
Requires-Dist: hopeit.engine[fs-storage]==0.24.1
|
|
30
30
|
Requires-Dist: pandas
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Provides-Extra: pyarrow
|
{hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit.dataframes.egg-info/SOURCES.txt
RENAMED
|
@@ -8,4 +8,10 @@ src/hopeit.dataframes.egg-info/top_level.txt
|
|
|
8
8
|
src/hopeit/dataframes/__init__.py
|
|
9
9
|
src/hopeit/dataframes/dataframe.py
|
|
10
10
|
src/hopeit/dataframes/dataframeobject.py
|
|
11
|
-
src/hopeit/dataframes/py.typed
|
|
11
|
+
src/hopeit/dataframes/py.typed
|
|
12
|
+
src/hopeit/dataframes/serialization/__init__.py
|
|
13
|
+
src/hopeit/dataframes/serialization/dataset.py
|
|
14
|
+
src/hopeit/dataframes/serialization/files.py
|
|
15
|
+
src/hopeit/dataframes/serialization/settings.py
|
|
16
|
+
src/hopeit/dataframes/setup/__init__.py
|
|
17
|
+
src/hopeit/dataframes/setup/dataframes.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit/dataframes/dataframeobject.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hopeit.dataframes-0.24.0 → hopeit.dataframes-0.24.1}/src/hopeit.dataframes.egg-info/top_level.txt
RENAMED
|
File without changes
|