hopeit.dataframes 0.25.3__tar.gz → 0.25.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/PKG-INFO +2 -2
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/__init__.py +2 -1
- hopeit_dataframes-0.25.4/src/hopeit/dataframes/datablocks.py +140 -0
- hopeit_dataframes-0.25.4/src/hopeit/dataframes/serialization/dataset.py +93 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/serialization/files.py +25 -21
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit.dataframes.egg-info/PKG-INFO +2 -2
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit.dataframes.egg-info/SOURCES.txt +1 -0
- hopeit_dataframes-0.25.4/src/hopeit.dataframes.egg-info/requires.txt +6 -0
- hopeit_dataframes-0.25.3/src/hopeit/dataframes/serialization/dataset.py +0 -48
- hopeit_dataframes-0.25.3/src/hopeit.dataframes.egg-info/requires.txt +0 -6
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/README.md +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/pyproject.toml +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/setup.cfg +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/setup.py +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/dataframe.py +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/py.typed +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/serialization/__init__.py +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/serialization/py.typed +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/serialization/settings.py +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/setup/__init__.py +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/setup/dataframes.py +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/setup/py.typed +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
- {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.25.
|
|
3
|
+
Version: 0.25.4
|
|
4
4
|
Summary: Hopeit Engine Dataframes Toolkit
|
|
5
5
|
Author-email: Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
|
|
6
6
|
License: Apache 2
|
|
@@ -24,7 +24,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
24
24
|
Classifier: Framework :: AsyncIO
|
|
25
25
|
Requires-Python: >=3.9
|
|
26
26
|
Description-Content-Type: text/plain
|
|
27
|
-
Requires-Dist: hopeit.engine[fs-storage]==0.25.
|
|
27
|
+
Requires-Dist: hopeit.engine[fs-storage]==0.25.4
|
|
28
28
|
Requires-Dist: pandas
|
|
29
29
|
Requires-Dist: numpy
|
|
30
30
|
Provides-Extra: pyarrow
|
|
@@ -77,12 +77,13 @@ from typing import Dict, Generic, Iterator, List, Type
|
|
|
77
77
|
import numpy as np
|
|
78
78
|
import pandas as pd
|
|
79
79
|
from hopeit.dataframes.dataframe import DataFrameT, dataframe
|
|
80
|
+
from hopeit.dataframes.datablocks import DataBlocks
|
|
80
81
|
from hopeit.dataframes.serialization.dataset import Dataset
|
|
81
82
|
from hopeit.dataframes.serialization.settings import DatasetSerialization
|
|
82
83
|
from hopeit.dataframes.setup.dataframes import register_serialization
|
|
83
84
|
from hopeit.dataobjects import DataObject
|
|
84
85
|
|
|
85
|
-
__all__ = ["DataFrames", "Dataset", "dataframe"]
|
|
86
|
+
__all__ = ["DataBlocks", "DataFrames", "Dataset", "dataframe"]
|
|
86
87
|
|
|
87
88
|
|
|
88
89
|
class DataFrames(Generic[DataFrameT, DataObject]):
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from typing import Generic, Optional, Type, TypeVar, get_args, get_origin
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from hopeit.dataobjects import fields
|
|
5
|
+
|
|
6
|
+
from hopeit.dataframes.serialization.dataset import Dataset, DatasetLoadError, find_dataframe_type
|
|
7
|
+
|
|
8
|
+
DataBlockType = TypeVar("DataBlockType")
|
|
9
|
+
DataBlockItemType = TypeVar("DataBlockItemType")
|
|
10
|
+
DataFrameType = TypeVar("DataFrameType")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TempDataBlock(Generic[DataBlockType, DataBlockItemType]):
|
|
14
|
+
def __init__(self, datatype: Type[DataBlockType], df: pd.DataFrame):
|
|
15
|
+
self.datatype = datatype
|
|
16
|
+
self.df = df
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def from_dataobjects(
|
|
20
|
+
cls, datatype: Type[DataBlockType], items: list[DataBlockItemType]
|
|
21
|
+
) -> "TempDataBlock[DataBlockType, DataBlockItemType]":
|
|
22
|
+
result_df: Optional[pd.DataFrame] = None
|
|
23
|
+
for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
|
|
24
|
+
if get_origin(field_info.annotation) is Dataset:
|
|
25
|
+
block_items = (getattr(item, field_name) for item in items)
|
|
26
|
+
block_type = get_args(field_info.annotation)[0]
|
|
27
|
+
block = block_type._from_dataobjects(block_items)
|
|
28
|
+
block_df = block._df
|
|
29
|
+
else:
|
|
30
|
+
block_df = pd.DataFrame({field_name: [getattr(item, field_name) for item in items]})
|
|
31
|
+
|
|
32
|
+
if result_df is None:
|
|
33
|
+
result_df = block_df
|
|
34
|
+
else:
|
|
35
|
+
# Skips duplicated column names to they are included only once
|
|
36
|
+
result_df = result_df.join(
|
|
37
|
+
block_df[[col for col in block_df.columns if col not in result_df.columns]]
|
|
38
|
+
)
|
|
39
|
+
assert result_df is not None
|
|
40
|
+
return cls(datatype, result_df)
|
|
41
|
+
|
|
42
|
+
def to_dataobjects(
|
|
43
|
+
self, item_type: Type[DataBlockItemType], *, normalize_null_values: bool = False
|
|
44
|
+
) -> list[DataBlockItemType]:
|
|
45
|
+
keys: list[str] = []
|
|
46
|
+
entries: list[list] = []
|
|
47
|
+
for field_name, field_info in fields(self.datatype).items(): # type: ignore[type-var]
|
|
48
|
+
if get_origin(field_info.annotation) is Dataset:
|
|
49
|
+
block_type = get_args(field_info.annotation)[0]
|
|
50
|
+
keys.append(field_name)
|
|
51
|
+
dataframe = block_type._from_df(self.df)
|
|
52
|
+
entries.append(
|
|
53
|
+
dataframe._to_dataobjects(normalize_null_values=normalize_null_values)
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
keys.append(field_name)
|
|
57
|
+
entries.append(self.df[field_name].to_list())
|
|
58
|
+
|
|
59
|
+
return [
|
|
60
|
+
item_type(**{field_name: entry[i] for i, field_name in enumerate(keys)})
|
|
61
|
+
for entry in zip(*entries)
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
66
|
+
@classmethod
|
|
67
|
+
async def df(cls, datablock: DataBlockType, select: Optional[list[str]] = None) -> pd.DataFrame:
|
|
68
|
+
keys = [
|
|
69
|
+
field_name
|
|
70
|
+
for field_name, field_info in fields(datablock).items() # type: ignore[arg-type]
|
|
71
|
+
if get_origin(field_info.annotation) is Dataset
|
|
72
|
+
and (select is None or field_name in select)
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# Filter/validate selected field names using saved schema,
|
|
76
|
+
# generates a single field for every common/duplicated field in the datasets
|
|
77
|
+
field_names = list(
|
|
78
|
+
dict.fromkeys(
|
|
79
|
+
[
|
|
80
|
+
field_name
|
|
81
|
+
for key in keys
|
|
82
|
+
for field_name in getattr(datablock, key).schema["properties"].keys()
|
|
83
|
+
]
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Load data from first dataset (datablock uses a single file for all datasets)
|
|
88
|
+
dataset: Dataset = getattr(datablock, keys[0])
|
|
89
|
+
result_df = await DataBlocks._load_datablock_df(dataset, field_names)
|
|
90
|
+
|
|
91
|
+
# Add missing optional fields using class schema (allows schema evolution)
|
|
92
|
+
cls._adapt_to_schema(datablock, keys, result_df)
|
|
93
|
+
|
|
94
|
+
# Adding constant value fields
|
|
95
|
+
for field_name, field_info in fields(datablock).items(): # type: ignore[arg-type]
|
|
96
|
+
if get_origin(field_info.annotation) is not Dataset:
|
|
97
|
+
result_df[field_name] = getattr(datablock, field_name) # type: ignore[index]
|
|
98
|
+
|
|
99
|
+
return result_df
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
async def from_df(
|
|
103
|
+
datatype: Type[DataBlockType],
|
|
104
|
+
df: pd.DataFrame,
|
|
105
|
+
**kwargs, # Non-Dataset field values for DataBlockType
|
|
106
|
+
) -> DataBlockType:
|
|
107
|
+
blocks = {}
|
|
108
|
+
block_dataset = await Dataset._save_df(df, datatype)
|
|
109
|
+
for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
|
|
110
|
+
if get_origin(field_info.annotation) is Dataset:
|
|
111
|
+
block_type = get_args(field_info.annotation)[0]
|
|
112
|
+
blocks[field_name] = block_dataset._adapt(block_type)
|
|
113
|
+
else:
|
|
114
|
+
blocks[field_name] = kwargs[field_name]
|
|
115
|
+
|
|
116
|
+
return datatype(**blocks)
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def default(datatype: Type[DataBlockType]) -> DataBlockType:
|
|
120
|
+
return datatype(**{field_name: [] for field_name in list(fields(datatype))}) # type: ignore[type-var]
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
async def _load_datablock_df(
|
|
124
|
+
dataset: Dataset, columns: Optional[list[str]] = None
|
|
125
|
+
) -> pd.DataFrame:
|
|
126
|
+
try:
|
|
127
|
+
return await dataset._load_df(columns)
|
|
128
|
+
except (RuntimeError, IOError, KeyError) as e:
|
|
129
|
+
raise DatasetLoadError(
|
|
130
|
+
f"Error {type(e).__name__}: {e} loading datablock of type {dataset.datatype} "
|
|
131
|
+
f"at location {dataset.partition_key}/{dataset.key}"
|
|
132
|
+
) from e
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def _adapt_to_schema(cls, datablock: DataBlockType, keys: list[str], df: pd.DataFrame):
|
|
136
|
+
for key in keys:
|
|
137
|
+
datatype = find_dataframe_type(getattr(datablock, key).datatype) # type: ignore[var-annotated]
|
|
138
|
+
valid_df = datatype._from_df(df)._df
|
|
139
|
+
for col in valid_df.columns:
|
|
140
|
+
df[col] = valid_df[col]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Dataset objects definition, used as a result of serialized dataframes"""
|
|
2
|
+
|
|
3
|
+
from importlib import import_module
|
|
4
|
+
from typing import Any, Dict, Generic, Optional, Type, TypeVar
|
|
5
|
+
|
|
6
|
+
from hopeit.dataobjects import dataclass, dataobject, field
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from pydantic import TypeAdapter
|
|
9
|
+
|
|
10
|
+
DataFrameT = TypeVar("DataFrameT")
|
|
11
|
+
GenericDataFrameT = TypeVar("GenericDataFrameT")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DatasetLoadError(Exception):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DatasetConvertError(Exception):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataobject
|
|
23
|
+
@dataclass
|
|
24
|
+
class Dataset(Generic[DataFrameT]):
|
|
25
|
+
"""Persisted representation of a @dataframe object"""
|
|
26
|
+
|
|
27
|
+
protocol: str
|
|
28
|
+
partition_key: str
|
|
29
|
+
key: str
|
|
30
|
+
datatype: str
|
|
31
|
+
schema: Dict[str, Any] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
|
|
35
|
+
return await cls.__storage.save(dataframe) # type: ignore[attr-defined]
|
|
36
|
+
|
|
37
|
+
async def load(self) -> DataFrameT:
|
|
38
|
+
try:
|
|
39
|
+
df = await self._load_df()
|
|
40
|
+
return self._convert(df)
|
|
41
|
+
except (RuntimeError, IOError, KeyError) as e:
|
|
42
|
+
raise DatasetLoadError(
|
|
43
|
+
f"Error {type(e).__name__}: {e} loading dataset of type {self.datatype} "
|
|
44
|
+
f"at location {self.partition_key}/{self.key}"
|
|
45
|
+
) from e
|
|
46
|
+
|
|
47
|
+
async def _load_df(self, columns: Optional[list[str]] = None) -> pd.DataFrame:
|
|
48
|
+
return await self.__storage.load_df(self, columns) # type: ignore[attr-defined]
|
|
49
|
+
|
|
50
|
+
def _convert(self, df: pd.DataFrame) -> DataFrameT:
|
|
51
|
+
"""Converts loaded pandas Dataframe to @dataframe annotated object using Dataset metadata"""
|
|
52
|
+
datatype: Type[DataFrameT] = find_dataframe_type(self.datatype)
|
|
53
|
+
return datatype._from_df(df) # type: ignore[attr-defined]
|
|
54
|
+
|
|
55
|
+
def _adapt(self, datatype: DataFrameT) -> "Dataset[DataFrameT]":
|
|
56
|
+
"""Adapts a more generic dataset that contains combined fields to be type specific"""
|
|
57
|
+
return Dataset(
|
|
58
|
+
protocol=self.protocol,
|
|
59
|
+
partition_key=self.partition_key,
|
|
60
|
+
key=self.key,
|
|
61
|
+
datatype=f"{datatype.__module__}.{datatype.__qualname__}", # type: ignore[attr-defined]
|
|
62
|
+
schema=TypeAdapter(datatype).json_schema(),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
async def _save_df(
|
|
67
|
+
cls, df: pd.DataFrame, datatype: Type[GenericDataFrameT]
|
|
68
|
+
) -> "Dataset[GenericDataFrameT]":
|
|
69
|
+
return await cls.__storage.save_df(df, datatype) # type: ignore[attr-defined]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def find_protocol_impl(qual_type_name: str) -> Type:
|
|
73
|
+
mod_name, type_name = (
|
|
74
|
+
".".join(qual_type_name.split(".")[:-1]),
|
|
75
|
+
qual_type_name.split(".")[-1],
|
|
76
|
+
)
|
|
77
|
+
module = import_module(mod_name)
|
|
78
|
+
datatype = getattr(module, type_name)
|
|
79
|
+
return datatype
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
|
|
83
|
+
"""Returns dataframe class based on type name used during serialization"""
|
|
84
|
+
mod_name, type_name = (
|
|
85
|
+
".".join(qual_type_name.split(".")[:-1]),
|
|
86
|
+
qual_type_name.split(".")[-1],
|
|
87
|
+
)
|
|
88
|
+
module = import_module(mod_name)
|
|
89
|
+
datatype = getattr(module, type_name)
|
|
90
|
+
assert hasattr(
|
|
91
|
+
datatype, "__dataframe__"
|
|
92
|
+
), f"Type {qual_type_name} must be annotated with `@dataframe`."
|
|
93
|
+
return datatype
|
{hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/serialization/files.py
RENAMED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Support for `@dataframes` serialization to files"""
|
|
2
2
|
|
|
3
3
|
import io
|
|
4
|
-
from importlib import import_module
|
|
5
4
|
from typing import Generic, Optional, Type, TypeVar
|
|
6
5
|
from uuid import uuid4
|
|
7
6
|
|
|
@@ -18,7 +17,7 @@ except ImportError as e:
|
|
|
18
17
|
|
|
19
18
|
from hopeit.dataframes.dataframe import DataFrameMixin
|
|
20
19
|
from hopeit.dataframes.serialization.dataset import Dataset
|
|
21
|
-
from hopeit.dataobjects import
|
|
20
|
+
from hopeit.dataobjects import DataObject
|
|
22
21
|
from hopeit.fs_storage import FileStorage
|
|
23
22
|
|
|
24
23
|
DataFrameT = TypeVar("DataFrameT", bound=DataFrameMixin)
|
|
@@ -57,25 +56,30 @@ class DatasetFileStorage(Generic[DataFrameT]):
|
|
|
57
56
|
schema=TypeAdapter(datatype).json_schema(),
|
|
58
57
|
)
|
|
59
58
|
|
|
60
|
-
async def
|
|
61
|
-
"""
|
|
62
|
-
|
|
59
|
+
async def save_df(self, df: pd.DataFrame, datatype: Type[DataObject]) -> Dataset:
|
|
60
|
+
"""Saves pandas df object as parquet to file system
|
|
61
|
+
and returns Dataset metadata to be used when retrieval
|
|
62
|
+
is handled externally
|
|
63
|
+
"""
|
|
64
|
+
key = f"{datatype.__qualname__.lower()}_{uuid4()}.parquet"
|
|
65
|
+
data = io.BytesIO(
|
|
66
|
+
df.to_parquet( # pylint: disable=protected-access
|
|
67
|
+
engine="pyarrow"
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
location = await self.storage.store_file(file_name=key, value=data)
|
|
71
|
+
partition_key = self.storage.partition_key(location)
|
|
72
|
+
|
|
73
|
+
return Dataset(
|
|
74
|
+
protocol=f"{__name__}.{type(self).__name__}",
|
|
75
|
+
partition_key=partition_key,
|
|
76
|
+
key=key,
|
|
77
|
+
datatype=f"{datatype.__module__}.{datatype.__qualname__}",
|
|
78
|
+
schema=TypeAdapter(datatype).json_schema(),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
async def load_df(self, dataset: Dataset, columns: Optional[list[str]] = None) -> pd.DataFrame:
|
|
63
82
|
data = await self.storage.get_file(dataset.key, partition_key=dataset.partition_key)
|
|
64
83
|
if data is None:
|
|
65
84
|
raise FileNotFoundError(dataset.key)
|
|
66
|
-
|
|
67
|
-
return datatype._from_df(df) # pylint: disable=protected-access
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
|
|
71
|
-
"""Returns dataframe class based on type name used during serialization"""
|
|
72
|
-
mod_name, type_name = (
|
|
73
|
-
".".join(qual_type_name.split(".")[:-1]),
|
|
74
|
-
qual_type_name.split(".")[-1],
|
|
75
|
-
)
|
|
76
|
-
module = import_module(mod_name)
|
|
77
|
-
datatype = getattr(module, type_name)
|
|
78
|
-
assert hasattr(
|
|
79
|
-
datatype, "__dataframe__"
|
|
80
|
-
), f"Type {qual_type_name} must be annotated with `@dataframe`."
|
|
81
|
-
return datatype
|
|
85
|
+
return pd.read_parquet(io.BytesIO(data), engine="pyarrow", columns=columns)
|
{hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit.dataframes.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.25.
|
|
3
|
+
Version: 0.25.4
|
|
4
4
|
Summary: Hopeit Engine Dataframes Toolkit
|
|
5
5
|
Author-email: Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
|
|
6
6
|
License: Apache 2
|
|
@@ -24,7 +24,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
24
24
|
Classifier: Framework :: AsyncIO
|
|
25
25
|
Requires-Python: >=3.9
|
|
26
26
|
Description-Content-Type: text/plain
|
|
27
|
-
Requires-Dist: hopeit.engine[fs-storage]==0.25.
|
|
27
|
+
Requires-Dist: hopeit.engine[fs-storage]==0.25.4
|
|
28
28
|
Requires-Dist: pandas
|
|
29
29
|
Requires-Dist: numpy
|
|
30
30
|
Provides-Extra: pyarrow
|
{hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit.dataframes.egg-info/SOURCES.txt
RENAMED
|
@@ -7,6 +7,7 @@ src/hopeit.dataframes.egg-info/dependency_links.txt
|
|
|
7
7
|
src/hopeit.dataframes.egg-info/requires.txt
|
|
8
8
|
src/hopeit.dataframes.egg-info/top_level.txt
|
|
9
9
|
src/hopeit/dataframes/__init__.py
|
|
10
|
+
src/hopeit/dataframes/datablocks.py
|
|
10
11
|
src/hopeit/dataframes/dataframe.py
|
|
11
12
|
src/hopeit/dataframes/py.typed
|
|
12
13
|
src/hopeit/dataframes/serialization/__init__.py
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
"""Dataset objects definition, used as a result of serialized dataframes"""
|
|
2
|
-
|
|
3
|
-
from importlib import import_module
|
|
4
|
-
from typing import Any, Dict, Generic, Type, TypeVar
|
|
5
|
-
|
|
6
|
-
from hopeit.dataobjects import dataclass, dataobject, field
|
|
7
|
-
|
|
8
|
-
DataFrameT = TypeVar("DataFrameT")
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class DatasetLoadError(Exception):
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataobject
|
|
16
|
-
@dataclass
|
|
17
|
-
class Dataset(Generic[DataFrameT]):
|
|
18
|
-
"""Persisted representation of a @dataframe object"""
|
|
19
|
-
|
|
20
|
-
protocol: str
|
|
21
|
-
partition_key: str
|
|
22
|
-
key: str
|
|
23
|
-
datatype: str
|
|
24
|
-
schema: Dict[str, Any] = field(default_factory=dict)
|
|
25
|
-
|
|
26
|
-
async def load(self) -> DataFrameT:
|
|
27
|
-
try:
|
|
28
|
-
dataframe = await self.__storage.load(self) # type: ignore[attr-defined]
|
|
29
|
-
return dataframe
|
|
30
|
-
except (RuntimeError, IOError, KeyError) as e:
|
|
31
|
-
raise DatasetLoadError(
|
|
32
|
-
f"Error {type(e).__name__}: {e} loading dataset of type {self.datatype} "
|
|
33
|
-
f"at location {self.partition_key}/{self.key}"
|
|
34
|
-
) from e
|
|
35
|
-
|
|
36
|
-
@classmethod
|
|
37
|
-
async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
|
|
38
|
-
return await cls.__storage.save(dataframe) # type: ignore[attr-defined]
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def find_protocol_impl(qual_type_name: str) -> Type:
|
|
42
|
-
mod_name, type_name = (
|
|
43
|
-
".".join(qual_type_name.split(".")[:-1]),
|
|
44
|
-
qual_type_name.split(".")[-1],
|
|
45
|
-
)
|
|
46
|
-
module = import_module(mod_name)
|
|
47
|
-
datatype = getattr(module, type_name)
|
|
48
|
-
return datatype
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/serialization/py.typed
RENAMED
|
File without changes
|
|
File without changes
|
{hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/setup/__init__.py
RENAMED
|
File without changes
|
{hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit/dataframes/setup/dataframes.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hopeit_dataframes-0.25.3 → hopeit_dataframes-0.25.4}/src/hopeit.dataframes.egg-info/top_level.txt
RENAMED
|
File without changes
|