hopeit.dataframes 0.25.4__tar.gz → 0.26.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/PKG-INFO +13 -13
  2. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/pyproject.toml +18 -12
  3. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit/dataframes/__init__.py +5 -13
  4. hopeit_dataframes-0.26.0/src/hopeit/dataframes/datablocks.py +294 -0
  5. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit/dataframes/dataframe.py +5 -4
  6. hopeit_dataframes-0.26.0/src/hopeit/dataframes/serialization/dataset.py +118 -0
  7. hopeit_dataframes-0.26.0/src/hopeit/dataframes/serialization/files.py +213 -0
  8. hopeit_dataframes-0.26.0/src/hopeit/dataframes/serialization/protocol.py +28 -0
  9. hopeit_dataframes-0.26.0/src/hopeit/dataframes/serialization/settings.py +34 -0
  10. hopeit_dataframes-0.26.0/src/hopeit/dataframes/setup/dataframes.py +30 -0
  11. hopeit_dataframes-0.26.0/src/hopeit/dataframes/setup/register_database.py +26 -0
  12. hopeit_dataframes-0.26.0/src/hopeit/dataframes/setup/registry.py +70 -0
  13. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit.dataframes.egg-info/PKG-INFO +13 -13
  14. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit.dataframes.egg-info/SOURCES.txt +4 -2
  15. hopeit_dataframes-0.26.0/src/hopeit.dataframes.egg-info/requires.txt +5 -0
  16. hopeit_dataframes-0.25.4/setup.py +0 -25
  17. hopeit_dataframes-0.25.4/src/hopeit/dataframes/datablocks.py +0 -140
  18. hopeit_dataframes-0.25.4/src/hopeit/dataframes/serialization/dataset.py +0 -93
  19. hopeit_dataframes-0.25.4/src/hopeit/dataframes/serialization/files.py +0 -85
  20. hopeit_dataframes-0.25.4/src/hopeit/dataframes/serialization/settings.py +0 -13
  21. hopeit_dataframes-0.25.4/src/hopeit/dataframes/setup/dataframes.py +0 -36
  22. hopeit_dataframes-0.25.4/src/hopeit.dataframes.egg-info/requires.txt +0 -6
  23. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/README.md +0 -0
  24. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/setup.cfg +0 -0
  25. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit/dataframes/py.typed +0 -0
  26. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit/dataframes/serialization/__init__.py +0 -0
  27. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit/dataframes/serialization/py.typed +0 -0
  28. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit/dataframes/setup/__init__.py +0 -0
  29. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit/dataframes/setup/py.typed +0 -0
  30. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
  31. {hopeit_dataframes-0.25.4 → hopeit_dataframes-0.26.0}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: hopeit.dataframes
3
- Version: 0.25.4
4
- Summary: Hopeit Engine Dataframes Toolkit
5
- Author-email: Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
3
+ Version: 0.26.0
4
+ Summary: Hopeit Engine Dataframes for Pandas
5
+ Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
6
6
  License: Apache 2
7
7
  Project-URL: Homepage, https://github.com/hopeit-git/hopeit.engine
8
8
  Project-URL: CI: GitHub Actions, https://github.com/hopeit-git/hopeit.engine/actions?query=workflow
@@ -12,23 +12,23 @@ Project-URL: GitHub: repo, https://github.com/hopeit-git/hopeit.engine
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Programming Language :: Python
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
- Classifier: Development Status :: 4 - Beta
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Development Status :: 5 - Production/Stable
19
20
  Classifier: Operating System :: POSIX :: Linux
20
21
  Classifier: Operating System :: MacOS :: MacOS X
21
22
  Classifier: Operating System :: Microsoft :: Windows
22
23
  Classifier: Topic :: Internet :: WWW/HTTP
23
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
25
  Classifier: Framework :: AsyncIO
25
- Requires-Python: >=3.9
26
- Description-Content-Type: text/plain
27
- Requires-Dist: hopeit.engine[fs-storage]==0.25.4
28
- Requires-Dist: pandas
29
- Requires-Dist: numpy
30
- Provides-Extra: pyarrow
31
- Requires-Dist: pyarrow; extra == "pyarrow"
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: hopeit.engine>=0.26.0
28
+ Requires-Dist: hopeit.fs-storage>=0.26.0
29
+ Requires-Dist: pandas>=2.2.3
30
+ Requires-Dist: pyarrow>=19.0.1
31
+ Requires-Dist: numpy>=1.26.4
32
32
 
33
33
  # hopeit.engine dataframes plugin
34
34
 
@@ -4,10 +4,22 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hopeit.dataframes"
7
- description = "Hopeit Engine Dataframes Toolkit"
8
- dynamic = ["version", "readme", "dependencies", "optional-dependencies"]
7
+ version = "0.26.0"
8
+
9
+ description = "Hopeit Engine Dataframes for Pandas"
10
+ dynamic = ["readme"]
11
+
12
+ dependencies = [
13
+ "hopeit.engine>=0.26.0",
14
+ "hopeit.fs-storage>=0.26.0",
15
+ "pandas>=2.2.3",
16
+ "pyarrow>=19.0.1",
17
+ "numpy>=1.26.4"
18
+ ]
19
+
9
20
  license = { text = "Apache 2" }
10
21
  authors = [
22
+ { name = "Leo Smerling & Pablo Canto", email = "contact@hopeit.com.ar" },
11
23
  { name = "Leo Smerling", email = "contact@hopeit.com.ar" },
12
24
  { name = "Pablo Canto", email = "contact@hopeit.com.ar" },
13
25
  ]
@@ -15,10 +27,11 @@ classifiers = [
15
27
  "License :: OSI Approved :: Apache Software License",
16
28
  "Intended Audience :: Developers",
17
29
  "Programming Language :: Python",
18
- "Programming Language :: Python :: 3.9",
19
30
  "Programming Language :: Python :: 3.10",
20
31
  "Programming Language :: Python :: 3.11",
21
- "Development Status :: 4 - Beta",
32
+ "Programming Language :: Python :: 3.12",
33
+ "Programming Language :: Python :: 3.13",
34
+ "Development Status :: 5 - Production/Stable",
22
35
  "Operating System :: POSIX :: Linux",
23
36
  "Operating System :: MacOS :: MacOS X",
24
37
  "Operating System :: Microsoft :: Windows",
@@ -26,8 +39,6 @@ classifiers = [
26
39
  "Topic :: Software Development :: Libraries :: Python Modules",
27
40
  "Framework :: AsyncIO",
28
41
  ]
29
- requires-python = ">=3.9"
30
-
31
42
 
32
43
  [project.urls]
33
44
  "Homepage" = "https://github.com/hopeit-git/hopeit.engine"
@@ -42,10 +53,5 @@ include-package-data = true
42
53
  [tool.setuptools.packages.find]
43
54
  where = ["src"]
44
55
 
45
- [tool.setuptools.package-data]
46
- "hopeit.dataframes" = ["py.typed"]
47
- "hopeit.dataframes.serialization" = ["py.typed"]
48
- "hopeit.dataframes.setup" = ["py.typed"]
49
-
50
56
  [tool.setuptools.dynamic]
51
- readme = { file = ["README.md"], content-type = "text/plain" }
57
+ readme = { file = ["README.md"], content-type = "text/markdown" }
@@ -36,13 +36,11 @@ class MyWebResponse:
36
36
  dataset_name: str
37
37
  example_data: List[MyData.DataObject]
38
38
 
39
- # This step is not needed if SETUP event is configured in app
40
- DataFrames.setup(DatasetSerialization(
41
- protocol="hopeit.dataframes.serialization.files.DatasetFileStorage",
42
- location="/tmp/data",
43
- partition_dateformat="%Y/%m/%d/%H/",
44
- ))
39
+ # Initialization: this step is not needed if SETUP event is configured in app
40
+ settings = DataframesSettings(...) # settings example in `plugin-config.json`
41
+ await registry.init_registry(settings)
45
42
 
43
+ # Usage
46
44
  df = pd.DataFrame([ # Create or load a pandas DataFrame
47
45
  {"field1": 1, "field2": "text1"},
48
46
  {"field1": 2, "field2": "text2"},
@@ -77,10 +75,8 @@ from typing import Dict, Generic, Iterator, List, Type
77
75
  import numpy as np
78
76
  import pandas as pd
79
77
  from hopeit.dataframes.dataframe import DataFrameT, dataframe
80
- from hopeit.dataframes.datablocks import DataBlocks
81
78
  from hopeit.dataframes.serialization.dataset import Dataset
82
- from hopeit.dataframes.serialization.settings import DatasetSerialization
83
- from hopeit.dataframes.setup.dataframes import register_serialization
79
+ from hopeit.dataframes.datablocks import DataBlocks
84
80
  from hopeit.dataobjects import DataObject
85
81
 
86
82
  __all__ = ["DataBlocks", "DataFrames", "Dataset", "dataframe"]
@@ -91,10 +87,6 @@ class DataFrames(Generic[DataFrameT, DataObject]):
91
87
  Dataframes manipulation utilities methods
92
88
  """
93
89
 
94
- @staticmethod
95
- def setup(settings: DatasetSerialization):
96
- register_serialization(settings)
97
-
98
90
  @staticmethod
99
91
  def from_df(
100
92
  datatype: Type[DataFrameT], df: pd.DataFrame, **series: Dict[str, pd.Series]
@@ -0,0 +1,294 @@
1
+ """
2
+ DataBlocks is a utility that allows users of the dataframes plugin to create dataobjects
3
+ that contain combined properties with one or multiple Datasets but can be manipulated
4
+ and saved as a single flat pandas DataFrame.
5
+ """
6
+
7
+ from datetime import datetime
8
+ from typing import AsyncGenerator, Generic, Optional, Type, TypeVar, get_args, get_origin
9
+
10
+ import pandas as pd
11
+ from hopeit.dataobjects import dataobject, dataclass, fields
12
+
13
+ from hopeit.dataframes.serialization.dataset import Dataset, DatasetLoadError
14
+ from hopeit.dataframes.serialization.protocol import find_dataframe_type
15
+ from hopeit.dataframes.setup.registry import get_dataset_storage
16
+
17
+ DataBlockType = TypeVar("DataBlockType")
18
+ DataBlockItemType = TypeVar("DataBlockItemType")
19
+ DataFrameType = TypeVar("DataFrameType")
20
+
21
+
22
+ @dataobject
23
+ @dataclass
24
+ class DataBlockMetadata:
25
+ partition_dt: Optional[datetime] = None
26
+ database_key: Optional[str] = None
27
+ group_key: Optional[str] = None
28
+ collection: Optional[str] = None
29
+
30
+ @classmethod
31
+ def default(cls) -> "DataBlockMetadata":
32
+ return cls()
33
+
34
+
35
+ @dataobject
36
+ @dataclass
37
+ class DataBlockQuery:
38
+ from_partition_dt: datetime
39
+ to_partition_dt: datetime
40
+ select: list[str] | None = None
41
+
42
+
43
+ class TempDataBlock(Generic[DataBlockType, DataBlockItemType]):
44
+ """
45
+ TempDataBlock allows to convers a pandas Dataframe to a from dataobjects
46
+ using DatabBlockType and DataBlockItemType schemas. So from a flat pandas
47
+ dataframe, an object containing subsections of the data can be created.
48
+ """
49
+
50
+ def __init__(self, datatype: Type[DataBlockType], df: pd.DataFrame) -> None:
51
+ self.datatype = datatype
52
+ self.df = df
53
+
54
+ @classmethod
55
+ def from_dataobjects(
56
+ cls, datatype: Type[DataBlockType], items: list[DataBlockItemType]
57
+ ) -> "TempDataBlock[DataBlockType, DataBlockItemType]":
58
+ result_df: Optional[pd.DataFrame] = None
59
+ for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
60
+ if get_origin(field_info.annotation) is Dataset:
61
+ block_items = (getattr(item, field_name) for item in items)
62
+ block_type = get_args(field_info.annotation)[0]
63
+ block = block_type._from_dataobjects(block_items)
64
+ block_df = block._df
65
+ else:
66
+ block_df = pd.DataFrame({field_name: [getattr(item, field_name) for item in items]})
67
+
68
+ if result_df is None:
69
+ result_df = block_df
70
+ else:
71
+ # Skips duplicated column names to they are included only once
72
+ result_df = result_df.join(
73
+ block_df[[col for col in block_df.columns if col not in result_df.columns]]
74
+ )
75
+ assert result_df is not None
76
+ return cls(datatype, result_df)
77
+
78
+ def to_dataobjects(
79
+ self, item_type: Type[DataBlockItemType], *, normalize_null_values: bool = False
80
+ ) -> list[DataBlockItemType]:
81
+ keys: list[str] = []
82
+ entries: list[list] = []
83
+ for field_name, field_info in fields(self.datatype).items(): # type: ignore[type-var]
84
+ if get_origin(field_info.annotation) is Dataset:
85
+ block_type = get_args(field_info.annotation)[0]
86
+ keys.append(field_name)
87
+ dataframe = block_type._from_df(self.df)
88
+ entries.append(
89
+ dataframe._to_dataobjects(normalize_null_values=normalize_null_values)
90
+ )
91
+ else:
92
+ keys.append(field_name)
93
+ entries.append(self.df[field_name].to_list())
94
+
95
+ return [
96
+ item_type(**{field_name: entry[i] for i, field_name in enumerate(keys)})
97
+ for entry in zip(*entries)
98
+ ]
99
+
100
+
101
+ class DataBlocks(Generic[DataBlockType, DataFrameType]):
102
+ """
103
+ DataBlocks is a utility class that allows users to create dataobjects containing multiple Datasets.
104
+ These dataobjects can be converted and saved as a single pandas DataFrame.
105
+ """
106
+
107
+ @classmethod
108
+ async def load(
109
+ cls,
110
+ datablock: DataBlockType,
111
+ *,
112
+ select: Optional[list[str]] = None,
113
+ database_key: Optional[str] = None,
114
+ ) -> pd.DataFrame:
115
+ """
116
+ Converts a DataBlockType object to a pandas DataFrame, by reading the subyacent Dataset/s and
117
+ putting al the fields defined in the DataBlockType in a flat pandas DataFrame.
118
+
119
+ Args:
120
+ datablock (DataBlockType): The data block to convert.
121
+ select (Optional[list[str]]): Optional list of field names to select.
122
+ database_key (Optional[str]): Optional database key for loading data.
123
+
124
+ Returns:
125
+ pd.DataFrame: The resulting pandas DataFrame.
126
+ """
127
+ keys = [
128
+ field_name
129
+ for field_name, field_info in fields(datablock).items() # type: ignore[arg-type]
130
+ if get_origin(field_info.annotation) is Dataset
131
+ and (select is None or field_name in select)
132
+ ]
133
+
134
+ # Filter/validate selected field names using saved schema,
135
+ # generates a single field for every common/duplicated field in the datasets
136
+ field_names = list(
137
+ dict.fromkeys(
138
+ [
139
+ field_name
140
+ for key in keys
141
+ for field_name in getattr(datablock, key).schema["properties"].keys()
142
+ ]
143
+ )
144
+ )
145
+
146
+ # Load data from first dataset (datablock uses a single file for all datasets)
147
+ dataset: Dataset = getattr(datablock, keys[0])
148
+ storage = await get_dataset_storage(database_key)
149
+ result_df = await DataBlocks._load_datablock_df(storage, dataset, field_names, database_key)
150
+
151
+ # Add missing optional fields using class schema (allows schema evolution)
152
+ cls._adapt_to_schema(datablock, keys, result_df)
153
+
154
+ # Adding constant value fields
155
+ for field_name, field_info in fields(datablock).items(): # type: ignore[arg-type]
156
+ if get_origin(field_info.annotation) is not Dataset:
157
+ result_df[field_name] = getattr(datablock, field_name) # type: ignore[index]
158
+
159
+ return result_df
160
+
161
+ @staticmethod
162
+ async def save(
163
+ datatype: Type[DataBlockType],
164
+ df: pd.DataFrame,
165
+ metadata: DataBlockMetadata | None = None,
166
+ **kwargs, # Non-Dataset field values for DataBlockType
167
+ ) -> DataBlockType:
168
+ """
169
+ Creates a DataBlockType object from a pandas DataFrame, by saving the pandas Dataframe to a single
170
+ location, usually a file, and returning a dataobject with Datasets that reference the saved data.
171
+ The returned DataBlock can be retrieved in one shot using `DataBlocks.df` to get back a flat pandas
172
+ DataFrame, or each of the individual DataSets can be loaded independently.
173
+
174
+ Args:
175
+ datatype (Type[DataBlockType]): The type of the data block.
176
+ df (pd.DataFrame): The pandas DataFrame to convert.
177
+ metadata (Optional[DataBlockMetadata]): Optional metadata for the data block.
178
+ **kwargs: Additional non-Dataset field values for the DataBlockType.
179
+
180
+ Returns:
181
+ DataBlockType: The resulting data block.
182
+ """
183
+ if metadata is None:
184
+ metadata = DataBlockMetadata.default()
185
+
186
+ storage = await get_dataset_storage(metadata.database_key)
187
+
188
+ block_dataset = await Dataset._save_df(
189
+ storage,
190
+ df,
191
+ datatype,
192
+ database_key=metadata.database_key,
193
+ partition_dt=metadata.partition_dt,
194
+ group_key=metadata.group_key,
195
+ collection=metadata.collection,
196
+ save_schema=True, # Required for datablocks
197
+ )
198
+
199
+ blocks = {}
200
+ for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
201
+ if get_origin(field_info.annotation) is Dataset:
202
+ block_type = get_args(field_info.annotation)[0]
203
+ blocks[field_name] = block_dataset._adapt(block_type)
204
+ else:
205
+ blocks[field_name] = kwargs[field_name]
206
+
207
+ return datatype(**blocks)
208
+
209
+ @staticmethod
210
+ def default(datatype: Type[DataBlockType]) -> DataBlockType:
211
+ return datatype(**{field_name: [] for field_name in list(fields(datatype))}) # type: ignore[type-var]
212
+
213
+ @classmethod
214
+ async def load_batch(
215
+ cls,
216
+ datatype: Type[DataBlockType],
217
+ query: DataBlockQuery,
218
+ metadata: DataBlockMetadata | None = None,
219
+ **kwargs, # Non-Dataset field values for DataBlockType
220
+ ) -> AsyncGenerator[pd.DataFrame, None]:
221
+ if metadata is None:
222
+ metadata = DataBlockMetadata.default()
223
+
224
+ storage = await get_dataset_storage(metadata.database_key)
225
+
226
+ async for block_dataset in storage._get_batch( # type: ignore[attr-defined]
227
+ datatype,
228
+ database_key=metadata.database_key,
229
+ from_partition_dt=query.from_partition_dt,
230
+ to_partition_dt=query.to_partition_dt,
231
+ group_key=metadata.group_key,
232
+ collection=metadata.collection,
233
+ ):
234
+ dataset_types = [
235
+ (field_name, get_args(field_info.annotation)[0])
236
+ for field_name, field_info in fields(datatype).items() # type: ignore[type-var]
237
+ if get_origin(field_info.annotation) is Dataset
238
+ and (query.select is None or field_name in query.select)
239
+ ]
240
+ field_names = list(
241
+ dict.fromkeys(
242
+ [
243
+ field_name
244
+ for _, dataset_type in dataset_types
245
+ for field_name, _ in fields(dataset_type).items()
246
+ ]
247
+ )
248
+ )
249
+ result_df = await DataBlocks._load_datablock_df(
250
+ storage, block_dataset, field_names, metadata.database_key
251
+ )
252
+
253
+ # Adding constant value fields
254
+ for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
255
+ if get_origin(field_info.annotation) is not Dataset:
256
+ result_df[field_name] = kwargs.get(field_name)
257
+
258
+ yield result_df
259
+
260
+ @staticmethod
261
+ def _get_datablock_keys(
262
+ datablocktype: Type[DataBlockType],
263
+ *,
264
+ select: Optional[list[str]] = None,
265
+ ) -> list[str]:
266
+ return [
267
+ field_name
268
+ for field_name, field_info in fields(datablocktype).items() # type: ignore[type-var]
269
+ if get_origin(field_info.annotation) is Dataset
270
+ and (select is None or field_name in select)
271
+ ]
272
+
273
+ @staticmethod
274
+ async def _load_datablock_df(
275
+ storage: object,
276
+ dataset: Dataset,
277
+ columns: Optional[list[str]] = None,
278
+ database_key: Optional[str] = None,
279
+ ) -> pd.DataFrame:
280
+ try:
281
+ return await dataset._load_df(storage, columns)
282
+ except (RuntimeError, IOError, KeyError) as e:
283
+ raise DatasetLoadError(
284
+ f"Error {type(e).__name__}: {e} loading datablock of type {dataset.datatype} "
285
+ f"at location {dataset.partition_key}/{dataset.key}"
286
+ ) from e
287
+
288
+ @classmethod
289
+ def _adapt_to_schema(cls, datablock: DataBlockType, keys: list[str], df: pd.DataFrame) -> None:
290
+ for key in keys:
291
+ datatype = find_dataframe_type(getattr(datablock, key).datatype) # type: ignore[var-annotated]
292
+ valid_df = datatype._from_df(df)._df
293
+ for col in valid_df.columns:
294
+ df[col] = valid_df[col]
@@ -124,7 +124,7 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
124
124
  raise NotImplementedError # must use @dataframe decorator # pragma: no cover
125
125
 
126
126
  @staticmethod
127
- def __init_from_series__(self, **series: pd.Series): # pylint: disable=bad-staticmethod-argument
127
+ def __init_from_series__(self, **series: pd.Series) -> None: # pylint: disable=bad-staticmethod-argument
128
128
  df = pd.DataFrame(series)
129
129
  df.index.name = None # Removes index name to avoid colisions with series name
130
130
  if self.__data_object__["validate"]:
@@ -240,9 +240,10 @@ def dataframe(
240
240
  return amended_class
241
241
  return cls
242
242
 
243
- def add_dataframe_metadata(cls):
243
+ def add_dataframe_metadata(cls) -> None:
244
244
  serialized_fields = {k: (v.annotation, v) for k, v in fields(cls).items()}
245
- dataobject_type = create_model(cls.__name__ + "DataObject", **serialized_fields)
245
+ dataobject_name = str(cls.__name__) + "DataObject"
246
+ dataobject_type = create_model(dataobject_name, **serialized_fields) # type: ignore[call-overload]
246
247
  dataobject_type = dataobject(dataobject_type, unsafe=True)
247
248
 
248
249
  setattr(cls, "DataObject", dataobject_type)
@@ -255,7 +256,7 @@ def dataframe(
255
256
  ),
256
257
  )
257
258
 
258
- def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool):
259
+ def add_dataobject_annotations(cls, unsafe: bool, validate: bool, schema: bool) -> None:
259
260
  setattr(
260
261
  cls,
261
262
  "__data_object__",
@@ -0,0 +1,118 @@
1
+ """Dataset objects definition, used as a result of serialized dataframes"""
2
+
3
+ from datetime import datetime
4
+ from typing import Any, Dict, Generic, Optional, Type, TypeVar
5
+
6
+ from hopeit.dataobjects import dataclass, dataobject
7
+ import pandas as pd
8
+ from pydantic import TypeAdapter
9
+
10
+ from hopeit.dataframes.setup.registry import get_dataset_storage
11
+ from hopeit.dataframes.serialization.protocol import find_dataframe_type
12
+
13
+ DataFrameT = TypeVar("DataFrameT")
14
+ GenericDataFrameT = TypeVar("GenericDataFrameT")
15
+
16
+
17
+ class DatasetLoadError(Exception):
18
+ pass
19
+
20
+
21
+ class DatasetConvertError(Exception):
22
+ pass
23
+
24
+
25
+ @dataobject
26
+ @dataclass
27
+ class Dataset(Generic[DataFrameT]):
28
+ """Persisted representation of a @dataframe object"""
29
+
30
+ protocol: str
31
+ partition_key: str
32
+ key: str
33
+ datatype: str
34
+ partition_dt: Optional[datetime] = None
35
+ database_key: Optional[str] = None
36
+ group_key: Optional[str] = None
37
+ collection: Optional[str] = None
38
+ schema: Optional[Dict[str, Any]] = None
39
+
40
+ @classmethod
41
+ async def save(
42
+ cls,
43
+ dataframe: DataFrameT,
44
+ *,
45
+ partition_dt: Optional[datetime] = None,
46
+ database_key: Optional[str] = None,
47
+ group_key: Optional[str] = None,
48
+ collection: Optional[str] = None,
49
+ save_schema: bool = False,
50
+ ) -> "Dataset[DataFrameT]":
51
+ storage = await get_dataset_storage(database_key)
52
+ return await storage.save( # type: ignore[attr-defined]
53
+ dataframe,
54
+ partition_dt=partition_dt,
55
+ database_key=database_key,
56
+ group_key=group_key,
57
+ collection=collection,
58
+ save_schema=save_schema,
59
+ )
60
+
61
+ @classmethod
62
+ async def load(
63
+ cls, dataset: "Dataset[DataFrameT]", database_key: Optional[str] = None
64
+ ) -> DataFrameT:
65
+ try:
66
+ storage = await get_dataset_storage(database_key)
67
+ df = await dataset._load_df(storage)
68
+ return dataset._convert(df)
69
+ except (RuntimeError, IOError, KeyError) as e:
70
+ raise DatasetLoadError(
71
+ f"Error {type(e).__name__}: {e} loading dataset of type {dataset.datatype} "
72
+ f"at location {dataset.partition_key}/{dataset.key}"
73
+ ) from e
74
+
75
+ async def _load_df(self, storage: object, columns: Optional[list[str]] = None) -> pd.DataFrame:
76
+ return await storage.load_df(self, columns) # type: ignore[attr-defined]
77
+
78
+ def _convert(self, df: pd.DataFrame) -> DataFrameT:
79
+ """Converts loaded pandas Dataframe to @dataframe annotated object using Dataset metadata"""
80
+ datatype: Type[DataFrameT] = find_dataframe_type(self.datatype)
81
+ return datatype._from_df(df) # type: ignore[attr-defined]
82
+
83
+ def _adapt(self, datatype: DataFrameT) -> "Dataset[DataFrameT]":
84
+ """Adapts a more generic dataset that contains combined fields to be type specific"""
85
+ return Dataset(
86
+ protocol=self.protocol,
87
+ partition_key=self.partition_key,
88
+ key=self.key,
89
+ datatype=f"{datatype.__module__}.{datatype.__qualname__}", # type: ignore[attr-defined]
90
+ partition_dt=self.partition_dt,
91
+ database_key=self.database_key,
92
+ group_key=self.group_key,
93
+ collection=self.collection,
94
+ schema=TypeAdapter(datatype).json_schema() if self.schema else None,
95
+ )
96
+
97
+ @classmethod
98
+ async def _save_df(
99
+ cls,
100
+ storage: object,
101
+ df: pd.DataFrame,
102
+ datatype: Type[GenericDataFrameT],
103
+ *,
104
+ partition_dt: Optional[datetime],
105
+ database_key: Optional[str],
106
+ group_key: Optional[str],
107
+ collection: Optional[str],
108
+ save_schema: bool,
109
+ ) -> "Dataset[GenericDataFrameT]":
110
+ return await storage.save_df( # type: ignore[attr-defined]
111
+ df,
112
+ datatype,
113
+ partition_dt=partition_dt,
114
+ database_key=database_key,
115
+ group_key=group_key,
116
+ collection=collection,
117
+ save_schema=save_schema,
118
+ )