hopeit.dataframes 0.26.3__tar.gz → 0.26.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/PKG-INFO +3 -3
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/pyproject.toml +3 -3
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/__init__.py +9 -2
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/datablocks.py +57 -47
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/dataframe.py +7 -2
- hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas/numpy_mock.py +3 -0
- hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas/pandas_mock.py +13 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/dataset.py +6 -1
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/files.py +3 -4
- hopeit_dataframes-0.26.5/src/hopeit/dataframes/setup/__init__.py +0 -0
- hopeit_dataframes-0.26.5/src/hopeit/dataframes/setup/py.typed +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/PKG-INFO +3 -3
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/SOURCES.txt +4 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/requires.txt +2 -2
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/README.md +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/setup.cfg +0 -0
- {hopeit_dataframes-0.26.3/src/hopeit/dataframes/serialization → hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas}/__init__.py +0 -0
- {hopeit_dataframes-0.26.3/src/hopeit/dataframes → hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas}/py.typed +0 -0
- {hopeit_dataframes-0.26.3/src/hopeit/dataframes/serialization → hopeit_dataframes-0.26.5/src/hopeit/dataframes}/py.typed +0 -0
- {hopeit_dataframes-0.26.3/src/hopeit/dataframes/setup → hopeit_dataframes-0.26.5/src/hopeit/dataframes/serialization}/__init__.py +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/protocol.py +0 -0
- {hopeit_dataframes-0.26.3/src/hopeit/dataframes/setup → hopeit_dataframes-0.26.5/src/hopeit/dataframes/serialization}/py.typed +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/settings.py +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/setup/dataframes.py +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/setup/register_database.py +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/setup/registry.py +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
- {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.26.
|
|
3
|
+
Version: 0.26.5
|
|
4
4
|
Summary: Hopeit Engine Dataframes for Pandas
|
|
5
5
|
Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
|
|
6
6
|
License: Apache 2
|
|
@@ -24,8 +24,8 @@ Classifier: Topic :: Internet :: WWW/HTTP
|
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
25
|
Classifier: Framework :: AsyncIO
|
|
26
26
|
Description-Content-Type: text/markdown
|
|
27
|
-
Requires-Dist: hopeit.engine>=0.26.
|
|
28
|
-
Requires-Dist: hopeit.fs-storage>=0.26.
|
|
27
|
+
Requires-Dist: hopeit.engine>=0.26.5
|
|
28
|
+
Requires-Dist: hopeit.fs-storage>=0.26.5
|
|
29
29
|
Provides-Extra: pandas
|
|
30
30
|
Requires-Dist: pandas>=2.2.3; extra == "pandas"
|
|
31
31
|
Requires-Dist: pyarrow>=19.0.1; extra == "pandas"
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hopeit.dataframes"
|
|
7
|
-
version = "0.26.
|
|
7
|
+
version = "0.26.5"
|
|
8
8
|
|
|
9
9
|
description = "Hopeit Engine Dataframes for Pandas"
|
|
10
10
|
dynamic = ["readme"]
|
|
@@ -33,8 +33,8 @@ classifiers = [
|
|
|
33
33
|
]
|
|
34
34
|
|
|
35
35
|
dependencies = [
|
|
36
|
-
"hopeit.engine>=0.26.
|
|
37
|
-
"hopeit.fs-storage>=0.26.
|
|
36
|
+
"hopeit.engine>=0.26.5",
|
|
37
|
+
"hopeit.fs-storage>=0.26.5"
|
|
38
38
|
]
|
|
39
39
|
|
|
40
40
|
[project.optional-dependencies]
|
|
@@ -72,8 +72,15 @@ print(Payload.to_json(my_json_response))
|
|
|
72
72
|
|
|
73
73
|
from typing import Dict, Generic, Iterator, List, Type
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
import
|
|
75
|
+
try:
|
|
76
|
+
import numpy as np
|
|
77
|
+
import pandas as pd
|
|
78
|
+
except ImportError:
|
|
79
|
+
# Supports using `@dataframe` annotation for dataobjects definitions
|
|
80
|
+
# without installing pandas and numpy. Useful for API-only projects.
|
|
81
|
+
import hopeit.dataframes.pandas.numpy_mock as np # type: ignore[no-redef]
|
|
82
|
+
import hopeit.dataframes.pandas.pandas_mock as pd # type: ignore[no-redef]
|
|
83
|
+
|
|
77
84
|
from hopeit.dataframes.dataframe import DataFrameT, dataframe
|
|
78
85
|
from hopeit.dataframes.serialization.dataset import Dataset
|
|
79
86
|
from hopeit.dataframes.datablocks import DataBlocks
|
|
@@ -7,11 +7,14 @@ and saved as a single flat pandas DataFrame.
|
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from typing import AsyncGenerator, Generic, Optional, Type, TypeVar, get_args, get_origin
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
try:
|
|
11
|
+
import pandas as pd
|
|
12
|
+
except ImportError:
|
|
13
|
+
import hopeit.dataframes.pandas.pandas_mock as pd # type: ignore[no-redef]
|
|
14
|
+
|
|
11
15
|
from hopeit.dataobjects import dataobject, dataclass, fields
|
|
12
16
|
|
|
13
17
|
from hopeit.dataframes.serialization.dataset import Dataset, DatasetLoadError
|
|
14
|
-
from hopeit.dataframes.serialization.protocol import find_dataframe_type
|
|
15
18
|
from hopeit.dataframes.setup.registry import get_dataset_storage
|
|
16
19
|
|
|
17
20
|
DataBlockType = TypeVar("DataBlockType")
|
|
@@ -110,6 +113,7 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
|
110
113
|
datablock: DataBlockType,
|
|
111
114
|
*,
|
|
112
115
|
select: Optional[list[str]] = None,
|
|
116
|
+
schema_validation: bool = True,
|
|
113
117
|
database_key: Optional[str] = None,
|
|
114
118
|
) -> pd.DataFrame:
|
|
115
119
|
"""
|
|
@@ -124,37 +128,25 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
|
124
128
|
Returns:
|
|
125
129
|
pd.DataFrame: The resulting pandas DataFrame.
|
|
126
130
|
"""
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
for field_name, field_info in fields(datablock).items() # type: ignore[arg-type]
|
|
130
|
-
if get_origin(field_info.annotation) is Dataset
|
|
131
|
-
and (select is None or field_name in select)
|
|
132
|
-
]
|
|
133
|
-
|
|
134
|
-
# Filter/validate selected field names using saved schema,
|
|
135
|
-
# generates a single field for every common/duplicated field in the datasets
|
|
136
|
-
field_names = list(
|
|
137
|
-
dict.fromkeys(
|
|
138
|
-
[
|
|
139
|
-
field_name
|
|
140
|
-
for key in keys
|
|
141
|
-
for field_name in getattr(datablock, key).schema["properties"].keys()
|
|
142
|
-
]
|
|
143
|
-
)
|
|
144
|
-
)
|
|
131
|
+
dataset_types = cls._get_dataset_types(type(datablock), select=select)
|
|
132
|
+
field_names = cls._get_field_names(dataset_types)
|
|
145
133
|
|
|
146
134
|
# Load data from first dataset (datablock uses a single file for all datasets)
|
|
147
|
-
dataset: Dataset = getattr(datablock,
|
|
135
|
+
dataset: Dataset = getattr(datablock, dataset_types[0][0])
|
|
148
136
|
storage = await get_dataset_storage(database_key)
|
|
149
|
-
result_df = await DataBlocks._load_datablock_df(
|
|
137
|
+
result_df = await DataBlocks._load_datablock_df(
|
|
138
|
+
storage, dataset, columns=None, database_key=database_key
|
|
139
|
+
)
|
|
150
140
|
|
|
151
141
|
# Enfore datatypes and add missing optional fields using class schema (allows schema evolution)
|
|
152
|
-
|
|
142
|
+
if schema_validation:
|
|
143
|
+
cls._adapt_to_schema(dataset_types, result_df)
|
|
144
|
+
result_df = result_df[field_names]
|
|
153
145
|
|
|
154
|
-
# Adding constant value fields
|
|
146
|
+
# Adding constant value fields from serialized datablock
|
|
155
147
|
for field_name, field_info in fields(datablock).items(): # type: ignore[arg-type]
|
|
156
148
|
if get_origin(field_info.annotation) is not Dataset:
|
|
157
|
-
result_df[field_name] = getattr(datablock, field_name) # type: ignore[index]
|
|
149
|
+
result_df.loc[:, field_name] = getattr(datablock, field_name) # type: ignore[index]
|
|
158
150
|
|
|
159
151
|
return result_df
|
|
160
152
|
|
|
@@ -206,6 +198,29 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
|
206
198
|
|
|
207
199
|
return datatype(**blocks)
|
|
208
200
|
|
|
201
|
+
@staticmethod
|
|
202
|
+
def _get_dataset_types(
|
|
203
|
+
datatype: Type[DataBlockType], *, select: list[str] | None = None
|
|
204
|
+
) -> list[tuple[str, DataFrameType]]:
|
|
205
|
+
return [
|
|
206
|
+
(field_name, get_args(field_info.annotation)[0])
|
|
207
|
+
for field_name, field_info in fields(datatype).items() # type: ignore[type-var]
|
|
208
|
+
if get_origin(field_info.annotation) is Dataset
|
|
209
|
+
and (select is None or field_name in select)
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
@staticmethod
|
|
213
|
+
def _get_field_names(dataset_types: list[tuple[str, DataFrameType]]) -> list[str]:
|
|
214
|
+
return list(
|
|
215
|
+
dict.fromkeys(
|
|
216
|
+
[
|
|
217
|
+
field_name
|
|
218
|
+
for _, dataset_type in dataset_types
|
|
219
|
+
for field_name, _ in fields(dataset_type).items() # type: ignore[arg-type]
|
|
220
|
+
]
|
|
221
|
+
)
|
|
222
|
+
)
|
|
223
|
+
|
|
209
224
|
@staticmethod
|
|
210
225
|
def default(datatype: Type[DataBlockType]) -> DataBlockType:
|
|
211
226
|
return datatype(**{field_name: [] for field_name in list(fields(datatype))}) # type: ignore[type-var]
|
|
@@ -216,6 +231,7 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
|
216
231
|
datatype: Type[DataBlockType],
|
|
217
232
|
query: DataBlockQuery,
|
|
218
233
|
metadata: DataBlockMetadata | None = None,
|
|
234
|
+
schema_validation: bool = True,
|
|
219
235
|
**kwargs, # Non-Dataset field values for DataBlockType
|
|
220
236
|
) -> AsyncGenerator[pd.DataFrame, None]:
|
|
221
237
|
if metadata is None:
|
|
@@ -223,6 +239,9 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
|
223
239
|
|
|
224
240
|
storage = await get_dataset_storage(metadata.database_key)
|
|
225
241
|
|
|
242
|
+
dataset_types = cls._get_dataset_types(datatype, select=query.select)
|
|
243
|
+
field_names = cls._get_field_names(dataset_types)
|
|
244
|
+
|
|
226
245
|
async for block_dataset in storage._get_batch( # type: ignore[attr-defined]
|
|
227
246
|
datatype,
|
|
228
247
|
database_key=metadata.database_key,
|
|
@@ -231,29 +250,19 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
|
231
250
|
group_key=metadata.group_key,
|
|
232
251
|
collection=metadata.collection,
|
|
233
252
|
):
|
|
234
|
-
dataset_types = [
|
|
235
|
-
(field_name, get_args(field_info.annotation)[0])
|
|
236
|
-
for field_name, field_info in fields(datatype).items() # type: ignore[type-var]
|
|
237
|
-
if get_origin(field_info.annotation) is Dataset
|
|
238
|
-
and (query.select is None or field_name in query.select)
|
|
239
|
-
]
|
|
240
|
-
field_names = list(
|
|
241
|
-
dict.fromkeys(
|
|
242
|
-
[
|
|
243
|
-
field_name
|
|
244
|
-
for _, dataset_type in dataset_types
|
|
245
|
-
for field_name, _ in fields(dataset_type).items()
|
|
246
|
-
]
|
|
247
|
-
)
|
|
248
|
-
)
|
|
249
253
|
result_df = await DataBlocks._load_datablock_df(
|
|
250
|
-
storage, block_dataset,
|
|
254
|
+
storage, block_dataset, columns=None, database_key=metadata.database_key
|
|
251
255
|
)
|
|
252
256
|
|
|
253
|
-
#
|
|
257
|
+
# Enfore datatypes and add missing optional fields using class schema (allows schema evolution)
|
|
258
|
+
if schema_validation:
|
|
259
|
+
cls._adapt_to_schema(dataset_types, result_df)
|
|
260
|
+
result_df = result_df[field_names]
|
|
261
|
+
|
|
262
|
+
# Adding constant value fields from kwargs
|
|
254
263
|
for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
|
|
255
264
|
if get_origin(field_info.annotation) is not Dataset:
|
|
256
|
-
result_df[field_name] = kwargs.get(field_name)
|
|
265
|
+
result_df.loc[:, field_name] = kwargs.get(field_name)
|
|
257
266
|
|
|
258
267
|
yield result_df
|
|
259
268
|
|
|
@@ -286,9 +295,10 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
|
|
|
286
295
|
) from e
|
|
287
296
|
|
|
288
297
|
@classmethod
|
|
289
|
-
def _adapt_to_schema(
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
298
|
+
def _adapt_to_schema(
|
|
299
|
+
cls, dataset_types: list[tuple[str, DataFrameType]], df: pd.DataFrame
|
|
300
|
+
) -> None:
|
|
301
|
+
for _, datatype in dataset_types:
|
|
302
|
+
valid_df = datatype._from_df(df)._df # type: ignore[attr-defined]
|
|
293
303
|
for col in valid_df.columns:
|
|
294
304
|
df[col] = valid_df[col]
|
|
@@ -6,8 +6,13 @@ import dataclasses
|
|
|
6
6
|
from datetime import date, datetime, timezone
|
|
7
7
|
from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar, Union
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
import
|
|
9
|
+
try:
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
except ImportError:
|
|
13
|
+
import hopeit.dataframes.pandas.numpy_mock as np # type: ignore[no-redef]
|
|
14
|
+
import hopeit.dataframes.pandas.pandas_mock as pd # type: ignore[no-redef]
|
|
15
|
+
|
|
11
16
|
from pydantic import create_model
|
|
12
17
|
from pydantic.fields import FieldInfo
|
|
13
18
|
|
{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/dataset.py
RENAMED
|
@@ -4,7 +4,12 @@ from datetime import datetime
|
|
|
4
4
|
from typing import Any, Dict, Generic, Optional, Type, TypeVar
|
|
5
5
|
|
|
6
6
|
from hopeit.dataobjects import dataclass, dataobject
|
|
7
|
-
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import pandas as pd
|
|
10
|
+
except ImportError:
|
|
11
|
+
import hopeit.dataframes.pandas.pandas_mock as pd # type: ignore[no-redef]
|
|
12
|
+
|
|
8
13
|
from pydantic import TypeAdapter
|
|
9
14
|
|
|
10
15
|
from hopeit.dataframes.setup.registry import get_dataset_storage
|
{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/files.py
RENAMED
|
@@ -8,16 +8,15 @@ from uuid import uuid4
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
|
|
10
10
|
import aiofiles
|
|
11
|
-
import pandas as pd
|
|
12
|
-
import pyarrow
|
|
13
11
|
from pydantic import TypeAdapter
|
|
14
12
|
|
|
15
13
|
try:
|
|
14
|
+
import pandas as pd
|
|
16
15
|
import pyarrow # type: ignore # noqa # pylint: disable=unused-import
|
|
17
16
|
except ImportError as e:
|
|
18
17
|
raise ImportError(
|
|
19
|
-
"`pyarrow` needs to be installed to use `DatasetFileStorage`",
|
|
20
|
-
"Run `pip install hopeit.dataframes[
|
|
18
|
+
"`pandas` and `pyarrow` needs to be installed to use `DatasetFileStorage`",
|
|
19
|
+
"Run `pip install hopeit.dataframes[pandas]`",
|
|
21
20
|
) from e
|
|
22
21
|
|
|
23
22
|
from hopeit.dataframes.dataframe import DataFrameMixin
|
|
File without changes
|
|
File without changes
|
{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hopeit.dataframes
|
|
3
|
-
Version: 0.26.
|
|
3
|
+
Version: 0.26.5
|
|
4
4
|
Summary: Hopeit Engine Dataframes for Pandas
|
|
5
5
|
Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
|
|
6
6
|
License: Apache 2
|
|
@@ -24,8 +24,8 @@ Classifier: Topic :: Internet :: WWW/HTTP
|
|
|
24
24
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
25
|
Classifier: Framework :: AsyncIO
|
|
26
26
|
Description-Content-Type: text/markdown
|
|
27
|
-
Requires-Dist: hopeit.engine>=0.26.
|
|
28
|
-
Requires-Dist: hopeit.fs-storage>=0.26.
|
|
27
|
+
Requires-Dist: hopeit.engine>=0.26.5
|
|
28
|
+
Requires-Dist: hopeit.fs-storage>=0.26.5
|
|
29
29
|
Provides-Extra: pandas
|
|
30
30
|
Requires-Dist: pandas>=2.2.3; extra == "pandas"
|
|
31
31
|
Requires-Dist: pyarrow>=19.0.1; extra == "pandas"
|
{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/SOURCES.txt
RENAMED
|
@@ -9,6 +9,10 @@ src/hopeit/dataframes/__init__.py
|
|
|
9
9
|
src/hopeit/dataframes/datablocks.py
|
|
10
10
|
src/hopeit/dataframes/dataframe.py
|
|
11
11
|
src/hopeit/dataframes/py.typed
|
|
12
|
+
src/hopeit/dataframes/pandas/__init__.py
|
|
13
|
+
src/hopeit/dataframes/pandas/numpy_mock.py
|
|
14
|
+
src/hopeit/dataframes/pandas/pandas_mock.py
|
|
15
|
+
src/hopeit/dataframes/pandas/py.typed
|
|
12
16
|
src/hopeit/dataframes/serialization/__init__.py
|
|
13
17
|
src/hopeit/dataframes/serialization/dataset.py
|
|
14
18
|
src/hopeit/dataframes/serialization/files.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/setup/dataframes.py
RENAMED
|
File without changes
|
|
File without changes
|
{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/setup/registry.py
RENAMED
|
File without changes
|
|
File without changes
|
{hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/top_level.txt
RENAMED
|
File without changes
|