hopeit.dataframes 0.26.3__tar.gz → 0.26.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/PKG-INFO +3 -3
  2. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/pyproject.toml +3 -3
  3. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/__init__.py +9 -2
  4. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/datablocks.py +57 -47
  5. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/dataframe.py +7 -2
  6. hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas/numpy_mock.py +3 -0
  7. hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas/pandas_mock.py +13 -0
  8. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/dataset.py +6 -1
  9. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/files.py +3 -4
  10. hopeit_dataframes-0.26.5/src/hopeit/dataframes/setup/__init__.py +0 -0
  11. hopeit_dataframes-0.26.5/src/hopeit/dataframes/setup/py.typed +0 -0
  12. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/PKG-INFO +3 -3
  13. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/SOURCES.txt +4 -0
  14. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/requires.txt +2 -2
  15. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/README.md +0 -0
  16. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/setup.cfg +0 -0
  17. {hopeit_dataframes-0.26.3/src/hopeit/dataframes/serialization → hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas}/__init__.py +0 -0
  18. {hopeit_dataframes-0.26.3/src/hopeit/dataframes → hopeit_dataframes-0.26.5/src/hopeit/dataframes/pandas}/py.typed +0 -0
  19. {hopeit_dataframes-0.26.3/src/hopeit/dataframes/serialization → hopeit_dataframes-0.26.5/src/hopeit/dataframes}/py.typed +0 -0
  20. {hopeit_dataframes-0.26.3/src/hopeit/dataframes/setup → hopeit_dataframes-0.26.5/src/hopeit/dataframes/serialization}/__init__.py +0 -0
  21. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/protocol.py +0 -0
  22. {hopeit_dataframes-0.26.3/src/hopeit/dataframes/setup → hopeit_dataframes-0.26.5/src/hopeit/dataframes/serialization}/py.typed +0 -0
  23. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/serialization/settings.py +0 -0
  24. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/setup/dataframes.py +0 -0
  25. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/setup/register_database.py +0 -0
  26. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit/dataframes/setup/registry.py +0 -0
  27. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
  28. {hopeit_dataframes-0.26.3 → hopeit_dataframes-0.26.5}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hopeit.dataframes
3
- Version: 0.26.3
3
+ Version: 0.26.5
4
4
  Summary: Hopeit Engine Dataframes for Pandas
5
5
  Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
6
6
  License: Apache 2
@@ -24,8 +24,8 @@ Classifier: Topic :: Internet :: WWW/HTTP
24
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
25
  Classifier: Framework :: AsyncIO
26
26
  Description-Content-Type: text/markdown
27
- Requires-Dist: hopeit.engine>=0.26.3
28
- Requires-Dist: hopeit.fs-storage>=0.26.3
27
+ Requires-Dist: hopeit.engine>=0.26.5
28
+ Requires-Dist: hopeit.fs-storage>=0.26.5
29
29
  Provides-Extra: pandas
30
30
  Requires-Dist: pandas>=2.2.3; extra == "pandas"
31
31
  Requires-Dist: pyarrow>=19.0.1; extra == "pandas"
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hopeit.dataframes"
7
- version = "0.26.3"
7
+ version = "0.26.5"
8
8
 
9
9
  description = "Hopeit Engine Dataframes for Pandas"
10
10
  dynamic = ["readme"]
@@ -33,8 +33,8 @@ classifiers = [
33
33
  ]
34
34
 
35
35
  dependencies = [
36
- "hopeit.engine>=0.26.3",
37
- "hopeit.fs-storage>=0.26.3"
36
+ "hopeit.engine>=0.26.5",
37
+ "hopeit.fs-storage>=0.26.5"
38
38
  ]
39
39
 
40
40
  [project.optional-dependencies]
@@ -72,8 +72,15 @@ print(Payload.to_json(my_json_response))
72
72
 
73
73
  from typing import Dict, Generic, Iterator, List, Type
74
74
 
75
- import numpy as np
76
- import pandas as pd
75
+ try:
76
+ import numpy as np
77
+ import pandas as pd
78
+ except ImportError:
79
+ # Supports using `@dataframe` annotation for dataobjects definitions
80
+ # without installing pandas and numpy. Useful for API-only projects.
81
+ import hopeit.dataframes.pandas.numpy_mock as np # type: ignore[no-redef]
82
+ import hopeit.dataframes.pandas.pandas_mock as pd # type: ignore[no-redef]
83
+
77
84
  from hopeit.dataframes.dataframe import DataFrameT, dataframe
78
85
  from hopeit.dataframes.serialization.dataset import Dataset
79
86
  from hopeit.dataframes.datablocks import DataBlocks
@@ -7,11 +7,14 @@ and saved as a single flat pandas DataFrame.
7
7
  from datetime import datetime
8
8
  from typing import AsyncGenerator, Generic, Optional, Type, TypeVar, get_args, get_origin
9
9
 
10
- import pandas as pd
10
+ try:
11
+ import pandas as pd
12
+ except ImportError:
13
+ import hopeit.dataframes.pandas.pandas_mock as pd # type: ignore[no-redef]
14
+
11
15
  from hopeit.dataobjects import dataobject, dataclass, fields
12
16
 
13
17
  from hopeit.dataframes.serialization.dataset import Dataset, DatasetLoadError
14
- from hopeit.dataframes.serialization.protocol import find_dataframe_type
15
18
  from hopeit.dataframes.setup.registry import get_dataset_storage
16
19
 
17
20
  DataBlockType = TypeVar("DataBlockType")
@@ -110,6 +113,7 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
110
113
  datablock: DataBlockType,
111
114
  *,
112
115
  select: Optional[list[str]] = None,
116
+ schema_validation: bool = True,
113
117
  database_key: Optional[str] = None,
114
118
  ) -> pd.DataFrame:
115
119
  """
@@ -124,37 +128,25 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
124
128
  Returns:
125
129
  pd.DataFrame: The resulting pandas DataFrame.
126
130
  """
127
- keys = [
128
- field_name
129
- for field_name, field_info in fields(datablock).items() # type: ignore[arg-type]
130
- if get_origin(field_info.annotation) is Dataset
131
- and (select is None or field_name in select)
132
- ]
133
-
134
- # Filter/validate selected field names using saved schema,
135
- # generates a single field for every common/duplicated field in the datasets
136
- field_names = list(
137
- dict.fromkeys(
138
- [
139
- field_name
140
- for key in keys
141
- for field_name in getattr(datablock, key).schema["properties"].keys()
142
- ]
143
- )
144
- )
131
+ dataset_types = cls._get_dataset_types(type(datablock), select=select)
132
+ field_names = cls._get_field_names(dataset_types)
145
133
 
146
134
  # Load data from first dataset (datablock uses a single file for all datasets)
147
- dataset: Dataset = getattr(datablock, keys[0])
135
+ dataset: Dataset = getattr(datablock, dataset_types[0][0])
148
136
  storage = await get_dataset_storage(database_key)
149
- result_df = await DataBlocks._load_datablock_df(storage, dataset, field_names, database_key)
137
+ result_df = await DataBlocks._load_datablock_df(
138
+ storage, dataset, columns=None, database_key=database_key
139
+ )
150
140
 
151
141
  # Enfore datatypes and add missing optional fields using class schema (allows schema evolution)
152
- cls._adapt_to_schema(datablock, keys, result_df)
142
+ if schema_validation:
143
+ cls._adapt_to_schema(dataset_types, result_df)
144
+ result_df = result_df[field_names]
153
145
 
154
- # Adding constant value fields
146
+ # Adding constant value fields from serialized datablock
155
147
  for field_name, field_info in fields(datablock).items(): # type: ignore[arg-type]
156
148
  if get_origin(field_info.annotation) is not Dataset:
157
- result_df[field_name] = getattr(datablock, field_name) # type: ignore[index]
149
+ result_df.loc[:, field_name] = getattr(datablock, field_name) # type: ignore[index]
158
150
 
159
151
  return result_df
160
152
 
@@ -206,6 +198,29 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
206
198
 
207
199
  return datatype(**blocks)
208
200
 
201
+ @staticmethod
202
+ def _get_dataset_types(
203
+ datatype: Type[DataBlockType], *, select: list[str] | None = None
204
+ ) -> list[tuple[str, DataFrameType]]:
205
+ return [
206
+ (field_name, get_args(field_info.annotation)[0])
207
+ for field_name, field_info in fields(datatype).items() # type: ignore[type-var]
208
+ if get_origin(field_info.annotation) is Dataset
209
+ and (select is None or field_name in select)
210
+ ]
211
+
212
+ @staticmethod
213
+ def _get_field_names(dataset_types: list[tuple[str, DataFrameType]]) -> list[str]:
214
+ return list(
215
+ dict.fromkeys(
216
+ [
217
+ field_name
218
+ for _, dataset_type in dataset_types
219
+ for field_name, _ in fields(dataset_type).items() # type: ignore[arg-type]
220
+ ]
221
+ )
222
+ )
223
+
209
224
  @staticmethod
210
225
  def default(datatype: Type[DataBlockType]) -> DataBlockType:
211
226
  return datatype(**{field_name: [] for field_name in list(fields(datatype))}) # type: ignore[type-var]
@@ -216,6 +231,7 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
216
231
  datatype: Type[DataBlockType],
217
232
  query: DataBlockQuery,
218
233
  metadata: DataBlockMetadata | None = None,
234
+ schema_validation: bool = True,
219
235
  **kwargs, # Non-Dataset field values for DataBlockType
220
236
  ) -> AsyncGenerator[pd.DataFrame, None]:
221
237
  if metadata is None:
@@ -223,6 +239,9 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
223
239
 
224
240
  storage = await get_dataset_storage(metadata.database_key)
225
241
 
242
+ dataset_types = cls._get_dataset_types(datatype, select=query.select)
243
+ field_names = cls._get_field_names(dataset_types)
244
+
226
245
  async for block_dataset in storage._get_batch( # type: ignore[attr-defined]
227
246
  datatype,
228
247
  database_key=metadata.database_key,
@@ -231,29 +250,19 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
231
250
  group_key=metadata.group_key,
232
251
  collection=metadata.collection,
233
252
  ):
234
- dataset_types = [
235
- (field_name, get_args(field_info.annotation)[0])
236
- for field_name, field_info in fields(datatype).items() # type: ignore[type-var]
237
- if get_origin(field_info.annotation) is Dataset
238
- and (query.select is None or field_name in query.select)
239
- ]
240
- field_names = list(
241
- dict.fromkeys(
242
- [
243
- field_name
244
- for _, dataset_type in dataset_types
245
- for field_name, _ in fields(dataset_type).items()
246
- ]
247
- )
248
- )
249
253
  result_df = await DataBlocks._load_datablock_df(
250
- storage, block_dataset, field_names, metadata.database_key
254
+ storage, block_dataset, columns=None, database_key=metadata.database_key
251
255
  )
252
256
 
253
- # Adding constant value fields
257
+ # Enfore datatypes and add missing optional fields using class schema (allows schema evolution)
258
+ if schema_validation:
259
+ cls._adapt_to_schema(dataset_types, result_df)
260
+ result_df = result_df[field_names]
261
+
262
+ # Adding constant value fields from kwargs
254
263
  for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
255
264
  if get_origin(field_info.annotation) is not Dataset:
256
- result_df[field_name] = kwargs.get(field_name)
265
+ result_df.loc[:, field_name] = kwargs.get(field_name)
257
266
 
258
267
  yield result_df
259
268
 
@@ -286,9 +295,10 @@ class DataBlocks(Generic[DataBlockType, DataFrameType]):
286
295
  ) from e
287
296
 
288
297
  @classmethod
289
- def _adapt_to_schema(cls, datablock: DataBlockType, keys: list[str], df: pd.DataFrame) -> None:
290
- for key in keys:
291
- datatype = find_dataframe_type(getattr(datablock, key).datatype) # type: ignore[var-annotated]
292
- valid_df = datatype._from_df(df)._df
298
+ def _adapt_to_schema(
299
+ cls, dataset_types: list[tuple[str, DataFrameType]], df: pd.DataFrame
300
+ ) -> None:
301
+ for _, datatype in dataset_types:
302
+ valid_df = datatype._from_df(df)._df # type: ignore[attr-defined]
293
303
  for col in valid_df.columns:
294
304
  df[col] = valid_df[col]
@@ -6,8 +6,13 @@ import dataclasses
6
6
  from datetime import date, datetime, timezone
7
7
  from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar, Union
8
8
 
9
- import numpy as np
10
- import pandas as pd
9
+ try:
10
+ import numpy as np
11
+ import pandas as pd
12
+ except ImportError:
13
+ import hopeit.dataframes.pandas.numpy_mock as np # type: ignore[no-redef]
14
+ import hopeit.dataframes.pandas.pandas_mock as pd # type: ignore[no-redef]
15
+
11
16
  from pydantic import create_model
12
17
  from pydantic.fields import FieldInfo
13
18
 
@@ -0,0 +1,3 @@
1
+ from typing import Any
2
+
3
+ ndarray = Any
@@ -0,0 +1,13 @@
1
+ class DataFrame:
2
+ def __init__(*args, **kwargs):
3
+ pass
4
+
5
+
6
+ class Series:
7
+ def __init__(*args, **kwargs):
8
+ pass
9
+
10
+
11
+ class Timestamp:
12
+ def __init__(*args, **kwargs):
13
+ pass
@@ -4,7 +4,12 @@ from datetime import datetime
4
4
  from typing import Any, Dict, Generic, Optional, Type, TypeVar
5
5
 
6
6
  from hopeit.dataobjects import dataclass, dataobject
7
- import pandas as pd
7
+
8
+ try:
9
+ import pandas as pd
10
+ except ImportError:
11
+ import hopeit.dataframes.pandas.pandas_mock as pd # type: ignore[no-redef]
12
+
8
13
  from pydantic import TypeAdapter
9
14
 
10
15
  from hopeit.dataframes.setup.registry import get_dataset_storage
@@ -8,16 +8,15 @@ from uuid import uuid4
8
8
  from pathlib import Path
9
9
 
10
10
  import aiofiles
11
- import pandas as pd
12
- import pyarrow
13
11
  from pydantic import TypeAdapter
14
12
 
15
13
  try:
14
+ import pandas as pd
16
15
  import pyarrow # type: ignore # noqa # pylint: disable=unused-import
17
16
  except ImportError as e:
18
17
  raise ImportError(
19
- "`pyarrow` needs to be installed to use `DatasetFileStorage`",
20
- "Run `pip install hopeit.dataframes[pyarrow]`",
18
+ "`pandas` and `pyarrow` needs to be installed to use `DatasetFileStorage`",
19
+ "Run `pip install hopeit.dataframes[pandas]`",
21
20
  ) from e
22
21
 
23
22
  from hopeit.dataframes.dataframe import DataFrameMixin
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hopeit.dataframes
3
- Version: 0.26.3
3
+ Version: 0.26.5
4
4
  Summary: Hopeit Engine Dataframes for Pandas
5
5
  Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
6
6
  License: Apache 2
@@ -24,8 +24,8 @@ Classifier: Topic :: Internet :: WWW/HTTP
24
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
25
  Classifier: Framework :: AsyncIO
26
26
  Description-Content-Type: text/markdown
27
- Requires-Dist: hopeit.engine>=0.26.3
28
- Requires-Dist: hopeit.fs-storage>=0.26.3
27
+ Requires-Dist: hopeit.engine>=0.26.5
28
+ Requires-Dist: hopeit.fs-storage>=0.26.5
29
29
  Provides-Extra: pandas
30
30
  Requires-Dist: pandas>=2.2.3; extra == "pandas"
31
31
  Requires-Dist: pyarrow>=19.0.1; extra == "pandas"
@@ -9,6 +9,10 @@ src/hopeit/dataframes/__init__.py
9
9
  src/hopeit/dataframes/datablocks.py
10
10
  src/hopeit/dataframes/dataframe.py
11
11
  src/hopeit/dataframes/py.typed
12
+ src/hopeit/dataframes/pandas/__init__.py
13
+ src/hopeit/dataframes/pandas/numpy_mock.py
14
+ src/hopeit/dataframes/pandas/pandas_mock.py
15
+ src/hopeit/dataframes/pandas/py.typed
12
16
  src/hopeit/dataframes/serialization/__init__.py
13
17
  src/hopeit/dataframes/serialization/dataset.py
14
18
  src/hopeit/dataframes/serialization/files.py
@@ -1,5 +1,5 @@
1
- hopeit.engine>=0.26.3
2
- hopeit.fs-storage>=0.26.3
1
+ hopeit.engine>=0.26.5
2
+ hopeit.fs-storage>=0.26.5
3
3
 
4
4
  [pandas]
5
5
  pandas>=2.2.3