hopeit.dataframes 0.25.0b9__tar.gz → 0.25.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/PKG-INFO +5 -6
  2. hopeit_dataframes-0.25.1/pyproject.toml +51 -0
  3. hopeit_dataframes-0.25.1/setup.py +25 -0
  4. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit/dataframes/dataframe.py +70 -15
  5. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit/dataframes/serialization/dataset.py +15 -3
  6. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit/dataframes/serialization/files.py +2 -0
  7. hopeit_dataframes-0.25.1/src/hopeit/dataframes/setup/__init__.py +0 -0
  8. hopeit_dataframes-0.25.1/src/hopeit/dataframes/setup/py.typed +0 -0
  9. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit.dataframes.egg-info/PKG-INFO +5 -6
  10. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit.dataframes.egg-info/SOURCES.txt +4 -1
  11. hopeit_dataframes-0.25.1/src/hopeit.dataframes.egg-info/requires.txt +6 -0
  12. hopeit_dataframes-0.25.0b9/setup.py +0 -58
  13. hopeit_dataframes-0.25.0b9/src/hopeit.dataframes.egg-info/requires.txt +0 -6
  14. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/README.md +0 -0
  15. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/setup.cfg +0 -0
  16. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit/dataframes/__init__.py +0 -0
  17. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit/dataframes/py.typed +0 -0
  18. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit/dataframes/serialization/__init__.py +0 -0
  19. /hopeit_dataframes-0.25.0b9/src/hopeit/dataframes/setup/__init__.py → /hopeit_dataframes-0.25.1/src/hopeit/dataframes/serialization/py.typed +0 -0
  20. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit/dataframes/serialization/settings.py +0 -0
  21. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit/dataframes/setup/dataframes.py +0 -0
  22. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
  23. {hopeit_dataframes-0.25.0b9 → hopeit_dataframes-0.25.1}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
@@ -1,11 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.25.0b9
3
+ Version: 0.25.1
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
- Home-page: https://github.com/hopeit-git/hopeit.engine
6
- Author: Leo Smerling and Pablo Canto
7
- Author-email: contact@hopeit.com.ar
5
+ Author-email: Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
8
6
  License: Apache 2
7
+ Project-URL: Homepage, https://github.com/hopeit-git/hopeit.engine
9
8
  Project-URL: CI: GitHub Actions, https://github.com/hopeit-git/hopeit.engine/actions?query=workflow
10
9
  Project-URL: Docs: RTD, https://hopeitengine.readthedocs.io/en/latest/
11
10
  Project-URL: GitHub: issues, https://github.com/hopeit-git/hopeit.engine/issues
@@ -24,8 +23,8 @@ Classifier: Topic :: Internet :: WWW/HTTP
24
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
24
  Classifier: Framework :: AsyncIO
26
25
  Requires-Python: >=3.9
27
- Description-Content-Type: text/markdown
28
- Requires-Dist: hopeit.engine[fs-storage]==0.25.0b9
26
+ Description-Content-Type: text/plain
27
+ Requires-Dist: hopeit.engine[fs-storage]==0.25.1
29
28
  Requires-Dist: pandas
30
29
  Requires-Dist: numpy
31
30
  Provides-Extra: pyarrow
@@ -0,0 +1,51 @@
1
+ [build-system]
2
+ requires = ["setuptools >= 64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "hopeit.dataframes"
7
+ description = "Hopeit Engine Dataframes Toolkit"
8
+ dynamic = ["version", "readme", "dependencies", "optional-dependencies"]
9
+ license = { text = "Apache 2" }
10
+ authors = [
11
+ { name = "Leo Smerling", email = "contact@hopeit.com.ar" },
12
+ { name = "Pablo Canto", email = "contact@hopeit.com.ar" },
13
+ ]
14
+ classifiers = [
15
+ "License :: OSI Approved :: Apache Software License",
16
+ "Intended Audience :: Developers",
17
+ "Programming Language :: Python",
18
+ "Programming Language :: Python :: 3.9",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Development Status :: 4 - Beta",
22
+ "Operating System :: POSIX :: Linux",
23
+ "Operating System :: MacOS :: MacOS X",
24
+ "Operating System :: Microsoft :: Windows",
25
+ "Topic :: Internet :: WWW/HTTP",
26
+ "Topic :: Software Development :: Libraries :: Python Modules",
27
+ "Framework :: AsyncIO",
28
+ ]
29
+ requires-python = ">=3.9"
30
+
31
+
32
+ [project.urls]
33
+ "Homepage" = "https://github.com/hopeit-git/hopeit.engine"
34
+ "CI: GitHub Actions" = "https://github.com/hopeit-git/hopeit.engine/actions?query=workflow"
35
+ "Docs: RTD" = "https://hopeitengine.readthedocs.io/en/latest/"
36
+ "GitHub: issues" = "https://github.com/hopeit-git/hopeit.engine/issues"
37
+ "GitHub: repo" = "https://github.com/hopeit-git/hopeit.engine"
38
+
39
+ [tool.setuptools]
40
+ include-package-data = true
41
+
42
+ [tool.setuptools.packages.find]
43
+ where = ["src"]
44
+
45
+ [tool.setuptools.package-data]
46
+ "hopeit.dataframes" = ["py.typed"]
47
+ "hopeit.dataframes.serialization" = ["py.typed"]
48
+ "hopeit.dataframes.setup" = ["py.typed"]
49
+
50
+ [tool.setuptools.dynamic]
51
+ readme = { file = ["README.md"], content-type = "text/plain" }
@@ -0,0 +1,25 @@
1
+ from setuptools import setup
2
+ from os import environ
3
+
4
+ version = {}
5
+ try:
6
+ with open("../../../engine/src/hopeit/server/version.py") as fp:
7
+ exec(fp.read(), version)
8
+ ENGINE_VERSION = version["ENGINE_VERSION"]
9
+ except FileNotFoundError:
10
+ ENGINE_VERSION = environ.get("ENGINE_VERSION")
11
+
12
+ if not ENGINE_VERSION:
13
+ raise RuntimeError("ENGINE_VERSION is not specified.")
14
+
15
+ setup(
16
+ version=ENGINE_VERSION,
17
+ install_requires=[
18
+ f"hopeit.engine[fs-storage]=={ENGINE_VERSION}",
19
+ "pandas",
20
+ "numpy",
21
+ ],
22
+ extras_require={
23
+ "pyarrow": ["pyarrow"],
24
+ },
25
+ )
@@ -4,8 +4,7 @@ DataFrames type abstractions.
4
4
 
5
5
  import dataclasses
6
6
  from datetime import date, datetime, timezone
7
- from functools import partial
8
- from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar
7
+ from typing import Any, Callable, Dict, Generic, Iterator, List, Type, TypeVar, Union
9
8
 
10
9
  import numpy as np
11
10
  import pandas as pd
@@ -20,6 +19,7 @@ from hopeit.dataobjects import (
20
19
  fields,
21
20
  )
22
21
  from hopeit.dataobjects.payload import Payload
22
+ from pydantic_core import PydanticUndefined
23
23
 
24
24
  DataFrameT = TypeVar("DataFrameT")
25
25
 
@@ -31,19 +31,55 @@ class DataFrameMetadata:
31
31
 
32
32
 
33
33
  # Functions to do type coercion
34
- def _series_to_int(x: pd.Series) -> pd.Series:
34
+ def _series_to_int(field_name: str, x: pd.Series) -> pd.Series:
35
+ if x.isnull().values.any(): # type: ignore[union-attr]
36
+ raise ValueError(f"Field `{field_name}` is not nullable")
35
37
  return x.astype(np.int64)
36
38
 
37
39
 
38
- def _series_to_float(x: pd.Series) -> pd.Series:
40
+ def _series_to_float(field_name: str, x: pd.Series) -> pd.Series:
41
+ if x.isnull().values.any(): # type: ignore[union-attr]
42
+ raise ValueError(f"Field `{field_name}` is not nullable")
39
43
  return x.astype(np.float64)
40
44
 
41
45
 
42
- def _series_to_str(x: pd.Series) -> pd.Series:
46
+ def _series_to_str(field_name: str, x: pd.Series) -> pd.Series:
47
+ if x.isnull().values.any(): # type: ignore[union-attr]
48
+ raise ValueError(f"Field `{field_name}` is not nullable")
43
49
  return x.astype(str)
44
50
 
45
51
 
46
- _series_to_utc_datetime = partial(pd.to_datetime, utc=True)
52
+ # Functions to do type coercion
53
+ def _series_to_int_nullable(_field_name: str, x: pd.Series) -> pd.Series:
54
+ return x[x.notna()].astype(np.int64)
55
+
56
+
57
+ def _series_to_float_nullable(_field_name: str, x: pd.Series) -> pd.Series:
58
+ return x[x.notna()].astype(np.float64)
59
+
60
+
61
+ def _series_to_str_nullable(_field_name: str, x: pd.Series) -> pd.Series:
62
+ return x[x.notna()].astype(str)
63
+
64
+
65
+ def _series_to_datetime(field_name: str, x: pd.Series) -> pd.Series:
66
+ if x.isnull().values.any(): # type: ignore[union-attr]
67
+ raise ValueError(f"Field `{field_name}` is not nullable")
68
+ return pd.to_datetime(x)
69
+
70
+
71
+ def _series_to_utc_datetime(field_name: str, x: pd.Series) -> pd.Series:
72
+ if x.isnull().values.any(): # type: ignore[union-attr]
73
+ raise ValueError(f"Field `{field_name}` is not nullable")
74
+ return pd.to_datetime(x, utc=True)
75
+
76
+
77
+ def _series_to_datetime_nullable(_field_name: str, x: pd.Series) -> pd.Series:
78
+ return pd.to_datetime(x)
79
+
80
+
81
+ def _series_to_utc_datetime_nullable(_field_name: str, x: pd.Series) -> pd.Series:
82
+ return pd.to_datetime(x, utc=True)
47
83
 
48
84
 
49
85
  class DataFrameMixin(Generic[DataFrameT, DataObject]):
@@ -57,11 +93,16 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
57
93
  int: _series_to_int,
58
94
  float: _series_to_float,
59
95
  str: _series_to_str,
60
- date: pd.to_datetime,
96
+ date: _series_to_datetime,
61
97
  datetime: _series_to_utc_datetime,
98
+ Union[int, None]: _series_to_int_nullable,
99
+ Union[float, None]: _series_to_float_nullable,
100
+ Union[str, None]: _series_to_str_nullable,
101
+ Union[date, None]: _series_to_datetime_nullable,
102
+ Union[datetime, None]: _series_to_utc_datetime_nullable,
62
103
  }
63
104
 
64
- def __init__(self) -> None:
105
+ def __init__(self, **series: pd.Series) -> None:
65
106
  # Fields added here only to allow mypy to provide correct type hints
66
107
  self.__data_object__: Dict[str, Any] = {}
67
108
  self.__dataframe__: DataFrameMetadata = None # type: ignore
@@ -128,9 +169,28 @@ class DataFrameMixin(Generic[DataFrameT, DataObject]):
128
169
  else:
129
170
  object.__setattr__(self, name, value)
130
171
 
131
- def _coerce_datatypes(self, df: pd.DataFrame) -> Dict[str, pd.Series]:
172
+ def _get_series(
173
+ self,
174
+ df: pd.DataFrame,
175
+ field_name: str,
176
+ field_info: FieldInfo,
177
+ ) -> pd.Series:
178
+ try:
179
+ return df[field_name]
180
+ except KeyError:
181
+ default_value = field_info.get_default()
182
+ if default_value is not PydanticUndefined:
183
+ return pd.Series([default_value] * len(df))
184
+ raise
185
+
186
+ def _coerce_datatypes(
187
+ self,
188
+ df: pd.DataFrame,
189
+ ) -> Dict[str, pd.Series]:
132
190
  return {
133
- name: self.DATATYPE_MAPPING[field.annotation](df[name]) # type: ignore
191
+ name: self.DATATYPE_MAPPING[field.annotation]( # type: ignore[index, operator]
192
+ name, self._get_series(df, name, field)
193
+ )
134
194
  for name, field in self.__dataframe__.fields.items()
135
195
  }
136
196
 
@@ -181,17 +241,12 @@ def dataframe(
181
241
  setattr(cls, "event_id", StreamEventMixin.event_id)
182
242
  setattr(cls, "event_ts", StreamEventMixin.event_ts)
183
243
 
184
- def set_fields_optional(cls):
185
- for _, field in fields(cls).items():
186
- field.default = None
187
-
188
244
  def wrap(cls) -> Type[DataFrameMixin]:
189
245
  if hasattr(cls, "__dataframe__"):
190
246
  return cls
191
247
  add_dataframe_metadata(cls)
192
248
  amended_class = add_dataframe_mixin(cls)
193
249
  add_dataobject_annotations(amended_class, unsafe, validate, schema)
194
- set_fields_optional(amended_class)
195
250
  return amended_class
196
251
 
197
252
  if decorated_class is None:
@@ -1,13 +1,17 @@
1
1
  """Dataset objects definition, used as a result of serialized dataframes"""
2
2
 
3
3
  from importlib import import_module
4
- from typing import Generic, Type, TypeVar
4
+ from typing import Any, Dict, Generic, Type, TypeVar
5
5
 
6
- from hopeit.dataobjects import dataclass, dataobject
6
+ from hopeit.dataobjects import dataclass, dataobject, field
7
7
 
8
8
  DataFrameT = TypeVar("DataFrameT")
9
9
 
10
10
 
11
+ class DatasetLoadError(Exception):
12
+ pass
13
+
14
+
11
15
  @dataobject
12
16
  @dataclass
13
17
  class Dataset(Generic[DataFrameT]):
@@ -17,9 +21,17 @@ class Dataset(Generic[DataFrameT]):
17
21
  partition_key: str
18
22
  key: str
19
23
  datatype: str
24
+ schema: Dict[str, Any] = field(default_factory=dict)
20
25
 
21
26
  async def load(self) -> DataFrameT:
22
- return await self.__storage.load(self) # type: ignore[attr-defined]
27
+ try:
28
+ dataframe = await self.__storage.load(self) # type: ignore[attr-defined]
29
+ return dataframe
30
+ except (RuntimeError, IOError, KeyError) as e:
31
+ raise DatasetLoadError(
32
+ f"Error {type(e).__name__}: {e} loading dataset of type {self.datatype} "
33
+ f"at location {self.partition_key}/{self.key}"
34
+ ) from e
23
35
 
24
36
  @classmethod
25
37
  async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
@@ -6,6 +6,7 @@ from typing import Generic, Optional, Type, TypeVar
6
6
  from uuid import uuid4
7
7
 
8
8
  import pandas as pd
9
+ from pydantic import TypeAdapter
9
10
 
10
11
  try:
11
12
  import pyarrow # type: ignore # noqa # pylint: disable=unused-import
@@ -53,6 +54,7 @@ class DatasetFileStorage(Generic[DataFrameT]):
53
54
  partition_key=partition_key,
54
55
  key=key,
55
56
  datatype=f"{datatype.__module__}.{datatype.__qualname__}",
57
+ schema=TypeAdapter(datatype).json_schema(),
56
58
  )
57
59
 
58
60
  async def load(self, dataset: Dataset) -> EventPayloadType:
@@ -1,11 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: hopeit.dataframes
3
- Version: 0.25.0b9
3
+ Version: 0.25.1
4
4
  Summary: Hopeit Engine Dataframes Toolkit
5
- Home-page: https://github.com/hopeit-git/hopeit.engine
6
- Author: Leo Smerling and Pablo Canto
7
- Author-email: contact@hopeit.com.ar
5
+ Author-email: Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
8
6
  License: Apache 2
7
+ Project-URL: Homepage, https://github.com/hopeit-git/hopeit.engine
9
8
  Project-URL: CI: GitHub Actions, https://github.com/hopeit-git/hopeit.engine/actions?query=workflow
10
9
  Project-URL: Docs: RTD, https://hopeitengine.readthedocs.io/en/latest/
11
10
  Project-URL: GitHub: issues, https://github.com/hopeit-git/hopeit.engine/issues
@@ -24,8 +23,8 @@ Classifier: Topic :: Internet :: WWW/HTTP
24
23
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
24
  Classifier: Framework :: AsyncIO
26
25
  Requires-Python: >=3.9
27
- Description-Content-Type: text/markdown
28
- Requires-Dist: hopeit.engine[fs-storage]==0.25.0b9
26
+ Description-Content-Type: text/plain
27
+ Requires-Dist: hopeit.engine[fs-storage]==0.25.1
29
28
  Requires-Dist: pandas
30
29
  Requires-Dist: numpy
31
30
  Provides-Extra: pyarrow
@@ -1,4 +1,5 @@
1
1
  README.md
2
+ pyproject.toml
2
3
  setup.py
3
4
  src/hopeit.dataframes.egg-info/PKG-INFO
4
5
  src/hopeit.dataframes.egg-info/SOURCES.txt
@@ -11,6 +12,8 @@ src/hopeit/dataframes/py.typed
11
12
  src/hopeit/dataframes/serialization/__init__.py
12
13
  src/hopeit/dataframes/serialization/dataset.py
13
14
  src/hopeit/dataframes/serialization/files.py
15
+ src/hopeit/dataframes/serialization/py.typed
14
16
  src/hopeit/dataframes/serialization/settings.py
15
17
  src/hopeit/dataframes/setup/__init__.py
16
- src/hopeit/dataframes/setup/dataframes.py
18
+ src/hopeit/dataframes/setup/dataframes.py
19
+ src/hopeit/dataframes/setup/py.typed
@@ -0,0 +1,6 @@
1
+ hopeit.engine[fs-storage]==0.25.1
2
+ pandas
3
+ numpy
4
+
5
+ [pyarrow]
6
+ pyarrow
@@ -1,58 +0,0 @@
1
- import setuptools
2
-
3
- version = {}
4
- with open("../../../engine/src/hopeit/server/version.py") as fp:
5
- exec(fp.read(), version)
6
-
7
- setuptools.setup(
8
- name="hopeit.dataframes",
9
- version=version["ENGINE_VERSION"],
10
- description="Hopeit Engine Dataframes Toolkit",
11
- license="Apache 2",
12
- long_description=open("README.md").read(),
13
- long_description_content_type="text/markdown",
14
- author="Leo Smerling and Pablo Canto",
15
- author_email="contact@hopeit.com.ar",
16
- url="https://github.com/hopeit-git/hopeit.engine",
17
- classifiers=[
18
- "License :: OSI Approved :: Apache Software License",
19
- "Intended Audience :: Developers",
20
- "Programming Language :: Python",
21
- "Programming Language :: Python :: 3.9",
22
- "Programming Language :: Python :: 3.10",
23
- "Programming Language :: Python :: 3.11",
24
- "Development Status :: 4 - Beta",
25
- "Operating System :: POSIX :: Linux",
26
- "Operating System :: MacOS :: MacOS X",
27
- "Operating System :: Microsoft :: Windows",
28
- "Topic :: Internet :: WWW/HTTP",
29
- "Topic :: Software Development :: Libraries :: Python Modules",
30
- "Framework :: AsyncIO",
31
- ],
32
- project_urls={
33
- "CI: GitHub Actions": "https://github.com/hopeit-git/hopeit.engine/actions?query=workflow", # noqa
34
- "Docs: RTD": "https://hopeitengine.readthedocs.io/en/latest/",
35
- "GitHub: issues": "https://github.com/hopeit-git/hopeit.engine/issues",
36
- "GitHub: repo": "https://github.com/hopeit-git/hopeit.engine",
37
- },
38
- package_dir={"": "src"},
39
- packages=[
40
- "hopeit.dataframes",
41
- "hopeit.dataframes.serialization",
42
- "hopeit.dataframes.setup",
43
- ],
44
- include_package_data=True,
45
- package_data={
46
- "hopeit.dataframes": ["py.typed"],
47
- },
48
- python_requires=">=3.9",
49
- install_requires=[
50
- f"hopeit.engine[fs-storage]=={version['ENGINE_VERSION']}",
51
- "pandas",
52
- "numpy",
53
- ],
54
- extras_require={
55
- "pyarrow": ["pyarrow"],
56
- },
57
- entry_points={},
58
- )
@@ -1,6 +0,0 @@
1
- hopeit.engine[fs-storage]==0.25.0b9
2
- pandas
3
- numpy
4
-
5
- [pyarrow]
6
- pyarrow