hopeit.dataframes 0.25.3__tar.gz → 0.26.0rc0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/PKG-INFO +12 -13
  2. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/pyproject.toml +17 -12
  3. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/__init__.py +2 -1
  4. hopeit_dataframes-0.26.0rc0/src/hopeit/dataframes/datablocks.py +140 -0
  5. hopeit_dataframes-0.26.0rc0/src/hopeit/dataframes/serialization/dataset.py +93 -0
  6. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/serialization/files.py +25 -21
  7. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit.dataframes.egg-info/PKG-INFO +12 -13
  8. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit.dataframes.egg-info/SOURCES.txt +1 -1
  9. hopeit_dataframes-0.26.0rc0/src/hopeit.dataframes.egg-info/requires.txt +4 -0
  10. hopeit_dataframes-0.25.3/setup.py +0 -25
  11. hopeit_dataframes-0.25.3/src/hopeit/dataframes/serialization/dataset.py +0 -48
  12. hopeit_dataframes-0.25.3/src/hopeit.dataframes.egg-info/requires.txt +0 -6
  13. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/README.md +0 -0
  14. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/setup.cfg +0 -0
  15. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/dataframe.py +0 -0
  16. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/py.typed +0 -0
  17. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/serialization/__init__.py +0 -0
  18. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/serialization/py.typed +0 -0
  19. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/serialization/settings.py +0 -0
  20. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/setup/__init__.py +0 -0
  21. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/setup/dataframes.py +0 -0
  22. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit/dataframes/setup/py.typed +0 -0
  23. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit.dataframes.egg-info/dependency_links.txt +0 -0
  24. {hopeit_dataframes-0.25.3 → hopeit_dataframes-0.26.0rc0}/src/hopeit.dataframes.egg-info/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: hopeit.dataframes
3
- Version: 0.25.3
4
- Summary: Hopeit Engine Dataframes Toolkit
5
- Author-email: Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
3
+ Version: 0.26.0rc0
4
+ Summary: Hopeit Engine Dataframes for Pandas
5
+ Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
6
6
  License: Apache 2
7
7
  Project-URL: Homepage, https://github.com/hopeit-git/hopeit.engine
8
8
  Project-URL: CI: GitHub Actions, https://github.com/hopeit-git/hopeit.engine/actions?query=workflow
@@ -12,23 +12,22 @@ Project-URL: GitHub: repo, https://github.com/hopeit-git/hopeit.engine
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Programming Language :: Python
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
- Classifier: Development Status :: 4 - Beta
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Development Status :: 5 - Production/Stable
19
20
  Classifier: Operating System :: POSIX :: Linux
20
21
  Classifier: Operating System :: MacOS :: MacOS X
21
22
  Classifier: Operating System :: Microsoft :: Windows
22
23
  Classifier: Topic :: Internet :: WWW/HTTP
23
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
25
  Classifier: Framework :: AsyncIO
25
- Requires-Python: >=3.9
26
- Description-Content-Type: text/plain
27
- Requires-Dist: hopeit.engine[fs-storage]==0.25.3
28
- Requires-Dist: pandas
29
- Requires-Dist: numpy
30
- Provides-Extra: pyarrow
31
- Requires-Dist: pyarrow; extra == "pyarrow"
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: hopeit.engine>=0.26.0rc0
28
+ Requires-Dist: hopeit.fs-storage>=0.26.0rc0
29
+ Requires-Dist: pandas>=2.2.3
30
+ Requires-Dist: numpy>=1.26.4
32
31
 
33
32
  # hopeit.engine dataframes plugin
34
33
 
@@ -4,10 +4,21 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "hopeit.dataframes"
7
- description = "Hopeit Engine Dataframes Toolkit"
8
- dynamic = ["version", "readme", "dependencies", "optional-dependencies"]
7
+ version = "0.26.0rc0"
8
+
9
+ description = "Hopeit Engine Dataframes for Pandas"
10
+ dynamic = ["readme"]
11
+
12
+ dependencies = [
13
+ "hopeit.engine>=0.26.0rc0",
14
+ "hopeit.fs-storage>=0.26.0rc0",
15
+ "pandas>=2.2.3",
16
+ "numpy>=1.26.4"
17
+ ]
18
+
9
19
  license = { text = "Apache 2" }
10
20
  authors = [
21
+ { name = "Leo Smerling & Pablo Canto", email = "contact@hopeit.com.ar" },
11
22
  { name = "Leo Smerling", email = "contact@hopeit.com.ar" },
12
23
  { name = "Pablo Canto", email = "contact@hopeit.com.ar" },
13
24
  ]
@@ -15,10 +26,11 @@ classifiers = [
15
26
  "License :: OSI Approved :: Apache Software License",
16
27
  "Intended Audience :: Developers",
17
28
  "Programming Language :: Python",
18
- "Programming Language :: Python :: 3.9",
19
29
  "Programming Language :: Python :: 3.10",
20
30
  "Programming Language :: Python :: 3.11",
21
- "Development Status :: 4 - Beta",
31
+ "Programming Language :: Python :: 3.12",
32
+ "Programming Language :: Python :: 3.13",
33
+ "Development Status :: 5 - Production/Stable",
22
34
  "Operating System :: POSIX :: Linux",
23
35
  "Operating System :: MacOS :: MacOS X",
24
36
  "Operating System :: Microsoft :: Windows",
@@ -26,8 +38,6 @@ classifiers = [
26
38
  "Topic :: Software Development :: Libraries :: Python Modules",
27
39
  "Framework :: AsyncIO",
28
40
  ]
29
- requires-python = ">=3.9"
30
-
31
41
 
32
42
  [project.urls]
33
43
  "Homepage" = "https://github.com/hopeit-git/hopeit.engine"
@@ -42,10 +52,5 @@ include-package-data = true
42
52
  [tool.setuptools.packages.find]
43
53
  where = ["src"]
44
54
 
45
- [tool.setuptools.package-data]
46
- "hopeit.dataframes" = ["py.typed"]
47
- "hopeit.dataframes.serialization" = ["py.typed"]
48
- "hopeit.dataframes.setup" = ["py.typed"]
49
-
50
55
  [tool.setuptools.dynamic]
51
- readme = { file = ["README.md"], content-type = "text/plain" }
56
+ readme = { file = ["README.md"], content-type = "text/markdown" }
@@ -77,12 +77,13 @@ from typing import Dict, Generic, Iterator, List, Type
77
77
  import numpy as np
78
78
  import pandas as pd
79
79
  from hopeit.dataframes.dataframe import DataFrameT, dataframe
80
+ from hopeit.dataframes.datablocks import DataBlocks
80
81
  from hopeit.dataframes.serialization.dataset import Dataset
81
82
  from hopeit.dataframes.serialization.settings import DatasetSerialization
82
83
  from hopeit.dataframes.setup.dataframes import register_serialization
83
84
  from hopeit.dataobjects import DataObject
84
85
 
85
- __all__ = ["DataFrames", "Dataset", "dataframe"]
86
+ __all__ = ["DataBlocks", "DataFrames", "Dataset", "dataframe"]
86
87
 
87
88
 
88
89
  class DataFrames(Generic[DataFrameT, DataObject]):
@@ -0,0 +1,140 @@
1
+ from typing import Generic, Optional, Type, TypeVar, get_args, get_origin
2
+
3
+ import pandas as pd
4
+ from hopeit.dataobjects import fields
5
+
6
+ from hopeit.dataframes.serialization.dataset import Dataset, DatasetLoadError, find_dataframe_type
7
+
8
+ DataBlockType = TypeVar("DataBlockType")
9
+ DataBlockItemType = TypeVar("DataBlockItemType")
10
+ DataFrameType = TypeVar("DataFrameType")
11
+
12
+
13
+ class TempDataBlock(Generic[DataBlockType, DataBlockItemType]):
14
+ def __init__(self, datatype: Type[DataBlockType], df: pd.DataFrame):
15
+ self.datatype = datatype
16
+ self.df = df
17
+
18
+ @classmethod
19
+ def from_dataobjects(
20
+ cls, datatype: Type[DataBlockType], items: list[DataBlockItemType]
21
+ ) -> "TempDataBlock[DataBlockType, DataBlockItemType]":
22
+ result_df: Optional[pd.DataFrame] = None
23
+ for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
24
+ if get_origin(field_info.annotation) is Dataset:
25
+ block_items = (getattr(item, field_name) for item in items)
26
+ block_type = get_args(field_info.annotation)[0]
27
+ block = block_type._from_dataobjects(block_items)
28
+ block_df = block._df
29
+ else:
30
+ block_df = pd.DataFrame({field_name: [getattr(item, field_name) for item in items]})
31
+
32
+ if result_df is None:
33
+ result_df = block_df
34
+ else:
35
+ # Skips duplicated column names to they are included only once
36
+ result_df = result_df.join(
37
+ block_df[[col for col in block_df.columns if col not in result_df.columns]]
38
+ )
39
+ assert result_df is not None
40
+ return cls(datatype, result_df)
41
+
42
+ def to_dataobjects(
43
+ self, item_type: Type[DataBlockItemType], *, normalize_null_values: bool = False
44
+ ) -> list[DataBlockItemType]:
45
+ keys: list[str] = []
46
+ entries: list[list] = []
47
+ for field_name, field_info in fields(self.datatype).items(): # type: ignore[type-var]
48
+ if get_origin(field_info.annotation) is Dataset:
49
+ block_type = get_args(field_info.annotation)[0]
50
+ keys.append(field_name)
51
+ dataframe = block_type._from_df(self.df)
52
+ entries.append(
53
+ dataframe._to_dataobjects(normalize_null_values=normalize_null_values)
54
+ )
55
+ else:
56
+ keys.append(field_name)
57
+ entries.append(self.df[field_name].to_list())
58
+
59
+ return [
60
+ item_type(**{field_name: entry[i] for i, field_name in enumerate(keys)})
61
+ for entry in zip(*entries)
62
+ ]
63
+
64
+
65
+ class DataBlocks(Generic[DataBlockType, DataFrameType]):
66
+ @classmethod
67
+ async def df(cls, datablock: DataBlockType, select: Optional[list[str]] = None) -> pd.DataFrame:
68
+ keys = [
69
+ field_name
70
+ for field_name, field_info in fields(datablock).items() # type: ignore[arg-type]
71
+ if get_origin(field_info.annotation) is Dataset
72
+ and (select is None or field_name in select)
73
+ ]
74
+
75
+ # Filter/validate selected field names using saved schema,
76
+ # generates a single field for every common/duplicated field in the datasets
77
+ field_names = list(
78
+ dict.fromkeys(
79
+ [
80
+ field_name
81
+ for key in keys
82
+ for field_name in getattr(datablock, key).schema["properties"].keys()
83
+ ]
84
+ )
85
+ )
86
+
87
+ # Load data from first dataset (datablock uses a single file for all datasets)
88
+ dataset: Dataset = getattr(datablock, keys[0])
89
+ result_df = await DataBlocks._load_datablock_df(dataset, field_names)
90
+
91
+ # Add missing optional fields using class schema (allows schema evolution)
92
+ cls._adapt_to_schema(datablock, keys, result_df)
93
+
94
+ # Adding constant value fields
95
+ for field_name, field_info in fields(datablock).items(): # type: ignore[arg-type]
96
+ if get_origin(field_info.annotation) is not Dataset:
97
+ result_df[field_name] = getattr(datablock, field_name) # type: ignore[index]
98
+
99
+ return result_df
100
+
101
+ @staticmethod
102
+ async def from_df(
103
+ datatype: Type[DataBlockType],
104
+ df: pd.DataFrame,
105
+ **kwargs, # Non-Dataset field values for DataBlockType
106
+ ) -> DataBlockType:
107
+ blocks = {}
108
+ block_dataset = await Dataset._save_df(df, datatype)
109
+ for field_name, field_info in fields(datatype).items(): # type: ignore[type-var]
110
+ if get_origin(field_info.annotation) is Dataset:
111
+ block_type = get_args(field_info.annotation)[0]
112
+ blocks[field_name] = block_dataset._adapt(block_type)
113
+ else:
114
+ blocks[field_name] = kwargs[field_name]
115
+
116
+ return datatype(**blocks)
117
+
118
+ @staticmethod
119
+ def default(datatype: Type[DataBlockType]) -> DataBlockType:
120
+ return datatype(**{field_name: [] for field_name in list(fields(datatype))}) # type: ignore[type-var]
121
+
122
+ @staticmethod
123
+ async def _load_datablock_df(
124
+ dataset: Dataset, columns: Optional[list[str]] = None
125
+ ) -> pd.DataFrame:
126
+ try:
127
+ return await dataset._load_df(columns)
128
+ except (RuntimeError, IOError, KeyError) as e:
129
+ raise DatasetLoadError(
130
+ f"Error {type(e).__name__}: {e} loading datablock of type {dataset.datatype} "
131
+ f"at location {dataset.partition_key}/{dataset.key}"
132
+ ) from e
133
+
134
+ @classmethod
135
+ def _adapt_to_schema(cls, datablock: DataBlockType, keys: list[str], df: pd.DataFrame):
136
+ for key in keys:
137
+ datatype = find_dataframe_type(getattr(datablock, key).datatype) # type: ignore[var-annotated]
138
+ valid_df = datatype._from_df(df)._df
139
+ for col in valid_df.columns:
140
+ df[col] = valid_df[col]
@@ -0,0 +1,93 @@
1
+ """Dataset objects definition, used as a result of serialized dataframes"""
2
+
3
+ from importlib import import_module
4
+ from typing import Any, Dict, Generic, Optional, Type, TypeVar
5
+
6
+ from hopeit.dataobjects import dataclass, dataobject, field
7
+ import pandas as pd
8
+ from pydantic import TypeAdapter
9
+
10
+ DataFrameT = TypeVar("DataFrameT")
11
+ GenericDataFrameT = TypeVar("GenericDataFrameT")
12
+
13
+
14
+ class DatasetLoadError(Exception):
15
+ pass
16
+
17
+
18
+ class DatasetConvertError(Exception):
19
+ pass
20
+
21
+
22
+ @dataobject
23
+ @dataclass
24
+ class Dataset(Generic[DataFrameT]):
25
+ """Persisted representation of a @dataframe object"""
26
+
27
+ protocol: str
28
+ partition_key: str
29
+ key: str
30
+ datatype: str
31
+ schema: Dict[str, Any] = field(default_factory=dict)
32
+
33
+ @classmethod
34
+ async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
35
+ return await cls.__storage.save(dataframe) # type: ignore[attr-defined]
36
+
37
+ async def load(self) -> DataFrameT:
38
+ try:
39
+ df = await self._load_df()
40
+ return self._convert(df)
41
+ except (RuntimeError, IOError, KeyError) as e:
42
+ raise DatasetLoadError(
43
+ f"Error {type(e).__name__}: {e} loading dataset of type {self.datatype} "
44
+ f"at location {self.partition_key}/{self.key}"
45
+ ) from e
46
+
47
+ async def _load_df(self, columns: Optional[list[str]] = None) -> pd.DataFrame:
48
+ return await self.__storage.load_df(self, columns) # type: ignore[attr-defined]
49
+
50
+ def _convert(self, df: pd.DataFrame) -> DataFrameT:
51
+ """Converts loaded pandas Dataframe to @dataframe annotated object using Dataset metadata"""
52
+ datatype: Type[DataFrameT] = find_dataframe_type(self.datatype)
53
+ return datatype._from_df(df) # type: ignore[attr-defined]
54
+
55
+ def _adapt(self, datatype: DataFrameT) -> "Dataset[DataFrameT]":
56
+ """Adapts a more generic dataset that contains combined fields to be type specific"""
57
+ return Dataset(
58
+ protocol=self.protocol,
59
+ partition_key=self.partition_key,
60
+ key=self.key,
61
+ datatype=f"{datatype.__module__}.{datatype.__qualname__}", # type: ignore[attr-defined]
62
+ schema=TypeAdapter(datatype).json_schema(),
63
+ )
64
+
65
+ @classmethod
66
+ async def _save_df(
67
+ cls, df: pd.DataFrame, datatype: Type[GenericDataFrameT]
68
+ ) -> "Dataset[GenericDataFrameT]":
69
+ return await cls.__storage.save_df(df, datatype) # type: ignore[attr-defined]
70
+
71
+
72
+ def find_protocol_impl(qual_type_name: str) -> Type:
73
+ mod_name, type_name = (
74
+ ".".join(qual_type_name.split(".")[:-1]),
75
+ qual_type_name.split(".")[-1],
76
+ )
77
+ module = import_module(mod_name)
78
+ datatype = getattr(module, type_name)
79
+ return datatype
80
+
81
+
82
+ def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
83
+ """Returns dataframe class based on type name used during serialization"""
84
+ mod_name, type_name = (
85
+ ".".join(qual_type_name.split(".")[:-1]),
86
+ qual_type_name.split(".")[-1],
87
+ )
88
+ module = import_module(mod_name)
89
+ datatype = getattr(module, type_name)
90
+ assert hasattr(datatype, "__dataframe__"), (
91
+ f"Type {qual_type_name} must be annotated with `@dataframe`."
92
+ )
93
+ return datatype
@@ -1,7 +1,6 @@
1
1
  """Support for `@dataframes` serialization to files"""
2
2
 
3
3
  import io
4
- from importlib import import_module
5
4
  from typing import Generic, Optional, Type, TypeVar
6
5
  from uuid import uuid4
7
6
 
@@ -18,7 +17,7 @@ except ImportError as e:
18
17
 
19
18
  from hopeit.dataframes.dataframe import DataFrameMixin
20
19
  from hopeit.dataframes.serialization.dataset import Dataset
21
- from hopeit.dataobjects import EventPayloadType
20
+ from hopeit.dataobjects import DataObject
22
21
  from hopeit.fs_storage import FileStorage
23
22
 
24
23
  DataFrameT = TypeVar("DataFrameT", bound=DataFrameMixin)
@@ -57,25 +56,30 @@ class DatasetFileStorage(Generic[DataFrameT]):
57
56
  schema=TypeAdapter(datatype).json_schema(),
58
57
  )
59
58
 
60
- async def load(self, dataset: Dataset) -> EventPayloadType:
61
- """Loads @dataframe annotated object using Dataset metadata"""
62
- datatype: Type[DataFrameT] = find_dataframe_type(dataset.datatype)
59
+ async def save_df(self, df: pd.DataFrame, datatype: Type[DataObject]) -> Dataset:
60
+ """Saves pandas df object as parquet to file system
61
+ and returns Dataset metadata to be used when retrieval
62
+ is handled externally
63
+ """
64
+ key = f"{datatype.__qualname__.lower()}_{uuid4()}.parquet"
65
+ data = io.BytesIO(
66
+ df.to_parquet( # pylint: disable=protected-access
67
+ engine="pyarrow"
68
+ )
69
+ )
70
+ location = await self.storage.store_file(file_name=key, value=data)
71
+ partition_key = self.storage.partition_key(location)
72
+
73
+ return Dataset(
74
+ protocol=f"{__name__}.{type(self).__name__}",
75
+ partition_key=partition_key,
76
+ key=key,
77
+ datatype=f"{datatype.__module__}.{datatype.__qualname__}",
78
+ schema=TypeAdapter(datatype).json_schema(),
79
+ )
80
+
81
+ async def load_df(self, dataset: Dataset, columns: Optional[list[str]] = None) -> pd.DataFrame:
63
82
  data = await self.storage.get_file(dataset.key, partition_key=dataset.partition_key)
64
83
  if data is None:
65
84
  raise FileNotFoundError(dataset.key)
66
- df = pd.read_parquet(io.BytesIO(data), engine="pyarrow")
67
- return datatype._from_df(df) # pylint: disable=protected-access
68
-
69
-
70
- def find_dataframe_type(qual_type_name: str) -> Type[DataFrameT]:
71
- """Returns dataframe class based on type name used during serialization"""
72
- mod_name, type_name = (
73
- ".".join(qual_type_name.split(".")[:-1]),
74
- qual_type_name.split(".")[-1],
75
- )
76
- module = import_module(mod_name)
77
- datatype = getattr(module, type_name)
78
- assert hasattr(
79
- datatype, "__dataframe__"
80
- ), f"Type {qual_type_name} must be annotated with `@dataframe`."
81
- return datatype
85
+ return pd.read_parquet(io.BytesIO(data), engine="pyarrow", columns=columns)
@@ -1,8 +1,8 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: hopeit.dataframes
3
- Version: 0.25.3
4
- Summary: Hopeit Engine Dataframes Toolkit
5
- Author-email: Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
3
+ Version: 0.26.0rc0
4
+ Summary: Hopeit Engine Dataframes for Pandas
5
+ Author-email: Leo Smerling & Pablo Canto <contact@hopeit.com.ar>, Leo Smerling <contact@hopeit.com.ar>, Pablo Canto <contact@hopeit.com.ar>
6
6
  License: Apache 2
7
7
  Project-URL: Homepage, https://github.com/hopeit-git/hopeit.engine
8
8
  Project-URL: CI: GitHub Actions, https://github.com/hopeit-git/hopeit.engine/actions?query=workflow
@@ -12,23 +12,22 @@ Project-URL: GitHub: repo, https://github.com/hopeit-git/hopeit.engine
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Programming Language :: Python
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
- Classifier: Development Status :: 4 - Beta
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Development Status :: 5 - Production/Stable
19
20
  Classifier: Operating System :: POSIX :: Linux
20
21
  Classifier: Operating System :: MacOS :: MacOS X
21
22
  Classifier: Operating System :: Microsoft :: Windows
22
23
  Classifier: Topic :: Internet :: WWW/HTTP
23
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
25
  Classifier: Framework :: AsyncIO
25
- Requires-Python: >=3.9
26
- Description-Content-Type: text/plain
27
- Requires-Dist: hopeit.engine[fs-storage]==0.25.3
28
- Requires-Dist: pandas
29
- Requires-Dist: numpy
30
- Provides-Extra: pyarrow
31
- Requires-Dist: pyarrow; extra == "pyarrow"
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: hopeit.engine>=0.26.0rc0
28
+ Requires-Dist: hopeit.fs-storage>=0.26.0rc0
29
+ Requires-Dist: pandas>=2.2.3
30
+ Requires-Dist: numpy>=1.26.4
32
31
 
33
32
  # hopeit.engine dataframes plugin
34
33
 
@@ -1,12 +1,12 @@
1
1
  README.md
2
2
  pyproject.toml
3
- setup.py
4
3
  src/hopeit.dataframes.egg-info/PKG-INFO
5
4
  src/hopeit.dataframes.egg-info/SOURCES.txt
6
5
  src/hopeit.dataframes.egg-info/dependency_links.txt
7
6
  src/hopeit.dataframes.egg-info/requires.txt
8
7
  src/hopeit.dataframes.egg-info/top_level.txt
9
8
  src/hopeit/dataframes/__init__.py
9
+ src/hopeit/dataframes/datablocks.py
10
10
  src/hopeit/dataframes/dataframe.py
11
11
  src/hopeit/dataframes/py.typed
12
12
  src/hopeit/dataframes/serialization/__init__.py
@@ -0,0 +1,4 @@
1
+ hopeit.engine>=0.26.0rc0
2
+ hopeit.fs-storage>=0.26.0rc0
3
+ pandas>=2.2.3
4
+ numpy>=1.26.4
@@ -1,25 +0,0 @@
1
- from setuptools import setup
2
- from os import environ
3
-
4
- version = {}
5
- try:
6
- with open("../../../engine/src/hopeit/server/version.py") as fp:
7
- exec(fp.read(), version)
8
- ENGINE_VERSION = version["ENGINE_VERSION"]
9
- except FileNotFoundError:
10
- ENGINE_VERSION = environ.get("ENGINE_VERSION")
11
-
12
- if not ENGINE_VERSION:
13
- raise RuntimeError("ENGINE_VERSION is not specified.")
14
-
15
- setup(
16
- version=ENGINE_VERSION,
17
- install_requires=[
18
- f"hopeit.engine[fs-storage]=={ENGINE_VERSION}",
19
- "pandas",
20
- "numpy",
21
- ],
22
- extras_require={
23
- "pyarrow": ["pyarrow"],
24
- },
25
- )
@@ -1,48 +0,0 @@
1
- """Dataset objects definition, used as a result of serialized dataframes"""
2
-
3
- from importlib import import_module
4
- from typing import Any, Dict, Generic, Type, TypeVar
5
-
6
- from hopeit.dataobjects import dataclass, dataobject, field
7
-
8
- DataFrameT = TypeVar("DataFrameT")
9
-
10
-
11
- class DatasetLoadError(Exception):
12
- pass
13
-
14
-
15
- @dataobject
16
- @dataclass
17
- class Dataset(Generic[DataFrameT]):
18
- """Persisted representation of a @dataframe object"""
19
-
20
- protocol: str
21
- partition_key: str
22
- key: str
23
- datatype: str
24
- schema: Dict[str, Any] = field(default_factory=dict)
25
-
26
- async def load(self) -> DataFrameT:
27
- try:
28
- dataframe = await self.__storage.load(self) # type: ignore[attr-defined]
29
- return dataframe
30
- except (RuntimeError, IOError, KeyError) as e:
31
- raise DatasetLoadError(
32
- f"Error {type(e).__name__}: {e} loading dataset of type {self.datatype} "
33
- f"at location {self.partition_key}/{self.key}"
34
- ) from e
35
-
36
- @classmethod
37
- async def save(cls, dataframe: DataFrameT) -> "Dataset[DataFrameT]":
38
- return await cls.__storage.save(dataframe) # type: ignore[attr-defined]
39
-
40
-
41
- def find_protocol_impl(qual_type_name: str) -> Type:
42
- mod_name, type_name = (
43
- ".".join(qual_type_name.split(".")[:-1]),
44
- qual_type_name.split(".")[-1],
45
- )
46
- module = import_module(mod_name)
47
- datatype = getattr(module, type_name)
48
- return datatype
@@ -1,6 +0,0 @@
1
- hopeit.engine[fs-storage]==0.25.3
2
- pandas
3
- numpy
4
-
5
- [pyarrow]
6
- pyarrow