workbench 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- workbench/algorithms/dataframe/proximity.py +11 -4
- workbench/api/__init__.py +2 -1
- workbench/api/df_store.py +17 -108
- workbench/api/feature_set.py +48 -11
- workbench/api/model.py +1 -1
- workbench/api/parameter_store.py +3 -52
- workbench/core/artifacts/__init__.py +11 -2
- workbench/core/artifacts/artifact.py +5 -5
- workbench/core/artifacts/df_store_core.py +114 -0
- workbench/core/artifacts/endpoint_core.py +261 -78
- workbench/core/artifacts/feature_set_core.py +69 -1
- workbench/core/artifacts/model_core.py +48 -14
- workbench/core/artifacts/parameter_store_core.py +98 -0
- workbench/core/transforms/features_to_model/features_to_model.py +50 -33
- workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
- workbench/core/views/view.py +2 -2
- workbench/model_scripts/chemprop/chemprop.template +933 -0
- workbench/model_scripts/chemprop/generated_model_script.py +933 -0
- workbench/model_scripts/chemprop/requirements.txt +11 -0
- workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
- workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
- workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
- workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
- workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
- workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
- workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
- workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
- workbench/model_scripts/pytorch_model/generated_model_script.py +365 -173
- workbench/model_scripts/pytorch_model/pytorch.template +362 -170
- workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
- workbench/model_scripts/script_generation.py +10 -7
- workbench/model_scripts/uq_models/generated_model_script.py +43 -27
- workbench/model_scripts/uq_models/mapie.template +40 -24
- workbench/model_scripts/xgb_model/generated_model_script.py +36 -7
- workbench/model_scripts/xgb_model/xgb_model.template +36 -7
- workbench/repl/workbench_shell.py +14 -5
- workbench/resources/open_source_api.key +1 -1
- workbench/scripts/endpoint_test.py +162 -0
- workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
- workbench/utils/chemprop_utils.py +761 -0
- workbench/utils/pytorch_utils.py +527 -0
- workbench/utils/xgboost_model_utils.py +10 -5
- workbench/web_interface/components/model_plot.py +7 -1
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/METADATA +3 -3
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/RECORD +49 -43
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +2 -1
- workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
- workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
- workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
- workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
|
@@ -69,6 +69,7 @@ class Proximity:
|
|
|
69
69
|
top_percent: float = 1.0,
|
|
70
70
|
min_delta: Optional[float] = None,
|
|
71
71
|
k_neighbors: int = 4,
|
|
72
|
+
only_coincident: bool = False,
|
|
72
73
|
) -> pd.DataFrame:
|
|
73
74
|
"""
|
|
74
75
|
Find compounds with steep target gradients (data quality issues and activity cliffs).
|
|
@@ -81,6 +82,7 @@ class Proximity:
|
|
|
81
82
|
top_percent: Percentage of compounds with steepest gradients to return (e.g., 1.0 = top 1%)
|
|
82
83
|
min_delta: Minimum absolute target difference to consider. If None, defaults to target_range/100
|
|
83
84
|
k_neighbors: Number of neighbors to use for median calculation (default: 4)
|
|
85
|
+
only_coincident: If True, only consider compounds that are coincident (default: False)
|
|
84
86
|
|
|
85
87
|
Returns:
|
|
86
88
|
DataFrame of compounds with steepest gradients, sorted by gradient (descending)
|
|
@@ -99,10 +101,15 @@ class Proximity:
|
|
|
99
101
|
min_delta = self.target_range / 100.0 if self.target_range > 0 else 0.0
|
|
100
102
|
candidates = candidates[candidates["nn_target_diff"] >= min_delta]
|
|
101
103
|
|
|
102
|
-
#
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
104
|
+
# Filter based on mode
|
|
105
|
+
if only_coincident:
|
|
106
|
+
# Only keep coincident points (nn_distance ~= 0)
|
|
107
|
+
candidates = candidates[candidates["nn_distance"] < epsilon].copy()
|
|
108
|
+
else:
|
|
109
|
+
# Get top X% by initial gradient
|
|
110
|
+
percentile = 100 - top_percent
|
|
111
|
+
threshold = np.percentile(candidates["gradient"], percentile)
|
|
112
|
+
candidates = candidates[candidates["gradient"] >= threshold].copy()
|
|
106
113
|
|
|
107
114
|
# Phase 2: Verify with k-neighbor median to filter out cases where nearest neighbor is the outlier
|
|
108
115
|
results = []
|
workbench/api/__init__.py
CHANGED
|
@@ -14,7 +14,7 @@ These class provide high-level APIs for the Workbench package, offering easy acc
|
|
|
14
14
|
|
|
15
15
|
from .data_source import DataSource
|
|
16
16
|
from .feature_set import FeatureSet
|
|
17
|
-
from .model import Model, ModelType
|
|
17
|
+
from .model import Model, ModelType, ModelFramework
|
|
18
18
|
from .endpoint import Endpoint
|
|
19
19
|
from .meta import Meta
|
|
20
20
|
from .parameter_store import ParameterStore
|
|
@@ -25,6 +25,7 @@ __all__ = [
|
|
|
25
25
|
"FeatureSet",
|
|
26
26
|
"Model",
|
|
27
27
|
"ModelType",
|
|
28
|
+
"ModelFramework",
|
|
28
29
|
"Endpoint",
|
|
29
30
|
"Meta",
|
|
30
31
|
"ParameterStore",
|
workbench/api/df_store.py
CHANGED
|
@@ -1,35 +1,32 @@
|
|
|
1
1
|
"""DFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
|
|
2
2
|
|
|
3
|
-
from datetime import datetime
|
|
4
3
|
from typing import Union
|
|
5
|
-
import logging
|
|
6
|
-
import pandas as pd
|
|
7
4
|
|
|
8
5
|
# Workbench Imports
|
|
9
|
-
from workbench.core.
|
|
6
|
+
from workbench.core.artifacts.df_store_core import DFStoreCore
|
|
10
7
|
|
|
11
8
|
|
|
12
|
-
class DFStore(
|
|
9
|
+
class DFStore(DFStoreCore):
|
|
13
10
|
"""DFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
|
|
14
11
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
12
|
+
Common Usage:
|
|
13
|
+
```python
|
|
14
|
+
df_store = DFStore()
|
|
18
15
|
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
# List Data
|
|
17
|
+
df_store.list()
|
|
21
18
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
19
|
+
# Add DataFrame
|
|
20
|
+
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
21
|
+
df_store.upsert("/test/my_data", df)
|
|
25
22
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
23
|
+
# Retrieve DataFrame
|
|
24
|
+
df = df_store.get("/test/my_data")
|
|
25
|
+
print(df)
|
|
29
26
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
27
|
+
# Delete Data
|
|
28
|
+
df_store.delete("/test/my_data")
|
|
29
|
+
```
|
|
33
30
|
"""
|
|
34
31
|
|
|
35
32
|
def __init__(self, path_prefix: Union[str, None] = None):
|
|
@@ -38,101 +35,13 @@ class DFStore(AWSDFStore):
|
|
|
38
35
|
Args:
|
|
39
36
|
path_prefix (Union[str, None], optional): Add a path prefix to storage locations (Defaults to None)
|
|
40
37
|
"""
|
|
41
|
-
self.log = logging.getLogger("workbench")
|
|
42
|
-
|
|
43
|
-
# Initialize the SuperClass
|
|
44
38
|
super().__init__(path_prefix=path_prefix)
|
|
45
39
|
|
|
46
|
-
def list(self, include_cache: bool = False) -> list:
|
|
47
|
-
"""List all the objects in the data_store prefix.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
include_cache (bool, optional): Include cache objects in the list (Defaults to False).
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
list: A list of all the objects in the data_store prefix.
|
|
54
|
-
"""
|
|
55
|
-
return super().list(include_cache=include_cache)
|
|
56
|
-
|
|
57
|
-
def summary(self, include_cache: bool = False) -> pd.DataFrame:
|
|
58
|
-
"""Return a nicely formatted summary of object locations, sizes (in MB), and modified dates.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
include_cache (bool, optional): Include cache objects in the summary (Defaults to False).
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
pd.DataFrame: A formatted DataFrame with the summary details.
|
|
65
|
-
"""
|
|
66
|
-
return super().summary(include_cache=include_cache)
|
|
67
|
-
|
|
68
|
-
def details(self, include_cache: bool = False) -> pd.DataFrame:
|
|
69
|
-
"""Return a DataFrame with detailed metadata for all objects in the data_store prefix.
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
include_cache (bool, optional): Include cache objects in the details (Defaults to False).
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
pd.DataFrame: A DataFrame with detailed metadata for all objects in the data_store prefix.
|
|
76
|
-
"""
|
|
77
|
-
return super().details(include_cache=include_cache)
|
|
78
|
-
|
|
79
|
-
def check(self, location: str) -> bool:
|
|
80
|
-
"""Check if a DataFrame exists at the specified location
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
location (str): The location of the data to check.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
bool: True if the data exists, False otherwise.
|
|
87
|
-
"""
|
|
88
|
-
return super().check(location)
|
|
89
|
-
|
|
90
|
-
def get(self, location: str) -> Union[pd.DataFrame, None]:
|
|
91
|
-
"""Retrieve a DataFrame from AWS S3.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
location (str): The location of the data to retrieve.
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
pd.DataFrame: The retrieved DataFrame or None if not found.
|
|
98
|
-
"""
|
|
99
|
-
_df = super().get(location)
|
|
100
|
-
if _df is None:
|
|
101
|
-
self.log.error(f"Dataframe not found at location: {location}")
|
|
102
|
-
return _df
|
|
103
|
-
|
|
104
|
-
def upsert(self, location: str, data: Union[pd.DataFrame, pd.Series]):
|
|
105
|
-
"""Insert or update a DataFrame or Series in the AWS S3.
|
|
106
|
-
|
|
107
|
-
Args:
|
|
108
|
-
location (str): The location of the data.
|
|
109
|
-
data (Union[pd.DataFrame, pd.Series]): The data to be stored.
|
|
110
|
-
"""
|
|
111
|
-
super().upsert(location, data)
|
|
112
|
-
|
|
113
|
-
def last_modified(self, location: str) -> Union[datetime, None]:
|
|
114
|
-
"""Get the last modified date of the DataFrame at the specified location.
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
location (str): The location of the data to check.
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
Union[datetime, None]: The last modified date of the DataFrame or None if not found.
|
|
121
|
-
"""
|
|
122
|
-
return super().last_modified(location)
|
|
123
|
-
|
|
124
|
-
def delete(self, location: str):
|
|
125
|
-
"""Delete a DataFrame from the AWS S3.
|
|
126
|
-
|
|
127
|
-
Args:
|
|
128
|
-
location (str): The location of the data to delete.
|
|
129
|
-
"""
|
|
130
|
-
super().delete(location)
|
|
131
|
-
|
|
132
40
|
|
|
133
41
|
if __name__ == "__main__":
|
|
134
42
|
"""Exercise the DFStore Class"""
|
|
135
43
|
import time
|
|
44
|
+
import pandas as pd
|
|
136
45
|
|
|
137
46
|
# Create a DFStore manager
|
|
138
47
|
df_store = DFStore()
|
workbench/api/feature_set.py
CHANGED
|
@@ -12,7 +12,7 @@ import pandas as pd
|
|
|
12
12
|
from workbench.core.artifacts.artifact import Artifact
|
|
13
13
|
from workbench.core.artifacts.feature_set_core import FeatureSetCore
|
|
14
14
|
from workbench.core.transforms.features_to_model.features_to_model import FeaturesToModel
|
|
15
|
-
from workbench.api.model import Model, ModelType
|
|
15
|
+
from workbench.api.model import Model, ModelType, ModelFramework
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class FeatureSet(FeatureSetCore):
|
|
@@ -58,10 +58,7 @@ class FeatureSet(FeatureSetCore):
|
|
|
58
58
|
include_aws_columns (bool): Include the AWS columns in the DataFrame (default: False)
|
|
59
59
|
|
|
60
60
|
Returns:
|
|
61
|
-
pd.DataFrame: A DataFrame of
|
|
62
|
-
|
|
63
|
-
Note:
|
|
64
|
-
Obviously this is not recommended for large datasets :)
|
|
61
|
+
pd.DataFrame: A DataFrame of all the data from this FeatureSet up to the limit
|
|
65
62
|
"""
|
|
66
63
|
|
|
67
64
|
# Get the table associated with the data
|
|
@@ -79,10 +76,11 @@ class FeatureSet(FeatureSetCore):
|
|
|
79
76
|
self,
|
|
80
77
|
name: str,
|
|
81
78
|
model_type: ModelType,
|
|
79
|
+
model_framework: ModelFramework = ModelFramework.XGBOOST,
|
|
82
80
|
tags: list = None,
|
|
83
81
|
description: str = None,
|
|
84
82
|
feature_list: list = None,
|
|
85
|
-
target_column: str = None,
|
|
83
|
+
target_column: Union[str, list[str]] = None,
|
|
86
84
|
model_class: str = None,
|
|
87
85
|
model_import_str: str = None,
|
|
88
86
|
custom_script: Union[str, Path] = None,
|
|
@@ -98,11 +96,12 @@ class FeatureSet(FeatureSetCore):
|
|
|
98
96
|
|
|
99
97
|
name (str): The name of the Model to create
|
|
100
98
|
model_type (ModelType): The type of model to create (See workbench.model.ModelType)
|
|
99
|
+
model_framework (ModelFramework, optional): The framework to use for the model (default: XGBOOST)
|
|
101
100
|
tags (list, optional): Set the tags for the model. If not given tags will be generated.
|
|
102
101
|
description (str, optional): Set the description for the model. If not give a description is generated.
|
|
103
102
|
feature_list (list, optional): Set the feature list for the model. If not given a feature list is generated.
|
|
104
|
-
target_column (str, optional):
|
|
105
|
-
model_class (str, optional): Model class to use (e.g. "KMeans",
|
|
103
|
+
target_column (str or list[str], optional): Target column(s) for the model (use None for unsupervised model)
|
|
104
|
+
model_class (str, optional): Model class to use (e.g. "KMeans", default: None)
|
|
106
105
|
model_import_str (str, optional): The import for the model (e.g. "from sklearn.cluster import KMeans")
|
|
107
106
|
custom_script (str, optional): The custom script to use for the model (default: None)
|
|
108
107
|
training_image (str, optional): The training image to use (default: "training")
|
|
@@ -128,8 +127,8 @@ class FeatureSet(FeatureSetCore):
|
|
|
128
127
|
# Create the Model Tags
|
|
129
128
|
tags = [name] if tags is None else tags
|
|
130
129
|
|
|
131
|
-
# If the
|
|
132
|
-
if
|
|
130
|
+
# If the model framework is PyTorch or ChemProp, ensure we set the training and inference images
|
|
131
|
+
if model_framework in (ModelFramework.PYTORCH_TABULAR, ModelFramework.CHEMPROP):
|
|
133
132
|
training_image = "pytorch_training"
|
|
134
133
|
inference_image = "pytorch_inference"
|
|
135
134
|
|
|
@@ -138,6 +137,7 @@ class FeatureSet(FeatureSetCore):
|
|
|
138
137
|
feature_name=self.name,
|
|
139
138
|
model_name=name,
|
|
140
139
|
model_type=model_type,
|
|
140
|
+
model_framework=model_framework,
|
|
141
141
|
model_class=model_class,
|
|
142
142
|
model_import_str=model_import_str,
|
|
143
143
|
custom_script=custom_script,
|
|
@@ -154,6 +154,24 @@ class FeatureSet(FeatureSetCore):
|
|
|
154
154
|
# Return the Model
|
|
155
155
|
return Model(name)
|
|
156
156
|
|
|
157
|
+
def prox_model(self, target: str, features: list) -> "Proximity": # noqa: F821
|
|
158
|
+
"""Create a local Proximity Model for this Model
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
target (str): The target column name
|
|
162
|
+
features (list): The list of feature column names
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Proximity: A local Proximity Model
|
|
166
|
+
"""
|
|
167
|
+
from workbench.algorithms.dataframe.proximity import Proximity # noqa: F401 (avoid circular import)
|
|
168
|
+
|
|
169
|
+
# Create the Proximity Model from the full FeatureSet dataframe
|
|
170
|
+
full_df = self.pull_dataframe()
|
|
171
|
+
|
|
172
|
+
# Create and return the Proximity Model
|
|
173
|
+
return Proximity(full_df, self.id_column, features, target, track_columns=features)
|
|
174
|
+
|
|
157
175
|
|
|
158
176
|
if __name__ == "__main__":
|
|
159
177
|
"""Exercise the FeatureSet Class"""
|
|
@@ -164,5 +182,24 @@ if __name__ == "__main__":
|
|
|
164
182
|
pprint(my_features.summary())
|
|
165
183
|
pprint(my_features.details())
|
|
166
184
|
|
|
185
|
+
# Pull the full DataFrame
|
|
186
|
+
df = my_features.pull_dataframe()
|
|
187
|
+
print(df.head())
|
|
188
|
+
|
|
189
|
+
# Create a Proximity Model from the FeatureSet
|
|
190
|
+
features = ["height", "weight", "age", "iq_score", "likes_dogs", "food"]
|
|
191
|
+
my_prox = my_features.prox_model(target="salary", features=features)
|
|
192
|
+
neighbors = my_prox.neighbors(42)
|
|
193
|
+
print("Neighbors for ID 42:")
|
|
194
|
+
print(neighbors)
|
|
195
|
+
|
|
167
196
|
# Create a Model from the FeatureSet
|
|
168
|
-
|
|
197
|
+
"""
|
|
198
|
+
my_model = my_features.to_model(
|
|
199
|
+
name="test-model",
|
|
200
|
+
model_type=ModelType.REGRESSOR,
|
|
201
|
+
target_column="salary",
|
|
202
|
+
feature_list=features
|
|
203
|
+
)
|
|
204
|
+
pprint(my_model.summary())
|
|
205
|
+
"""
|
workbench/api/model.py
CHANGED
|
@@ -7,7 +7,7 @@ Dashboard UI, which provides additional model details and performance metrics
|
|
|
7
7
|
|
|
8
8
|
# Workbench Imports
|
|
9
9
|
from workbench.core.artifacts.artifact import Artifact
|
|
10
|
-
from workbench.core.artifacts.model_core import ModelCore, ModelType # noqa: F401
|
|
10
|
+
from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework # noqa: F401
|
|
11
11
|
from workbench.core.transforms.model_to_endpoint.model_to_endpoint import ModelToEndpoint
|
|
12
12
|
from workbench.api.endpoint import Endpoint
|
|
13
13
|
from workbench.utils.model_utils import proximity_model_local, uq_model
|
workbench/api/parameter_store.py
CHANGED
|
@@ -1,13 +1,10 @@
|
|
|
1
1
|
"""ParameterStore: Manages Workbench parameters in a Cloud Based Parameter Store."""
|
|
2
2
|
|
|
3
|
-
from typing import Union
|
|
4
|
-
import logging
|
|
5
|
-
|
|
6
3
|
# Workbench Imports
|
|
7
|
-
from workbench.core.
|
|
4
|
+
from workbench.core.artifacts.parameter_store_core import ParameterStoreCore
|
|
8
5
|
|
|
9
6
|
|
|
10
|
-
class ParameterStore(
|
|
7
|
+
class ParameterStore(ParameterStoreCore):
|
|
11
8
|
"""ParameterStore: Manages Workbench parameters in a Cloud Based Parameter Store.
|
|
12
9
|
|
|
13
10
|
Common Usage:
|
|
@@ -43,56 +40,10 @@ class ParameterStore(AWSParameterStore):
|
|
|
43
40
|
|
|
44
41
|
def __init__(self):
|
|
45
42
|
"""ParameterStore Init Method"""
|
|
46
|
-
self.log = logging.getLogger("workbench")
|
|
47
43
|
|
|
48
|
-
# Initialize
|
|
44
|
+
# Initialize parent class
|
|
49
45
|
super().__init__()
|
|
50
46
|
|
|
51
|
-
def list(self, prefix: str = None) -> list:
|
|
52
|
-
"""List all parameters in the AWS Parameter Store, optionally filtering by a prefix.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
prefix (str, optional): A prefix to filter the parameters by. Defaults to None.
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
list: A list of parameter names and details.
|
|
59
|
-
"""
|
|
60
|
-
return super().list(prefix=prefix)
|
|
61
|
-
|
|
62
|
-
def get(self, name: str, warn: bool = True, decrypt: bool = True) -> Union[str, list, dict, None]:
|
|
63
|
-
"""Retrieve a parameter value from the AWS Parameter Store.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
name (str): The name of the parameter to retrieve.
|
|
67
|
-
warn (bool): Whether to log a warning if the parameter is not found.
|
|
68
|
-
decrypt (bool): Whether to decrypt secure string parameters.
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
Union[str, list, dict, None]: The value of the parameter or None if not found.
|
|
72
|
-
"""
|
|
73
|
-
return super().get(name=name, warn=warn, decrypt=decrypt)
|
|
74
|
-
|
|
75
|
-
def upsert(self, name: str, value):
|
|
76
|
-
"""Insert or update a parameter in the AWS Parameter Store.
|
|
77
|
-
|
|
78
|
-
Args:
|
|
79
|
-
name (str): The name of the parameter.
|
|
80
|
-
value (str | list | dict): The value of the parameter.
|
|
81
|
-
"""
|
|
82
|
-
super().upsert(name=name, value=value)
|
|
83
|
-
|
|
84
|
-
def delete(self, name: str):
|
|
85
|
-
"""Delete a parameter from the AWS Parameter Store.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
name (str): The name of the parameter to delete.
|
|
89
|
-
"""
|
|
90
|
-
super().delete(name=name)
|
|
91
|
-
|
|
92
|
-
def __repr__(self):
|
|
93
|
-
"""Return a string representation of the ParameterStore object."""
|
|
94
|
-
return super().__repr__()
|
|
95
|
-
|
|
96
47
|
|
|
97
48
|
if __name__ == "__main__":
|
|
98
49
|
"""Exercise the ParameterStore Class"""
|
|
@@ -15,7 +15,16 @@ from .artifact import Artifact
|
|
|
15
15
|
from .athena_source import AthenaSource
|
|
16
16
|
from .data_source_abstract import DataSourceAbstract
|
|
17
17
|
from .feature_set_core import FeatureSetCore
|
|
18
|
-
from .model_core import ModelCore, ModelType
|
|
18
|
+
from .model_core import ModelCore, ModelType, ModelFramework
|
|
19
19
|
from .endpoint_core import EndpointCore
|
|
20
20
|
|
|
21
|
-
__all__ = [
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Artifact",
|
|
23
|
+
"AthenaSource",
|
|
24
|
+
"DataSourceAbstract",
|
|
25
|
+
"FeatureSetCore",
|
|
26
|
+
"ModelCore",
|
|
27
|
+
"ModelType",
|
|
28
|
+
"ModelFramework",
|
|
29
|
+
"EndpointCore",
|
|
30
|
+
]
|
|
@@ -8,8 +8,8 @@ from typing import Union
|
|
|
8
8
|
|
|
9
9
|
# Workbench Imports
|
|
10
10
|
from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
|
|
11
|
-
from workbench.core.
|
|
12
|
-
from workbench.core.
|
|
11
|
+
from workbench.core.artifacts.parameter_store_core import ParameterStoreCore
|
|
12
|
+
from workbench.core.artifacts.df_store_core import DFStoreCore
|
|
13
13
|
from workbench.utils.aws_utils import dict_to_aws_tags
|
|
14
14
|
from workbench.utils.config_manager import ConfigManager, FatalConfigError
|
|
15
15
|
from workbench.core.cloud_platform.cloud_meta import CloudMeta
|
|
@@ -48,11 +48,11 @@ class Artifact(ABC):
|
|
|
48
48
|
tag_delimiter = "::"
|
|
49
49
|
|
|
50
50
|
# Grab our Dataframe Cache Storage
|
|
51
|
-
df_cache =
|
|
51
|
+
df_cache = DFStoreCore(path_prefix="/workbench/dataframe_cache")
|
|
52
52
|
|
|
53
53
|
# Artifact may want to use the Parameter Store or Dataframe Store
|
|
54
|
-
param_store =
|
|
55
|
-
df_store =
|
|
54
|
+
param_store = ParameterStoreCore()
|
|
55
|
+
df_store = DFStoreCore()
|
|
56
56
|
|
|
57
57
|
def __init__(self, name: str, use_cached_meta: bool = False):
|
|
58
58
|
"""Initialize the Artifact Base Class
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""DFStoreCore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
# Workbench Imports
|
|
7
|
+
from workbench.utils.config_manager import ConfigManager
|
|
8
|
+
from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
|
|
9
|
+
|
|
10
|
+
# Workbench Bridges Import
|
|
11
|
+
from workbench_bridges.api import DFStore as BridgesDFStore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DFStoreCore(BridgesDFStore):
|
|
15
|
+
"""DFStoreCore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
|
|
16
|
+
|
|
17
|
+
Common Usage:
|
|
18
|
+
```python
|
|
19
|
+
df_store = DFStoreCore()
|
|
20
|
+
|
|
21
|
+
# List Data
|
|
22
|
+
df_store.list()
|
|
23
|
+
|
|
24
|
+
# Add DataFrame
|
|
25
|
+
df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
26
|
+
df_store.upsert("/test/my_data", df)
|
|
27
|
+
|
|
28
|
+
# Retrieve DataFrame
|
|
29
|
+
df = df_store.get("/test/my_data")
|
|
30
|
+
print(df)
|
|
31
|
+
|
|
32
|
+
# Delete Data
|
|
33
|
+
df_store.delete("/test/my_data")
|
|
34
|
+
```
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, path_prefix: Union[str, None] = None):
|
|
38
|
+
"""DFStoreCore Init Method
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
path_prefix (Union[str, None], optional): Add a path prefix to storage locations (Defaults to None)
|
|
42
|
+
"""
|
|
43
|
+
# Get config from workbench's systems
|
|
44
|
+
bucket = ConfigManager().get_config("WORKBENCH_BUCKET")
|
|
45
|
+
session = AWSAccountClamp().boto3_session
|
|
46
|
+
|
|
47
|
+
# Initialize parent with workbench config
|
|
48
|
+
super().__init__(path_prefix=path_prefix, s3_bucket=bucket, boto3_session=session)
|
|
49
|
+
self.log = logging.getLogger("workbench")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
"""Exercise the DFStoreCore Class"""
|
|
54
|
+
import time
|
|
55
|
+
import pandas as pd
|
|
56
|
+
|
|
57
|
+
# Create a DFStoreCore manager
|
|
58
|
+
df_store = DFStoreCore()
|
|
59
|
+
|
|
60
|
+
# Details of the Dataframe Store
|
|
61
|
+
print("Detailed Data...")
|
|
62
|
+
print(df_store.details())
|
|
63
|
+
|
|
64
|
+
# Add a new DataFrame
|
|
65
|
+
my_df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
|
|
66
|
+
df_store.upsert("/testing/test_data", my_df)
|
|
67
|
+
|
|
68
|
+
# Get the DataFrame
|
|
69
|
+
print(f"Getting data 'test_data':\n{df_store.get('/testing/test_data')}")
|
|
70
|
+
|
|
71
|
+
# Now let's test adding a Series
|
|
72
|
+
series = pd.Series([1, 2, 3, 4], name="Series")
|
|
73
|
+
df_store.upsert("/testing/test_series", series)
|
|
74
|
+
print(f"Getting data 'test_series':\n{df_store.get('/testing/test_series')}")
|
|
75
|
+
|
|
76
|
+
# Summary of the data
|
|
77
|
+
print("Summary Data...")
|
|
78
|
+
print(df_store.summary())
|
|
79
|
+
|
|
80
|
+
# Repr of the DFStoreCore object
|
|
81
|
+
print("DFStoreCore Object:")
|
|
82
|
+
print(df_store)
|
|
83
|
+
|
|
84
|
+
# Check if the data exists
|
|
85
|
+
print("Check if data exists...")
|
|
86
|
+
print(df_store.check("/testing/test_data"))
|
|
87
|
+
print(df_store.check("/testing/test_series"))
|
|
88
|
+
|
|
89
|
+
# Time the check
|
|
90
|
+
start_time = time.time()
|
|
91
|
+
print(df_store.check("/testing/test_data"))
|
|
92
|
+
print("--- Check %s seconds ---" % (time.time() - start_time))
|
|
93
|
+
|
|
94
|
+
# Now delete the test data
|
|
95
|
+
df_store.delete("/testing/test_data")
|
|
96
|
+
df_store.delete("/testing/test_series")
|
|
97
|
+
|
|
98
|
+
# Check if the data exists
|
|
99
|
+
print("Check if data exists...")
|
|
100
|
+
print(df_store.check("/testing/test_data"))
|
|
101
|
+
print(df_store.check("/testing/test_series"))
|
|
102
|
+
|
|
103
|
+
# Add a bunch of dataframes and then test recursive delete
|
|
104
|
+
for i in range(10):
|
|
105
|
+
df_store.upsert(f"/testing/data_{i}", pd.DataFrame({"A": [1, 2], "B": [3, 4]}))
|
|
106
|
+
print("Before Recursive Delete:")
|
|
107
|
+
print(df_store.summary())
|
|
108
|
+
df_store.delete_recursive("/testing")
|
|
109
|
+
print("After Recursive Delete:")
|
|
110
|
+
print(df_store.summary())
|
|
111
|
+
|
|
112
|
+
# Get a non-existent DataFrame
|
|
113
|
+
print("Getting non-existent data...")
|
|
114
|
+
print(df_store.get("/testing/no_where"))
|