workbench 0.8.162__py3-none-any.whl → 0.8.220__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

Files changed (147) hide show
  1. workbench/algorithms/dataframe/__init__.py +1 -2
  2. workbench/algorithms/dataframe/compound_dataset_overlap.py +321 -0
  3. workbench/algorithms/dataframe/feature_space_proximity.py +168 -75
  4. workbench/algorithms/dataframe/fingerprint_proximity.py +422 -86
  5. workbench/algorithms/dataframe/projection_2d.py +44 -21
  6. workbench/algorithms/dataframe/proximity.py +259 -305
  7. workbench/algorithms/graph/light/proximity_graph.py +14 -12
  8. workbench/algorithms/models/cleanlab_model.py +382 -0
  9. workbench/algorithms/models/noise_model.py +388 -0
  10. workbench/algorithms/sql/outliers.py +3 -3
  11. workbench/api/__init__.py +5 -1
  12. workbench/api/compound.py +1 -1
  13. workbench/api/df_store.py +17 -108
  14. workbench/api/endpoint.py +18 -5
  15. workbench/api/feature_set.py +121 -15
  16. workbench/api/meta.py +5 -2
  17. workbench/api/meta_model.py +289 -0
  18. workbench/api/model.py +55 -21
  19. workbench/api/monitor.py +1 -16
  20. workbench/api/parameter_store.py +3 -52
  21. workbench/cached/cached_model.py +4 -4
  22. workbench/core/artifacts/__init__.py +11 -2
  23. workbench/core/artifacts/artifact.py +16 -8
  24. workbench/core/artifacts/data_capture_core.py +355 -0
  25. workbench/core/artifacts/df_store_core.py +114 -0
  26. workbench/core/artifacts/endpoint_core.py +382 -253
  27. workbench/core/artifacts/feature_set_core.py +249 -45
  28. workbench/core/artifacts/model_core.py +135 -80
  29. workbench/core/artifacts/monitor_core.py +33 -248
  30. workbench/core/artifacts/parameter_store_core.py +98 -0
  31. workbench/core/cloud_platform/aws/aws_account_clamp.py +50 -1
  32. workbench/core/cloud_platform/aws/aws_meta.py +12 -5
  33. workbench/core/cloud_platform/aws/aws_session.py +4 -4
  34. workbench/core/pipelines/pipeline_executor.py +1 -1
  35. workbench/core/transforms/data_to_features/light/molecular_descriptors.py +4 -4
  36. workbench/core/transforms/features_to_model/features_to_model.py +62 -40
  37. workbench/core/transforms/model_to_endpoint/model_to_endpoint.py +76 -15
  38. workbench/core/transforms/pandas_transforms/pandas_to_features.py +38 -2
  39. workbench/core/views/training_view.py +113 -42
  40. workbench/core/views/view.py +53 -3
  41. workbench/core/views/view_utils.py +4 -4
  42. workbench/model_script_utils/model_script_utils.py +339 -0
  43. workbench/model_script_utils/pytorch_utils.py +405 -0
  44. workbench/model_script_utils/uq_harness.py +278 -0
  45. workbench/model_scripts/chemprop/chemprop.template +649 -0
  46. workbench/model_scripts/chemprop/generated_model_script.py +649 -0
  47. workbench/model_scripts/chemprop/model_script_utils.py +339 -0
  48. workbench/model_scripts/chemprop/requirements.txt +3 -0
  49. workbench/model_scripts/custom_models/chem_info/fingerprints.py +175 -0
  50. workbench/model_scripts/custom_models/chem_info/mol_descriptors.py +483 -0
  51. workbench/model_scripts/custom_models/chem_info/mol_standardize.py +450 -0
  52. workbench/model_scripts/custom_models/chem_info/molecular_descriptors.py +7 -9
  53. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  54. workbench/model_scripts/custom_models/proximity/feature_space_proximity.py +194 -0
  55. workbench/model_scripts/custom_models/proximity/feature_space_proximity.template +8 -10
  56. workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template +7 -8
  57. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +20 -21
  58. workbench/model_scripts/custom_models/uq_models/feature_space_proximity.py +194 -0
  59. workbench/model_scripts/custom_models/uq_models/gaussian_process.template +5 -11
  60. workbench/model_scripts/custom_models/uq_models/ngboost.template +30 -18
  61. workbench/model_scripts/custom_models/uq_models/requirements.txt +1 -3
  62. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +15 -17
  63. workbench/model_scripts/meta_model/generated_model_script.py +209 -0
  64. workbench/model_scripts/meta_model/meta_model.template +209 -0
  65. workbench/model_scripts/pytorch_model/generated_model_script.py +444 -500
  66. workbench/model_scripts/pytorch_model/model_script_utils.py +339 -0
  67. workbench/model_scripts/pytorch_model/pytorch.template +440 -496
  68. workbench/model_scripts/pytorch_model/pytorch_utils.py +405 -0
  69. workbench/model_scripts/pytorch_model/requirements.txt +1 -1
  70. workbench/model_scripts/pytorch_model/uq_harness.py +278 -0
  71. workbench/model_scripts/scikit_learn/generated_model_script.py +7 -12
  72. workbench/model_scripts/scikit_learn/scikit_learn.template +4 -9
  73. workbench/model_scripts/script_generation.py +20 -11
  74. workbench/model_scripts/uq_models/generated_model_script.py +248 -0
  75. workbench/model_scripts/xgb_model/generated_model_script.py +372 -404
  76. workbench/model_scripts/xgb_model/model_script_utils.py +339 -0
  77. workbench/model_scripts/xgb_model/uq_harness.py +278 -0
  78. workbench/model_scripts/xgb_model/xgb_model.template +369 -401
  79. workbench/repl/workbench_shell.py +28 -19
  80. workbench/resources/open_source_api.key +1 -1
  81. workbench/scripts/endpoint_test.py +162 -0
  82. workbench/scripts/lambda_test.py +73 -0
  83. workbench/scripts/meta_model_sim.py +35 -0
  84. workbench/scripts/ml_pipeline_batch.py +137 -0
  85. workbench/scripts/ml_pipeline_sqs.py +186 -0
  86. workbench/scripts/monitor_cloud_watch.py +20 -100
  87. workbench/scripts/training_test.py +85 -0
  88. workbench/utils/aws_utils.py +4 -3
  89. workbench/utils/chem_utils/__init__.py +0 -0
  90. workbench/utils/chem_utils/fingerprints.py +175 -0
  91. workbench/utils/chem_utils/misc.py +194 -0
  92. workbench/utils/chem_utils/mol_descriptors.py +483 -0
  93. workbench/utils/chem_utils/mol_standardize.py +450 -0
  94. workbench/utils/chem_utils/mol_tagging.py +348 -0
  95. workbench/utils/chem_utils/projections.py +219 -0
  96. workbench/utils/chem_utils/salts.py +256 -0
  97. workbench/utils/chem_utils/sdf.py +292 -0
  98. workbench/utils/chem_utils/toxicity.py +250 -0
  99. workbench/utils/chem_utils/vis.py +253 -0
  100. workbench/utils/chemprop_utils.py +141 -0
  101. workbench/utils/cloudwatch_handler.py +1 -1
  102. workbench/utils/cloudwatch_utils.py +137 -0
  103. workbench/utils/config_manager.py +3 -7
  104. workbench/utils/endpoint_utils.py +5 -7
  105. workbench/utils/license_manager.py +2 -6
  106. workbench/utils/meta_model_simulator.py +499 -0
  107. workbench/utils/metrics_utils.py +256 -0
  108. workbench/utils/model_utils.py +278 -79
  109. workbench/utils/monitor_utils.py +44 -62
  110. workbench/utils/pandas_utils.py +3 -3
  111. workbench/utils/pytorch_utils.py +87 -0
  112. workbench/utils/shap_utils.py +11 -57
  113. workbench/utils/workbench_logging.py +0 -3
  114. workbench/utils/workbench_sqs.py +1 -1
  115. workbench/utils/xgboost_local_crossfold.py +267 -0
  116. workbench/utils/xgboost_model_utils.py +127 -219
  117. workbench/web_interface/components/model_plot.py +14 -2
  118. workbench/web_interface/components/plugin_unit_test.py +5 -2
  119. workbench/web_interface/components/plugins/dashboard_status.py +3 -1
  120. workbench/web_interface/components/plugins/generated_compounds.py +1 -1
  121. workbench/web_interface/components/plugins/model_details.py +38 -74
  122. workbench/web_interface/components/plugins/scatter_plot.py +6 -10
  123. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/METADATA +31 -9
  124. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/RECORD +128 -96
  125. workbench-0.8.220.dist-info/entry_points.txt +11 -0
  126. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/licenses/LICENSE +1 -1
  127. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  128. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  129. workbench/model_scripts/custom_models/chem_info/local_utils.py +0 -769
  130. workbench/model_scripts/custom_models/chem_info/tautomerize.py +0 -83
  131. workbench/model_scripts/custom_models/meta_endpoints/example.py +0 -53
  132. workbench/model_scripts/custom_models/proximity/generated_model_script.py +0 -138
  133. workbench/model_scripts/custom_models/proximity/proximity.py +0 -384
  134. workbench/model_scripts/custom_models/uq_models/generated_model_script.py +0 -393
  135. workbench/model_scripts/custom_models/uq_models/mapie_xgb.template +0 -203
  136. workbench/model_scripts/custom_models/uq_models/meta_uq.template +0 -273
  137. workbench/model_scripts/custom_models/uq_models/proximity.py +0 -384
  138. workbench/model_scripts/ensemble_xgb/generated_model_script.py +0 -279
  139. workbench/model_scripts/quant_regression/quant_regression.template +0 -279
  140. workbench/model_scripts/quant_regression/requirements.txt +0 -1
  141. workbench/utils/chem_utils.py +0 -1556
  142. workbench/utils/execution_environment.py +0 -211
  143. workbench/utils/fast_inference.py +0 -167
  144. workbench/utils/resource_utils.py +0 -39
  145. workbench-0.8.162.dist-info/entry_points.txt +0 -5
  146. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/WHEEL +0 -0
  147. {workbench-0.8.162.dist-info → workbench-0.8.220.dist-info}/top_level.txt +0 -0
@@ -1,404 +0,0 @@
1
- """AWSDFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy"""
2
-
3
- from datetime import datetime
4
- from typing import Union
5
- import logging
6
- import awswrangler as wr
7
- import pandas as pd
8
- import re
9
- from urllib.parse import urlparse
10
-
11
- # Workbench Imports
12
- from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
13
- from workbench.utils.config_manager import ConfigManager
14
- from workbench.utils.aws_utils import not_found_returns_none
15
-
16
-
17
- class AWSDFStore:
18
- """AWSDFStore: Fast/efficient storage of DataFrames using AWS S3/Parquet/Snappy
19
-
20
- Common Usage:
21
- ```python
22
- df_store = AWSDFStore()
23
-
24
- # List Data
25
- df_store.list()
26
-
27
- # Add DataFrame
28
- df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
29
- df_store.upsert("/test/my_data", df)
30
-
31
- # Retrieve DataFrame
32
- df = df_store.get("/test/my_data")
33
- print(df)
34
-
35
- # Delete Data
36
- df_store.delete("/test/my_data")
37
- ```
38
- """
39
-
40
- def __init__(self, path_prefix: Union[str, None] = None):
41
- """AWSDFStore Init Method
42
-
43
- Args:
44
- path_prefix (Union[str, None], optional): Path prefix for storage locations (Defaults to None)
45
- """
46
- self.log = logging.getLogger("workbench")
47
- self._base_prefix = "df_store/"
48
- self.path_prefix = self._base_prefix + path_prefix if path_prefix else self._base_prefix
49
- self.path_prefix = re.sub(r"/+", "/", self.path_prefix) # Collapse slashes
50
-
51
- # Get the Workbench Bucket
52
- config = ConfigManager()
53
- self.workbench_bucket = config.get_config("WORKBENCH_BUCKET")
54
-
55
- # Get the S3 Client
56
- self.boto3_session = AWSAccountClamp().boto3_session
57
- self.s3_client = self.boto3_session.client("s3")
58
-
59
- def list(self, include_cache: bool = False) -> list:
60
- """List all objects in the data_store prefix
61
-
62
- Args:
63
- include_cache (bool, optional): Include cache objects in the list (Defaults to False)
64
-
65
- Returns:
66
- list: A list of all the objects in the data_store prefix.
67
- """
68
- df = self.summary(include_cache=include_cache)
69
- return df["location"].tolist()
70
-
71
- def last_modified(self, location: str) -> Union[datetime, None]:
72
- """Return the last modified date of a graph.
73
-
74
- Args:
75
- location (str): Logical location of the graph.
76
-
77
- Returns:
78
- Union[datetime, None]: Last modified datetime or None if not found.
79
- """
80
- s3_uri = self._generate_s3_uri(location)
81
- bucket, key = self._parse_s3_uri(s3_uri)
82
-
83
- try:
84
- response = self.s3_client.head_object(Bucket=bucket, Key=key)
85
- return response["LastModified"]
86
- except self.s3_client.exceptions.ClientError:
87
- return None
88
-
89
- def summary(self, include_cache: bool = False) -> pd.DataFrame:
90
- """Return a nicely formatted summary of object locations, sizes (in MB), and modified dates.
91
-
92
- Args:
93
- include_cache (bool, optional): Include cache objects in the summary (Defaults to False)
94
- """
95
- df = self.details(include_cache=include_cache)
96
-
97
- # Create a formatted DataFrame
98
- formatted_df = pd.DataFrame(
99
- {
100
- "location": df["location"],
101
- "size (MB)": (df["size"] / (1024 * 1024)).round(2), # Convert size to MB
102
- "modified": pd.to_datetime(df["modified"]).dt.strftime("%Y-%m-%d %H:%M:%S"), # Format date
103
- }
104
- )
105
- return formatted_df
106
-
107
- def details(self, include_cache: bool = False) -> pd.DataFrame:
108
- """Return detailed metadata for all objects, optionally excluding the specified prefix.
109
-
110
- Args:
111
- include_cache (bool, optional): Include cache objects in the details (Defaults to False)
112
- """
113
- try:
114
- response = self.s3_client.list_objects_v2(Bucket=self.workbench_bucket, Prefix=self.path_prefix)
115
- if "Contents" not in response:
116
- return pd.DataFrame(columns=["location", "s3_file", "size", "modified"])
117
-
118
- # Collect details for each object
119
- data = []
120
- for obj in response["Contents"]:
121
- full_key = obj["Key"]
122
-
123
- # Reverse logic: Strip the bucket/prefix in the front and .parquet in the end
124
- location = full_key.replace(f"{self.path_prefix}", "/").split(".parquet")[0]
125
- s3_file = f"s3://{self.workbench_bucket}/{full_key}"
126
- size = obj["Size"]
127
- modified = obj["LastModified"]
128
- data.append([location, s3_file, size, modified])
129
-
130
- # Create the DataFrame
131
- df = pd.DataFrame(data, columns=["location", "s3_file", "size", "modified"])
132
-
133
- # Apply the exclude_prefix filter if set
134
- cache_prefix = "/workbench/dataframe_cache/"
135
- if not include_cache:
136
- df = df[~df["location"].str.startswith(cache_prefix)]
137
-
138
- return df
139
-
140
- except Exception as e:
141
- self.log.error(f"Failed to get object details: {e}")
142
- return pd.DataFrame(columns=["location", "s3_file", "size", "created", "modified"])
143
-
144
- def check(self, location: str) -> bool:
145
- """Check if a DataFrame exists at the specified location
146
-
147
- Args:
148
- location (str): The location of the data to check.
149
-
150
- Returns:
151
- bool: True if the data exists, False otherwise.
152
- """
153
- # Generate the specific S3 prefix for the target location
154
- s3_prefix = f"{self.path_prefix}/{location}.parquet/"
155
- s3_prefix = re.sub(r"/+", "/", s3_prefix) # Collapse slashes
156
-
157
- # Use list_objects_v2 to check if any objects exist under this specific prefix
158
- response = self.s3_client.list_objects_v2(Bucket=self.workbench_bucket, Prefix=s3_prefix, MaxKeys=1)
159
- return "Contents" in response
160
-
161
- @not_found_returns_none
162
- def get(self, location: str) -> Union[pd.DataFrame, None]:
163
- """Retrieve a DataFrame from AWS S3.
164
-
165
- Args:
166
- location (str): The location of the data to retrieve.
167
-
168
- Returns:
169
- pd.DataFrame: The retrieved DataFrame or None if not found.
170
- """
171
- s3_uri = self._generate_s3_uri(location)
172
- return wr.s3.read_parquet(s3_uri)
173
-
174
- def upsert(self, location: str, data: Union[pd.DataFrame, pd.Series]):
175
- """Insert or update a DataFrame or Series in the AWS S3.
176
-
177
- Args:
178
- location (str): The location of the data.
179
- data (Union[pd.DataFrame, pd.Series]): The data to be stored.
180
- """
181
- # Check if the data is a Pandas Series, convert it to a DataFrame
182
- if isinstance(data, pd.Series):
183
- data = data.to_frame()
184
-
185
- # Ensure data is a DataFrame
186
- if not isinstance(data, pd.DataFrame):
187
- raise ValueError("Only Pandas DataFrame or Series objects are supported.")
188
-
189
- # Convert object columns to string type to avoid PyArrow type inference issues.
190
- data = self.type_convert_before_parquet(data)
191
-
192
- # Update/Insert the DataFrame to S3
193
- s3_uri = self._generate_s3_uri(location)
194
- try:
195
- wr.s3.to_parquet(df=data, path=s3_uri, dataset=True, mode="overwrite", index=True)
196
- self.log.info(f"Dataframe cached {s3_uri}...")
197
- except Exception as e:
198
- self.log.error(f"Failed to cache dataframe '{s3_uri}': {e}")
199
- raise
200
-
201
- @staticmethod
202
- def type_convert_before_parquet(df: pd.DataFrame) -> pd.DataFrame:
203
- # Convert object columns to string type to avoid PyArrow type inference issues.
204
- df = df.copy()
205
- object_cols = df.select_dtypes(include=["object"]).columns
206
- df[object_cols] = df[object_cols].astype("str")
207
- return df
208
-
209
- def delete(self, location: str):
210
- """Delete a DataFrame from the AWS S3.
211
-
212
- Args:
213
- location (str): The location of the data to delete.
214
- """
215
- s3_uri = self._generate_s3_uri(location)
216
-
217
- # Check if the folder (prefix) exists in S3
218
- if not wr.s3.list_objects(s3_uri):
219
- self.log.info(f"Data '{location}' does not exist in S3...")
220
- return
221
-
222
- # Delete the data from S3
223
- try:
224
- wr.s3.delete_objects(s3_uri)
225
- self.log.info(f"Data '{location}' deleted successfully from S3.")
226
- except Exception as e:
227
- self.log.error(f"Failed to delete data '{location}': {e}")
228
-
229
- def delete_recursive(self, location: str):
230
- """Recursively delete all data under the specified location in AWS S3.
231
-
232
- Args:
233
- location (str): The location prefix of the data to delete.
234
- """
235
- # Construct the full prefix for S3
236
- s3_prefix = re.sub(r"/+", "/", f"{self.path_prefix}/{location}") # Collapse slashes
237
- s3_prefix = s3_prefix.rstrip("/") + "/" # Ensure the prefix ends with a slash
238
-
239
- # List all objects under the given prefix
240
- try:
241
- response = self.s3_client.list_objects_v2(Bucket=self.workbench_bucket, Prefix=s3_prefix)
242
- if "Contents" not in response:
243
- self.log.info(f"No data found under '{s3_prefix}' to delete.")
244
- return
245
-
246
- # Gather all keys to delete
247
- keys = [{"Key": obj["Key"]} for obj in response["Contents"]]
248
- response = self.s3_client.delete_objects(Bucket=self.workbench_bucket, Delete={"Objects": keys})
249
- for response in response.get("Deleted", []):
250
- self.log.info(f"Deleted: {response['Key']}")
251
-
252
- except Exception as e:
253
- self.log.error(f"Failed to delete data recursively at '{location}': {e}")
254
-
255
- def list_subfiles(self, prefix: str) -> list:
256
- """Return a list of file locations with the given prefix.
257
-
258
- Args:
259
- prefix (str, optional): Only include files with the given prefix
260
-
261
- Returns:
262
- list: List of file locations (paths)
263
- """
264
- try:
265
- full_prefix = f"{self.path_prefix}{prefix.lstrip('/')}"
266
- response = self.s3_client.list_objects_v2(Bucket=self.workbench_bucket, Prefix=full_prefix)
267
- if "Contents" not in response:
268
- return []
269
-
270
- locations = []
271
- for obj in response["Contents"]:
272
- full_key = obj["Key"]
273
- location = full_key.replace(f"{self.path_prefix}", "/").split(".parquet")[0]
274
- locations.append(location)
275
- return locations
276
-
277
- except Exception as e:
278
- self.log.error(f"Failed to list subfiles: {e}")
279
- return []
280
-
281
- def _generate_s3_uri(self, location: str) -> str:
282
- """Generate the S3 URI for the given location."""
283
- s3_path = f"{self.workbench_bucket}/{self.path_prefix}/{location}.parquet"
284
- return f"s3://{re.sub(r'/+', '/', s3_path)}"
285
-
286
- def _parse_s3_uri(self, s3_uri: str) -> tuple:
287
- """Parse an S3 URI into bucket and key."""
288
- parsed = urlparse(s3_uri)
289
- if parsed.scheme != "s3":
290
- raise ValueError(f"Invalid S3 URI: {s3_uri}")
291
- return parsed.netloc, parsed.path.lstrip("/")
292
-
293
- def __repr__(self):
294
- """Return a string representation of the AWSDFStore object."""
295
- # Use the summary() method and format it to align columns for printing
296
- summary_df = self.summary()
297
-
298
- # Sanity check: If there are no objects, return a message
299
- if summary_df.empty:
300
- return "AWSDFStore: No data objects found in the store."
301
-
302
- # Dynamically compute the max length of the 'location' column and add 5 spaces for padding
303
- max_location_len = summary_df["location"].str.len().max() + 2
304
- summary_df["location"] = summary_df["location"].str.ljust(max_location_len)
305
-
306
- # Format the size column to include (MB) and ensure 3 spaces between size and date
307
- summary_df["size (MB)"] = summary_df["size (MB)"].apply(lambda x: f"{x:.2f} MB")
308
-
309
- # Enclose the modified date in parentheses and ensure 3 spaces between size and date
310
- summary_df["modified"] = summary_df["modified"].apply(lambda x: f" ({x})")
311
-
312
- # Convert the DataFrame to a string, remove headers, and return
313
- return summary_df.to_string(index=False, header=False)
314
-
315
-
316
- if __name__ == "__main__":
317
- """Exercise the AWSDFStore Class"""
318
- import time
319
-
320
- # Create a AWSDFStore manager
321
- df_store = AWSDFStore()
322
-
323
- # Details of the Dataframe Store
324
- print("Detailed Data...")
325
- print(df_store.details())
326
-
327
- # List all objects
328
- print("List Data...")
329
- print(df_store.list())
330
-
331
- # Add a new DataFrame
332
- my_df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
333
- df_store.upsert("/testing/test_data", my_df)
334
-
335
- # Check the last modified date
336
- print("Last Modified Date:")
337
- print(df_store.last_modified("/testing/test_data"))
338
-
339
- # Get the DataFrame
340
- print(f"Getting data 'test_data':\n{df_store.get('/testing/test_data')}")
341
-
342
- # Now let's test adding a Series
343
- series = pd.Series([1, 2, 3, 4], name="Series")
344
- df_store.upsert("/testing/test_series", series)
345
- print(f"Getting data 'test_series':\n{df_store.get('/testing/test_series')}")
346
-
347
- # Summary of the data
348
- print("Summary Data...")
349
- print(df_store.summary())
350
-
351
- # Repr of the AWSDFStore object
352
- print("AWSDFStore Object:")
353
- print(df_store)
354
-
355
- # Check if the data exists
356
- print("Check if data exists...")
357
- print(df_store.check("/testing/test_data"))
358
- print(df_store.check("/testing/test_series"))
359
-
360
- # Time the check
361
- start_time = time.time()
362
- print(df_store.check("/testing/test_data"))
363
- print("--- Check %s seconds ---" % (time.time() - start_time))
364
-
365
- # Test list_subfiles
366
- print("List Subfiles:")
367
- print(df_store.list_subfiles("/testing"))
368
-
369
- # Now delete the test data
370
- df_store.delete("/testing/test_data")
371
- df_store.delete("/testing/test_series")
372
-
373
- # Check if the data exists
374
- print("Check if data exists...")
375
- print(df_store.check("/testing/test_data"))
376
- print(df_store.check("/testing/test_series"))
377
-
378
- # Add a bunch of dataframes and then test recursive delete
379
- for i in range(10):
380
- df_store.upsert(f"/testing/data_{i}", pd.DataFrame({"A": [1, 2], "B": [3, 4]}))
381
- print("Before Recursive Delete:")
382
- print(df_store.summary())
383
- df_store.delete_recursive("/testing")
384
- print("After Recursive Delete:")
385
- print(df_store.summary())
386
-
387
- # Get a non-existent DataFrame
388
- print("Getting non-existent data...")
389
- print(df_store.get("/testing/no_where"))
390
-
391
- # Test path_prefix
392
- df_store = AWSDFStore(path_prefix="/super/test")
393
- print(df_store.path_prefix)
394
- df_store.upsert("test_data", my_df)
395
- print(df_store.get("test_data"))
396
- print(df_store.summary())
397
- df_store.delete("test_data")
398
- print(df_store.summary())
399
-
400
- # Test columns with Spaces in them
401
- my_df = pd.DataFrame({"My A": [1, 2], "My B": [3, 4]})
402
- df_store.upsert("/testing/test_data", my_df)
403
- my_df = df_store.get("/testing/test_data")
404
- print(my_df)
@@ -1,280 +0,0 @@
1
- """AWSParameterStore: Manages Workbench parameters in AWS Systems Manager Parameter Store."""
2
-
3
- from typing import Union
4
- import logging
5
- import json
6
- import zlib
7
- import base64
8
- from botocore.exceptions import ClientError
9
-
10
- # Workbench Imports
11
- from workbench.core.cloud_platform.aws.aws_session import AWSSession
12
- from workbench.utils.json_utils import CustomEncoder
13
-
14
-
15
- class AWSParameterStore:
16
- """AWSParameterStore: Manages Workbench parameters in AWS Systems Manager Parameter Store.
17
-
18
- Common Usage:
19
- ```python
20
- params = AWSParameterStore()
21
-
22
- # List Parameters
23
- params.list()
24
-
25
- ['/workbench/abalone_info',
26
- '/workbench/my_data',
27
- '/workbench/test',
28
- '/workbench/pipelines/my_pipeline']
29
-
30
- # Add Key
31
- params.upsert("key", "value")
32
- value = params.get("key")
33
-
34
- # Add any data (lists, dictionaries, etc..)
35
- my_data = {"key": "value", "number": 4.2, "list": [1,2,3]}
36
- params.upsert("my_data", my_data)
37
-
38
- # Retrieve data
39
- return_value = params.get("my_data")
40
- pprint(return_value)
41
-
42
- {'key': 'value', 'list': [1, 2, 3], 'number': 4.2}
43
-
44
- # Delete parameters
45
- param_store.delete("my_data")
46
- ```
47
- """
48
-
49
- def __init__(self):
50
- """AWSParameterStore Init Method"""
51
- self.log = logging.getLogger("workbench")
52
-
53
- # Initialize a Workbench Session (to assume the Workbench ExecutionRole)
54
- self.boto3_session = AWSSession().boto3_session
55
-
56
- # Create a Systems Manager (SSM) client for Parameter Store operations
57
- self.ssm_client = self.boto3_session.client("ssm")
58
-
59
- def list(self, prefix: str = None) -> list:
60
- """List all parameters in the AWS Parameter Store, optionally filtering by a prefix.
61
-
62
- Args:
63
- prefix (str, optional): A prefix to filter the parameters by. Defaults to None.
64
-
65
- Returns:
66
- list: A list of parameter names and details.
67
- """
68
- try:
69
- # Set up parameters for the query
70
- params = {"MaxResults": 50}
71
-
72
- # If a prefix is provided, add the 'ParameterFilters' for optimization
73
- if prefix:
74
- params["ParameterFilters"] = [{"Key": "Name", "Option": "BeginsWith", "Values": [prefix]}]
75
-
76
- # Initialize the list to collect parameter names
77
- all_parameters = []
78
-
79
- # Make the initial call to describe parameters
80
- response = self.ssm_client.describe_parameters(**params)
81
-
82
- # Aggregate the names from the initial response
83
- all_parameters.extend(param["Name"] for param in response["Parameters"])
84
-
85
- # Continue to paginate if there's a NextToken
86
- while "NextToken" in response:
87
- # Update the parameters with the NextToken for subsequent calls
88
- params["NextToken"] = response["NextToken"]
89
- response = self.ssm_client.describe_parameters(**params)
90
-
91
- # Aggregate the names from the subsequent responses
92
- all_parameters.extend(param["Name"] for param in response["Parameters"])
93
-
94
- except Exception as e:
95
- self.log.error(f"Failed to list parameters: {e}")
96
- return []
97
-
98
- # Return the aggregated list of parameter names
99
- return all_parameters
100
-
101
- def get(self, name: str, warn: bool = True, decrypt: bool = True) -> Union[str, list, dict, None]:
102
- """Retrieve a parameter value from the AWS Parameter Store.
103
-
104
- Args:
105
- name (str): The name of the parameter to retrieve.
106
- warn (bool): Whether to log a warning if the parameter is not found.
107
- decrypt (bool): Whether to decrypt secure string parameters.
108
-
109
- Returns:
110
- Union[str, list, dict, None]: The value of the parameter or None if not found.
111
- """
112
- try:
113
- # Retrieve the parameter from Parameter Store
114
- response = self.ssm_client.get_parameter(Name=name, WithDecryption=decrypt)
115
- value = response["Parameter"]["Value"]
116
-
117
- # Auto-detect and decompress if needed
118
- if value.startswith("COMPRESSED:"):
119
- # Base64 decode and decompress
120
- self.log.important(f"Decompressing parameter '{name}'...")
121
- compressed_value = base64.b64decode(value[len("COMPRESSED:") :])
122
- value = zlib.decompress(compressed_value).decode("utf-8")
123
-
124
- # Attempt to parse the value back to its original type
125
- try:
126
- parsed_value = json.loads(value)
127
- return parsed_value
128
- except (json.JSONDecodeError, TypeError):
129
- # If parsing fails, return the value as is "hope for the best"
130
- return value
131
-
132
- except ClientError as e:
133
- if e.response["Error"]["Code"] == "ParameterNotFound":
134
- if warn:
135
- self.log.warning(f"Parameter '{name}' not found")
136
- else:
137
- self.log.error(f"Failed to get parameter '{name}': {e}")
138
- return None
139
-
140
- def upsert(self, name: str, value, precision: int = 3):
141
- """Insert or update a parameter in the AWS Parameter Store.
142
-
143
- Args:
144
- name (str): The name of the parameter.
145
- value (str | list | dict): The value of the parameter.
146
- precision (int): The precision for float values in the JSON encoding.
147
- """
148
- try:
149
- # Convert to JSON and check if compression is needed
150
- json_value = json.dumps(value, cls=CustomEncoder, precision=precision)
151
- if len(json_value) <= 4096:
152
- # Store normally if under 4KB
153
- self._store_parameter(name, json_value)
154
- return
155
-
156
- # Need compression - log warning
157
- self.log.important(
158
- f"Parameter {name} exceeds 4KB ({len(json_value)} bytes): compressing and reducing precision..."
159
- )
160
-
161
- # Try compression with precision reduction
162
- compressed_value = self._compress_value(value)
163
-
164
- if len(compressed_value) <= 4096:
165
- self._store_parameter(name, compressed_value)
166
- return
167
-
168
- # Try clipping the data
169
- clipped_value = self._clip_data(value)
170
- compressed_clipped = self._compress_value(clipped_value)
171
-
172
- if len(compressed_clipped) <= 4096:
173
- self.log.warning(
174
- f"Parameter {name} data clipped to 100 items/elements: ({len(compressed_clipped)} bytes)"
175
- )
176
- self._store_parameter(name, compressed_clipped)
177
- return
178
-
179
- # Still too large - give up
180
- self._handle_oversized_data(name, len(compressed_clipped))
181
-
182
- except Exception as e:
183
- self.log.critical(f"Failed to add/update parameter '{name}': {e}")
184
- raise
185
-
186
- @staticmethod
187
- def _compress_value(value) -> str:
188
- """Compress a value with precision reduction."""
189
- json_value = json.dumps(value, cls=CustomEncoder, precision=3)
190
- compressed = zlib.compress(json_value.encode("utf-8"), level=9)
191
- return "COMPRESSED:" + base64.b64encode(compressed).decode("utf-8")
192
-
193
- @staticmethod
194
- def _clip_data(value):
195
- """Clip data to reduce size, clip to first 100 items/elements."""
196
- if isinstance(value, dict):
197
- return dict(list(value.items())[:100])
198
- elif isinstance(value, list):
199
- return value[:100]
200
- return value
201
-
202
- def _store_parameter(self, name: str, value: str):
203
- """Store parameter in AWS Parameter Store."""
204
- self.ssm_client.put_parameter(Name=name, Value=value, Type="String", Overwrite=True)
205
- self.log.info(f"Parameter '{name}' added/updated successfully.")
206
-
207
- def _handle_oversized_data(self, name: str, size: int):
208
- """Handle data that's too large even after compression and clipping."""
209
- doc_link = "https://supercowpowers.github.io/workbench/api_classes/df_store"
210
- self.log.error(f"Compressed size {size} bytes, cannot store > 4KB")
211
- self.log.error(f"For larger data use the DFStore() class ({doc_link})")
212
-
213
- def delete(self, name: str):
214
- """Delete a parameter from the AWS Parameter Store.
215
-
216
- Args:
217
- name (str): The name of the parameter to delete.
218
- """
219
- try:
220
- # Delete the parameter from Parameter Store
221
- self.ssm_client.delete_parameter(Name=name)
222
- self.log.info(f"Parameter '{name}' deleted successfully.")
223
- except Exception as e:
224
- self.log.error(f"Failed to delete parameter '{name}': {e}")
225
-
226
- def delete_recursive(self, prefix: str):
227
- """Delete all parameters with a given prefix from the AWS Parameter Store.
228
-
229
- Args:
230
- prefix (str): The prefix of the parameters to delete.
231
- """
232
- # List all parameters with the given prefix
233
- parameters = self.list(prefix=prefix)
234
- for param in parameters:
235
- self.delete(param)
236
-
237
- def __repr__(self):
238
- """Return a string representation of the AWSParameterStore object."""
239
- return "\n".join(self.list())
240
-
241
-
242
- if __name__ == "__main__":
243
- """Exercise the AWSParameterStore Class"""
244
-
245
- # Create a AWSParameterStore manager
246
- param_store = AWSParameterStore()
247
-
248
- # List the parameters
249
- print("Listing Parameters...")
250
- print(param_store.list())
251
-
252
- # Add a new parameter
253
- param_store.upsert("/workbench/test", "value")
254
-
255
- # Get the parameter
256
- print(f"Getting parameter 'test': {param_store.get('/workbench/test')}")
257
-
258
- # Add a dictionary as a parameter
259
- sample_dict = {"key": "str_value", "awesome_value": 4.2}
260
- param_store.upsert("/workbench/my_data", sample_dict)
261
-
262
- # Retrieve the parameter as a dictionary
263
- retrieved_value = param_store.get("/workbench/my_data")
264
- print("Retrieved value:", retrieved_value)
265
-
266
- # List the parameters
267
- print("Listing Parameters...")
268
- print(param_store.list())
269
-
270
- # List the parameters with a prefix
271
- print("Listing Parameters with prefix '/workbench':")
272
- print(param_store.list("/workbench"))
273
-
274
- # Delete the parameters
275
- param_store.delete("/workbench/test")
276
- param_store.delete("/workbench/my_data")
277
-
278
- # Out of scope tests
279
- param_store.upsert("test", "value")
280
- param_store.delete("test")