workbench 0.8.198__py3-none-any.whl → 0.8.203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. workbench/algorithms/dataframe/proximity.py +11 -4
  2. workbench/api/__init__.py +2 -1
  3. workbench/api/df_store.py +17 -108
  4. workbench/api/feature_set.py +48 -11
  5. workbench/api/model.py +1 -1
  6. workbench/api/parameter_store.py +3 -52
  7. workbench/core/artifacts/__init__.py +11 -2
  8. workbench/core/artifacts/artifact.py +5 -5
  9. workbench/core/artifacts/df_store_core.py +114 -0
  10. workbench/core/artifacts/endpoint_core.py +261 -78
  11. workbench/core/artifacts/feature_set_core.py +69 -1
  12. workbench/core/artifacts/model_core.py +48 -14
  13. workbench/core/artifacts/parameter_store_core.py +98 -0
  14. workbench/core/transforms/features_to_model/features_to_model.py +50 -33
  15. workbench/core/transforms/pandas_transforms/pandas_to_features.py +11 -2
  16. workbench/core/views/view.py +2 -2
  17. workbench/model_scripts/chemprop/chemprop.template +933 -0
  18. workbench/model_scripts/chemprop/generated_model_script.py +933 -0
  19. workbench/model_scripts/chemprop/requirements.txt +11 -0
  20. workbench/model_scripts/custom_models/chem_info/fingerprints.py +134 -0
  21. workbench/model_scripts/custom_models/chem_info/morgan_fingerprints.py +1 -1
  22. workbench/model_scripts/custom_models/proximity/proximity.py +11 -4
  23. workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template +11 -5
  24. workbench/model_scripts/custom_models/uq_models/meta_uq.template +11 -5
  25. workbench/model_scripts/custom_models/uq_models/ngboost.template +11 -5
  26. workbench/model_scripts/custom_models/uq_models/proximity.py +11 -4
  27. workbench/model_scripts/ensemble_xgb/ensemble_xgb.template +11 -5
  28. workbench/model_scripts/pytorch_model/generated_model_script.py +365 -173
  29. workbench/model_scripts/pytorch_model/pytorch.template +362 -170
  30. workbench/model_scripts/scikit_learn/generated_model_script.py +302 -0
  31. workbench/model_scripts/script_generation.py +10 -7
  32. workbench/model_scripts/uq_models/generated_model_script.py +43 -27
  33. workbench/model_scripts/uq_models/mapie.template +40 -24
  34. workbench/model_scripts/xgb_model/generated_model_script.py +36 -7
  35. workbench/model_scripts/xgb_model/xgb_model.template +36 -7
  36. workbench/repl/workbench_shell.py +14 -5
  37. workbench/resources/open_source_api.key +1 -1
  38. workbench/scripts/endpoint_test.py +162 -0
  39. workbench/scripts/{lambda_launcher.py → lambda_test.py} +10 -0
  40. workbench/utils/chemprop_utils.py +761 -0
  41. workbench/utils/pytorch_utils.py +527 -0
  42. workbench/utils/xgboost_model_utils.py +10 -5
  43. workbench/web_interface/components/model_plot.py +7 -1
  44. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/METADATA +3 -3
  45. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/RECORD +49 -43
  46. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/entry_points.txt +2 -1
  47. workbench/core/cloud_platform/aws/aws_df_store.py +0 -404
  48. workbench/core/cloud_platform/aws/aws_parameter_store.py +0 -280
  49. workbench/model_scripts/__pycache__/script_generation.cpython-312.pyc +0 -0
  50. workbench/model_scripts/__pycache__/script_generation.cpython-313.pyc +0 -0
  51. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/WHEEL +0 -0
  52. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/licenses/LICENSE +0 -0
  53. {workbench-0.8.198.dist-info → workbench-0.8.203.dist-info}/top_level.txt +0 -0
@@ -30,11 +30,23 @@ class ModelType(Enum):
30
30
  CLASSIFIER = "classifier"
31
31
  REGRESSOR = "regressor"
32
32
  CLUSTERER = "clusterer"
33
- TRANSFORMER = "transformer"
34
33
  PROXIMITY = "proximity"
35
34
  PROJECTION = "projection"
36
35
  UQ_REGRESSOR = "uq_regressor"
37
36
  ENSEMBLE_REGRESSOR = "ensemble_regressor"
37
+ TRANSFORMER = "transformer"
38
+ UNKNOWN = "unknown"
39
+
40
+
41
+ class ModelFramework(Enum):
42
+ """Enumerated Types for Workbench Model Frameworks"""
43
+
44
+ SKLEARN = "sklearn"
45
+ XGBOOST = "xgboost"
46
+ LIGHTGBM = "lightgbm"
47
+ PYTORCH_TABULAR = "pytorch_tabular"
48
+ CHEMPROP = "chemprop"
49
+ TRANSFORMER = "transformer"
38
50
  UNKNOWN = "unknown"
39
51
 
40
52
 
@@ -87,11 +99,10 @@ class ModelCore(Artifact):
87
99
  ```
88
100
  """
89
101
 
90
- def __init__(self, model_name: str, model_type: ModelType = None, **kwargs):
102
+ def __init__(self, model_name: str, **kwargs):
91
103
  """ModelCore Initialization
92
104
  Args:
93
105
  model_name (str): Name of Model in Workbench.
94
- model_type (ModelType, optional): Set this for newly created Models. Defaults to None.
95
106
  **kwargs: Additional keyword arguments
96
107
  """
97
108
 
@@ -125,10 +136,8 @@ class ModelCore(Artifact):
125
136
  self.latest_model = self.model_meta["ModelPackageList"][0]
126
137
  self.description = self.latest_model.get("ModelPackageDescription", "-")
127
138
  self.training_job_name = self._extract_training_job_name()
128
- if model_type:
129
- self._set_model_type(model_type)
130
- else:
131
- self.model_type = self._get_model_type()
139
+ self.model_type = self._get_model_type()
140
+ self.model_framework = self._get_model_framework()
132
141
  except (IndexError, KeyError):
133
142
  self.log.critical(f"Model {self.model_name} appears to be malformed. Delete and recreate it!")
134
143
  return
@@ -254,21 +263,25 @@ class ModelCore(Artifact):
254
263
  else:
255
264
  self.log.important(f"No inference data found for {self.model_name}!")
256
265
 
257
- def get_inference_metrics(self, capture_name: str = "latest") -> Union[pd.DataFrame, None]:
266
+ def get_inference_metrics(self, capture_name: str = "any") -> Union[pd.DataFrame, None]:
258
267
  """Retrieve the inference performance metrics for this model
259
268
 
260
269
  Args:
261
- capture_name (str, optional): Specific capture_name or "training" (default: "latest")
270
+ capture_name (str, optional): Specific capture_name (default: "any")
262
271
  Returns:
263
272
  pd.DataFrame: DataFrame of the Model Metrics
264
273
 
265
274
  Note:
266
- If a capture_name isn't specified this will try to return something reasonable
275
+ If a capture_name isn't specified this will try to the 'first' available metrics
267
276
  """
268
277
  # Try to get the auto_capture 'training_holdout' or the training
269
- if capture_name == "latest":
270
- metrics_df = self.get_inference_metrics("auto_inference")
271
- return metrics_df if metrics_df is not None else self.get_inference_metrics("model_training")
278
+ if capture_name == "any":
279
+ metric_list = self.list_inference_runs()
280
+ if metric_list:
281
+ return self.get_inference_metrics(metric_list[0])
282
+ else:
283
+ self.log.warning(f"No performance metrics found for {self.model_name}!")
284
+ return None
272
285
 
273
286
  # Grab the metrics captured during model training (could return None)
274
287
  if capture_name == "model_training":
@@ -860,7 +873,7 @@ class ModelCore(Artifact):
860
873
  return self.df_store.get(f"/workbench/models/{self.name}/shap_data")
861
874
  else:
862
875
  # Loop over the SHAP data and return a dict of DataFrames
863
- shap_dfs = self.df_store.list_subfiles(f"/workbench/models/{self.name}/shap_data")
876
+ shap_dfs = self.df_store.list(f"/workbench/models/{self.name}/shap_data")
864
877
  shap_data = {}
865
878
  for df_location in shap_dfs:
866
879
  key = df_location.split("/")[-1]
@@ -972,6 +985,27 @@ class ModelCore(Artifact):
972
985
  self.log.warning(f"Could not determine model type for {self.model_name}!")
973
986
  return ModelType.UNKNOWN
974
987
 
988
+ def _set_model_framework(self, model_framework: ModelFramework):
989
+ """Internal: Set the Model Framework for this Model"""
990
+ self.model_framework = model_framework
991
+ self.upsert_workbench_meta({"workbench_model_framework": self.model_framework.value})
992
+ self.remove_health_tag("model_framework_unknown")
993
+
994
+ def _get_model_framework(self) -> ModelFramework:
995
+ """Internal: Query the Workbench Metadata to get the model framework
996
+ Returns:
997
+ ModelFramework: The ModelFramework of this Model
998
+ Notes:
999
+ This is an internal method that should not be called directly
1000
+ Use the model_framework attribute instead
1001
+ """
1002
+ model_framework = self.workbench_meta().get("workbench_model_framework")
1003
+ try:
1004
+ return ModelFramework(model_framework)
1005
+ except ValueError:
1006
+ self.log.warning(f"Could not determine model framework for {self.model_name}!")
1007
+ return ModelFramework.UNKNOWN
1008
+
975
1009
  def _load_training_metrics(self):
976
1010
  """Internal: Retrieve the training metrics and Confusion Matrix for this model
977
1011
  and load the data into the Workbench Metadata
@@ -0,0 +1,98 @@
1
+ """ParameterStoreCore: Manages Workbench parameters in a Cloud Based Parameter Store."""
2
+
3
+ import logging
4
+
5
+ # Workbench Imports
6
+ from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
7
+
8
+ # Workbench Bridges Import
9
+ from workbench_bridges.api import ParameterStore as BridgesParameterStore
10
+
11
+
12
+ class ParameterStoreCore(BridgesParameterStore):
13
+ """ParameterStoreCore: Manages Workbench parameters in a Cloud Based Parameter Store.
14
+
15
+ Common Usage:
16
+ ```python
17
+ params = ParameterStoreCore()
18
+
19
+ # List Parameters
20
+ params.list()
21
+
22
+ ['/workbench/abalone_info',
23
+ '/workbench/my_data',
24
+ '/workbench/test',
25
+ '/workbench/pipelines/my_pipeline']
26
+
27
+ # Add Key
28
+ params.upsert("key", "value")
29
+ value = params.get("key")
30
+
31
+ # Add any data (lists, dictionaries, etc..)
32
+ my_data = {"key": "value", "number": 4.2, "list": [1,2,3]}
33
+ params.upsert("my_data", my_data)
34
+
35
+ # Retrieve data
36
+ return_value = params.get("my_data")
37
+ pprint(return_value)
38
+
39
+ {'key': 'value', 'list': [1, 2, 3], 'number': 4.2}
40
+
41
+ # Delete parameters
42
+ param_store.delete("my_data")
43
+ ```
44
+ """
45
+
46
+ def __init__(self):
47
+ """ParameterStoreCore Init Method"""
48
+ session = AWSAccountClamp().boto3_session
49
+
50
+ # Initialize parent with workbench config
51
+ super().__init__(boto3_session=session)
52
+ self.log = logging.getLogger("workbench")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ """Exercise the ParameterStoreCore Class"""
57
+
58
+ # Create a ParameterStoreCore manager
59
+ param_store = ParameterStoreCore()
60
+
61
+ # List the parameters
62
+ print("Listing Parameters...")
63
+ print(param_store.list())
64
+
65
+ # Add a new parameter
66
+ param_store.upsert("/workbench/test", "value")
67
+
68
+ # Get the parameter
69
+ print(f"Getting parameter 'test': {param_store.get('/workbench/test')}")
70
+
71
+ # Add a dictionary as a parameter
72
+ sample_dict = {"key": "str_value", "awesome_value": 4.2}
73
+ param_store.upsert("/workbench/my_data", sample_dict)
74
+
75
+ # Retrieve the parameter as a dictionary
76
+ retrieved_value = param_store.get("/workbench/my_data")
77
+ print("Retrieved value:", retrieved_value)
78
+
79
+ # List the parameters
80
+ print("Listing Parameters...")
81
+ print(param_store.list())
82
+
83
+ # List the parameters with a prefix
84
+ print("Listing Parameters with prefix '/workbench':")
85
+ print(param_store.list("/workbench"))
86
+
87
+ # Delete the parameters
88
+ param_store.delete("/workbench/test")
89
+ param_store.delete("/workbench/my_data")
90
+
91
+ # Out of scope tests
92
+ param_store.upsert("test", "value")
93
+ param_store.delete("test")
94
+
95
+ # Recursive delete test
96
+ param_store.upsert("/workbench/test/test1", "value1")
97
+ param_store.upsert("/workbench/test/test2", "value2")
98
+ param_store.delete_recursive("workbench/test/")
@@ -1,6 +1,7 @@
1
1
  """FeaturesToModel: Train/Create a Model from a Feature Set"""
2
2
 
3
3
  from pathlib import Path
4
+ from typing import Union
4
5
  from sagemaker.estimator import Estimator
5
6
  import awswrangler as wr
6
7
  from datetime import datetime, timezone
@@ -9,7 +10,7 @@ import time
9
10
  # Local Imports
10
11
  from workbench.core.transforms.transform import Transform, TransformInput, TransformOutput
11
12
  from workbench.core.artifacts.feature_set_core import FeatureSetCore
12
- from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelImages
13
+ from workbench.core.artifacts.model_core import ModelCore, ModelType, ModelFramework, ModelImages
13
14
  from workbench.core.artifacts.artifact import Artifact
14
15
  from workbench.model_scripts.script_generation import generate_model_script, fill_template
15
16
  from workbench.utils.model_utils import supported_instance_types
@@ -33,6 +34,7 @@ class FeaturesToModel(Transform):
33
34
  feature_name: str,
34
35
  model_name: str,
35
36
  model_type: ModelType,
37
+ model_framework=ModelFramework.XGBOOST,
36
38
  model_class=None,
37
39
  model_import_str=None,
38
40
  custom_script=None,
@@ -46,6 +48,7 @@ class FeaturesToModel(Transform):
46
48
  feature_name (str): Name of the FeatureSet to use as input
47
49
  model_name (str): Name of the Model to create as output
48
50
  model_type (ModelType): ModelType.REGRESSOR or ModelType.CLASSIFIER, etc.
51
+ model_framework (ModelFramework, optional): The model framework (default ModelFramework.XGBOOST)
49
52
  model_class (str, optional): The scikit model (e.g. KNeighborsRegressor) (default None)
50
53
  model_import_str (str, optional): The import string for the model (default None)
51
54
  custom_script (str, optional): Custom script to use for the model (default None)
@@ -65,6 +68,7 @@ class FeaturesToModel(Transform):
65
68
  self.input_type = TransformInput.FEATURE_SET
66
69
  self.output_type = TransformOutput.MODEL
67
70
  self.model_type = model_type
71
+ self.model_framework = model_framework
68
72
  self.model_class = model_class
69
73
  self.model_import_str = model_import_str
70
74
  self.custom_script = str(custom_script) if custom_script else None
@@ -80,12 +84,17 @@ class FeaturesToModel(Transform):
80
84
  self.inference_arch = inference_arch
81
85
 
82
86
  def transform_impl(
83
- self, target_column: str, description: str = None, feature_list: list = None, train_all_data=False, **kwargs
87
+ self,
88
+ target_column: Union[str, list[str]],
89
+ description: str = None,
90
+ feature_list: list = None,
91
+ train_all_data=False,
92
+ **kwargs,
84
93
  ):
85
94
  """Generic Features to Model: Note you should create a new class and inherit from
86
95
  this one to include specific logic for your Feature Set/Model
87
96
  Args:
88
- target_column (str): Column name of the target variable
97
+ target_column (str or list[str]): Column name(s) of the target variable(s)
89
98
  description (str): Description of the model (optional)
90
99
  feature_list (list[str]): A list of columns for the features (default None, will try to guess)
91
100
  train_all_data (bool): Train on ALL (100%) of the data (default False)
@@ -102,9 +111,11 @@ class FeaturesToModel(Transform):
102
111
  s3_training_path = feature_set.create_s3_training_data()
103
112
  self.log.info(f"Created new training data {s3_training_path}...")
104
113
 
105
- # Report the target column
114
+ # Report the target column(s)
106
115
  self.target_column = target_column
107
- self.log.info(f"Target column: {self.target_column}")
116
+ # Normalize target_column to a list for internal use
117
+ target_list = [target_column] if isinstance(target_column, str) else (target_column or [])
118
+ self.log.info(f"Target column(s): {self.target_column}")
108
119
 
109
120
  # Did they specify a feature list?
110
121
  if feature_list:
@@ -131,7 +142,7 @@ class FeaturesToModel(Transform):
131
142
  "is_deleted",
132
143
  "event_time",
133
144
  "training",
134
- ] + [self.target_column]
145
+ ] + target_list
135
146
  feature_list = [c for c in all_columns if c not in filter_list]
136
147
 
137
148
  # AWS Feature Store has 3 user column types (String, Integral, Fractional)
@@ -154,11 +165,14 @@ class FeaturesToModel(Transform):
154
165
  self.log.important(f"Feature List for Modeling: {self.model_feature_list}")
155
166
 
156
167
  # Set up our parameters for the model script
168
+ # ChemProp expects target_column as a list; other templates expect a string
169
+ target_for_template = target_list if self.model_framework == ModelFramework.CHEMPROP else self.target_column
157
170
  template_params = {
158
171
  "model_imports": self.model_import_str,
159
172
  "model_type": self.model_type,
173
+ "model_framework": self.model_framework,
160
174
  "model_class": self.model_class,
161
- "target_column": self.target_column,
175
+ "target_column": target_for_template,
162
176
  "feature_list": self.model_feature_list,
163
177
  "compressed_features": feature_set.get_compressed_features(),
164
178
  "model_metrics_s3_path": self.model_training_root,
@@ -184,23 +198,27 @@ class FeaturesToModel(Transform):
184
198
  # Generate our model script
185
199
  script_path = generate_model_script(template_params)
186
200
 
187
- # Metric Definitions for Regression
201
+ # Metric Definitions for Regression (matches model script output format)
188
202
  if self.model_type in [ModelType.REGRESSOR, ModelType.UQ_REGRESSOR, ModelType.ENSEMBLE_REGRESSOR]:
189
203
  metric_definitions = [
190
- {"Name": "RMSE", "Regex": "RMSE: ([0-9.]+)"},
191
- {"Name": "MAE", "Regex": "MAE: ([0-9.]+)"},
192
- {"Name": "R2", "Regex": "R2: ([0-9.]+)"},
193
- {"Name": "NumRows", "Regex": "NumRows: ([0-9]+)"},
204
+ {"Name": "rmse", "Regex": r"rmse: ([0-9.]+)"},
205
+ {"Name": "mae", "Regex": r"mae: ([0-9.]+)"},
206
+ {"Name": "medae", "Regex": r"medae: ([0-9.]+)"},
207
+ {"Name": "r2", "Regex": r"r2: ([0-9.-]+)"},
208
+ {"Name": "spearmanr", "Regex": r"spearmanr: ([0-9.-]+)"},
209
+ {"Name": "support", "Regex": r"support: ([0-9]+)"},
194
210
  ]
195
211
 
196
212
  # Metric Definitions for Classification
197
213
  elif self.model_type == ModelType.CLASSIFIER:
198
214
  # We need to get creative with the Classification Metrics
215
+ # Note: Classification only supports single target
216
+ class_target = target_list[0] if target_list else self.target_column
199
217
 
200
218
  # Grab all the target column class values (class labels)
201
219
  table = feature_set.data_source.table
202
- self.class_labels = feature_set.query(f'select DISTINCT {self.target_column} FROM "{table}"')[
203
- self.target_column
220
+ self.class_labels = feature_set.query(f'select DISTINCT {class_target} FROM "{table}"')[
221
+ class_target
204
222
  ].to_list()
205
223
 
206
224
  # Sanity check on the targets
@@ -234,12 +252,20 @@ class FeaturesToModel(Transform):
234
252
 
235
253
  # Create a Sagemaker Model with our script
236
254
  image = ModelImages.get_image_uri(self.sm_session.boto_region_name, self.training_image)
255
+
256
+ # Use GPU instance for ChemProp/PyTorch, CPU for others
257
+ if self.model_framework in [ModelFramework.CHEMPROP, ModelFramework.PYTORCH_TABULAR]:
258
+ train_instance_type = "ml.g6.xlarge" # NVIDIA L4 GPU, ~$0.80/hr
259
+ self.log.important(f"Using GPU instance {train_instance_type} for {self.model_framework.value}")
260
+ else:
261
+ train_instance_type = "ml.m5.xlarge"
262
+
237
263
  self.estimator = Estimator(
238
264
  entry_point=entry_point,
239
265
  source_dir=source_dir,
240
266
  role=self.workbench_role_arn,
241
267
  instance_count=1,
242
- instance_type="ml.m5.xlarge",
268
+ instance_type=train_instance_type,
243
269
  sagemaker_session=self.sm_session,
244
270
  image_uri=image,
245
271
  metric_definitions=metric_definitions,
@@ -274,8 +300,10 @@ class FeaturesToModel(Transform):
274
300
  self.log.info("Post-Transform: Calling onboard() on the Model...")
275
301
  time.sleep(3) # Give AWS time to complete Model register
276
302
 
277
- # Store the model feature_list and target_column in the workbench_meta
278
- output_model = ModelCore(self.output_name, model_type=self.model_type)
303
+ # Store the model metadata information
304
+ output_model = ModelCore(self.output_name)
305
+ output_model._set_model_type(self.model_type)
306
+ output_model._set_model_framework(self.model_framework)
279
307
  output_model.upsert_workbench_meta({"workbench_model_features": self.model_feature_list})
280
308
  output_model.upsert_workbench_meta({"workbench_model_target": self.target_column})
281
309
 
@@ -330,12 +358,11 @@ if __name__ == "__main__":
330
358
 
331
359
  # Regression Model
332
360
  input_name = "abalone_features"
333
- output_name = "test-abalone-regression"
361
+ output_name = "abalone-regression"
334
362
  to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.REGRESSOR)
335
363
  to_model.set_output_tags(["test"])
336
364
  to_model.transform(target_column="class_number_of_rings", description="Test Abalone Regression")
337
365
 
338
- """
339
366
  # Classification Model
340
367
  input_name = "wine_features"
341
368
  output_name = "wine-classification"
@@ -345,10 +372,10 @@ if __name__ == "__main__":
345
372
 
346
373
  # Quantile Regression Model (Abalone)
347
374
  input_name = "abalone_features"
348
- output_name = "abalone-quantile-reg"
375
+ output_name = "abalone-regression-uq"
349
376
  to_model = FeaturesToModel(input_name, output_name, ModelType.UQ_REGRESSOR)
350
- to_model.set_output_tags(["abalone", "quantiles"])
351
- to_model.transform(target_column="class_number_of_rings", description="Abalone Quantile Regression")
377
+ to_model.set_output_tags(["abalone", "uq"])
378
+ to_model.transform(target_column="class_number_of_rings", description="Abalone UQ Regression")
352
379
 
353
380
  # Scikit-Learn Kmeans Clustering Model
354
381
  input_name = "wine_features"
@@ -402,7 +429,7 @@ if __name__ == "__main__":
402
429
  scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
403
430
  my_script = scripts_root / "custom_models" / "chem_info" / "molecular_descriptors.py"
404
431
  input_name = "aqsol_features"
405
- output_name = "smiles-to-taut-md-stereo-v0"
432
+ output_name = "test-smiles-to-taut-md-stereo"
406
433
  to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
407
434
  to_model.set_output_tags(["smiles", "molecular descriptors"])
408
435
  to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Molecular Descriptors")
@@ -415,13 +442,3 @@ if __name__ == "__main__":
415
442
  to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
416
443
  to_model.set_output_tags(["smiles", "morgan fingerprints"])
417
444
  to_model.transform(target_column=None, feature_list=["smiles"], description="Smiles to Morgan Fingerprints")
418
-
419
- # Tautomerization Model
420
- scripts_root = Path(__file__).resolve().parents[3] / "model_scripts"
421
- my_script = scripts_root / "custom_models" / "chem_info" / "tautomerize.py"
422
- input_name = "aqsol_features"
423
- output_name = "tautomerize-v0"
424
- to_model = FeaturesToModel(input_name, output_name, model_type=ModelType.TRANSFORMER, custom_script=my_script)
425
- to_model.set_output_tags(["smiles", "tautomerization"])
426
- to_model.transform(target_column=None, feature_list=["smiles"], description="Tautomerize Smiles")
427
- """
@@ -68,6 +68,15 @@ class PandasToFeatures(Transform):
68
68
  self.output_df = input_df.copy()
69
69
  self.one_hot_columns = one_hot_columns or []
70
70
 
71
+ # Warn about known AWS Iceberg bug with event_time_column
72
+ if event_time_column is not None:
73
+ self.log.warning(
74
+ f"event_time_column='{event_time_column}' specified. Note: AWS has a known bug with "
75
+ "Iceberg FeatureGroups where varying event times across multiple days can cause "
76
+ "duplicate rows in the offline store. Setting event_time_column=None."
77
+ )
78
+ self.event_time_column = None
79
+
71
80
  # Now Prepare the DataFrame for its journey into an AWS FeatureGroup
72
81
  self.prep_dataframe()
73
82
 
@@ -400,7 +409,7 @@ class PandasToFeatures(Transform):
400
409
 
401
410
  # Set Hold Out Ids (if we got them during creation)
402
411
  if self.incoming_hold_out_ids:
403
- self.output_feature_set.set_training_holdouts(self.id_column, self.incoming_hold_out_ids)
412
+ self.output_feature_set.set_training_holdouts(self.incoming_hold_out_ids)
404
413
 
405
414
  def ensure_feature_group_created(self, feature_group):
406
415
  status = feature_group.describe().get("FeatureGroupStatus")
@@ -462,7 +471,7 @@ if __name__ == "__main__":
462
471
 
463
472
  # Create my DF to Feature Set Transform (with one-hot encoding)
464
473
  df_to_features = PandasToFeatures("test_features")
465
- df_to_features.set_input(data_df, id_column="id", one_hot_columns=["food"])
474
+ df_to_features.set_input(data_df, id_column="id", event_time_column="date", one_hot_columns=["food"])
466
475
  df_to_features.set_output_tags(["test", "small"])
467
476
  df_to_features.transform()
468
477
 
@@ -91,11 +91,11 @@ class View:
91
91
  self.table, self.data_source.database, self.data_source.boto3_session
92
92
  )
93
93
 
94
- def pull_dataframe(self, limit: int = 50000) -> Union[pd.DataFrame, None]:
94
+ def pull_dataframe(self, limit: int = 100000) -> Union[pd.DataFrame, None]:
95
95
  """Pull a DataFrame based on the view type
96
96
 
97
97
  Args:
98
- limit (int): The maximum number of rows to pull (default: 50000)
98
+ limit (int): The maximum number of rows to pull (default: 100000)
99
99
 
100
100
  Returns:
101
101
  Union[pd.DataFrame, None]: The DataFrame for the view or None if it doesn't exist