workbench 0.8.177__py3-none-any.whl → 0.8.178__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -17,7 +17,7 @@ from workbench.core.artifacts.artifact import Artifact
17
17
  from workbench.core.artifacts.data_source_factory import DataSourceFactory
18
18
  from workbench.core.artifacts.athena_source import AthenaSource
19
19
 
20
- from typing import TYPE_CHECKING
20
+ from typing import TYPE_CHECKING, Optional
21
21
 
22
22
  from workbench.utils.aws_utils import aws_throttle
23
23
 
@@ -509,6 +509,25 @@ class FeatureSetCore(Artifact):
509
509
  ].tolist()
510
510
  return hold_out_ids
511
511
 
512
+ def set_training_filter(self, filter_expression: Optional[str] = None):
513
+ """Set a filter expression for the training view for this FeatureSet
514
+
515
+ Args:
516
+ filter_expression (Optional[str]): A SQL filter expression (e.g., "age > 25 AND status = 'active'")
517
+ If None or empty string, will reset to default training view with no filter
518
+ (default: None)
519
+ """
520
+ from workbench.core.views import TrainingView
521
+
522
+ # Grab the existing holdout ids
523
+ holdout_ids = self.get_training_holdouts()
524
+
525
+ # Create a NEW training view
526
+ self.log.important(f"Setting Training Filter: {filter_expression}")
527
+ TrainingView.create(
528
+ self, id_column=self.id_column, holdout_ids=holdout_ids, filter_expression=filter_expression
529
+ )
530
+
512
531
  @classmethod
513
532
  def delete_views(cls, table: str, database: str):
514
533
  """Delete any views associated with this FeatureSet
@@ -707,7 +726,7 @@ if __name__ == "__main__":
707
726
 
708
727
  # Test getting the holdout ids
709
728
  print("Getting the hold out ids...")
710
- holdout_ids = my_features.get_training_holdouts("id")
729
+ holdout_ids = my_features.get_training_holdouts()
711
730
  print(f"Holdout IDs: {holdout_ids}")
712
731
 
713
732
  # Get a sample of the data
@@ -729,16 +748,26 @@ if __name__ == "__main__":
729
748
  table = my_features.view("training").table
730
749
  df = my_features.query(f'SELECT id, name FROM "{table}"')
731
750
  my_holdout_ids = [id for id in df["id"] if id < 20]
732
- my_features.set_training_holdouts("id", my_holdout_ids)
733
-
734
- # Test the hold out set functionality with strings
735
- print("Setting hold out ids (strings)...")
736
- my_holdout_ids = [name for name in df["name"] if int(name.split(" ")[1]) > 80]
737
- my_features.set_training_holdouts("name", my_holdout_ids)
751
+ my_features.set_training_holdouts(my_holdout_ids)
738
752
 
739
753
  # Get the training data
740
754
  print("Getting the training data...")
741
755
  training_data = my_features.get_training_data()
756
+ print(f"Training Data: {training_data.shape}")
757
+
758
+ # Test the filter expression functionality
759
+ print("Setting a filter expression...")
760
+ my_features.set_training_filter("id < 50 AND height > 65.0")
761
+ training_data = my_features.get_training_data()
762
+ print(f"Training Data: {training_data.shape}")
763
+ print(training_data)
764
+
765
+ # Remove training filter
766
+ print("Removing the filter expression...")
767
+ my_features.set_training_filter(None)
768
+ training_data = my_features.get_training_data()
769
+ print(f"Training Data: {training_data.shape}")
770
+ print(training_data)
742
771
 
743
772
  # Now delete the AWS artifacts associated with this Feature Set
744
773
  # print("Deleting Workbench Feature Set...")
@@ -3,7 +3,7 @@
3
3
  from typing import Union
4
4
 
5
5
  # Workbench Imports
6
- from workbench.api import DataSource, FeatureSet
6
+ from workbench.api import FeatureSet
7
7
  from workbench.core.views.view import View
8
8
  from workbench.core.views.create_view import CreateView
9
9
  from workbench.core.views.view_utils import get_column_list
@@ -34,6 +34,7 @@ class TrainingView(CreateView):
34
34
  source_table: str = None,
35
35
  id_column: str = None,
36
36
  holdout_ids: Union[list[str], list[int], None] = None,
37
+ filter_expression: str = None,
37
38
  ) -> Union[View, None]:
38
39
  """Factory method to create and return a TrainingView instance.
39
40
 
@@ -42,6 +43,8 @@ class TrainingView(CreateView):
42
43
  source_table (str, optional): The table/view to create the view from. Defaults to None.
43
44
  id_column (str, optional): The name of the id column. Defaults to None.
44
45
  holdout_ids (Union[list[str], list[int], None], optional): A list of holdout ids. Defaults to None.
46
+ filter_expression (str, optional): SQL filter expression (e.g., "age > 25 AND status = 'active'").
47
+ Defaults to None.
45
48
 
46
49
  Returns:
47
50
  Union[View, None]: The created View object (or None if failed to create the view)
@@ -69,28 +72,36 @@ class TrainingView(CreateView):
69
72
  else:
70
73
  id_column = instance.auto_id_column
71
74
 
72
- # If we don't have holdout ids, create a default training view
73
- if not holdout_ids:
74
- instance._default_training_view(instance.data_source, id_column)
75
- return View(instance.data_source, instance.view_name, auto_create_view=False)
75
+ # Enclose each column name in double quotes
76
+ sql_columns = ", ".join([f'"{column}"' for column in column_list])
77
+
78
+ # Build the training assignment logic
79
+ if holdout_ids:
80
+ # Format the list of holdout ids for SQL IN clause
81
+ if all(isinstance(id, str) for id in holdout_ids):
82
+ formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
83
+ else:
84
+ formatted_holdout_ids = ", ".join(map(str, holdout_ids))
76
85
 
77
- # Format the list of holdout ids for SQL IN clause
78
- if holdout_ids and all(isinstance(id, str) for id in holdout_ids):
79
- formatted_holdout_ids = ", ".join(f"'{id}'" for id in holdout_ids)
86
+ training_logic = f"""CASE
87
+ WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
88
+ ELSE True
89
+ END AS training"""
80
90
  else:
81
- formatted_holdout_ids = ", ".join(map(str, holdout_ids))
91
+ # Default 80/20 split using modulo
92
+ training_logic = f"""CASE
93
+ WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True
94
+ ELSE False
95
+ END AS training"""
82
96
 
83
- # Enclose each column name in double quotes
84
- sql_columns = ", ".join([f'"{column}"' for column in column_list])
97
+ # Build WHERE clause if filter_expression is provided
98
+ where_clause = f"\nWHERE {filter_expression}" if filter_expression else ""
85
99
 
86
100
  # Construct the CREATE VIEW query
87
101
  create_view_query = f"""
88
102
  CREATE OR REPLACE VIEW {instance.table} AS
89
- SELECT {sql_columns}, CASE
90
- WHEN {id_column} IN ({formatted_holdout_ids}) THEN False
91
- ELSE True
92
- END AS training
93
- FROM {instance.source_table}
103
+ SELECT {sql_columns}, {training_logic}
104
+ FROM {instance.source_table}{where_clause}
94
105
  """
95
106
 
96
107
  # Execute the CREATE VIEW query
@@ -99,43 +110,13 @@ class TrainingView(CreateView):
99
110
  # Return the View
100
111
  return View(instance.data_source, instance.view_name, auto_create_view=False)
101
112
 
102
- # This is an internal method that's used to create a default training view
103
- def _default_training_view(self, data_source: DataSource, id_column: str):
104
- """Create a default view in Athena that assigns roughly 80% of the data to training
105
-
106
- Args:
107
- data_source (DataSource): The Workbench DataSource object
108
- id_column (str): The name of the id column
109
- """
110
- self.log.important(f"Creating default Training View {self.table}...")
111
-
112
- # Drop any columns generated from AWS
113
- aws_cols = ["write_time", "api_invocation_time", "is_deleted", "event_time"]
114
- column_list = [col for col in data_source.columns if col not in aws_cols]
115
-
116
- # Enclose each column name in double quotes
117
- sql_columns = ", ".join([f'"{column}"' for column in column_list])
118
-
119
- # Construct the CREATE VIEW query with a simple modulo operation for the 80/20 split
120
- create_view_query = f"""
121
- CREATE OR REPLACE VIEW "{self.table}" AS
122
- SELECT {sql_columns}, CASE
123
- WHEN MOD(ROW_NUMBER() OVER (ORDER BY {id_column}), 10) < 8 THEN True -- Assign 80% to training
124
- ELSE False -- Assign roughly 20% to validation/test
125
- END AS training
126
- FROM {self.base_table_name}
127
- """
128
-
129
- # Execute the CREATE VIEW query
130
- data_source.execute_statement(create_view_query)
131
-
132
113
 
133
114
  if __name__ == "__main__":
134
115
  """Exercise the Training View functionality"""
135
116
  from workbench.api import FeatureSet
136
117
 
137
118
  # Get the FeatureSet
138
- fs = FeatureSet("test_features")
119
+ fs = FeatureSet("abalone_features")
139
120
 
140
121
  # Delete the existing training view
141
122
  training_view = TrainingView.create(fs)
@@ -152,9 +133,18 @@ if __name__ == "__main__":
152
133
 
153
134
  # Create a TrainingView with holdout ids
154
135
  my_holdout_ids = list(range(10))
155
- training_view = TrainingView.create(fs, id_column="id", holdout_ids=my_holdout_ids)
136
+ training_view = TrainingView.create(fs, id_column="auto_id", holdout_ids=my_holdout_ids)
156
137
 
157
138
  # Pull the training data
158
139
  df = training_view.pull_dataframe()
159
140
  print(df.head())
160
141
  print(df["training"].value_counts())
142
+ print(f"Shape: {df.shape}")
143
+ print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
144
+
145
+ # Test the filter expression
146
+ training_view = TrainingView.create(fs, id_column="auto_id", filter_expression="diameter > 0.5")
147
+ df = training_view.pull_dataframe()
148
+ print(df.head())
149
+ print(f"Shape with filter: {df.shape}")
150
+ print(f"Diameter min: {df['diameter'].min()}, max: {df['diameter'].max()}")
@@ -22,7 +22,7 @@ from typing import List, Tuple
22
22
 
23
23
  # Template Placeholders
24
24
  TEMPLATE_PARAMS = {
25
- "target": "udm_asy_res_value",
25
+ "target": "logs",
26
26
  "features": ['chi2v', 'fr_sulfone', 'chi1v', 'bcut2d_logplow', 'fr_piperzine', 'kappa3', 'smr_vsa1', 'slogp_vsa5', 'fr_ketone_topliss', 'fr_sulfonamd', 'fr_imine', 'fr_benzene', 'fr_ester', 'chi2n', 'labuteasa', 'peoe_vsa2', 'smr_vsa6', 'bcut2d_chglo', 'fr_sh', 'peoe_vsa1', 'fr_allylic_oxid', 'chi4n', 'fr_ar_oh', 'fr_nh0', 'fr_term_acetylene', 'slogp_vsa7', 'slogp_vsa4', 'estate_vsa1', 'vsa_estate4', 'numbridgeheadatoms', 'numheterocycles', 'fr_ketone', 'fr_morpholine', 'fr_guanido', 'estate_vsa2', 'numheteroatoms', 'fr_nitro_arom_nonortho', 'fr_piperdine', 'nocount', 'numspiroatoms', 'fr_aniline', 'fr_thiophene', 'slogp_vsa10', 'fr_amide', 'slogp_vsa2', 'fr_epoxide', 'vsa_estate7', 'fr_ar_coo', 'fr_imidazole', 'fr_nitrile', 'fr_oxazole', 'numsaturatedrings', 'fr_pyridine', 'fr_hoccn', 'fr_ndealkylation1', 'numaliphaticheterocycles', 'fr_phenol', 'maxpartialcharge', 'vsa_estate5', 'peoe_vsa13', 'minpartialcharge', 'qed', 'fr_al_oh', 'slogp_vsa11', 'chi0n', 'fr_bicyclic', 'peoe_vsa12', 'fpdensitymorgan1', 'fr_oxime', 'molwt', 'fr_dihydropyridine', 'smr_vsa5', 'peoe_vsa5', 'fr_nitro', 'hallkieralpha', 'heavyatommolwt', 'fr_alkyl_halide', 'peoe_vsa8', 'fr_nhpyrrole', 'fr_isocyan', 'bcut2d_chghi', 'fr_lactam', 'peoe_vsa11', 'smr_vsa9', 'tpsa', 'chi4v', 'slogp_vsa1', 'phi', 'bcut2d_logphi', 'avgipc', 'estate_vsa11', 'fr_coo', 'bcut2d_mwhi', 'numunspecifiedatomstereocenters', 'vsa_estate10', 'estate_vsa8', 'numvalenceelectrons', 'fr_nh2', 'fr_lactone', 'vsa_estate1', 'estate_vsa4', 'numatomstereocenters', 'vsa_estate8', 'fr_para_hydroxylation', 'peoe_vsa3', 'fr_thiazole', 'peoe_vsa10', 'fr_ndealkylation2', 'slogp_vsa12', 'peoe_vsa9', 'maxestateindex', 'fr_quatn', 'smr_vsa7', 'minestateindex', 'numaromaticheterocycles', 'numrotatablebonds', 'fr_ar_nh', 'fr_ether', 'exactmolwt', 'fr_phenol_noorthohbond', 'slogp_vsa3', 'fr_ar_n', 'sps', 'fr_c_o_nocoo', 'bertzct', 'peoe_vsa7', 'slogp_vsa8', 'numradicalelectrons', 'molmr', 'fr_tetrazole', 'numsaturatedcarbocycles', 'bcut2d_mrhi', 'kappa1', 'numamidebonds', 'fpdensitymorgan2', 'smr_vsa8', 'chi1n', 'estate_vsa6', 'fr_barbitur', 'fr_diazo', 'kappa2', 'chi0', 'bcut2d_mrlow', 'balabanj', 'peoe_vsa4', 'numhacceptors', 'fr_sulfide', 'chi3n', 'smr_vsa2', 'fr_al_oh_notert', 'fr_benzodiazepine', 'fr_phos_ester', 'fr_aldehyde', 'fr_coo2', 'estate_vsa5', 'fr_prisulfonamd', 'numaromaticcarbocycles', 'fr_unbrch_alkane', 'fr_urea', 'fr_nitroso', 'smr_vsa10', 'fr_c_s', 'smr_vsa3', 'fr_methoxy', 'maxabspartialcharge', 'slogp_vsa9', 'heavyatomcount', 'fr_azide', 'chi3v', 'smr_vsa4', 'mollogp', 'chi0v', 'fr_aryl_methyl', 'fr_nh1', 'fpdensitymorgan3', 'fr_furan', 'fr_hdrzine', 'fr_arn', 'numaromaticrings', 'vsa_estate3', 'fr_azo', 'fr_halogen', 'estate_vsa9', 'fr_hdrzone', 'numhdonors', 'fr_alkyl_carbamate', 'fr_isothiocyan', 'minabspartialcharge', 'fr_al_coo', 'ringcount', 'chi1', 'estate_vsa7', 'fr_nitro_arom', 'vsa_estate9', 'minabsestateindex', 'maxabsestateindex', 'vsa_estate6', 'estate_vsa10', 'estate_vsa3', 'fr_n_o', 'fr_amidine', 'fr_thiocyan', 'fr_phos_acid', 'fr_c_o', 'fr_imide', 'numaliphaticrings', 'peoe_vsa6', 'vsa_estate2', 'nhohcount', 'numsaturatedheterocycles', 'slogp_vsa6', 'peoe_vsa14', 'fractioncsp3', 'bcut2d_mwlow', 'numaliphaticcarbocycles', 'fr_priamide', 'nacid', 'nbase', 'naromatom', 'narombond', 'sz', 'sm', 'sv', 'sse', 'spe', 'sare', 'sp', 'si', 'mz', 'mm', 'mv', 'mse', 'mpe', 'mare', 'mp', 'mi', 'xch_3d', 'xch_4d', 'xch_5d', 'xch_6d', 'xch_7d', 'xch_3dv', 'xch_4dv', 'xch_5dv', 'xch_6dv', 'xch_7dv', 'xc_3d', 'xc_4d', 'xc_5d', 'xc_6d', 'xc_3dv', 'xc_4dv', 'xc_5dv', 'xc_6dv', 'xpc_4d', 'xpc_5d', 'xpc_6d', 'xpc_4dv', 'xpc_5dv', 'xpc_6dv', 'xp_0d', 'xp_1d', 'xp_2d', 'xp_3d', 'xp_4d', 'xp_5d', 'xp_6d', 'xp_7d', 'axp_0d', 'axp_1d', 'axp_2d', 'axp_3d', 'axp_4d', 'axp_5d', 'axp_6d', 'axp_7d', 'xp_0dv', 'xp_1dv', 'xp_2dv', 'xp_3dv', 'xp_4dv', 'xp_5dv', 'xp_6dv', 'xp_7dv', 'axp_0dv', 'axp_1dv', 'axp_2dv', 'axp_3dv', 'axp_4dv', 'axp_5dv', 'axp_6dv', 'axp_7dv', 'c1sp1', 'c2sp1', 'c1sp2', 'c2sp2', 'c3sp2', 'c1sp3', 'c2sp3', 'c3sp3', 'c4sp3', 'hybratio', 'fcsp3', 'num_stereocenters', 'num_unspecified_stereocenters', 'num_defined_stereocenters', 'num_r_centers', 'num_s_centers', 'num_stereobonds', 'num_e_bonds', 'num_z_bonds', 'stereo_complexity', 'frac_defined_stereo'],
27
27
  "compressed_features": [],
28
28
  "train_all_data": True
@@ -242,7 +242,7 @@ if __name__ == "__main__":
242
242
  print(f"R2: {xgb_r2:.3f}")
243
243
 
244
244
  # Define confidence levels we want to model
245
- confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
245
+ confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
246
246
 
247
247
  # Store MAPIE models for each confidence level
248
248
  mapie_models = {}
@@ -459,6 +459,9 @@ def predict_fn(df, models) -> pd.DataFrame:
459
459
  if conf_level == 0.50: # 50% CI
460
460
  df["q_25"] = y_pis[:, 0, 0]
461
461
  df["q_75"] = y_pis[:, 1, 0]
462
+ elif conf_level == 0.68: # 68% CI
463
+ df["q_16"] = y_pis[:, 0, 0]
464
+ df["q_84"] = y_pis[:, 1, 0]
462
465
  elif conf_level == 0.80: # 80% CI
463
466
  df["q_10"] = y_pis[:, 0, 0]
464
467
  df["q_90"] = y_pis[:, 1, 0]
@@ -472,23 +475,16 @@ def predict_fn(df, models) -> pd.DataFrame:
472
475
  # Add median (q_50) from XGBoost prediction
473
476
  df["q_50"] = df["prediction"]
474
477
 
475
- # Calculate uncertainty metrics based on 95% interval
476
- interval_width = df["q_975"] - df["q_025"]
477
- df["prediction_std"] = interval_width / 3.92
478
+ # Calculate a psueduo-standard deviation from the 68% interval width
479
+ df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
478
480
 
479
481
  # Reorder the quantile columns for easier reading
480
- quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
482
+ quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
481
483
  other_cols = [col for col in df.columns if col not in quantile_cols]
482
484
  df = df[other_cols + quantile_cols]
483
485
 
484
- # Uncertainty score
485
- df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
486
-
487
- # Confidence bands
488
- df["confidence_band"] = pd.cut(
489
- df["uncertainty_score"],
490
- bins=[0, 0.5, 1.0, 2.0, np.inf],
491
- labels=["high", "medium", "low", "very_low"]
492
- )
486
+ # Adjust the outer quantiles to ensure they encompass the prediction
487
+ df["q_025"] = np.minimum(df["q_025"], df["prediction"])
488
+ df["q_975"] = np.maximum(df["q_975"], df["prediction"])
493
489
 
494
490
  return df
@@ -242,7 +242,7 @@ if __name__ == "__main__":
242
242
  print(f"R2: {xgb_r2:.3f}")
243
243
 
244
244
  # Define confidence levels we want to model
245
- confidence_levels = [0.50, 0.80, 0.90, 0.95] # 50%, 80%, 90%, 95% confidence intervals
245
+ confidence_levels = [0.50, 0.68, 0.80, 0.90, 0.95] # 50%, 68%, 80%, 90%, 95% confidence intervals
246
246
 
247
247
  # Store MAPIE models for each confidence level
248
248
  mapie_models = {}
@@ -459,6 +459,9 @@ def predict_fn(df, models) -> pd.DataFrame:
459
459
  if conf_level == 0.50: # 50% CI
460
460
  df["q_25"] = y_pis[:, 0, 0]
461
461
  df["q_75"] = y_pis[:, 1, 0]
462
+ elif conf_level == 0.68: # 68% CI
463
+ df["q_16"] = y_pis[:, 0, 0]
464
+ df["q_84"] = y_pis[:, 1, 0]
462
465
  elif conf_level == 0.80: # 80% CI
463
466
  df["q_10"] = y_pis[:, 0, 0]
464
467
  df["q_90"] = y_pis[:, 1, 0]
@@ -472,23 +475,16 @@ def predict_fn(df, models) -> pd.DataFrame:
472
475
  # Add median (q_50) from XGBoost prediction
473
476
  df["q_50"] = df["prediction"]
474
477
 
475
- # Calculate uncertainty metrics based on 50% interval
476
- interval_width = df["q_75"] - df["q_25"]
477
- df["prediction_std"] = interval_width / 1.348
478
+ # Calculate a psueduo-standard deviation from the 68% interval width
479
+ df["prediction_std"] = (df["q_84"] - df["q_16"]) / 2.0
478
480
 
479
481
  # Reorder the quantile columns for easier reading
480
- quantile_cols = ["q_025", "q_05", "q_10", "q_25", "q_75", "q_90", "q_95", "q_975"]
482
+ quantile_cols = ["q_025", "q_05", "q_10", "q_16", "q_25", "q_75", "q_84", "q_90", "q_95", "q_975"]
481
483
  other_cols = [col for col in df.columns if col not in quantile_cols]
482
484
  df = df[other_cols + quantile_cols]
483
485
 
484
- # Uncertainty score
485
- df["uncertainty_score"] = interval_width / (np.abs(df["prediction"]) + 1e-6)
486
-
487
- # Confidence bands
488
- df["confidence_band"] = pd.cut(
489
- df["uncertainty_score"],
490
- bins=[0, 0.5, 1.0, 2.0, np.inf],
491
- labels=["high", "medium", "low", "very_low"]
492
- )
486
+ # Adjust the outer quantiles to ensure they encompass the prediction
487
+ df["q_025"] = np.minimum(df["q_025"], df["prediction"])
488
+ df["q_975"] = np.maximum(df["q_975"], df["prediction"])
493
489
 
494
490
  return df
@@ -13,12 +13,13 @@ cm = ConfigManager()
13
13
  workbench_bucket = cm.get_config("WORKBENCH_BUCKET")
14
14
 
15
15
 
16
- def submit_to_sqs(script_path: str, size: str = "small") -> None:
16
+ def submit_to_sqs(script_path: str, size: str = "small", realtime: bool = False) -> None:
17
17
  """
18
18
  Upload script to S3 and submit message to SQS queue for processing.
19
19
  Args:
20
20
  script_path: Local path to the ML pipeline script
21
21
  size: Job size tier - "small" (default), "medium", or "large"
22
+ realtime: If True, sets serverless=False for real-time processing (default: False, meaning serverless=True)
22
23
  """
23
24
  print(f"\n{'=' * 60}")
24
25
  print("🚀 SUBMITTING ML PIPELINE JOB")
@@ -33,6 +34,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
33
34
 
34
35
  print(f"📄 Script: {script_file.name}")
35
36
  print(f"📏 Size tier: {size}")
37
+ print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (serverless={'False' if realtime else 'True'})")
36
38
  print(f"🪣 Bucket: {workbench_bucket}")
37
39
  sqs = AWSAccountClamp().boto3_session.client("sqs")
38
40
  script_name = script_file.name
@@ -88,6 +90,10 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
88
90
 
89
91
  # Prepare message
90
92
  message = {"script_path": s3_path, "size": size}
93
+
94
+ # Set serverless environment variable (defaults to True, False if --realtime)
95
+ message["environment"] = {"SERVERLESS": "False" if realtime else "True"}
96
+
91
97
  print("\n📨 Sending message to SQS...")
92
98
 
93
99
  # Send the message to SQS
@@ -110,6 +116,7 @@ def submit_to_sqs(script_path: str, size: str = "small") -> None:
110
116
  print(f"{'=' * 60}")
111
117
  print(f"📄 Script: {script_name}")
112
118
  print(f"📏 Size: {size}")
119
+ print(f"⚡ Mode: {'Real-time' if realtime else 'Serverless'} (SERVERLESS={'False' if realtime else 'True'})")
113
120
  print(f"🆔 Message ID: {message_id}")
114
121
  print("\n🔍 MONITORING LOCATIONS:")
115
122
  print(f" • SQS Queue: AWS Console → SQS → {queue_name}")
@@ -126,9 +133,14 @@ def main():
126
133
  parser.add_argument(
127
134
  "--size", default="small", choices=["small", "medium", "large"], help="Job size tier (default: small)"
128
135
  )
136
+ parser.add_argument(
137
+ "--realtime",
138
+ action="store_true",
139
+ help="Run in real-time mode (sets serverless=False). Default is serverless mode (serverless=True)",
140
+ )
129
141
  args = parser.parse_args()
130
142
  try:
131
- submit_to_sqs(args.script_file, args.size)
143
+ submit_to_sqs(args.script_file, args.size, realtime=args.realtime)
132
144
  except Exception as e:
133
145
  print(f"\n❌ ERROR: {e}")
134
146
  log.error(f"Error: {e}")
@@ -222,6 +222,7 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
222
222
  lower_95, upper_95 = df["q_025"], df["q_975"]
223
223
  lower_90, upper_90 = df["q_05"], df["q_95"]
224
224
  lower_80, upper_80 = df["q_10"], df["q_90"]
225
+ lower_68, upper_68 = df["q_16"], df["q_84"]
225
226
  lower_50, upper_50 = df["q_25"], df["q_75"]
226
227
  elif "prediction_std" in df.columns:
227
228
  lower_95 = df["prediction"] - 1.96 * df["prediction_std"]
@@ -230,6 +231,8 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
230
231
  upper_90 = df["prediction"] + 1.645 * df["prediction_std"]
231
232
  lower_80 = df["prediction"] - 1.282 * df["prediction_std"]
232
233
  upper_80 = df["prediction"] + 1.282 * df["prediction_std"]
234
+ lower_68 = df["prediction"] - 1.0 * df["prediction_std"]
235
+ upper_68 = df["prediction"] + 1.0 * df["prediction_std"]
233
236
  lower_50 = df["prediction"] - 0.674 * df["prediction_std"]
234
237
  upper_50 = df["prediction"] + 0.674 * df["prediction_std"]
235
238
  else:
@@ -241,11 +244,13 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
241
244
  coverage_95 = np.mean((df[target_col] >= lower_95) & (df[target_col] <= upper_95))
242
245
  coverage_90 = np.mean((df[target_col] >= lower_90) & (df[target_col] <= upper_90))
243
246
  coverage_80 = np.mean((df[target_col] >= lower_80) & (df[target_col] <= upper_80))
247
+ coverage_68 = np.mean((df[target_col] >= lower_68) & (df[target_col] <= upper_68))
244
248
  coverage_50 = np.mean((df[target_col] >= lower_50) & (df[target_col] <= upper_50))
245
249
  avg_width_95 = np.mean(upper_95 - lower_95)
246
250
  avg_width_90 = np.mean(upper_90 - lower_90)
247
251
  avg_width_80 = np.mean(upper_80 - lower_80)
248
252
  avg_width_50 = np.mean(upper_50 - lower_50)
253
+ avg_width_68 = np.mean(upper_68 - lower_68)
249
254
 
250
255
  # --- CRPS (measures calibration + sharpness) ---
251
256
  z = (df[target_col] - df["prediction"]) / df["prediction_std"]
@@ -269,12 +274,14 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
269
274
  # Collect results
270
275
  results = {
271
276
  "coverage_50": coverage_50,
277
+ "coverage_68": coverage_68,
272
278
  "coverage_80": coverage_80,
273
279
  "coverage_90": coverage_90,
274
280
  "coverage_95": coverage_95,
275
- "avg_std": avg_std,
276
281
  "median_std": median_std,
282
+ "avg_std": avg_std,
277
283
  "avg_width_50": avg_width_50,
284
+ "avg_width_68": avg_width_68,
278
285
  "avg_width_80": avg_width_80,
279
286
  "avg_width_90": avg_width_90,
280
287
  "avg_width_95": avg_width_95,
@@ -286,12 +293,14 @@ def uq_metrics(df: pd.DataFrame, target_col: str) -> Dict[str, Any]:
286
293
 
287
294
  print("\n=== UQ Metrics ===")
288
295
  print(f"Coverage @ 50%: {coverage_50:.3f} (target: 0.50)")
296
+ print(f"Coverage @ 68%: {coverage_68:.3f} (target: 0.68)")
289
297
  print(f"Coverage @ 80%: {coverage_80:.3f} (target: 0.80)")
290
298
  print(f"Coverage @ 90%: {coverage_90:.3f} (target: 0.90)")
291
299
  print(f"Coverage @ 95%: {coverage_95:.3f} (target: 0.95)")
292
- print(f"Avg Prediction StdDev: {avg_std:.3f}")
293
300
  print(f"Median Prediction StdDev: {median_std:.3f}")
301
+ print(f"Avg Prediction StdDev: {avg_std:.3f}")
294
302
  print(f"Average 50% Width: {avg_width_50:.3f}")
303
+ print(f"Average 68% Width: {avg_width_68:.3f}")
295
304
  print(f"Average 80% Width: {avg_width_80:.3f}")
296
305
  print(f"Average 90% Width: {avg_width_90:.3f}")
297
306
  print(f"Average 95% Width: {avg_width_95:.3f}")
@@ -259,7 +259,7 @@ def cross_fold_inference(workbench_model: Any, nfolds: int = 5) -> Dict[str, Any
259
259
  xgb_model._Booster = loaded_booster
260
260
  # Prepare data
261
261
  fs = FeatureSet(workbench_model.get_input())
262
- df = fs.pull_dataframe()
262
+ df = fs.view("training").pull_dataframe()
263
263
  feature_cols = workbench_model.features()
264
264
  # Convert string features to categorical
265
265
  for col in feature_cols:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: workbench
3
- Version: 0.8.177
3
+ Version: 0.8.178
4
4
  Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
5
5
  Author-email: SuperCowPowers LLC <support@supercowpowers.com>
6
6
  License-Expression: MIT
@@ -55,7 +55,7 @@ workbench/core/artifacts/data_capture_core.py,sha256=q8f79rRTYiZ7T4IQRWXl8ZvPpcv
55
55
  workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99vQAmQ0WHE5jcm5RfsVTg,10619
56
56
  workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
57
57
  workbench/core/artifacts/endpoint_core.py,sha256=Q6wL0IpMgCkVssX-BvPwawgogQjq9klSaoBUZ6tEIuc,49146
58
- workbench/core/artifacts/feature_set_core.py,sha256=055VdSYR09HP4ygAuYvIYtHQ7Ec4XxsZygpgEl5H5jQ,29136
58
+ workbench/core/artifacts/feature_set_core.py,sha256=0wvW4VyZii0GmO6tBudoGEqZktLtb6spDyIkn7MkDcw,30292
59
59
  workbench/core/artifacts/model_core.py,sha256=ECDwQ0qM5qb1yGJ07U70BVdfkrW9m7p9e6YJWib3uR0,50855
60
60
  workbench/core/artifacts/monitor_core.py,sha256=M307yz7tEzOEHgv-LmtVy9jKjSbM98fHW3ckmNYrwlU,27897
61
61
  workbench/core/cloud_platform/cloud_meta.py,sha256=-g4-LTC3D0PXb3VfaXdLR1ERijKuHdffeMK_zhD-koQ,8809
@@ -118,7 +118,7 @@ workbench/core/views/create_view.py,sha256=2Ykzb2NvJGoD4PP4k2Bka46GDog9iGG5SWnAc
118
118
  workbench/core/views/display_view.py,sha256=9K4O77ZnKOh93aMRhxcQJQ1lqScLhuJnU_tHtYZ_U4E,2598
119
119
  workbench/core/views/inference_view.py,sha256=9s70M0dFdGq0tWvzMZfgUK7EPKtuvcQhux0uyRZuuLM,3293
120
120
  workbench/core/views/pandas_to_view.py,sha256=20uCsnG2iMh-U1VxqVUUtnrWAY98SeuHjmfJK_wcq1I,6422
121
- workbench/core/views/training_view.py,sha256=mUkv1oVhDG-896RdLNKxCg0j0yvudEcPnvL5EH8WERQ,6359
121
+ workbench/core/views/training_view.py,sha256=UWW8Asxtm_kV7Z8NooitMA4xC5vTc7lSWwTGbLdifqY,5900
122
122
  workbench/core/views/view.py,sha256=Ujzw6zLROP9oKfKm3zJwaOyfpyjh5uM9fAu1i3kUOig,11764
123
123
  workbench/core/views/view_utils.py,sha256=y0YuPW-90nAfgAD1UW_49-j7Mvncfm7-5rV8I_97CK8,12274
124
124
  workbench/core/views/storage/mdq_view.py,sha256=qf_ep1KwaXOIfO930laEwNIiCYP7VNOqjE3VdHfopRE,5195
@@ -140,8 +140,8 @@ workbench/model_scripts/custom_models/uq_models/Readme.md,sha256=UVpL-lvtTrLqwBe
140
140
  workbench/model_scripts/custom_models/uq_models/bayesian_ridge.template,sha256=U4LIlpp8Rbu3apyzPR7-55lvlutpTsCro_PUvQ5pklY,6457
141
141
  workbench/model_scripts/custom_models/uq_models/ensemble_xgb.template,sha256=0IJnSBACQ556ldEiPqR7yPCOOLJs1hQhHmPBvB2d9tY,13491
142
142
  workbench/model_scripts/custom_models/uq_models/gaussian_process.template,sha256=QbDUfkiPCwJ-c-4Twgu4utZuYZaAyeW_3T1IP-_tutw,6683
143
- workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=AcLf-vXOmn_vpTeiKpNKCW_dRhR8Co1sMFC84EPT4IE,22392
144
- workbench/model_scripts/custom_models/uq_models/mapie.template,sha256=Vou_g0ux-KOrs36S98g27Y8ckU9sdYrKWwypJjasQX4,18180
143
+ workbench/model_scripts/custom_models/uq_models/generated_model_script.py,sha256=DUH80Y-We_-3OomUNjvBdRPrNQLQb3zlSsKZIPiglU4,22402
144
+ workbench/model_scripts/custom_models/uq_models/mapie.template,sha256=SHP1Sd-nWMVF5sgB9Ski6C4IkQlm4g0EqpnJT1GfHl4,18204
145
145
  workbench/model_scripts/custom_models/uq_models/meta_uq.template,sha256=eawh0Fp3DhbdCXzWN6KloczT5ZS_ou4ayW65yUTTE4o,14109
146
146
  workbench/model_scripts/custom_models/uq_models/ngboost.template,sha256=9-O6P-SW50ul5Wl6es2DMWXSbrwOg7HWsdc8Qdln0MM,8278
147
147
  workbench/model_scripts/custom_models/uq_models/proximity.py,sha256=zqmNlX70LnWXr5fdtFFQppSNTLjlOciQVrjGr-g9jRE,13716
@@ -169,7 +169,7 @@ workbench/resources/signature_verify_pub.pem,sha256=V3-u-3_z2PH-805ybkKvzDOBwAbv
169
169
  workbench/scripts/check_double_bond_stereo.py,sha256=p5hnL54Weq77ES0HCELq9JeoM-PyUGkvVSeWYF2dKyo,7776
170
170
  workbench/scripts/glue_launcher.py,sha256=bIKQvfGxpAhzbeNvTnHfRW_5kQhY-169_868ZnCejJk,10692
171
171
  workbench/scripts/ml_pipeline_batch.py,sha256=1T5JnLlUJR7bwAGBLHmLPOuj1xFRqVIQX8PsuDhHy8o,4907
172
- workbench/scripts/ml_pipeline_sqs.py,sha256=7w67UUuZNYnxXiZG48gpoEFbH-c_cUfjMg0FgWI0DbQ,5100
172
+ workbench/scripts/ml_pipeline_sqs.py,sha256=COewJcFYuv5Pa_l0q0PA4ZZb-AQ_7opKJP4JTEKBQ2c,5847
173
173
  workbench/scripts/monitor_cloud_watch.py,sha256=s7MY4bsHts0nup9G0lWESCvgJZ9Mw1Eo-c8aKRgLjMw,9235
174
174
  workbench/scripts/redis_expire.py,sha256=DxI_RKSNlrW2BsJZXcsSbaWGBgPZdPhtzHjV9SUtElE,1120
175
175
  workbench/scripts/redis_report.py,sha256=iaJSuGPyLCs6e0TMcZDoT0YyJ43xJ1u74YD8FLnnUg4,990
@@ -219,7 +219,7 @@ workbench/utils/lambda_utils.py,sha256=7GhGRPyXn9o-toWb9HBGSnI8-DhK9YRkwhCSk_mNK
219
219
  workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYwoo7ho,6975
220
220
  workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
221
221
  workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
222
- workbench/utils/model_utils.py,sha256=7TYxTa2KCoLJfJ47QcnzmibMwKHX3bP37-sPvfqgdVM,12273
222
+ workbench/utils/model_utils.py,sha256=97yqEEeGLV8KSDt_RTGexcUEK1wU_UnmLj-cfuryPOs,12779
223
223
  workbench/utils/monitor_utils.py,sha256=kVaJ7BgUXs3VPMFYfLC03wkIV4Dq-pEhoXS0wkJFxCc,7858
224
224
  workbench/utils/pandas_utils.py,sha256=uTUx-d1KYfjbS9PMQp2_9FogCV7xVZR6XLzU5YAGmfs,39371
225
225
  workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
@@ -242,7 +242,7 @@ workbench/utils/workbench_cache.py,sha256=IQchxB81iR4eVggHBxUJdXxUCRkqWz1jKe5gxN
242
242
  workbench/utils/workbench_event_bridge.py,sha256=z1GmXOB-Qs7VOgC6Hjnp2DI9nSEWepaSXejACxTIR7o,4150
243
243
  workbench/utils/workbench_logging.py,sha256=WCuMWhQwibrvcGAyj96h2wowh6dH7zNlDJ7sWUzdCeI,10263
244
244
  workbench/utils/workbench_sqs.py,sha256=RwM80z7YWwdtMaCKh7KWF8v38f7eBRU7kyC7ZhTRuI0,2072
245
- workbench/utils/xgboost_model_utils.py,sha256=iiDJH0O81aO6aOTwgssqQygvTgjE7lRDRzLJ_fI3RVs,15554
245
+ workbench/utils/xgboost_model_utils.py,sha256=NNcALcBNOveqkIJiG7Wh7DS0O95RlGE3GZJbdSB8XWY,15571
246
246
  workbench/utils/chem_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
247
247
  workbench/utils/chem_utils/fingerprints.py,sha256=Qvs8jaUwguWUq3Q3j695MY0t0Wk3BvroW-oWBwalMUo,5255
248
248
  workbench/utils/chem_utils/misc.py,sha256=Nevf8_opu-uIPrv_1_0ubuFVVo2_fGUkMoLAHB3XAeo,7372
@@ -287,9 +287,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
287
287
  workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
288
288
  workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
289
289
  workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
290
- workbench-0.8.177.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
291
- workbench-0.8.177.dist-info/METADATA,sha256=sjKEEHLha3-tDo9uYsRtpjPTHV_pj5PkucHuc2WWxBM,9210
292
- workbench-0.8.177.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
293
- workbench-0.8.177.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
294
- workbench-0.8.177.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
295
- workbench-0.8.177.dist-info/RECORD,,
290
+ workbench-0.8.178.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
291
+ workbench-0.8.178.dist-info/METADATA,sha256=kS1snm2EjzaXVrpsg3TX28OmXqYDdZD1K7kQ0lXhNg8,9210
292
+ workbench-0.8.178.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
293
+ workbench-0.8.178.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
294
+ workbench-0.8.178.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
295
+ workbench-0.8.178.dist-info/RECORD,,