wizata-dsapi 2.0.0.dev25__tar.gz → 2.0.0.dev27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {wizata_dsapi-2.0.0.dev25/wizata_dsapi.egg-info → wizata_dsapi-2.0.0.dev27}/PKG-INFO +1 -1
  2. wizata_dsapi-2.0.0.dev27/wizata_dsapi/models/__init__.py +1 -0
  3. wizata_dsapi-2.0.0.dev27/wizata_dsapi/models/common.py +681 -0
  4. wizata_dsapi-2.0.0.dev27/wizata_dsapi/plots/__init__.py +2 -0
  5. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/plots/common.py +141 -0
  6. wizata_dsapi-2.0.0.dev27/wizata_dsapi/version.py +1 -0
  7. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27/wizata_dsapi.egg-info}/PKG-INFO +1 -1
  8. wizata_dsapi-2.0.0.dev25/wizata_dsapi/models/__init__.py +0 -1
  9. wizata_dsapi-2.0.0.dev25/wizata_dsapi/models/common.py +0 -272
  10. wizata_dsapi-2.0.0.dev25/wizata_dsapi/plots/__init__.py +0 -2
  11. wizata_dsapi-2.0.0.dev25/wizata_dsapi/version.py +0 -1
  12. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/LICENSE.txt +0 -0
  13. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/README.rst +0 -0
  14. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/setup.cfg +0 -0
  15. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/setup.py +0 -0
  16. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/__init__.py +0 -0
  17. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/api_config.py +0 -0
  18. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/api_dto.py +0 -0
  19. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/api_interface.py +0 -0
  20. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/bucket.py +0 -0
  21. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/business_label.py +0 -0
  22. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/context.py +0 -0
  23. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/dashboard.py +0 -0
  24. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/dataframe_toolkit.py +0 -0
  25. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/datapoint.py +0 -0
  26. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/datastore.py +0 -0
  27. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/ds_dataframe.py +0 -0
  28. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/dsapi_json_encoder.py +0 -0
  29. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/edge_config.py +0 -0
  30. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/edge_device.py +0 -0
  31. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/edge_module.py +0 -0
  32. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/evaluation.py +0 -0
  33. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/execution.py +0 -0
  34. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/execution_log.py +0 -0
  35. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/experiment.py +0 -0
  36. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/graylog_log.py +0 -0
  37. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/group_system.py +0 -0
  38. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/ilogger.py +0 -0
  39. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/insight.py +0 -0
  40. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/mlmodel.py +0 -0
  41. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/mobile_asset.py +0 -0
  42. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/model_toolkit.py +0 -0
  43. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/notification.py +0 -0
  44. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/paged_query_result.py +0 -0
  45. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/pipeline.py +0 -0
  46. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/pipeline_image.py +0 -0
  47. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/plot.py +0 -0
  48. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/plots/theme.py +0 -0
  49. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/request.py +0 -0
  50. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/script.py +0 -0
  51. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/scripts/__init__.py +0 -0
  52. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/scripts/common.py +0 -0
  53. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/search.py +0 -0
  54. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/solution_component.py +0 -0
  55. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/streamlit_utils.py +0 -0
  56. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/template.py +0 -0
  57. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/template_config.py +0 -0
  58. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/trigger.py +0 -0
  59. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/twin.py +0 -0
  60. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/twinregistration.py +0 -0
  61. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/user.py +0 -0
  62. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/wizard_function.py +0 -0
  63. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/wizard_request.py +0 -0
  64. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/wizata_dsapi_client.py +0 -0
  65. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi/words.py +0 -0
  66. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi.egg-info/SOURCES.txt +0 -0
  67. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi.egg-info/dependency_links.txt +0 -0
  68. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi.egg-info/requires.txt +0 -0
  69. {wizata_dsapi-2.0.0.dev25 → wizata_dsapi-2.0.0.dev27}/wizata_dsapi.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wizata_dsapi
3
- Version: 2.0.0.dev25
3
+ Version: 2.0.0.dev27
4
4
  Summary: Wizata Data Science Toolkit
5
5
  Author: Wizata S.A.
6
6
  Author-email: info@wizata.com
@@ -0,0 +1 @@
1
+ from .common import linear_regression, logistic_regression, isolation_forest, gradiant_boost_classifier, setpoint_optimizer, SetpointOptimizer, random_forest_regressor, hotelling_t2_monitor, HotellingT2Monitor, ridge_regression, random_forest_classifier, autoencoder_anomaly, AutoencoderAnomaly, mahalanobis_anomaly, MahalanobisAnomaly, gmm_regime_classifier, GMMRegimeClassifier
@@ -0,0 +1,681 @@
1
+ import wizata_dsapi
2
+
3
+ import pandas
4
+ import numpy
5
+
6
+ import sklearn
7
+ import sklearn.linear_model
8
+ import sklearn.ensemble
9
+ import sklearn.mixture
10
+ import sklearn.neighbors
11
+ import sklearn.neural_network
12
+ import sklearn.pipeline
13
+ import sklearn.preprocessing
14
+ import sklearn.decomposition
15
+ import sklearn.covariance
16
+
17
+
18
+ def extract_target_feat(context: wizata_dsapi.Context, single: bool = True):
19
+ """
20
+ return a list of target_feat columns names if not single value or the single value target feat name
21
+ raise an error if configuration mismatch
22
+ """
23
+
24
+ if "target_feat" not in context.properties:
25
+ raise ValueError(f"training script requires a proper target_feat")
26
+
27
+ target_feat = context.properties["target_feat"]
28
+ if isinstance(target_feat, str):
29
+ if single:
30
+ return target_feat
31
+ else:
32
+ return [target_feat]
33
+ elif isinstance(target_feat, list):
34
+ if single:
35
+ if len(target_feat) == 1:
36
+ return target_feat[0]
37
+ else:
38
+ raise ValueError(f"expecting only one target_feat but found {len(target_feat)}")
39
+ else:
40
+ return [target_feat]
41
+ else:
42
+ raise TypeError(f'target_feat must be a str or a list of str but found {target_feat.__class__.__name__}')
43
+
44
+
45
+ def linear_regression(context: wizata_dsapi.Context):
46
+ """Train a linear regression model on all features to predict a single target column."""
47
+ df = context.dataframe
48
+
49
+ model_config = context.get_model_config()
50
+ if not model_config.has_target_feat():
51
+ raise ValueError(f'linear_regression requires a target feat')
52
+ target_feat_name = context.properties["target_feat"]
53
+
54
+ x = df.drop(columns=[target_feat_name])
55
+ y = df[target_feat_name]
56
+
57
+ model = sklearn.linear_model.LinearRegression()
58
+ model.fit(x, y)
59
+
60
+ context.set_model(model, features=x.columns)
61
+
62
+
63
+ def logistic_regression(context: wizata_dsapi.Context):
64
+ """Train a logistic regression classifier on all features to predict a binary target column."""
65
+ df = context.dataframe
66
+
67
+ model_config = context.get_model_config()
68
+ if not model_config.has_target_feat():
69
+ raise ValueError(f'logistic_regression requires a target feat')
70
+ target_feat_name = context.properties["target_feat"]
71
+
72
+ x = df.drop(columns=[target_feat_name])
73
+ y = df[target_feat_name]
74
+
75
+ model = sklearn.linear_model.LogisticRegression()
76
+ model.fit(x, y.astype(int))
77
+
78
+ context.set_model(model, features=x.columns)
79
+
80
+
81
+ def isolation_forest(context: wizata_dsapi.Context):
82
+ """Train an Isolation Forest for unsupervised anomaly detection using a sensitivity level (1-5)."""
83
+
84
+ model_config = context.get_model_config()
85
+ if model_config.has_target_feat():
86
+ raise ValueError(f'isolation_forest does not requires a target feat')
87
+
88
+ try:
89
+ if context.properties['sensitivity'] is None:
90
+ raise KeyError("sensitivity is none")
91
+ sensitivity = int(context.properties['sensitivity'])
92
+ sensitivities = [0.05, 0.15, 0.25, 0.35, 0.4]
93
+ contamination = sensitivities[sensitivity - 1]
94
+ except Exception as e:
95
+ raise ValueError(f'cannot extract sensitivity integer from 0 to 4 due to {e}')
96
+
97
+ df = context.dataframe.copy()
98
+ model = sklearn.ensemble.IsolationForest(contamination=contamination)
99
+ df['isolation_forest_predict'] = model.fit_predict(df)
100
+ context.set_model(model, features=df.columns)
101
+ return df
102
+
103
+
104
+ def gradiant_boost_classifier(context: wizata_dsapi.Context):
105
+ """Train a Gradient Boosting classifier on all features to predict a target column."""
106
+ df = context.dataframe
107
+
108
+ model_config = context.get_model_config()
109
+ if not model_config.has_target_feat():
110
+ raise ValueError(f'gradiant_boost_classifier requires a target feat')
111
+ target_feat_name = context.properties["target_feat"]
112
+
113
+ x = df.drop(columns=[target_feat_name])
114
+ y = df[target_feat_name]
115
+
116
+ model = sklearn.ensemble.GradientBoostingClassifier(random_state=0).fit(x, y)
117
+ context.set_model(model, features=df.columns)
118
+
119
+
120
+ class SetpointOptimizer:
121
+ """
122
+ Wraps a KNN quality forecaster (StandardScaler + KNeighborsRegressor) with a grid-search
123
+ setpoint recommendation method.
124
+
125
+ At training time, the underlying pipeline learns quality = f(telemetry + setpoints) from
126
+ historical data. At inference, recommend(X) keeps each row's telemetry fixed and grid-searches
127
+ over stored setpoint bounds (5th-95th percentile of train data) to find the combination that
128
+ minimizes (or maximizes) predicted quality.
129
+
130
+ :ivar pipeline: fitted sklearn Pipeline (StandardScaler -> KNeighborsRegressor).
131
+ :ivar setpoint_cols: ordered list of setpoint column names auto-detected at train time.
132
+ :ivar feature_cols: ordered list of all feature columns used at training (telemetry + setpoints).
133
+ :ivar bounds: dict mapping each setpoint column name to a (low, high) tuple.
134
+ :ivar direction: 'minimize' or 'maximize' the target quality.
135
+ :ivar grid_size: number of points per setpoint axis in the grid search.
136
+ """
137
+
138
+ def __init__(self, pipeline, setpoint_cols, feature_cols, bounds, direction, grid_size):
139
+ self.pipeline = pipeline
140
+ self.setpoint_cols = list(setpoint_cols)
141
+ self.feature_cols = list(feature_cols)
142
+ self.bounds = dict(bounds)
143
+ self.direction = direction
144
+ self.grid_size = int(grid_size)
145
+
146
+ def predict(self, X):
147
+ """Return predicted quality for each row — dual-use for validation or regular predict-mode pipelines."""
148
+ return self.pipeline.predict(X)
149
+
150
+ def recommend(self, X):
151
+ """
152
+ For each row of X, return the grid-search best setpoint combination.
153
+ :param X: features matrix (DataFrame or ndarray) with columns matching feature_cols.
154
+ :return: ndarray of shape [n_rows, n_setpoints] in the order of self.setpoint_cols.
155
+ """
156
+ if not isinstance(X, pandas.DataFrame):
157
+ X = pandas.DataFrame(X, columns=self.feature_cols)
158
+
159
+ grids = [
160
+ numpy.linspace(self.bounds[sp][0], self.bounds[sp][1], self.grid_size)
161
+ for sp in self.setpoint_cols
162
+ ]
163
+ mesh = numpy.array(numpy.meshgrid(*grids)).reshape(len(self.setpoint_cols), -1).T
164
+
165
+ recs = numpy.zeros((len(X), len(self.setpoint_cols)))
166
+ for i, (_, row) in enumerate(X.iterrows()):
167
+ candidates = pandas.DataFrame(
168
+ numpy.tile(row.values, (len(mesh), 1)),
169
+ columns=self.feature_cols
170
+ )
171
+ for j, sp in enumerate(self.setpoint_cols):
172
+ candidates[sp] = mesh[:, j]
173
+ preds = self.pipeline.predict(candidates)
174
+ if self.direction == "maximize":
175
+ best_idx = int(numpy.argmax(preds))
176
+ else:
177
+ best_idx = int(numpy.argmin(preds))
178
+ recs[i] = mesh[best_idx]
179
+
180
+ return recs
181
+
182
+ def output_names(self, suffix: str = "_recommended"):
183
+ """Suggest MLModelConfig.output_columns_names matching the setpoint order (e.g. for UI prefill)."""
184
+ return [f"{sp}{suffix}" for sp in self.setpoint_cols]
185
+
186
+ def get_inference_contract(self):
187
+ """Self-describing inference contract — the platform reads this to override MLModelConfig at runtime.
188
+ Returns the function to invoke (.recommend) and the output column names that will be produced, in order."""
189
+ return {
190
+ "function": "recommend",
191
+ "output_columns_names": self.output_names(),
192
+ }
193
+
194
+
195
+ def setpoint_optimizer(context: wizata_dsapi.Context):
196
+ """Train a KNN-based setpoint optimizer that learns quality = f(telemetry + setpoints) and at
197
+ inference recommends optimal setpoint values. Setpoint columns are auto-detected via
198
+ BusinessType.SET_POINTS on context.datapoints; bounds are the 5th-95th percentile of train data.
199
+
200
+ Required MLModelConfig:
201
+ - train_script = 'wizata.models.setpoint_optimizer'
202
+ - target_feat = '<quality column name>'
203
+ - function = 'recommend' (or 'predict' for quality forecasting only)
204
+ - output_columns_names = ['<sp1>_recommended', '<sp2>_recommended', ...] in the order setpoints appear.
205
+ The trained model exposes `.output_names()` to suggest a matching default.
206
+
207
+ Properties:
208
+ - k: KNN neighbors (default 5)
209
+ - grid_size: points per setpoint axis (default 10 — total cost is grid_size^n_setpoints per row)
210
+ - direction: 'minimize' (default) or 'maximize' the target
211
+ """
212
+ df = context.dataframe
213
+
214
+ model_config = context.get_model_config()
215
+ if not model_config.has_target_feat():
216
+ raise ValueError(f'setpoint_optimizer requires a target_feat (the quality column to optimize)')
217
+
218
+ target = context.properties["target_feat"]
219
+ if isinstance(target, list):
220
+ if len(target) != 1:
221
+ raise ValueError(f'setpoint_optimizer requires exactly one target_feat column')
222
+ target = target[0]
223
+
224
+ if target not in df.columns:
225
+ raise ValueError(f"target_feat '{target}' not found in dataframe columns")
226
+
227
+ datapoints = context.datapoints or {}
228
+ setpoint_cols = [
229
+ col for col, dp in datapoints.items()
230
+ if col in df.columns
231
+ and col != target
232
+ and dp.business_type == wizata_dsapi.BusinessType.SET_POINTS
233
+ ]
234
+
235
+ if not setpoint_cols:
236
+ raise ValueError(
237
+ "no setpoint datapoints found in context (BusinessType.SET_POINTS) — "
238
+ "the optimizer needs at least one setpoint column to optimize"
239
+ )
240
+
241
+ k = int(context.properties.get("k", 5))
242
+ grid_size = int(context.properties.get("grid_size", 10))
243
+ direction = context.properties.get("direction", "minimize")
244
+ if direction not in ("minimize", "maximize"):
245
+ raise ValueError(f"direction must be 'minimize' or 'maximize', got '{direction}'")
246
+
247
+ x = df.drop(columns=[target])
248
+ y = df[target]
249
+
250
+ pipeline = sklearn.pipeline.Pipeline([
251
+ ("scaler", sklearn.preprocessing.StandardScaler()),
252
+ ("knn", sklearn.neighbors.KNeighborsRegressor(n_neighbors=k)),
253
+ ])
254
+ pipeline.fit(x, y)
255
+
256
+ bounds = {
257
+ sp: (float(x[sp].quantile(0.05)), float(x[sp].quantile(0.95)))
258
+ for sp in setpoint_cols
259
+ }
260
+
261
+ optimizer = SetpointOptimizer(
262
+ pipeline=pipeline,
263
+ setpoint_cols=setpoint_cols,
264
+ feature_cols=list(x.columns),
265
+ bounds=bounds,
266
+ direction=direction,
267
+ grid_size=grid_size,
268
+ )
269
+
270
+ context.set_model(optimizer, features=x.columns)
271
+
272
+
273
+ def random_forest_regressor(context: wizata_dsapi.Context):
274
+ """Train a Random Forest Regressor on all features to predict a single target column — non-linear,
275
+ robust to outliers, no scaling required. Exposes feature_importances_ on the trained model
276
+ for interpretability (root-cause hints).
277
+
278
+ Properties:
279
+ - n_estimators: number of trees (default 100)
280
+ - max_depth: max depth per tree (default None = unbounded)
281
+ - random_state: RNG seed for reproducibility (default 0)
282
+ """
283
+ df = context.dataframe
284
+
285
+ model_config = context.get_model_config()
286
+ if not model_config.has_target_feat():
287
+ raise ValueError(f'random_forest_regressor requires a target_feat')
288
+ target_feat_name = context.properties["target_feat"]
289
+
290
+ x = df.drop(columns=[target_feat_name])
291
+ y = df[target_feat_name]
292
+
293
+ n_estimators = int(context.properties.get("n_estimators", 100))
294
+ max_depth_prop = context.properties.get("max_depth")
295
+ max_depth = int(max_depth_prop) if max_depth_prop not in (None, "", "None") else None
296
+ random_state = int(context.properties.get("random_state", 0))
297
+
298
+ model = sklearn.ensemble.RandomForestRegressor(
299
+ n_estimators=n_estimators,
300
+ max_depth=max_depth,
301
+ random_state=random_state,
302
+ )
303
+ model.fit(x, y)
304
+
305
+ context.set_model(model, features=x.columns)
306
+
307
+
308
+ class HotellingT2Monitor:
309
+ """
310
+ Multivariate statistical process control (SPC) anomaly monitor.
311
+
312
+ Fits a StandardScaler + PCA on training data (assumed to represent normal operation), then at
313
+ inference computes two classic industrial anomaly statistics per observation:
314
+
315
+ - T² (Hotelling's T²): sum of squared PC scores divided by eigenvalues — measures anomaly
316
+ *within* the principal subspace (drift of the operating regime while still explained by
317
+ the learned model).
318
+ - SPE / Q statistic: squared reconstruction error — measures anomaly *orthogonal* to the
319
+ PC space (novel behaviour not captured by the learned model).
320
+
321
+ Both statistics together enable decomposition of what is abnormal and why.
322
+
323
+ :ivar scaler: fitted sklearn StandardScaler.
324
+ :ivar pca: fitted sklearn PCA.
325
+ :ivar feature_cols: ordered list of feature columns used at training.
326
+ """
327
+
328
+ def __init__(self, scaler, pca, feature_cols):
329
+ self.scaler = scaler
330
+ self.pca = pca
331
+ self.feature_cols = list(feature_cols)
332
+
333
+ def _compute(self, X):
334
+ if not isinstance(X, pandas.DataFrame):
335
+ X = pandas.DataFrame(X, columns=self.feature_cols)
336
+ X_scaled = self.scaler.transform(X)
337
+ scores = self.pca.transform(X_scaled)
338
+ eigenvalues = self.pca.explained_variance_
339
+ # T² = Σ (tᵢ² / λᵢ) per row
340
+ t2 = (scores ** 2 / eigenvalues).sum(axis=1)
341
+ # SPE = ||x − x̂||² per row
342
+ reconstructed = self.pca.inverse_transform(scores)
343
+ spe = ((X_scaled - reconstructed) ** 2).sum(axis=1)
344
+ return t2, spe
345
+
346
+ def predict(self, X):
347
+ """Return the T² statistic per row (1-D) — dual-use with the default predict interface."""
348
+ t2, _ = self._compute(X)
349
+ return t2
350
+
351
+ def monitor(self, X):
352
+ """Return [n_rows, 2] array with T² and SPE per row."""
353
+ t2, spe = self._compute(X)
354
+ return numpy.column_stack([t2, spe])
355
+
356
+ def get_inference_contract(self):
357
+ """Self-describing contract — platform reads this to override MLModelConfig at runtime."""
358
+ return {
359
+ "function": "monitor",
360
+ "output_columns_names": ["T2", "SPE"],
361
+ }
362
+
363
+
364
+ def hotelling_t2_monitor(context: wizata_dsapi.Context):
365
+ """Train a PCA-based multivariate SPC monitor on training data representative of *normal*
366
+ operation. At inference, produces two columns per row: T² (Hotelling's statistic — anomaly
367
+ within the principal subspace) and SPE (squared prediction error — novelty orthogonal to the
368
+ PC space). The classical industrial monitoring pattern — far more interpretable than a black-box
369
+ outlier score because T² and SPE can be decomposed back to per-sensor contributions.
370
+
371
+ Unsupervised: do **not** set target_feat. NaN values must be handled upstream.
372
+
373
+ Property:
374
+ - n_components: int (exact number of components) or float in (0, 1] (minimum explained
375
+ variance ratio, default 0.95).
376
+ """
377
+ df = context.dataframe
378
+
379
+ model_config = context.get_model_config()
380
+ if model_config.has_target_feat():
381
+ raise ValueError(f'hotelling_t2_monitor is unsupervised — do not set target_feat')
382
+
383
+ if df.isna().any().any():
384
+ raise ValueError(f'hotelling_t2_monitor cannot handle NaN values — run interpolate or fillna upstream')
385
+
386
+ n_components_prop = context.properties.get("n_components", 0.95)
387
+ try:
388
+ n_components = float(n_components_prop)
389
+ if n_components > 1:
390
+ n_components = int(n_components)
391
+ except (TypeError, ValueError):
392
+ raise ValueError(f"n_components must be an int (# components) or a float in (0, 1] (variance ratio)")
393
+
394
+ scaler = sklearn.preprocessing.StandardScaler()
395
+ x_scaled = scaler.fit_transform(df)
396
+
397
+ pca = sklearn.decomposition.PCA(n_components=n_components)
398
+ pca.fit(x_scaled)
399
+
400
+ monitor = HotellingT2Monitor(
401
+ scaler=scaler,
402
+ pca=pca,
403
+ feature_cols=list(df.columns),
404
+ )
405
+
406
+ context.set_model(monitor, features=df.columns)
407
+
408
+
409
+ def ridge_regression(context: wizata_dsapi.Context):
410
+ """Train a Ridge (L2-regularized) linear regression to predict a single target column.
411
+ A drop-in replacement for linear_regression when features are highly correlated (common with
412
+ multi-sensor industrial data) — produces more stable coefficients.
413
+
414
+ Property:
415
+ - alpha: L2 regularization strength (default 1.0 — larger means more shrinkage)
416
+ """
417
+ df = context.dataframe
418
+
419
+ model_config = context.get_model_config()
420
+ if not model_config.has_target_feat():
421
+ raise ValueError(f'ridge_regression requires a target_feat')
422
+ target_feat_name = context.properties["target_feat"]
423
+
424
+ x = df.drop(columns=[target_feat_name])
425
+ y = df[target_feat_name]
426
+
427
+ alpha = float(context.properties.get("alpha", 1.0))
428
+
429
+ model = sklearn.linear_model.Ridge(alpha=alpha)
430
+ model.fit(x, y)
431
+
432
+ context.set_model(model, features=x.columns)
433
+
434
+
435
+ def random_forest_classifier(context: wizata_dsapi.Context):
436
+ """Train a Random Forest Classifier on all features to predict a target class column — robust,
437
+ handles multi-class, no scaling required, exposes feature_importances_ for interpretability.
438
+ Good default for process-regime or fault-type classification.
439
+
440
+ Properties:
441
+ - n_estimators: number of trees (default 100)
442
+ - max_depth: max depth per tree (default None = unbounded)
443
+ - random_state: RNG seed for reproducibility (default 0)
444
+ """
445
+ df = context.dataframe
446
+
447
+ model_config = context.get_model_config()
448
+ if not model_config.has_target_feat():
449
+ raise ValueError(f'random_forest_classifier requires a target_feat')
450
+ target_feat_name = context.properties["target_feat"]
451
+
452
+ x = df.drop(columns=[target_feat_name])
453
+ y = df[target_feat_name]
454
+
455
+ n_estimators = int(context.properties.get("n_estimators", 100))
456
+ max_depth_prop = context.properties.get("max_depth")
457
+ max_depth = int(max_depth_prop) if max_depth_prop not in (None, "", "None") else None
458
+ random_state = int(context.properties.get("random_state", 0))
459
+
460
+ model = sklearn.ensemble.RandomForestClassifier(
461
+ n_estimators=n_estimators,
462
+ max_depth=max_depth,
463
+ random_state=random_state,
464
+ )
465
+ model.fit(x, y)
466
+
467
+ context.set_model(model, features=x.columns)
468
+
469
+
470
+ class AutoencoderAnomaly:
471
+ """
472
+ MLP-based bottleneck autoencoder — trained to reconstruct its input; at inference returns the
473
+ per-row reconstruction error as a single anomaly score. Complements hotelling_t2_monitor when
474
+ the normal-operation manifold is non-linear and PCA misses meaningful structure.
475
+
476
+ Implementation detail: built on sklearn's MLPRegressor with fit(X, X) — keeps dependencies
477
+ minimal (no torch) while preserving the autoencoder concept via symmetric hidden_layer_sizes
478
+ like (16, 8, 16) which yields a bottleneck of 8.
479
+
480
+ :ivar scaler: fitted sklearn StandardScaler.
481
+ :ivar mlp: fitted sklearn MLPRegressor used as encoder/decoder.
482
+ :ivar feature_cols: ordered list of feature columns used at training.
483
+ """
484
+
485
+ def __init__(self, scaler, mlp, feature_cols):
486
+ self.scaler = scaler
487
+ self.mlp = mlp
488
+ self.feature_cols = list(feature_cols)
489
+
490
+ def predict(self, X):
491
+ """Return the per-row reconstruction error (anomaly score) as a 1-D array."""
492
+ if not isinstance(X, pandas.DataFrame):
493
+ X = pandas.DataFrame(X, columns=self.feature_cols)
494
+ x_scaled = self.scaler.transform(X)
495
+ reconstructed = self.mlp.predict(x_scaled)
496
+ return ((x_scaled - reconstructed) ** 2).sum(axis=1)
497
+
498
+ def get_inference_contract(self):
499
+ """Self-describing contract — platform reads this to override MLModelConfig at runtime."""
500
+ return {
501
+ "function": "predict",
502
+ "output_columns_names": ["anomaly_score"],
503
+ }
504
+
505
+
506
+ def autoencoder_anomaly(context: wizata_dsapi.Context):
507
+ """Train an MLP bottleneck autoencoder on normal-operation data. At inference, returns a single
508
+ 'anomaly_score' column per row (squared reconstruction error on scaled features). Pair with
509
+ hotelling_t2_monitor when you need non-linear anomaly detection — PCA captures linear variance,
510
+ this captures non-linear structure.
511
+
512
+ Unsupervised: do **not** set target_feat. NaN values must be handled upstream.
513
+
514
+ Properties:
515
+ - hidden_layer_sizes: tuple/list of hidden layer widths (default (16, 8, 16) = bottleneck 8).
516
+ Pass as a comma-separated string from the UI (e.g. '16,8,16').
517
+ - max_iter: training iterations (default 500)
518
+ - random_state: RNG seed for reproducibility (default 0)
519
+ """
520
+ df = context.dataframe
521
+
522
+ model_config = context.get_model_config()
523
+ if model_config.has_target_feat():
524
+ raise ValueError(f'autoencoder_anomaly is unsupervised — do not set target_feat')
525
+
526
+ if df.isna().any().any():
527
+ raise ValueError(f'autoencoder_anomaly cannot handle NaN values — run interpolate or fillna upstream')
528
+
529
+ hidden_prop = context.properties.get("hidden_layer_sizes", (16, 8, 16))
530
+ if isinstance(hidden_prop, str):
531
+ hidden = tuple(int(x.strip()) for x in hidden_prop.split(",") if x.strip())
532
+ elif isinstance(hidden_prop, (list, tuple)):
533
+ hidden = tuple(int(x) for x in hidden_prop)
534
+ else:
535
+ raise ValueError(f"hidden_layer_sizes must be a tuple, list, or comma-separated string")
536
+
537
+ max_iter = int(context.properties.get("max_iter", 500))
538
+ random_state = int(context.properties.get("random_state", 0))
539
+
540
+ scaler = sklearn.preprocessing.StandardScaler()
541
+ x_scaled = scaler.fit_transform(df)
542
+
543
+ mlp = sklearn.neural_network.MLPRegressor(
544
+ hidden_layer_sizes=hidden,
545
+ max_iter=max_iter,
546
+ random_state=random_state,
547
+ )
548
+ mlp.fit(x_scaled, x_scaled)
549
+
550
+ ae = AutoencoderAnomaly(scaler=scaler, mlp=mlp, feature_cols=list(df.columns))
551
+
552
+ context.set_model(ae, features=df.columns)
553
+
554
+
555
+ class MahalanobisAnomaly:
556
+ """
557
+ Lightweight multivariate anomaly detector based on Mahalanobis distance from training mean.
558
+ Uses a robust inverse covariance matrix from sklearn's EmpiricalCovariance. Complements
559
+ hotelling_t2_monitor — same underlying idea but no PCA reduction, giving a single interpretable
560
+ distance score per row.
561
+
562
+ :ivar covariance: fitted sklearn EmpiricalCovariance.
563
+ :ivar feature_cols: ordered list of feature columns used at training.
564
+ """
565
+
566
+ def __init__(self, covariance, feature_cols):
567
+ self.covariance = covariance
568
+ self.feature_cols = list(feature_cols)
569
+
570
+ def predict(self, X):
571
+ """Return Mahalanobis distance per row (1-D array)."""
572
+ if not isinstance(X, pandas.DataFrame):
573
+ X = pandas.DataFrame(X, columns=self.feature_cols)
574
+ return numpy.sqrt(self.covariance.mahalanobis(X.values))
575
+
576
+ def get_inference_contract(self):
577
+ """Self-describing contract — platform reads this to override MLModelConfig at runtime."""
578
+ return {
579
+ "function": "predict",
580
+ "output_columns_names": ["mahalanobis_distance"],
581
+ }
582
+
583
+
584
+ def mahalanobis_anomaly(context: wizata_dsapi.Context):
585
+ """Train a Mahalanobis-distance anomaly detector on normal-operation data. At inference,
586
+ returns a single 'mahalanobis_distance' column per row — larger distance = more anomalous,
587
+ interpretable because each feature contributes proportional to its deviation weighted by
588
+ the inverse covariance. No dimensionality reduction (unlike hotelling_t2_monitor), so useful
589
+ when you have few-to-medium sensors and want a single calibrated score.
590
+
591
+ Unsupervised: do **not** set target_feat. NaN values must be handled upstream.
592
+
593
+ Properties: (none — uses sklearn's EmpiricalCovariance defaults)
594
+ """
595
+ df = context.dataframe
596
+
597
+ model_config = context.get_model_config()
598
+ if model_config.has_target_feat():
599
+ raise ValueError(f'mahalanobis_anomaly is unsupervised — do not set target_feat')
600
+
601
+ if df.isna().any().any():
602
+ raise ValueError(f'mahalanobis_anomaly cannot handle NaN values — run interpolate or fillna upstream')
603
+
604
+ cov = sklearn.covariance.EmpiricalCovariance()
605
+ cov.fit(df.values)
606
+
607
+ ma = MahalanobisAnomaly(covariance=cov, feature_cols=list(df.columns))
608
+
609
+ context.set_model(ma, features=df.columns)
610
+
611
+
612
+ class GMMRegimeClassifier:
613
+ """
614
+ Gaussian Mixture Model for soft process-regime detection. At inference, assigns each row to
615
+ the most likely regime (cluster index 0..n_components-1).
616
+
617
+ :ivar gmm: fitted sklearn GaussianMixture.
618
+ :ivar scaler: fitted sklearn StandardScaler (so clusters are scale-invariant).
619
+ :ivar feature_cols: ordered list of feature columns used at training.
620
+ """
621
+
622
+ def __init__(self, gmm, scaler, feature_cols):
623
+ self.gmm = gmm
624
+ self.scaler = scaler
625
+ self.feature_cols = list(feature_cols)
626
+
627
+ def predict(self, X):
628
+ """Return the most likely regime index (0..n_components-1) per row as a 1-D array."""
629
+ if not isinstance(X, pandas.DataFrame):
630
+ X = pandas.DataFrame(X, columns=self.feature_cols)
631
+ x_scaled = self.scaler.transform(X)
632
+ return self.gmm.predict(x_scaled).astype(float)
633
+
634
+ def get_inference_contract(self):
635
+ """Self-describing contract — platform reads this to override MLModelConfig at runtime."""
636
+ return {
637
+ "function": "predict",
638
+ "output_columns_names": ["regime"],
639
+ }
640
+
641
+
642
+ def gmm_regime_classifier(context: wizata_dsapi.Context):
643
+ """Train a Gaussian Mixture Model to identify process regimes from multi-sensor data. At
644
+ inference, returns a single 'regime' column per row (integer cluster index, 0..n_components-1).
645
+ Good for separating operating modes (e.g. startup / steady / shutdown) without labels.
646
+
647
+ Unsupervised: do **not** set target_feat. NaN values must be handled upstream.
648
+
649
+ Properties:
650
+ - n_components: number of regimes / clusters (default 3)
651
+ - covariance_type: 'full' (default), 'tied', 'diag', or 'spherical'
652
+ - random_state: RNG seed for reproducibility (default 0)
653
+ """
654
+ df = context.dataframe
655
+
656
+ model_config = context.get_model_config()
657
+ if model_config.has_target_feat():
658
+ raise ValueError(f'gmm_regime_classifier is unsupervised — do not set target_feat')
659
+
660
+ if df.isna().any().any():
661
+ raise ValueError(f'gmm_regime_classifier cannot handle NaN values — run interpolate or fillna upstream')
662
+
663
+ n_components = int(context.properties.get("n_components", 3))
664
+ covariance_type = context.properties.get("covariance_type", "full")
665
+ if covariance_type not in ("full", "tied", "diag", "spherical"):
666
+ raise ValueError(f"covariance_type must be 'full', 'tied', 'diag', or 'spherical', got '{covariance_type}'")
667
+ random_state = int(context.properties.get("random_state", 0))
668
+
669
+ scaler = sklearn.preprocessing.StandardScaler()
670
+ x_scaled = scaler.fit_transform(df)
671
+
672
+ gmm = sklearn.mixture.GaussianMixture(
673
+ n_components=n_components,
674
+ covariance_type=covariance_type,
675
+ random_state=random_state,
676
+ )
677
+ gmm.fit(x_scaled)
678
+
679
+ classifier = GMMRegimeClassifier(gmm=gmm, scaler=scaler, feature_cols=list(df.columns))
680
+
681
+ context.set_model(classifier, features=df.columns)
@@ -0,0 +1,2 @@
1
+ from .common import ts_chart, confusion_matrix, r_squared, anomalies_chart, parallel_coordinates, data_table, setpoint_recommendation, feature_importance
2
+ from . import theme