PyPI - wizata-dsapi - Versions diffs - 2.0.0.dev24__tar.gz → 2.0.0.dev26__tar.gz - Mend

wizata-dsapi 2.0.0.dev24tar.gz → 2.0.0.dev26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{wizata_dsapi-2.0.0.dev24/wizata_dsapi.egg-info → wizata_dsapi-2.0.0.dev26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wizata_dsapi
-Version: 2.0.0.dev24
+Version: 2.0.0.dev26
 Summary: Wizata Data Science Toolkit
 Author: Wizata S.A.
 Author-email: info@wizata.com

wizata_dsapi-2.0.0.dev26/wizata_dsapi/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .common import linear_regression, logistic_regression, isolation_forest, gradiant_boost_classifier, setpoint_optimizer, SetpointOptimizer, random_forest_regressor, hotelling_t2_monitor, HotellingT2Monitor, ridge_regression, random_forest_classifier, autoencoder_anomaly, AutoencoderAnomaly, mahalanobis_anomaly, MahalanobisAnomaly, gmm_regime_classifier, GMMRegimeClassifier

wizata_dsapi-2.0.0.dev26/wizata_dsapi/models/common.py ADDED Viewed

@@ -0,0 +1,681 @@
+import wizata_dsapi
+import pandas
+import numpy
+import sklearn
+import sklearn.linear_model
+import sklearn.ensemble
+import sklearn.mixture
+import sklearn.neighbors
+import sklearn.neural_network
+import sklearn.pipeline
+import sklearn.preprocessing
+import sklearn.decomposition
+import sklearn.covariance
+def extract_target_feat(context: wizata_dsapi.Context, single: bool = True):
+    """
+    return a list of target_feat columns names if not single value or the single value target feat name
+    raise an error if configuration mismatch
+    """
+    if "target_feat" not in context.properties:
+        raise ValueError(f"training script requires a proper target_feat")
+    target_feat = context.properties["target_feat"]
+    if isinstance(target_feat, str):
+        if single:
+            return target_feat
+        else:
+            return [target_feat]
+    elif isinstance(target_feat, list):
+        if single:
+            if len(target_feat) == 1:
+                return target_feat[0]
+            else:
+                raise ValueError(f"expecting only one target_feat but found {len(target_feat)}")
+        else:
+            return [target_feat]
+    else:
+        raise TypeError(f'target_feat must be a str or a list of str but found {target_feat.__class__.__name__}')
+def linear_regression(context: wizata_dsapi.Context):
+    """Train a linear regression model on all features to predict a single target column."""
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if not model_config.has_target_feat():
+        raise ValueError(f'linear_regression requires a target feat')
+    target_feat_name = context.properties["target_feat"]
+    x = df.drop(columns=[target_feat_name])
+    y = df[target_feat_name]
+    model = sklearn.linear_model.LinearRegression()
+    model.fit(x, y)
+    context.set_model(model, features=x.columns)
+def logistic_regression(context: wizata_dsapi.Context):
+    """Train a logistic regression classifier on all features to predict a binary target column."""
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if not model_config.has_target_feat():
+        raise ValueError(f'logistic_regression requires a target feat')
+    target_feat_name = context.properties["target_feat"]
+    x = df.drop(columns=[target_feat_name])
+    y = df[target_feat_name]
+    model = sklearn.linear_model.LogisticRegression()
+    model.fit(x, y.astype(int))
+    context.set_model(model, features=x.columns)
+def isolation_forest(context: wizata_dsapi.Context):
+    """Train an Isolation Forest for unsupervised anomaly detection using a sensitivity level (1-5)."""
+    model_config = context.get_model_config()
+    if model_config.has_target_feat():
+        raise ValueError(f'isolation_forest does not requires a target feat')
+    try:
+        if context.properties['sensitivity'] is None:
+            raise KeyError("sensitivity is none")
+        sensitivity = int(context.properties['sensitivity'])
+        sensitivities = [0.05, 0.15, 0.25, 0.35, 0.4]
+        contamination = sensitivities[sensitivity - 1]
+    except Exception as e:
+        raise ValueError(f'cannot extract sensitivity integer from 0 to 4 due to {e}')
+    df = context.dataframe.copy()
+    model = sklearn.ensemble.IsolationForest(contamination=contamination)
+    df['isolation_forest_predict'] = model.fit_predict(df)
+    context.set_model(model, features=df.columns)
+    return df
+def gradiant_boost_classifier(context: wizata_dsapi.Context):
+    """Train a Gradient Boosting classifier on all features to predict a target column."""
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if not model_config.has_target_feat():
+        raise ValueError(f'gradiant_boost_classifier requires a target feat')
+    target_feat_name = context.properties["target_feat"]
+    x = df.drop(columns=[target_feat_name])
+    y = df[target_feat_name]
+    model = sklearn.ensemble.GradientBoostingClassifier(random_state=0).fit(x, y)
+    context.set_model(model, features=df.columns)
+class SetpointOptimizer:
+    """
+    Wraps a KNN quality forecaster (StandardScaler + KNeighborsRegressor) with a grid-search
+    setpoint recommendation method.
+    At training time, the underlying pipeline learns quality = f(telemetry + setpoints) from
+    historical data. At inference, recommend(X) keeps each row's telemetry fixed and grid-searches
+    over stored setpoint bounds (5th-95th percentile of train data) to find the combination that
+    minimizes (or maximizes) predicted quality.
+    :ivar pipeline: fitted sklearn Pipeline (StandardScaler -> KNeighborsRegressor).
+    :ivar setpoint_cols: ordered list of setpoint column names auto-detected at train time.
+    :ivar feature_cols: ordered list of all feature columns used at training (telemetry + setpoints).
+    :ivar bounds: dict mapping each setpoint column name to a (low, high) tuple.
+    :ivar direction: 'minimize' or 'maximize' the target quality.
+    :ivar grid_size: number of points per setpoint axis in the grid search.
+    """
+    def __init__(self, pipeline, setpoint_cols, feature_cols, bounds, direction, grid_size):
+        self.pipeline = pipeline
+        self.setpoint_cols = list(setpoint_cols)
+        self.feature_cols = list(feature_cols)
+        self.bounds = dict(bounds)
+        self.direction = direction
+        self.grid_size = int(grid_size)
+    def predict(self, X):
+        """Return predicted quality for each row — dual-use for validation or regular predict-mode pipelines."""
+        return self.pipeline.predict(X)
+    def recommend(self, X):
+        """
+        For each row of X, return the grid-search best setpoint combination.
+        :param X: features matrix (DataFrame or ndarray) with columns matching feature_cols.
+        :return: ndarray of shape [n_rows, n_setpoints] in the order of self.setpoint_cols.
+        """
+        if not isinstance(X, pandas.DataFrame):
+            X = pandas.DataFrame(X, columns=self.feature_cols)
+        grids = [
+            numpy.linspace(self.bounds[sp][0], self.bounds[sp][1], self.grid_size)
+            for sp in self.setpoint_cols
+        ]
+        mesh = numpy.array(numpy.meshgrid(*grids)).reshape(len(self.setpoint_cols), -1).T
+        recs = numpy.zeros((len(X), len(self.setpoint_cols)))
+        for i, (_, row) in enumerate(X.iterrows()):
+            candidates = pandas.DataFrame(
+                numpy.tile(row.values, (len(mesh), 1)),
+                columns=self.feature_cols
+            )
+            for j, sp in enumerate(self.setpoint_cols):
+                candidates[sp] = mesh[:, j]
+            preds = self.pipeline.predict(candidates)
+            if self.direction == "maximize":
+                best_idx = int(numpy.argmax(preds))
+            else:
+                best_idx = int(numpy.argmin(preds))
+            recs[i] = mesh[best_idx]
+        return recs
+    def output_names(self, suffix: str = "_recommended"):
+        """Suggest MLModelConfig.output_columns_names matching the setpoint order (e.g. for UI prefill)."""
+        return [f"{sp}{suffix}" for sp in self.setpoint_cols]
+    def get_inference_contract(self):
+        """Self-describing inference contract — the platform reads this to override MLModelConfig at runtime.
+        Returns the function to invoke (.recommend) and the output column names that will be produced, in order."""
+        return {
+            "function": "recommend",
+            "output_columns_names": self.output_names(),
+        }
+def setpoint_optimizer(context: wizata_dsapi.Context):
+    """Train a KNN-based setpoint optimizer that learns quality = f(telemetry + setpoints) and at
+    inference recommends optimal setpoint values. Setpoint columns are auto-detected via
+    BusinessType.SET_POINTS on context.datapoints; bounds are the 5th-95th percentile of train data.
+    Required MLModelConfig:
+      - train_script = 'wizata.models.setpoint_optimizer'
+      - target_feat  = '<quality column name>'
+      - function     = 'recommend'  (or 'predict' for quality forecasting only)
+      - output_columns_names = ['<sp1>_recommended', '<sp2>_recommended', ...] in the order setpoints appear.
+        The trained model exposes `.output_names()` to suggest a matching default.
+    Properties:
+      - k: KNN neighbors (default 5)
+      - grid_size: points per setpoint axis (default 10 — total cost is grid_size^n_setpoints per row)
+      - direction: 'minimize' (default) or 'maximize' the target
+    """
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if not model_config.has_target_feat():
+        raise ValueError(f'setpoint_optimizer requires a target_feat (the quality column to optimize)')
+    target = context.properties["target_feat"]
+    if isinstance(target, list):
+        if len(target) != 1:
+            raise ValueError(f'setpoint_optimizer requires exactly one target_feat column')
+        target = target[0]
+    if target not in df.columns:
+        raise ValueError(f"target_feat '{target}' not found in dataframe columns")
+    datapoints = context.datapoints or {}
+    setpoint_cols = [
+        col for col, dp in datapoints.items()
+        if col in df.columns
+        and col != target
+        and dp.business_type == wizata_dsapi.BusinessType.SET_POINTS
+    ]
+    if not setpoint_cols:
+        raise ValueError(
+            "no setpoint datapoints found in context (BusinessType.SET_POINTS) — "
+            "the optimizer needs at least one setpoint column to optimize"
+        )
+    k = int(context.properties.get("k", 5))
+    grid_size = int(context.properties.get("grid_size", 10))
+    direction = context.properties.get("direction", "minimize")
+    if direction not in ("minimize", "maximize"):
+        raise ValueError(f"direction must be 'minimize' or 'maximize', got '{direction}'")
+    x = df.drop(columns=[target])
+    y = df[target]
+    pipeline = sklearn.pipeline.Pipeline([
+        ("scaler", sklearn.preprocessing.StandardScaler()),
+        ("knn", sklearn.neighbors.KNeighborsRegressor(n_neighbors=k)),
+    ])
+    pipeline.fit(x, y)
+    bounds = {
+        sp: (float(x[sp].quantile(0.05)), float(x[sp].quantile(0.95)))
+        for sp in setpoint_cols
+    }
+    optimizer = SetpointOptimizer(
+        pipeline=pipeline,
+        setpoint_cols=setpoint_cols,
+        feature_cols=list(x.columns),
+        bounds=bounds,
+        direction=direction,
+        grid_size=grid_size,
+    )
+    context.set_model(optimizer, features=x.columns)
+def random_forest_regressor(context: wizata_dsapi.Context):
+    """Train a Random Forest Regressor on all features to predict a single target column — non-linear,
+    robust to outliers, no scaling required. Exposes feature_importances_ on the trained model
+    for interpretability (root-cause hints).
+    Properties:
+      - n_estimators: number of trees (default 100)
+      - max_depth: max depth per tree (default None = unbounded)
+      - random_state: RNG seed for reproducibility (default 0)
+    """
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if not model_config.has_target_feat():
+        raise ValueError(f'random_forest_regressor requires a target_feat')
+    target_feat_name = context.properties["target_feat"]
+    x = df.drop(columns=[target_feat_name])
+    y = df[target_feat_name]
+    n_estimators = int(context.properties.get("n_estimators", 100))
+    max_depth_prop = context.properties.get("max_depth")
+    max_depth = int(max_depth_prop) if max_depth_prop not in (None, "", "None") else None
+    random_state = int(context.properties.get("random_state", 0))
+    model = sklearn.ensemble.RandomForestRegressor(
+        n_estimators=n_estimators,
+        max_depth=max_depth,
+        random_state=random_state,
+    )
+    model.fit(x, y)
+    context.set_model(model, features=x.columns)
+class HotellingT2Monitor:
+    """
+    Multivariate statistical process control (SPC) anomaly monitor.
+    Fits a StandardScaler + PCA on training data (assumed to represent normal operation), then at
+    inference computes two classic industrial anomaly statistics per observation:
+      - T² (Hotelling's T²): sum of squared PC scores divided by eigenvalues — measures anomaly
+        *within* the principal subspace (drift of the operating regime while still explained by
+        the learned model).
+      - SPE / Q statistic: squared reconstruction error — measures anomaly *orthogonal* to the
+        PC space (novel behaviour not captured by the learned model).
+    Both statistics together enable decomposition of what is abnormal and why.
+    :ivar scaler: fitted sklearn StandardScaler.
+    :ivar pca: fitted sklearn PCA.
+    :ivar feature_cols: ordered list of feature columns used at training.
+    """
+    def __init__(self, scaler, pca, feature_cols):
+        self.scaler = scaler
+        self.pca = pca
+        self.feature_cols = list(feature_cols)
+    def _compute(self, X):
+        if not isinstance(X, pandas.DataFrame):
+            X = pandas.DataFrame(X, columns=self.feature_cols)
+        X_scaled = self.scaler.transform(X)
+        scores = self.pca.transform(X_scaled)
+        eigenvalues = self.pca.explained_variance_
+        # T² = Σ (tᵢ² / λᵢ) per row
+        t2 = (scores ** 2 / eigenvalues).sum(axis=1)
+        # SPE = ||x − x̂||² per row
+        reconstructed = self.pca.inverse_transform(scores)
+        spe = ((X_scaled - reconstructed) ** 2).sum(axis=1)
+        return t2, spe
+    def predict(self, X):
+        """Return the T² statistic per row (1-D) — dual-use with the default predict interface."""
+        t2, _ = self._compute(X)
+        return t2
+    def monitor(self, X):
+        """Return [n_rows, 2] array with T² and SPE per row."""
+        t2, spe = self._compute(X)
+        return numpy.column_stack([t2, spe])
+    def get_inference_contract(self):
+        """Self-describing contract — platform reads this to override MLModelConfig at runtime."""
+        return {
+            "function": "monitor",
+            "output_columns_names": ["T2", "SPE"],
+        }
+def hotelling_t2_monitor(context: wizata_dsapi.Context):
+    """Train a PCA-based multivariate SPC monitor on training data representative of *normal*
+    operation. At inference, produces two columns per row: T² (Hotelling's statistic — anomaly
+    within the principal subspace) and SPE (squared prediction error — novelty orthogonal to the
+    PC space). The classical industrial monitoring pattern — far more interpretable than a black-box
+    outlier score because T² and SPE can be decomposed back to per-sensor contributions.
+    Unsupervised: do **not** set target_feat. NaN values must be handled upstream.
+    Property:
+      - n_components: int (exact number of components) or float in (0, 1] (minimum explained
+        variance ratio, default 0.95).
+    """
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if model_config.has_target_feat():
+        raise ValueError(f'hotelling_t2_monitor is unsupervised — do not set target_feat')
+    if df.isna().any().any():
+        raise ValueError(f'hotelling_t2_monitor cannot handle NaN values — run interpolate or fillna upstream')
+    n_components_prop = context.properties.get("n_components", 0.95)
+    try:
+        n_components = float(n_components_prop)
+        if n_components > 1:
+            n_components = int(n_components)
+    except (TypeError, ValueError):
+        raise ValueError(f"n_components must be an int (# components) or a float in (0, 1] (variance ratio)")
+    scaler = sklearn.preprocessing.StandardScaler()
+    x_scaled = scaler.fit_transform(df)
+    pca = sklearn.decomposition.PCA(n_components=n_components)
+    pca.fit(x_scaled)
+    monitor = HotellingT2Monitor(
+        scaler=scaler,
+        pca=pca,
+        feature_cols=list(df.columns),
+    )
+    context.set_model(monitor, features=df.columns)
+def ridge_regression(context: wizata_dsapi.Context):
+    """Train a Ridge (L2-regularized) linear regression to predict a single target column.
+    A drop-in replacement for linear_regression when features are highly correlated (common with
+    multi-sensor industrial data) — produces more stable coefficients.
+    Property:
+      - alpha: L2 regularization strength (default 1.0 — larger means more shrinkage)
+    """
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if not model_config.has_target_feat():
+        raise ValueError(f'ridge_regression requires a target_feat')
+    target_feat_name = context.properties["target_feat"]
+    x = df.drop(columns=[target_feat_name])
+    y = df[target_feat_name]
+    alpha = float(context.properties.get("alpha", 1.0))
+    model = sklearn.linear_model.Ridge(alpha=alpha)
+    model.fit(x, y)
+    context.set_model(model, features=x.columns)
+def random_forest_classifier(context: wizata_dsapi.Context):
+    """Train a Random Forest Classifier on all features to predict a target class column — robust,
+    handles multi-class, no scaling required, exposes feature_importances_ for interpretability.
+    Good default for process-regime or fault-type classification.
+    Properties:
+      - n_estimators: number of trees (default 100)
+      - max_depth: max depth per tree (default None = unbounded)
+      - random_state: RNG seed for reproducibility (default 0)
+    """
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if not model_config.has_target_feat():
+        raise ValueError(f'random_forest_classifier requires a target_feat')
+    target_feat_name = context.properties["target_feat"]
+    x = df.drop(columns=[target_feat_name])
+    y = df[target_feat_name]
+    n_estimators = int(context.properties.get("n_estimators", 100))
+    max_depth_prop = context.properties.get("max_depth")
+    max_depth = int(max_depth_prop) if max_depth_prop not in (None, "", "None") else None
+    random_state = int(context.properties.get("random_state", 0))
+    model = sklearn.ensemble.RandomForestClassifier(
+        n_estimators=n_estimators,
+        max_depth=max_depth,
+        random_state=random_state,
+    )
+    model.fit(x, y)
+    context.set_model(model, features=x.columns)
+class AutoencoderAnomaly:
+    """
+    MLP-based bottleneck autoencoder — trained to reconstruct its input; at inference returns the
+    per-row reconstruction error as a single anomaly score. Complements hotelling_t2_monitor when
+    the normal-operation manifold is non-linear and PCA misses meaningful structure.
+    Implementation detail: built on sklearn's MLPRegressor with fit(X, X) — keeps dependencies
+    minimal (no torch) while preserving the autoencoder concept via symmetric hidden_layer_sizes
+    like (16, 8, 16) which yields a bottleneck of 8.
+    :ivar scaler: fitted sklearn StandardScaler.
+    :ivar mlp: fitted sklearn MLPRegressor used as encoder/decoder.
+    :ivar feature_cols: ordered list of feature columns used at training.
+    """
+    def __init__(self, scaler, mlp, feature_cols):
+        self.scaler = scaler
+        self.mlp = mlp
+        self.feature_cols = list(feature_cols)
+    def predict(self, X):
+        """Return the per-row reconstruction error (anomaly score) as a 1-D array."""
+        if not isinstance(X, pandas.DataFrame):
+            X = pandas.DataFrame(X, columns=self.feature_cols)
+        x_scaled = self.scaler.transform(X)
+        reconstructed = self.mlp.predict(x_scaled)
+        return ((x_scaled - reconstructed) ** 2).sum(axis=1)
+    def get_inference_contract(self):
+        """Self-describing contract — platform reads this to override MLModelConfig at runtime."""
+        return {
+            "function": "predict",
+            "output_columns_names": ["anomaly_score"],
+        }
+def autoencoder_anomaly(context: wizata_dsapi.Context):
+    """Train an MLP bottleneck autoencoder on normal-operation data. At inference, returns a single
+    'anomaly_score' column per row (squared reconstruction error on scaled features). Pair with
+    hotelling_t2_monitor when you need non-linear anomaly detection — PCA captures linear variance,
+    this captures non-linear structure.
+    Unsupervised: do **not** set target_feat. NaN values must be handled upstream.
+    Properties:
+      - hidden_layer_sizes: tuple/list of hidden layer widths (default (16, 8, 16) = bottleneck 8).
+        Pass as a comma-separated string from the UI (e.g. '16,8,16').
+      - max_iter: training iterations (default 500)
+      - random_state: RNG seed for reproducibility (default 0)
+    """
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if model_config.has_target_feat():
+        raise ValueError(f'autoencoder_anomaly is unsupervised — do not set target_feat')
+    if df.isna().any().any():
+        raise ValueError(f'autoencoder_anomaly cannot handle NaN values — run interpolate or fillna upstream')
+    hidden_prop = context.properties.get("hidden_layer_sizes", (16, 8, 16))
+    if isinstance(hidden_prop, str):
+        hidden = tuple(int(x.strip()) for x in hidden_prop.split(",") if x.strip())
+    elif isinstance(hidden_prop, (list, tuple)):
+        hidden = tuple(int(x) for x in hidden_prop)
+    else:
+        raise ValueError(f"hidden_layer_sizes must be a tuple, list, or comma-separated string")
+    max_iter = int(context.properties.get("max_iter", 500))
+    random_state = int(context.properties.get("random_state", 0))
+    scaler = sklearn.preprocessing.StandardScaler()
+    x_scaled = scaler.fit_transform(df)
+    mlp = sklearn.neural_network.MLPRegressor(
+        hidden_layer_sizes=hidden,
+        max_iter=max_iter,
+        random_state=random_state,
+    )
+    mlp.fit(x_scaled, x_scaled)
+    ae = AutoencoderAnomaly(scaler=scaler, mlp=mlp, feature_cols=list(df.columns))
+    context.set_model(ae, features=df.columns)
+class MahalanobisAnomaly:
+    """
+    Lightweight multivariate anomaly detector based on Mahalanobis distance from training mean.
+    Uses a robust inverse covariance matrix from sklearn's EmpiricalCovariance. Complements
+    hotelling_t2_monitor — same underlying idea but no PCA reduction, giving a single interpretable
+    distance score per row.
+    :ivar covariance: fitted sklearn EmpiricalCovariance.
+    :ivar feature_cols: ordered list of feature columns used at training.
+    """
+    def __init__(self, covariance, feature_cols):
+        self.covariance = covariance
+        self.feature_cols = list(feature_cols)
+    def predict(self, X):
+        """Return Mahalanobis distance per row (1-D array)."""
+        if not isinstance(X, pandas.DataFrame):
+            X = pandas.DataFrame(X, columns=self.feature_cols)
+        return numpy.sqrt(self.covariance.mahalanobis(X.values))
+    def get_inference_contract(self):
+        """Self-describing contract — platform reads this to override MLModelConfig at runtime."""
+        return {
+            "function": "predict",
+            "output_columns_names": ["mahalanobis_distance"],
+        }
+def mahalanobis_anomaly(context: wizata_dsapi.Context):
+    """Train a Mahalanobis-distance anomaly detector on normal-operation data. At inference,
+    returns a single 'mahalanobis_distance' column per row — larger distance = more anomalous,
+    interpretable because each feature contributes proportional to its deviation weighted by
+    the inverse covariance. No dimensionality reduction (unlike hotelling_t2_monitor), so useful
+    when you have few-to-medium sensors and want a single calibrated score.
+    Unsupervised: do **not** set target_feat. NaN values must be handled upstream.
+    Properties: (none — uses sklearn's EmpiricalCovariance defaults)
+    """
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if model_config.has_target_feat():
+        raise ValueError(f'mahalanobis_anomaly is unsupervised — do not set target_feat')
+    if df.isna().any().any():
+        raise ValueError(f'mahalanobis_anomaly cannot handle NaN values — run interpolate or fillna upstream')
+    cov = sklearn.covariance.EmpiricalCovariance()
+    cov.fit(df.values)
+    ma = MahalanobisAnomaly(covariance=cov, feature_cols=list(df.columns))
+    context.set_model(ma, features=df.columns)
+class GMMRegimeClassifier:
+    """
+    Gaussian Mixture Model for soft process-regime detection. At inference, assigns each row to
+    the most likely regime (cluster index 0..n_components-1).
+    :ivar gmm: fitted sklearn GaussianMixture.
+    :ivar scaler: fitted sklearn StandardScaler (so clusters are scale-invariant).
+    :ivar feature_cols: ordered list of feature columns used at training.
+    """
+    def __init__(self, gmm, scaler, feature_cols):
+        self.gmm = gmm
+        self.scaler = scaler
+        self.feature_cols = list(feature_cols)
+    def predict(self, X):
+        """Return the most likely regime index (0..n_components-1) per row as a 1-D array."""
+        if not isinstance(X, pandas.DataFrame):
+            X = pandas.DataFrame(X, columns=self.feature_cols)
+        x_scaled = self.scaler.transform(X)
+        return self.gmm.predict(x_scaled).astype(float)
+    def get_inference_contract(self):
+        """Self-describing contract — platform reads this to override MLModelConfig at runtime."""
+        return {
+            "function": "predict",
+            "output_columns_names": ["regime"],
+        }
+def gmm_regime_classifier(context: wizata_dsapi.Context):
+    """Train a Gaussian Mixture Model to identify process regimes from multi-sensor data. At
+    inference, returns a single 'regime' column per row (integer cluster index, 0..n_components-1).
+    Good for separating operating modes (e.g. startup / steady / shutdown) without labels.
+    Unsupervised: do **not** set target_feat. NaN values must be handled upstream.
+    Properties:
+      - n_components: number of regimes / clusters (default 3)
+      - covariance_type: 'full' (default), 'tied', 'diag', or 'spherical'
+      - random_state: RNG seed for reproducibility (default 0)
+    """
+    df = context.dataframe
+    model_config = context.get_model_config()
+    if model_config.has_target_feat():
+        raise ValueError(f'gmm_regime_classifier is unsupervised — do not set target_feat')
+    if df.isna().any().any():
+        raise ValueError(f'gmm_regime_classifier cannot handle NaN values — run interpolate or fillna upstream')
+    n_components = int(context.properties.get("n_components", 3))
+    covariance_type = context.properties.get("covariance_type", "full")
+    if covariance_type not in ("full", "tied", "diag", "spherical"):
+        raise ValueError(f"covariance_type must be 'full', 'tied', 'diag', or 'spherical', got '{covariance_type}'")
+    random_state = int(context.properties.get("random_state", 0))
+    scaler = sklearn.preprocessing.StandardScaler()
+    x_scaled = scaler.fit_transform(df)
+    gmm = sklearn.mixture.GaussianMixture(
+        n_components=n_components,
+        covariance_type=covariance_type,
+        random_state=random_state,
+    )
+    gmm.fit(x_scaled)
+    classifier = GMMRegimeClassifier(gmm=gmm, scaler=scaler, feature_cols=list(df.columns))
+    context.set_model(classifier, features=df.columns)

wizata_dsapi-2.0.0.dev26/wizata_dsapi/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "2.0.0.dev26"

{wizata_dsapi-2.0.0.dev24 → wizata_dsapi-2.0.0.dev26/wizata_dsapi.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: wizata_dsapi
-Version: 2.0.0.dev24
+Version: 2.0.0.dev26
 Summary: Wizata Data Science Toolkit
 Author: Wizata S.A.
 Author-email: info@wizata.com

wizata_dsapi-2.0.0.dev24/wizata_dsapi/models/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .common import linear_regression, logistic_regression, isolation_forest, gradiant_boost_classifier

wizata_dsapi-2.0.0.dev24/wizata_dsapi/models/common.py DELETED Viewed

@@ -1,116 +0,0 @@
-import wizata_dsapi
-import pandas
-import numpy
-import sklearn
-import sklearn.linear_model
-import sklearn.ensemble
-def extract_target_feat(context: wizata_dsapi.Context, single: bool = True):
-    """
-    return a list of target_feat columns names if not single value or the single value target feat name
-    raise an error if configuration mismatch
-    """
-    if "target_feat" not in context.properties:
-        raise ValueError(f"training script requires a proper target_feat")
-    target_feat = context.properties["target_feat"]
-    if isinstance(target_feat, str):
-        if single:
-            return target_feat
-        else:
-            return [target_feat]
-    elif isinstance(target_feat, list):
-        if single:
-            if len(target_feat) == 1:
-                return target_feat[0]
-            else:
-                raise ValueError(f"expecting only one target_feat but found {len(target_feat)}")
-        else:
-            return [target_feat]
-    else:
-        raise TypeError(f'target_feat must be a str or a list of str but found {target_feat.__class__.__name__}')
-def linear_regression(context: wizata_dsapi.Context):
-    """Train a linear regression model on all features to predict a single target column."""
-    df = context.dataframe
-    model_config = context.get_model_config()
-    if not model_config.has_target_feat():
-        raise ValueError(f'linear_regression requires a target feat')
-    target_feat_name = context.properties["target_feat"]
-    x = df.drop(columns=[target_feat_name])
-    y = df[target_feat_name]
-    model = sklearn.linear_model.LinearRegression()
-    model.fit(x, y)
-    context.set_model(model, features=x.columns)
-def logistic_regression(context: wizata_dsapi.Context):
-    """Train a logistic regression classifier on all features to predict a binary target column."""
-    df = context.dataframe
-    model_config = context.get_model_config()
-    if not model_config.has_target_feat():
-        raise ValueError(f'logistic_regression requires a target feat')
-    target_feat_name = context.properties["target_feat"]
-    x = df.drop(columns=[target_feat_name])
-    y = df[target_feat_name]
-    model = sklearn.linear_model.LogisticRegression()
-    model.fit(x, y.astype(int))
-    context.set_model(model, features=x.columns)
-def isolation_forest(context: wizata_dsapi.Context):
-    """Train an Isolation Forest for unsupervised anomaly detection using a sensitivity level (1-5)."""
-    model_config = context.get_model_config()
-    if model_config.has_target_feat():
-        raise ValueError(f'isolation_forest does not requires a target feat')
-    try:
-        if context.properties['sensitivity'] is None:
-            raise KeyError("sensitivity is none")
-        sensitivity = int(context.properties['sensitivity'])
-        sensitivities = [0.05, 0.15, 0.25, 0.35, 0.4]
-        contamination = sensitivities[sensitivity - 1]
-    except Exception as e:
-        raise ValueError(f'cannot extract sensitivity integer from 0 to 4 due to {e}')
-    df = context.dataframe.copy()
-    model = sklearn.ensemble.IsolationForest(contamination=contamination)
-    df['isolation_forest_predict'] = model.fit_predict(df)
-    context.set_model(model, features=df.columns)
-    return df
-def gradiant_boost_classifier(context: wizata_dsapi.Context):
-    """Train a Gradient Boosting classifier on all features to predict a target column."""
-    df = context.dataframe
-    model_config = context.get_model_config()
-    if not model_config.has_target_feat():
-        raise ValueError(f'gradiant_boost_classifier requires a target feat')
-    target_feat_name = context.properties["target_feat"]
-    x = df.drop(columns=[target_feat_name])
-    y = df[target_feat_name]
-    model = sklearn.ensemble.GradientBoostingClassifier(random_state=0).fit(x, y)
-    context.set_model(model, features=df.columns)