PyPI - segmentae - Versions diffs - 1.5.20__py3-none-any.whl - Mend

segmentae 1.5.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

segmentae/__init__.py +83 -0
segmentae/anomaly_detection.py +20 -0
segmentae/autoencoders/__init__.py +16 -0
segmentae/autoencoders/batch_norm.py +208 -0
segmentae/autoencoders/dense.py +211 -0
segmentae/autoencoders/ensemble.py +219 -0
segmentae/clusters/__init__.py +18 -0
segmentae/clusters/clustering.py +171 -0
segmentae/clusters/models.py +438 -0
segmentae/clusters/registry.py +75 -0
segmentae/core/__init__.py +65 -0
segmentae/core/base.py +108 -0
segmentae/core/constants.py +91 -0
segmentae/core/exceptions.py +60 -0
segmentae/core/types.py +55 -0
segmentae/data_sources/__init__.py +3 -0
segmentae/data_sources/examples.py +198 -0
segmentae/metrics/__init__.py +6 -0
segmentae/metrics/performance_metrics.py +119 -0
segmentae/optimization/__init__.py +6 -0
segmentae/optimization/optimizer.py +375 -0
segmentae/pipeline/__init__.py +21 -0
segmentae/pipeline/reconstruction.py +214 -0
segmentae/pipeline/segmentae.py +562 -0
segmentae/processing/__init__.py +21 -0
segmentae/processing/preprocessing.py +263 -0
segmentae/processing/simplifier.py +74 -0
segmentae/utils/__init__.py +17 -0
segmentae/utils/validation.py +94 -0
segmentae-1.5.20.dist-info/METADATA +393 -0
segmentae-1.5.20.dist-info/RECORD +34 -0
segmentae-1.5.20.dist-info/WHEEL +5 -0
segmentae-1.5.20.dist-info/licenses/LICENSE +21 -0
segmentae-1.5.20.dist-info/top_level.txt +1 -0

segmentae/core/constants.py ADDED Viewed

@@ -0,0 +1,91 @@
+from enum import Enum
+from typing import Dict
+class PhaseType(str, Enum):
+    """Pipeline execution phases for SegmentAE reconstruction workflow."""
+    EVALUATION = "evaluation"
+    TESTING = "testing"
+    PREDICTION = "prediction"
+class ClusterModel(str, Enum):
+    """Available clustering algorithms for data segmentation."""
+    KMEANS = "KMeans"
+    MINIBATCH_KMEANS = "MiniBatchKMeans"
+    GMM = "GMM"
+    AGGLOMERATIVE = "Agglomerative"
+class ThresholdMetric(str, Enum):
+    """Reconstruction error metrics for anomaly detection thresholding."""
+    MSE = "mse"
+    MAE = "mae"
+    RMSE = "rmse"
+    MAX_ERROR = "max_error"
+class EncoderType(str, Enum):
+    """Categorical variable encoding methods."""
+    IFREQUENCY = "IFrequencyEncoder"
+    LABEL = "LabelEncoder"
+    ONEHOT = "OneHotEncoder"
+class ScalerType(str, Enum):
+    """Feature scaling methods for numerical normalization."""
+    MINMAX = "MinMaxScaler"
+    STANDARD = "StandardScaler"
+    ROBUST = "RobustScaler"
+class ImputerType(str, Enum):
+    """Missing value imputation methods."""
+    SIMPLE = "Simple"
+# Mapping dictionaries
+METRIC_COLUMN_MAP: Dict[ThresholdMetric, str] = {
+    ThresholdMetric.MSE: "MSE_Recons_error",
+    ThresholdMetric.MAE: "MAE_Recons_error",
+    ThresholdMetric.RMSE: "RMSE_Recons_error",
+    ThresholdMetric.MAX_ERROR: "Max_Recons_error"
+}
+METRIC_NAME_MAP: Dict[str, ThresholdMetric] = {
+    "mse": ThresholdMetric.MSE,
+    "mae": ThresholdMetric.MAE,
+    "rmse": ThresholdMetric.RMSE,
+    "max_error": ThresholdMetric.MAX_ERROR
+}
+ENCODER_CLASS_MAP: Dict[EncoderType, str] = {
+    EncoderType.IFREQUENCY: "AutoIFrequencyEncoder",
+    EncoderType.LABEL: "AutoLabelEncoder",
+    EncoderType.ONEHOT: "AutoOneHotEncoder"
+}
+SCALER_CLASS_MAP: Dict[ScalerType, str] = {
+    ScalerType.MINMAX: "AutoMinMaxScaler",
+    ScalerType.STANDARD: "AutoStandardScaler",
+    ScalerType.ROBUST: "AutoRobustScaler"
+}
+IMPUTER_CLASS_MAP: Dict[ImputerType, str] = {
+    ImputerType.SIMPLE: "AutoSimpleImputer"
+}
+def get_metric_column_name(metric: ThresholdMetric) -> str:
+    return METRIC_COLUMN_MAP[metric]
+def parse_threshold_metric(metric_str: str) -> ThresholdMetric:
+    metric_lower = metric_str.lower()
+    if metric_lower not in METRIC_NAME_MAP:
+        valid_metrics = ", ".join(METRIC_NAME_MAP.keys())
+        raise ValueError(
+            f"Unknown threshold metric: '{metric_str}'. "
+            f"Valid options are: {valid_metrics}"
+        )
+    return METRIC_NAME_MAP[metric_lower]

segmentae/core/exceptions.py ADDED Viewed

@@ -0,0 +1,60 @@
+class SegmentAEError(Exception):
+    """Base exception class for all SegmentAE errors."""
+    def __init__(self, message: str):
+        self.message = message
+        super().__init__(self.message)
+class ClusteringError(SegmentAEError):
+    """Exception raised for clustering-related errors."""
+    def __init__(self, message: str):
+        super().__init__(f"Clustering Error: {message}")
+class ReconstructionError(SegmentAEError):
+    """Exception raised for reconstruction-related errors."""
+    def __init__(self, message: str):
+        super().__init__(f"Reconstruction Error: {message}")
+class ValidationError(SegmentAEError):
+    """Exception raised for input validation errors."""
+    def __init__(self, message: str, suggestion: str = None):
+        error_msg = f"Validation Error: {message}"
+        if suggestion:
+            error_msg += f"\nSuggestion: {suggestion}"
+        super().__init__(error_msg)
+class ModelNotFittedError(SegmentAEError):
+    """Exception raised when attempting to use a model before fitting."""
+    def __init__(self, message: str = None, component: str = "Model"):
+        if message is None:
+            message = (
+                f"{component} must be fitted before use. "
+                f"Please call the fit() or appropriate fitting method first."
+            )
+        super().__init__(message)
+class ConfigurationError(SegmentAEError):
+    """Exception raised for invalid configuration parameters."""
+    def __init__(self, message: str, valid_options: list = None):
+        error_msg = f"Configuration Error: {message}"
+        if valid_options:
+            options_str = ", ".join(str(opt) for opt in valid_options)
+            error_msg += f"\nValid options: {options_str}"
+        super().__init__(error_msg)
+class AutoencoderError(SegmentAEError):
+    """Exception raised for autoencoder-related errors."""
+    def __init__(self, message: str):
+        super().__init__(f"Autoencoder Error: {message}")

segmentae/core/types.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import Any, Dict, List, Protocol, Union
+import numpy as np
+import pandas as pd
+# Type aliases for commonly used types
+DataFrame = pd.DataFrame
+Series = pd.Series
+NDArray = np.ndarray
+DictStrAny = Dict[str, Any]
+class AutoencoderProtocol(Protocol):
+    """
+    Protocol defining the interface for autoencoder models.
+    """
+    def predict(self, X: Union[DataFrame, NDArray]) -> NDArray:
+        """
+        Generate reconstructions from input data.
+        """
+        ...
+class ClusterModelProtocol(Protocol):
+    """
+    Protocol defining the interface for clustering models.
+    """
+    def fit(self, X: DataFrame) -> None:
+        """Fit clustering model to data."""
+        ...
+    def predict(self, X: DataFrame) -> NDArray:
+        """Predict cluster assignments."""
+        ...
+    @property
+    def n_clusters(self) -> int:
+        """Number of clusters."""
+        ...
+class PreprocessorProtocol(Protocol):
+    """
+    Protocol defining the interface for preprocessing components.
+    """
+    def fit(self, X: DataFrame) -> 'PreprocessorProtocol':
+        """Fit preprocessor to data."""
+        ...
+    def transform(self, X: DataFrame) -> DataFrame:
+        """Transform data using fitted preprocessor."""
+        ...

segmentae/data_sources/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from segmentae.data_sources.examples import load_dataset
+__all__ = ['load_dataset']

segmentae/data_sources/examples.py ADDED Viewed

@@ -0,0 +1,198 @@
+from typing import Optional, Tuple
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from ucimlrepo import fetch_ucirepo
+def load_dataset(
+    dataset_selection: str = 'htru2_dataset',
+    split_ratio: float = 0.8,
+    random_state: Optional[int] = 5
+) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
+    """
+    Load and preprocess datasets for anomaly detection tasks.
+    Provides access to several benchmark anomaly detection datasets including
+    credit card defaults, shuttle data, and pulsar data.
+    """
+    # Handle different dataset selections
+    if dataset_selection == "default_credit_card":
+        return _load_credit_card_dataset(split_ratio, random_state)
+    elif dataset_selection == "shuttle_148":
+        return _load_shuttle_dataset(split_ratio, random_state)
+    elif dataset_selection == "htru2_dataset":
+        return _load_htru2_dataset(split_ratio, random_state)
+    else:
+        raise ValueError(
+            f"Unknown dataset: '{dataset_selection}'. "
+            f"Available options:  'default_credit_card', 'shuttle_148', 'htru2_dataset'"
+        )
+def _load_credit_card_dataset(
+    split_ratio: float,
+    random_state: Optional[int]
+) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
+    """Load default of credit card clients dataset."""
+    # Source: https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients
+    """
+    This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods.
+    """
+    # Fetch dataset
+    info = fetch_ucirepo(id=350)
+    # Concatenate features and targets
+    data = pd.concat([info.data.features, info.data.targets], axis=1)
+    target = 'Y'
+    # Cast target values to integer
+    data[target] = data[target].astype(int)
+    # Separate normal and fraud instances
+    normal = data[data[target] == 0]
+    fraud = data[data[target] == 1]
+    # Split normal instances into training and testing sets
+    train, test = train_test_split(
+        normal,
+        train_size=split_ratio,
+        random_state=random_state
+    )
+    # Combine testing set with fraud instances and shuffle
+    test = pd.concat([test, fraud])
+    test = test.sample(frac=1, random_state=42)
+    # Reset index
+    train = train.reset_index(drop=True)
+    test = test.reset_index(drop=True)
+    # Print information
+    _print_dataset_info(train, test, target, suggested_split=0.75)
+    return train, test, target
+def _load_shuttle_dataset(
+    split_ratio: float,
+    random_state: Optional[int]
+) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
+    """Load Statlog Shuttle dataset."""
+    # Source: https://archive.ics.uci.edu/dataset/148/statlog+shuttle
+    """ The shuttle dataset contains 9 attributes all of which are numerical. Approximately 80% of the data belongs to class 1 """
+    # Fetch dataset
+    info = fetch_ucirepo(id=148)
+    # Concatenate features and targets
+    data = pd.concat([
+        info.data.features.reset_index(drop=True),
+        info.data.targets.reset_index(drop=True)
+    ], axis=1)
+    target = 'class'
+    # Adjust target values to binary (1=normal, others=anomaly)
+    data[target] = data[target].replace({1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1})
+    data[target] = data[target].astype(int)
+    # Separate normal and anomaly instances
+    normal = data[data[target] == 0]
+    anomaly = data[data[target] == 1]
+    # Split normal instances
+    train, test = train_test_split(
+        normal,
+        train_size=split_ratio,
+        random_state=random_state
+    )
+    # Combine and shuffle
+    test = pd.concat([test, anomaly])
+    test = test.sample(frac=1, random_state=42)
+    # Reset index
+    train = train.reset_index(drop=True)
+    test = test.reset_index(drop=True)
+    # Print information
+    _print_dataset_info(train, test, target, suggested_split=0.75)
+    return train, test, target
+def _load_htru2_dataset(
+    split_ratio: float,
+    random_state: Optional[int]
+) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
+    """Load HTRU2 pulsar dataset."""
+    # Source: https://archive.ics.uci.edu/dataset/372/htru2
+    """
+    R. J. Lyon, B. W. Stappers, S. Cooper, J. M. Brooke, J. D. Knowles,
+    Fifty Years of Pulsar Candidate Selection: From simple filters to a new principled real-time classification approach,
+    Monthly Notices of the Royal Astronomical Society 459 (1), 1104-1123, DOI: 10.1093/mnras/stw656
+    """
+    # Fetch dataset
+    info = fetch_ucirepo(id=372)
+    # Concatenate features and targets
+    data = pd.concat([
+        info.data.features.reset_index(drop=True),
+        info.data.targets.reset_index(drop=True)
+    ], axis=1)
+    target = 'class'
+    # Cast target values to integer
+    data[target] = data[target].astype(int)
+    # Separate normal and anomaly instances
+    normal = data[data[target] == 0]
+    anomaly = data[data[target] == 1]
+    # Split normal instances
+    train, test = train_test_split(
+        normal,
+        train_size=split_ratio,
+        random_state=random_state
+    )
+    # Combine and shuffle
+    test = pd.concat([test, anomaly])
+    test = test.sample(frac=1, random_state=42)
+    # Reset index
+    train = train.reset_index(drop=True)
+    test = test.reset_index(drop=True)
+    # Print information
+    _print_dataset_info(train, test, target, suggested_split=0.9)
+    return train, test, target
+def _print_dataset_info(
+    train: pd.DataFrame,
+    test: pd.DataFrame,
+    target: str,
+    suggested_split: float
+) -> None:
+    """Print dataset information."""
+    info = {
+        "Train Length": len(train),
+        "Test Length": len(test),
+        "Suggested Split_Ratio": suggested_split
+    }
+    # Add target distribution
+    for key, value in test[target].value_counts().to_dict().items():
+        label = "Anomalies [1]" if key == 1 else "Normal [0]"
+        info[label] = value
+    print(info)

segmentae/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from segmentae.metrics.performance_metrics import metrics_classification, metrics_regression
+__all__ = [
+    'metrics_classification',
+    'metrics_regression'
+]

segmentae/metrics/performance_metrics.py ADDED Viewed

@@ -0,0 +1,119 @@
+from typing import Union
+import numpy as np
+import pandas as pd
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    max_error,
+    mean_absolute_error,
+    mean_squared_error,
+    precision_score,
+    r2_score,
+    recall_score,
+    root_mean_squared_error,
+)
+from segmentae.core.exceptions import ValidationError
+def metrics_classification(
+    y_true: Union[pd.Series, np.ndarray],
+    y_pred: Union[pd.Series, np.ndarray]
+) -> pd.DataFrame:
+    """
+    Calculate classification evaluation metrics.
+    Returns:
+        DataFrame containing accuracy, precision, recall, and F1 score metrics
+    """
+    # Validate inputs
+    _validate_classification_inputs(y_true, y_pred)
+    # Calculate metrics with zero_division handling
+    accuracy = accuracy_score(y_true, y_pred)
+    precision = precision_score(y_true, y_pred, zero_division=0)
+    recall = recall_score(y_true, y_pred, zero_division=0)
+    f1 = f1_score(y_true, y_pred, zero_division=0)
+    # Create metrics dictionary
+    metrics = {
+        'Accuracy': accuracy,
+        'Precision': precision,
+        'Recall': recall,
+        'F1 Score': f1
+    }
+    # Convert to DataFrame
+    return pd.DataFrame(metrics, index=[0])
+def metrics_regression(
+    y_true: Union[pd.Series, np.ndarray],
+    y_pred: Union[pd.Series, np.ndarray]
+) -> pd.DataFrame:
+    """
+    Calculate regression evaluation metrics.
+    Returns:
+        DataFrame containing MAE, MSE, RMSE, R², and Max Error metrics
+    """
+    # Validate inputs
+    _validate_regression_inputs(y_true, y_pred)
+    # Calculate metrics
+    mae = mean_absolute_error(y_true, y_pred)
+    mse = mean_squared_error(y_true, y_pred)
+    rmse = root_mean_squared_error(y_true, y_pred, squared=False)
+    r2 = r2_score(y_true, y_pred)
+    maxerror = max_error(y_true, y_pred)
+    # Create metrics dictionary
+    metrics = {
+        'Mean Absolute Error': mae,
+        'Mean Squared Error': mse,
+        'Root Mean Squared Error': rmse,
+        'R-squared': r2,
+        'Max Error': maxerror
+    }
+    # Convert to DataFrame
+    return pd.DataFrame(metrics, index=[0])
+def _validate_classification_inputs(
+    y_true: Union[pd.Series, np.ndarray],
+    y_pred: Union[pd.Series, np.ndarray]
+) -> None:
+    """Validate inputs for classification metrics."""
+    if len(y_true) != len(y_pred):
+        raise ValidationError(
+            f"Length mismatch: y_true has {len(y_true)} samples, "
+            f"y_pred has {len(y_pred)} samples",
+            suggestion="Ensure both arrays have the same number of samples"
+        )
+    if len(y_true) == 0:
+        raise ValidationError(
+            "Empty arrays provided",
+            suggestion="Provide non-empty arrays with predictions"
+        )
+def _validate_regression_inputs(
+    y_true: Union[pd.Series, np.ndarray],
+    y_pred: Union[pd.Series, np.ndarray]
+) -> None:
+    """Validate inputs for regression metrics."""
+    if len(y_true) != len(y_pred):
+        raise ValidationError(
+            f"Length mismatch: y_true has {len(y_true)} samples, "
+            f"y_pred has {len(y_pred)} samples",
+            suggestion="Ensure both arrays have the same number of samples"
+        )
+    if len(y_true) == 0:
+        raise ValidationError(
+            "Empty arrays provided",
+            suggestion="Provide non-empty arrays with predictions"
+        )

segmentae/optimization/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from segmentae.optimization.optimizer import OptimizerConfig, SegmentAE_Optimizer
+__all__ = [
+    'SegmentAE_Optimizer',
+    'OptimizerConfig'
+]