PyPI - smallaxe - Versions diffs - 0.1.0__py3-none-any.whl - Mend

smallaxe 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

smallaxe/__init__.py +157 -0
smallaxe/_config.py +37 -0
smallaxe/auto/__init__.py +1 -0
smallaxe/datasets/__init__.py +13 -0
smallaxe/datasets/_data.py +240 -0
smallaxe/exceptions/__init__.py +120 -0
smallaxe/metrics/__init__.py +35 -0
smallaxe/metrics/classification.py +241 -0
smallaxe/metrics/regression.py +301 -0
smallaxe/pipeline/__init__.py +5 -0
smallaxe/pipeline/pipeline.py +691 -0
smallaxe/preprocessing/__init__.py +11 -0
smallaxe/preprocessing/encoder.py +410 -0
smallaxe/preprocessing/imputer.py +327 -0
smallaxe/preprocessing/scaler.py +285 -0
smallaxe/search/__init__.py +1 -0
smallaxe/training/__init__.py +16 -0
smallaxe/training/base.py +764 -0
smallaxe/training/classifiers.py +127 -0
smallaxe/training/mixins/__init__.py +15 -0
smallaxe/training/mixins/metadata_mixin.py +158 -0
smallaxe/training/mixins/param_mixin.py +151 -0
smallaxe/training/mixins/persistence_mixin.py +164 -0
smallaxe/training/mixins/spark_model_mixin.py +255 -0
smallaxe/training/mixins/validation_mixin.py +228 -0
smallaxe/training/random_forest.py +198 -0
smallaxe/training/regressors.py +125 -0
smallaxe/viz/__init__.py +1 -0
smallaxe-0.1.0.dist-info/METADATA +204 -0
smallaxe-0.1.0.dist-info/RECORD +33 -0
smallaxe-0.1.0.dist-info/WHEEL +5 -0
smallaxe-0.1.0.dist-info/licenses/LICENSE +21 -0
smallaxe-0.1.0.dist-info/top_level.txt +1 -0

smallaxe/__init__.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""
+smallaxe - A PySpark MLOps library for simplified model training and optimization.
+"""
+from contextlib import contextmanager
+from typing import Any, Generator, Optional
+from smallaxe._config import (
+    VALID_CACHE_STRATEGIES,
+    VALID_VERBOSITY_LEVELS,
+    _config,
+)
+from smallaxe.exceptions import ConfigurationError
+__version__ = "0.1.0"
+__all__ = [
+    "__version__",
+    "set_verbosity",
+    "get_verbosity",
+    "verbosity",
+    "set_spark_session",
+    "get_spark_session",
+    "set_seed",
+    "get_seed",
+    "set_cache_strategy",
+    "get_cache_strategy",
+]
+def set_verbosity(level: str) -> None:
+    """Set the global verbosity level.
+    Args:
+        level: Verbosity level. One of 'quiet', 'normal', or 'verbose'.
+            - 'quiet': Only errors, no progress bars or info messages
+            - 'normal': Progress bars and key info (default)
+            - 'verbose': Detailed logging for debugging
+    Raises:
+        ConfigurationError: If level is not a valid verbosity level.
+    """
+    if level not in VALID_VERBOSITY_LEVELS:
+        raise ConfigurationError(
+            setting="verbosity",
+            value=level,
+            allowed_values=list(VALID_VERBOSITY_LEVELS),
+        )
+    _config._verbosity = level
+def get_verbosity() -> str:
+    """Get the current verbosity level.
+    Returns:
+        The current verbosity level ('quiet', 'normal', or 'verbose').
+    """
+    return _config._verbosity
+@contextmanager
+def verbosity(level: str) -> Generator[None, None, None]:
+    """Context manager for temporarily changing verbosity level.
+    Args:
+        level: Verbosity level to use within the context.
+    Raises:
+        ConfigurationError: If level is not a valid verbosity level.
+    Example:
+        >>> with smallaxe.verbosity('quiet'):
+        ...     model.fit(df, label_col='target')  # runs silently
+    """
+    previous_level = get_verbosity()
+    set_verbosity(level)
+    try:
+        yield
+    finally:
+        _config._verbosity = previous_level
+def set_spark_session(spark: Any) -> None:
+    """Set the Spark session to use.
+    Args:
+        spark: A SparkSession instance. If None, smallaxe will attempt
+            to get or create a session when needed.
+    """
+    _config._spark_session = spark
+def get_spark_session() -> Optional[Any]:
+    """Get the configured Spark session.
+    Returns:
+        The configured SparkSession, or None if not set.
+    """
+    return _config._spark_session
+def set_seed(seed: Optional[int]) -> None:
+    """Set the global random seed for reproducibility.
+    This affects all random operations including train/test splits,
+    k-fold cross-validation, and hyperopt sampling.
+    Args:
+        seed: Integer seed value, or None to reset to no seed.
+    Raises:
+        ConfigurationError: If seed is not an integer or None.
+    """
+    if seed is not None and not isinstance(seed, int):
+        raise ConfigurationError(
+            message=f"Seed must be an integer or None, got {type(seed).__name__}."
+        )
+    _config._seed = seed
+def get_seed() -> Optional[int]:
+    """Get the current random seed.
+    Returns:
+        The current seed value, or None if not set.
+    """
+    return _config._seed
+def set_cache_strategy(strategy: str) -> None:
+    """Set the caching strategy for PySpark operations.
+    Args:
+        strategy: Cache strategy. One of 'auto', 'always', or 'never'.
+            - 'auto': Smart caching - cache after preprocessing, unpersist after training
+            - 'always': Cache at every stage (use for debugging or small datasets)
+            - 'never': No automatic caching (manual control)
+    Raises:
+        ConfigurationError: If strategy is not a valid cache strategy.
+    """
+    if strategy not in VALID_CACHE_STRATEGIES:
+        raise ConfigurationError(
+            setting="cache_strategy",
+            value=strategy,
+            allowed_values=list(VALID_CACHE_STRATEGIES),
+        )
+    _config._cache_strategy = strategy
+def get_cache_strategy() -> str:
+    """Get the current cache strategy.
+    Returns:
+        The current cache strategy ('auto', 'always', or 'never').
+    """
+    return _config._cache_strategy

smallaxe/_config.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Internal configuration state for smallaxe."""
+from typing import Any, Optional
+# Valid configuration values
+VALID_VERBOSITY_LEVELS = ("quiet", "normal", "verbose")
+VALID_CACHE_STRATEGIES = ("auto", "always", "never")
+# Default configuration values
+DEFAULT_VERBOSITY = "normal"
+DEFAULT_CACHE_STRATEGY = "auto"
+DEFAULT_SEED = None
+class _Config:
+    """Internal configuration state container.
+    This class holds the global configuration state for smallaxe.
+    It should not be accessed directly - use the module-level functions instead.
+    """
+    def __init__(self) -> None:
+        self._verbosity: str = DEFAULT_VERBOSITY
+        self._cache_strategy: str = DEFAULT_CACHE_STRATEGY
+        self._seed: Optional[int] = DEFAULT_SEED
+        self._spark_session: Optional[Any] = None
+    def reset(self) -> None:
+        """Reset configuration to defaults."""
+        self._verbosity = DEFAULT_VERBOSITY
+        self._cache_strategy = DEFAULT_CACHE_STRATEGY
+        self._seed = DEFAULT_SEED
+        self._spark_session = None
+# Global configuration instance
+_config = _Config()

smallaxe/auto/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Auto module - automated training."""

smallaxe/datasets/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Sample datasets for testing and demos."""
+from smallaxe.datasets._data import (
+    dataset_info,
+    load_sample_classification,
+    load_sample_regression,
+)
+__all__ = [
+    "load_sample_regression",
+    "load_sample_classification",
+    "dataset_info",
+]

smallaxe/datasets/_data.py ADDED Viewed

@@ -0,0 +1,240 @@
+"""Raw data generators for sample datasets."""
+import random
+from typing import List, Tuple
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.types import (
+    DoubleType,
+    IntegerType,
+    StringType,
+    StructField,
+    StructType,
+)
+# Regression dataset constants
+LOCATIONS = ["urban", "suburban", "rural"]
+CONDITIONS = ["excellent", "good", "fair", "poor"]
+# Classification dataset constants
+CONTRACTS = ["month-to-month", "one_year", "two_year"]
+PAYMENT_METHODS = ["credit_card", "bank_transfer", "electronic_check", "mailed_check"]
+def _generate_regression_data(
+    n_rows: int = 10000, seed: int = 42
+) -> List[Tuple[int, int, int, int, str, str, float]]:
+    """Generate synthetic housing data with realistic distributions.
+    The price is correlated with features:
+    - More bedrooms/bathrooms → higher price
+    - More sqft → higher price
+    - Newer homes (lower age) → higher price
+    - Urban > suburban > rural
+    - Excellent > good > fair > poor condition
+    """
+    random.seed(seed)
+    location_multipliers = {"urban": 1.3, "suburban": 1.0, "rural": 0.7}
+    condition_multipliers = {"excellent": 1.2, "good": 1.0, "fair": 0.85, "poor": 0.7}
+    data = []
+    for _ in range(n_rows):
+        # Generate correlated features
+        bedrooms = random.choices([1, 2, 3, 4, 5], weights=[10, 25, 35, 20, 10])[0]
+        bathrooms = max(1, bedrooms - random.randint(0, 1))
+        sqft = int(500 + bedrooms * 400 + random.gauss(0, 200))
+        sqft = max(400, sqft)  # minimum sqft
+        age = random.choices(list(range(0, 51)), weights=[max(1, 50 - i) for i in range(51)])[0]
+        location = random.choices(LOCATIONS, weights=[30, 50, 20])[0]
+        condition = random.choices(CONDITIONS, weights=[15, 45, 30, 10])[0]
+        # Calculate price with realistic correlation
+        base_price = 50000 + sqft * 150 + bedrooms * 10000 + bathrooms * 8000
+        age_discount = age * 1000
+        location_factor = location_multipliers[location]
+        condition_factor = condition_multipliers[condition]
+        price = (base_price - age_discount) * location_factor * condition_factor
+        price = price + random.gauss(0, price * 0.1)  # Add noise
+        price = max(50000, round(price, 2))  # Minimum price
+        data.append((bedrooms, bathrooms, sqft, age, location, condition, price))
+    return data
+def _generate_classification_data(
+    n_rows: int = 10000, seed: int = 42
+) -> List[Tuple[int, float, float, str, str, int]]:
+    """Generate synthetic customer churn data with realistic distributions.
+    Churn probability is correlated with features:
+    - Lower tenure → higher churn
+    - Higher monthly charges → higher churn
+    - Month-to-month contract → higher churn
+    - Electronic check payment → higher churn
+    """
+    random.seed(seed)
+    contract_churn_base = {"month-to-month": 0.4, "one_year": 0.15, "two_year": 0.05}
+    payment_churn_modifier = {
+        "credit_card": -0.05,
+        "bank_transfer": -0.05,
+        "electronic_check": 0.1,
+        "mailed_check": 0.0,
+    }
+    data = []
+    for _ in range(n_rows):
+        # Generate features
+        tenure = random.choices(list(range(1, 73)), weights=[max(1, 72 - i) for i in range(72)])[0]
+        monthly_charges = round(random.uniform(20, 120), 2)
+        total_charges = round(tenure * monthly_charges * random.uniform(0.9, 1.1), 2)
+        contract = random.choices(CONTRACTS, weights=[55, 25, 20])[0]
+        payment_method = random.choices(PAYMENT_METHODS, weights=[25, 25, 30, 20])[0]
+        # Calculate churn probability
+        base_churn = contract_churn_base[contract]
+        tenure_modifier = max(0, (24 - tenure) / 100)  # Higher churn for low tenure
+        charge_modifier = (monthly_charges - 70) / 500  # Higher charges → more churn
+        payment_modifier = payment_churn_modifier[payment_method]
+        churn_prob = base_churn + tenure_modifier + charge_modifier + payment_modifier
+        churn_prob = max(0.02, min(0.8, churn_prob))  # Clamp probability
+        churn = 1 if random.random() < churn_prob else 0
+        data.append((tenure, monthly_charges, total_charges, contract, payment_method, churn))
+    return data
+def load_sample_regression(spark: SparkSession, n_rows: int = 10000, seed: int = 42) -> DataFrame:
+    """Load a sample regression dataset (housing prices).
+    Args:
+        spark: SparkSession instance.
+        n_rows: Number of rows to generate. Default is 10,000.
+        seed: Random seed for reproducibility. Default is 42.
+    Returns:
+        PySpark DataFrame with columns:
+            - bedrooms (int): Number of bedrooms (1-5)
+            - bathrooms (int): Number of bathrooms (1-5)
+            - sqft (int): Square footage (400+)
+            - age (int): Age of home in years (0-50)
+            - location (str): 'urban', 'suburban', or 'rural'
+            - condition (str): 'excellent', 'good', 'fair', or 'poor'
+            - price (float): House price in dollars (label column)
+    """
+    schema = StructType(
+        [
+            StructField("bedrooms", IntegerType(), False),
+            StructField("bathrooms", IntegerType(), False),
+            StructField("sqft", IntegerType(), False),
+            StructField("age", IntegerType(), False),
+            StructField("location", StringType(), False),
+            StructField("condition", StringType(), False),
+            StructField("price", DoubleType(), False),
+        ]
+    )
+    data = _generate_regression_data(n_rows=n_rows, seed=seed)
+    return spark.createDataFrame(data, schema)
+def load_sample_classification(
+    spark: SparkSession, n_rows: int = 10000, seed: int = 42
+) -> DataFrame:
+    """Load a sample classification dataset (customer churn).
+    Args:
+        spark: SparkSession instance.
+        n_rows: Number of rows to generate. Default is 10,000.
+        seed: Random seed for reproducibility. Default is 42.
+    Returns:
+        PySpark DataFrame with columns:
+            - tenure (int): Months as customer (1-72)
+            - monthly_charges (float): Monthly bill amount (20-120)
+            - total_charges (float): Total amount charged
+            - contract (str): 'month-to-month', 'one_year', or 'two_year'
+            - payment_method (str): Payment method used
+            - churn (int): 1 if churned, 0 otherwise (label column)
+    """
+    schema = StructType(
+        [
+            StructField("tenure", IntegerType(), False),
+            StructField("monthly_charges", DoubleType(), False),
+            StructField("total_charges", DoubleType(), False),
+            StructField("contract", StringType(), False),
+            StructField("payment_method", StringType(), False),
+            StructField("churn", IntegerType(), False),
+        ]
+    )
+    data = _generate_classification_data(n_rows=n_rows, seed=seed)
+    return spark.createDataFrame(data, schema)
+def dataset_info(dataset_name: str) -> None:
+    """Print information about a sample dataset.
+    Args:
+        dataset_name: Either 'regression' or 'classification'.
+    Raises:
+        ValueError: If dataset_name is not recognized.
+    """
+    if dataset_name == "regression":
+        info = """
+Sample Regression Dataset: Housing Prices
+==========================================
+Columns:
+  - bedrooms (int): Number of bedrooms (1-5)
+  - bathrooms (int): Number of bathrooms (1-5)
+  - sqft (int): Square footage of the home (400+)
+  - age (int): Age of the home in years (0-50)
+  - location (str): Location type - 'urban', 'suburban', or 'rural'
+  - condition (str): Home condition - 'excellent', 'good', 'fair', or 'poor'
+  - price (float): House price in dollars (LABEL COLUMN)
+Numerical features: bedrooms, bathrooms, sqft, age
+Categorical features: location, condition
+Label: price
+Usage:
+  from smallaxe.datasets import load_sample_regression
+  df = load_sample_regression(spark)
+"""
+    elif dataset_name == "classification":
+        info = """
+Sample Classification Dataset: Customer Churn
+==============================================
+Columns:
+  - tenure (int): Number of months as a customer (1-72)
+  - monthly_charges (float): Monthly bill amount (20-120)
+  - total_charges (float): Total amount charged over tenure
+  - contract (str): Contract type - 'month-to-month', 'one_year', or 'two_year'
+  - payment_method (str): 'credit_card', 'bank_transfer', 'electronic_check', or 'mailed_check'
+  - churn (int): 1 if customer churned, 0 otherwise (LABEL COLUMN)
+Numerical features: tenure, monthly_charges, total_charges
+Categorical features: contract, payment_method
+Label: churn (binary: 0 or 1)
+Class distribution: ~30% churn (1), ~70% no churn (0)
+Usage:
+  from smallaxe.datasets import load_sample_classification
+  df = load_sample_classification(spark)
+"""
+    else:
+        raise ValueError(
+            f"Unknown dataset: '{dataset_name}'. Use 'regression' or 'classification'."
+        )
+    print(info)

smallaxe/exceptions/__init__.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Custom exception classes for smallaxe.
+Exception Hierarchy:
+    SmallaxeError (base)
+    ├── ValidationError
+    ├── PreprocessingError
+    ├── ModelNotFittedError
+    ├── ColumnNotFoundError
+    ├── DependencyError
+    └── ConfigurationError
+"""
+from typing import List, Optional
+__all__ = [
+    "SmallaxeError",
+    "ValidationError",
+    "PreprocessingError",
+    "ModelNotFittedError",
+    "ColumnNotFoundError",
+    "DependencyError",
+    "ConfigurationError",
+]
+class SmallaxeError(Exception):
+    """Base exception for all smallaxe errors."""
+    def __init__(self, message: str = "An error occurred in smallaxe."):
+        self.message = message
+        super().__init__(self.message)
+class ValidationError(SmallaxeError):
+    """Raised when input parameters or data are invalid."""
+    def __init__(self, message: str = "Invalid input parameters or data."):
+        super().__init__(message)
+class PreprocessingError(SmallaxeError):
+    """Raised when required preprocessing steps are missing."""
+    def __init__(
+        self,
+        message: str = "Missing required preprocessing steps.",
+        algorithm: Optional[str] = None,
+        missing_step: Optional[str] = None,
+    ):
+        if algorithm and missing_step:
+            message = (
+                f"{algorithm} requires {missing_step} in pipeline. "
+                f"Add {missing_step} before the model step."
+            )
+        self.algorithm = algorithm
+        self.missing_step = missing_step
+        super().__init__(message)
+class ModelNotFittedError(SmallaxeError):
+    """Raised when predict() is called before fit()."""
+    def __init__(self, message: str = "Model has not been fitted. Call fit() before predict()."):
+        super().__init__(message)
+class ColumnNotFoundError(SmallaxeError):
+    """Raised when a required column is missing from the DataFrame."""
+    def __init__(
+        self,
+        message: str = "Required column not found in DataFrame.",
+        column: Optional[str] = None,
+        available_columns: Optional[List[str]] = None,
+    ):
+        if column:
+            message = f"Column '{column}' not found in DataFrame."
+            if available_columns:
+                message += f" Available columns: {available_columns}"
+        self.column = column
+        self.available_columns = available_columns
+        super().__init__(message)
+class DependencyError(SmallaxeError):
+    """Raised when an optional dependency is not installed."""
+    def __init__(
+        self,
+        message: str = "Missing optional dependency.",
+        package: Optional[str] = None,
+        install_command: Optional[str] = None,
+    ):
+        if package:
+            message = f"{package} is not installed."
+            if install_command:
+                message += f" Install with: {install_command}"
+        self.package = package
+        self.install_command = install_command
+        super().__init__(message)
+class ConfigurationError(SmallaxeError):
+    """Raised when configuration settings are invalid."""
+    def __init__(
+        self,
+        message: str = "Invalid configuration settings.",
+        setting: Optional[str] = None,
+        value: Optional[str] = None,
+        allowed_values: Optional[List[str]] = None,
+    ):
+        if setting and value:
+            message = f"Invalid value '{value}' for setting '{setting}'."
+            if allowed_values:
+                message += f" Allowed values: {allowed_values}"
+        self.setting = setting
+        self.value = value
+        self.allowed_values = allowed_values
+        super().__init__(message)

smallaxe/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""Metrics module - regression and classification metrics."""
+from smallaxe.metrics.classification import (
+    accuracy,
+    auc_pr,
+    auc_roc,
+    f1_score,
+    log_loss,
+    precision,
+    recall,
+)
+from smallaxe.metrics.regression import (
+    mae,
+    mape,
+    mse,
+    r2,
+    rmse,
+)
+__all__ = [
+    # Regression metrics
+    "mse",
+    "rmse",
+    "mae",
+    "r2",
+    "mape",
+    # Classification metrics
+    "accuracy",
+    "precision",
+    "recall",
+    "f1_score",
+    "auc_roc",
+    "auc_pr",
+    "log_loss",
+]