smallaxe 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
smallaxe/__init__.py ADDED
@@ -0,0 +1,157 @@
1
+ """
2
+ smallaxe - A PySpark MLOps library for simplified model training and optimization.
3
+ """
4
+
5
+ from contextlib import contextmanager
6
+ from typing import Any, Generator, Optional
7
+
8
+ from smallaxe._config import (
9
+ VALID_CACHE_STRATEGIES,
10
+ VALID_VERBOSITY_LEVELS,
11
+ _config,
12
+ )
13
+ from smallaxe.exceptions import ConfigurationError
14
+
15
+ __version__ = "0.1.0"
16
+
17
+ __all__ = [
18
+ "__version__",
19
+ "set_verbosity",
20
+ "get_verbosity",
21
+ "verbosity",
22
+ "set_spark_session",
23
+ "get_spark_session",
24
+ "set_seed",
25
+ "get_seed",
26
+ "set_cache_strategy",
27
+ "get_cache_strategy",
28
+ ]
29
+
30
+
31
+ def set_verbosity(level: str) -> None:
32
+ """Set the global verbosity level.
33
+
34
+ Args:
35
+ level: Verbosity level. One of 'quiet', 'normal', or 'verbose'.
36
+ - 'quiet': Only errors, no progress bars or info messages
37
+ - 'normal': Progress bars and key info (default)
38
+ - 'verbose': Detailed logging for debugging
39
+
40
+ Raises:
41
+ ConfigurationError: If level is not a valid verbosity level.
42
+ """
43
+ if level not in VALID_VERBOSITY_LEVELS:
44
+ raise ConfigurationError(
45
+ setting="verbosity",
46
+ value=level,
47
+ allowed_values=list(VALID_VERBOSITY_LEVELS),
48
+ )
49
+ _config._verbosity = level
50
+
51
+
52
+ def get_verbosity() -> str:
53
+ """Get the current verbosity level.
54
+
55
+ Returns:
56
+ The current verbosity level ('quiet', 'normal', or 'verbose').
57
+ """
58
+ return _config._verbosity
59
+
60
+
61
+ @contextmanager
62
+ def verbosity(level: str) -> Generator[None, None, None]:
63
+ """Context manager for temporarily changing verbosity level.
64
+
65
+ Args:
66
+ level: Verbosity level to use within the context.
67
+
68
+ Raises:
69
+ ConfigurationError: If level is not a valid verbosity level.
70
+
71
+ Example:
72
+ >>> with smallaxe.verbosity('quiet'):
73
+ ... model.fit(df, label_col='target') # runs silently
74
+ """
75
+ previous_level = get_verbosity()
76
+ set_verbosity(level)
77
+ try:
78
+ yield
79
+ finally:
80
+ _config._verbosity = previous_level
81
+
82
+
83
+ def set_spark_session(spark: Any) -> None:
84
+ """Set the Spark session to use.
85
+
86
+ Args:
87
+ spark: A SparkSession instance. If None, smallaxe will attempt
88
+ to get or create a session when needed.
89
+ """
90
+ _config._spark_session = spark
91
+
92
+
93
+ def get_spark_session() -> Optional[Any]:
94
+ """Get the configured Spark session.
95
+
96
+ Returns:
97
+ The configured SparkSession, or None if not set.
98
+ """
99
+ return _config._spark_session
100
+
101
+
102
+ def set_seed(seed: Optional[int]) -> None:
103
+ """Set the global random seed for reproducibility.
104
+
105
+ This affects all random operations including train/test splits,
106
+ k-fold cross-validation, and hyperopt sampling.
107
+
108
+ Args:
109
+ seed: Integer seed value, or None to reset to no seed.
110
+
111
+ Raises:
112
+ ConfigurationError: If seed is not an integer or None.
113
+ """
114
+ if seed is not None and not isinstance(seed, int):
115
+ raise ConfigurationError(
116
+ message=f"Seed must be an integer or None, got {type(seed).__name__}."
117
+ )
118
+ _config._seed = seed
119
+
120
+
121
+ def get_seed() -> Optional[int]:
122
+ """Get the current random seed.
123
+
124
+ Returns:
125
+ The current seed value, or None if not set.
126
+ """
127
+ return _config._seed
128
+
129
+
130
+ def set_cache_strategy(strategy: str) -> None:
131
+ """Set the caching strategy for PySpark operations.
132
+
133
+ Args:
134
+ strategy: Cache strategy. One of 'auto', 'always', or 'never'.
135
+ - 'auto': Smart caching - cache after preprocessing, unpersist after training
136
+ - 'always': Cache at every stage (use for debugging or small datasets)
137
+ - 'never': No automatic caching (manual control)
138
+
139
+ Raises:
140
+ ConfigurationError: If strategy is not a valid cache strategy.
141
+ """
142
+ if strategy not in VALID_CACHE_STRATEGIES:
143
+ raise ConfigurationError(
144
+ setting="cache_strategy",
145
+ value=strategy,
146
+ allowed_values=list(VALID_CACHE_STRATEGIES),
147
+ )
148
+ _config._cache_strategy = strategy
149
+
150
+
151
+ def get_cache_strategy() -> str:
152
+ """Get the current cache strategy.
153
+
154
+ Returns:
155
+ The current cache strategy ('auto', 'always', or 'never').
156
+ """
157
+ return _config._cache_strategy
smallaxe/_config.py ADDED
@@ -0,0 +1,37 @@
1
+ """Internal configuration state for smallaxe."""
2
+
3
+ from typing import Any, Optional
4
+
5
+ # Valid configuration values
6
+ VALID_VERBOSITY_LEVELS = ("quiet", "normal", "verbose")
7
+ VALID_CACHE_STRATEGIES = ("auto", "always", "never")
8
+
9
+ # Default configuration values
10
+ DEFAULT_VERBOSITY = "normal"
11
+ DEFAULT_CACHE_STRATEGY = "auto"
12
+ DEFAULT_SEED = None
13
+
14
+
15
+ class _Config:
16
+ """Internal configuration state container.
17
+
18
+ This class holds the global configuration state for smallaxe.
19
+ It should not be accessed directly - use the module-level functions instead.
20
+ """
21
+
22
+ def __init__(self) -> None:
23
+ self._verbosity: str = DEFAULT_VERBOSITY
24
+ self._cache_strategy: str = DEFAULT_CACHE_STRATEGY
25
+ self._seed: Optional[int] = DEFAULT_SEED
26
+ self._spark_session: Optional[Any] = None
27
+
28
+ def reset(self) -> None:
29
+ """Reset configuration to defaults."""
30
+ self._verbosity = DEFAULT_VERBOSITY
31
+ self._cache_strategy = DEFAULT_CACHE_STRATEGY
32
+ self._seed = DEFAULT_SEED
33
+ self._spark_session = None
34
+
35
+
36
+ # Global configuration instance
37
+ _config = _Config()
@@ -0,0 +1 @@
1
+ """Auto module - automated training."""
@@ -0,0 +1,13 @@
1
+ """Sample datasets for testing and demos."""
2
+
3
+ from smallaxe.datasets._data import (
4
+ dataset_info,
5
+ load_sample_classification,
6
+ load_sample_regression,
7
+ )
8
+
9
+ __all__ = [
10
+ "load_sample_regression",
11
+ "load_sample_classification",
12
+ "dataset_info",
13
+ ]
@@ -0,0 +1,240 @@
1
+ """Raw data generators for sample datasets."""
2
+
3
+ import random
4
+ from typing import List, Tuple
5
+
6
+ from pyspark.sql import DataFrame, SparkSession
7
+ from pyspark.sql.types import (
8
+ DoubleType,
9
+ IntegerType,
10
+ StringType,
11
+ StructField,
12
+ StructType,
13
+ )
14
+
15
+ # Regression dataset constants
16
+ LOCATIONS = ["urban", "suburban", "rural"]
17
+ CONDITIONS = ["excellent", "good", "fair", "poor"]
18
+
19
+ # Classification dataset constants
20
+ CONTRACTS = ["month-to-month", "one_year", "two_year"]
21
+ PAYMENT_METHODS = ["credit_card", "bank_transfer", "electronic_check", "mailed_check"]
22
+
23
+
24
+ def _generate_regression_data(
25
+ n_rows: int = 10000, seed: int = 42
26
+ ) -> List[Tuple[int, int, int, int, str, str, float]]:
27
+ """Generate synthetic housing data with realistic distributions.
28
+
29
+ The price is correlated with features:
30
+ - More bedrooms/bathrooms → higher price
31
+ - More sqft → higher price
32
+ - Newer homes (lower age) → higher price
33
+ - Urban > suburban > rural
34
+ - Excellent > good > fair > poor condition
35
+ """
36
+ random.seed(seed)
37
+
38
+ location_multipliers = {"urban": 1.3, "suburban": 1.0, "rural": 0.7}
39
+ condition_multipliers = {"excellent": 1.2, "good": 1.0, "fair": 0.85, "poor": 0.7}
40
+
41
+ data = []
42
+ for _ in range(n_rows):
43
+ # Generate correlated features
44
+ bedrooms = random.choices([1, 2, 3, 4, 5], weights=[10, 25, 35, 20, 10])[0]
45
+ bathrooms = max(1, bedrooms - random.randint(0, 1))
46
+ sqft = int(500 + bedrooms * 400 + random.gauss(0, 200))
47
+ sqft = max(400, sqft) # minimum sqft
48
+ age = random.choices(list(range(0, 51)), weights=[max(1, 50 - i) for i in range(51)])[0]
49
+ location = random.choices(LOCATIONS, weights=[30, 50, 20])[0]
50
+ condition = random.choices(CONDITIONS, weights=[15, 45, 30, 10])[0]
51
+
52
+ # Calculate price with realistic correlation
53
+ base_price = 50000 + sqft * 150 + bedrooms * 10000 + bathrooms * 8000
54
+ age_discount = age * 1000
55
+ location_factor = location_multipliers[location]
56
+ condition_factor = condition_multipliers[condition]
57
+
58
+ price = (base_price - age_discount) * location_factor * condition_factor
59
+ price = price + random.gauss(0, price * 0.1) # Add noise
60
+ price = max(50000, round(price, 2)) # Minimum price
61
+
62
+ data.append((bedrooms, bathrooms, sqft, age, location, condition, price))
63
+
64
+ return data
65
+
66
+
67
+ def _generate_classification_data(
68
+ n_rows: int = 10000, seed: int = 42
69
+ ) -> List[Tuple[int, float, float, str, str, int]]:
70
+ """Generate synthetic customer churn data with realistic distributions.
71
+
72
+ Churn probability is correlated with features:
73
+ - Lower tenure → higher churn
74
+ - Higher monthly charges → higher churn
75
+ - Month-to-month contract → higher churn
76
+ - Electronic check payment → higher churn
77
+ """
78
+ random.seed(seed)
79
+
80
+ contract_churn_base = {"month-to-month": 0.4, "one_year": 0.15, "two_year": 0.05}
81
+ payment_churn_modifier = {
82
+ "credit_card": -0.05,
83
+ "bank_transfer": -0.05,
84
+ "electronic_check": 0.1,
85
+ "mailed_check": 0.0,
86
+ }
87
+
88
+ data = []
89
+ for _ in range(n_rows):
90
+ # Generate features
91
+ tenure = random.choices(list(range(1, 73)), weights=[max(1, 72 - i) for i in range(72)])[0]
92
+ monthly_charges = round(random.uniform(20, 120), 2)
93
+ total_charges = round(tenure * monthly_charges * random.uniform(0.9, 1.1), 2)
94
+ contract = random.choices(CONTRACTS, weights=[55, 25, 20])[0]
95
+ payment_method = random.choices(PAYMENT_METHODS, weights=[25, 25, 30, 20])[0]
96
+
97
+ # Calculate churn probability
98
+ base_churn = contract_churn_base[contract]
99
+ tenure_modifier = max(0, (24 - tenure) / 100) # Higher churn for low tenure
100
+ charge_modifier = (monthly_charges - 70) / 500 # Higher charges → more churn
101
+ payment_modifier = payment_churn_modifier[payment_method]
102
+
103
+ churn_prob = base_churn + tenure_modifier + charge_modifier + payment_modifier
104
+ churn_prob = max(0.02, min(0.8, churn_prob)) # Clamp probability
105
+
106
+ churn = 1 if random.random() < churn_prob else 0
107
+
108
+ data.append((tenure, monthly_charges, total_charges, contract, payment_method, churn))
109
+
110
+ return data
111
+
112
+
113
+ def load_sample_regression(spark: SparkSession, n_rows: int = 10000, seed: int = 42) -> DataFrame:
114
+ """Load a sample regression dataset (housing prices).
115
+
116
+ Args:
117
+ spark: SparkSession instance.
118
+ n_rows: Number of rows to generate. Default is 10,000.
119
+ seed: Random seed for reproducibility. Default is 42.
120
+
121
+ Returns:
122
+ PySpark DataFrame with columns:
123
+ - bedrooms (int): Number of bedrooms (1-5)
124
+ - bathrooms (int): Number of bathrooms (1-5)
125
+ - sqft (int): Square footage (400+)
126
+ - age (int): Age of home in years (0-50)
127
+ - location (str): 'urban', 'suburban', or 'rural'
128
+ - condition (str): 'excellent', 'good', 'fair', or 'poor'
129
+ - price (float): House price in dollars (label column)
130
+ """
131
+ schema = StructType(
132
+ [
133
+ StructField("bedrooms", IntegerType(), False),
134
+ StructField("bathrooms", IntegerType(), False),
135
+ StructField("sqft", IntegerType(), False),
136
+ StructField("age", IntegerType(), False),
137
+ StructField("location", StringType(), False),
138
+ StructField("condition", StringType(), False),
139
+ StructField("price", DoubleType(), False),
140
+ ]
141
+ )
142
+
143
+ data = _generate_regression_data(n_rows=n_rows, seed=seed)
144
+ return spark.createDataFrame(data, schema)
145
+
146
+
147
+ def load_sample_classification(
148
+ spark: SparkSession, n_rows: int = 10000, seed: int = 42
149
+ ) -> DataFrame:
150
+ """Load a sample classification dataset (customer churn).
151
+
152
+ Args:
153
+ spark: SparkSession instance.
154
+ n_rows: Number of rows to generate. Default is 10,000.
155
+ seed: Random seed for reproducibility. Default is 42.
156
+
157
+ Returns:
158
+ PySpark DataFrame with columns:
159
+ - tenure (int): Months as customer (1-72)
160
+ - monthly_charges (float): Monthly bill amount (20-120)
161
+ - total_charges (float): Total amount charged
162
+ - contract (str): 'month-to-month', 'one_year', or 'two_year'
163
+ - payment_method (str): Payment method used
164
+ - churn (int): 1 if churned, 0 otherwise (label column)
165
+ """
166
+ schema = StructType(
167
+ [
168
+ StructField("tenure", IntegerType(), False),
169
+ StructField("monthly_charges", DoubleType(), False),
170
+ StructField("total_charges", DoubleType(), False),
171
+ StructField("contract", StringType(), False),
172
+ StructField("payment_method", StringType(), False),
173
+ StructField("churn", IntegerType(), False),
174
+ ]
175
+ )
176
+
177
+ data = _generate_classification_data(n_rows=n_rows, seed=seed)
178
+ return spark.createDataFrame(data, schema)
179
+
180
+
181
+ def dataset_info(dataset_name: str) -> None:
182
+ """Print information about a sample dataset.
183
+
184
+ Args:
185
+ dataset_name: Either 'regression' or 'classification'.
186
+
187
+ Raises:
188
+ ValueError: If dataset_name is not recognized.
189
+ """
190
+ if dataset_name == "regression":
191
+ info = """
192
+ Sample Regression Dataset: Housing Prices
193
+ ==========================================
194
+
195
+ Columns:
196
+ - bedrooms (int): Number of bedrooms (1-5)
197
+ - bathrooms (int): Number of bathrooms (1-5)
198
+ - sqft (int): Square footage of the home (400+)
199
+ - age (int): Age of the home in years (0-50)
200
+ - location (str): Location type - 'urban', 'suburban', or 'rural'
201
+ - condition (str): Home condition - 'excellent', 'good', 'fair', or 'poor'
202
+ - price (float): House price in dollars (LABEL COLUMN)
203
+
204
+ Numerical features: bedrooms, bathrooms, sqft, age
205
+ Categorical features: location, condition
206
+ Label: price
207
+
208
+ Usage:
209
+ from smallaxe.datasets import load_sample_regression
210
+ df = load_sample_regression(spark)
211
+ """
212
+ elif dataset_name == "classification":
213
+ info = """
214
+ Sample Classification Dataset: Customer Churn
215
+ ==============================================
216
+
217
+ Columns:
218
+ - tenure (int): Number of months as a customer (1-72)
219
+ - monthly_charges (float): Monthly bill amount (20-120)
220
+ - total_charges (float): Total amount charged over tenure
221
+ - contract (str): Contract type - 'month-to-month', 'one_year', or 'two_year'
222
+ - payment_method (str): 'credit_card', 'bank_transfer', 'electronic_check', or 'mailed_check'
223
+ - churn (int): 1 if customer churned, 0 otherwise (LABEL COLUMN)
224
+
225
+ Numerical features: tenure, monthly_charges, total_charges
226
+ Categorical features: contract, payment_method
227
+ Label: churn (binary: 0 or 1)
228
+
229
+ Class distribution: ~30% churn (1), ~70% no churn (0)
230
+
231
+ Usage:
232
+ from smallaxe.datasets import load_sample_classification
233
+ df = load_sample_classification(spark)
234
+ """
235
+ else:
236
+ raise ValueError(
237
+ f"Unknown dataset: '{dataset_name}'. Use 'regression' or 'classification'."
238
+ )
239
+
240
+ print(info)
@@ -0,0 +1,120 @@
1
+ """Custom exception classes for smallaxe.
2
+
3
+ Exception Hierarchy:
4
+ SmallaxeError (base)
5
+ ├── ValidationError
6
+ ├── PreprocessingError
7
+ ├── ModelNotFittedError
8
+ ├── ColumnNotFoundError
9
+ ├── DependencyError
10
+ └── ConfigurationError
11
+ """
12
+
13
+ from typing import List, Optional
14
+
15
+ __all__ = [
16
+ "SmallaxeError",
17
+ "ValidationError",
18
+ "PreprocessingError",
19
+ "ModelNotFittedError",
20
+ "ColumnNotFoundError",
21
+ "DependencyError",
22
+ "ConfigurationError",
23
+ ]
24
+
25
+
26
+ class SmallaxeError(Exception):
27
+ """Base exception for all smallaxe errors."""
28
+
29
+ def __init__(self, message: str = "An error occurred in smallaxe."):
30
+ self.message = message
31
+ super().__init__(self.message)
32
+
33
+
34
+ class ValidationError(SmallaxeError):
35
+ """Raised when input parameters or data are invalid."""
36
+
37
+ def __init__(self, message: str = "Invalid input parameters or data."):
38
+ super().__init__(message)
39
+
40
+
41
+ class PreprocessingError(SmallaxeError):
42
+ """Raised when required preprocessing steps are missing."""
43
+
44
+ def __init__(
45
+ self,
46
+ message: str = "Missing required preprocessing steps.",
47
+ algorithm: Optional[str] = None,
48
+ missing_step: Optional[str] = None,
49
+ ):
50
+ if algorithm and missing_step:
51
+ message = (
52
+ f"{algorithm} requires {missing_step} in pipeline. "
53
+ f"Add {missing_step} before the model step."
54
+ )
55
+ self.algorithm = algorithm
56
+ self.missing_step = missing_step
57
+ super().__init__(message)
58
+
59
+
60
+ class ModelNotFittedError(SmallaxeError):
61
+ """Raised when predict() is called before fit()."""
62
+
63
+ def __init__(self, message: str = "Model has not been fitted. Call fit() before predict()."):
64
+ super().__init__(message)
65
+
66
+
67
+ class ColumnNotFoundError(SmallaxeError):
68
+ """Raised when a required column is missing from the DataFrame."""
69
+
70
+ def __init__(
71
+ self,
72
+ message: str = "Required column not found in DataFrame.",
73
+ column: Optional[str] = None,
74
+ available_columns: Optional[List[str]] = None,
75
+ ):
76
+ if column:
77
+ message = f"Column '{column}' not found in DataFrame."
78
+ if available_columns:
79
+ message += f" Available columns: {available_columns}"
80
+ self.column = column
81
+ self.available_columns = available_columns
82
+ super().__init__(message)
83
+
84
+
85
+ class DependencyError(SmallaxeError):
86
+ """Raised when an optional dependency is not installed."""
87
+
88
+ def __init__(
89
+ self,
90
+ message: str = "Missing optional dependency.",
91
+ package: Optional[str] = None,
92
+ install_command: Optional[str] = None,
93
+ ):
94
+ if package:
95
+ message = f"{package} is not installed."
96
+ if install_command:
97
+ message += f" Install with: {install_command}"
98
+ self.package = package
99
+ self.install_command = install_command
100
+ super().__init__(message)
101
+
102
+
103
+ class ConfigurationError(SmallaxeError):
104
+ """Raised when configuration settings are invalid."""
105
+
106
+ def __init__(
107
+ self,
108
+ message: str = "Invalid configuration settings.",
109
+ setting: Optional[str] = None,
110
+ value: Optional[str] = None,
111
+ allowed_values: Optional[List[str]] = None,
112
+ ):
113
+ if setting and value:
114
+ message = f"Invalid value '{value}' for setting '{setting}'."
115
+ if allowed_values:
116
+ message += f" Allowed values: {allowed_values}"
117
+ self.setting = setting
118
+ self.value = value
119
+ self.allowed_values = allowed_values
120
+ super().__init__(message)
@@ -0,0 +1,35 @@
1
+ """Metrics module - regression and classification metrics."""
2
+
3
+ from smallaxe.metrics.classification import (
4
+ accuracy,
5
+ auc_pr,
6
+ auc_roc,
7
+ f1_score,
8
+ log_loss,
9
+ precision,
10
+ recall,
11
+ )
12
+ from smallaxe.metrics.regression import (
13
+ mae,
14
+ mape,
15
+ mse,
16
+ r2,
17
+ rmse,
18
+ )
19
+
20
+ __all__ = [
21
+ # Regression metrics
22
+ "mse",
23
+ "rmse",
24
+ "mae",
25
+ "r2",
26
+ "mape",
27
+ # Classification metrics
28
+ "accuracy",
29
+ "precision",
30
+ "recall",
31
+ "f1_score",
32
+ "auc_roc",
33
+ "auc_pr",
34
+ "log_loss",
35
+ ]