adamops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adamops/__init__.py +40 -0
- adamops/cli.py +163 -0
- adamops/data/__init__.py +24 -0
- adamops/data/feature_engineering.py +284 -0
- adamops/data/loaders.py +922 -0
- adamops/data/preprocessors.py +227 -0
- adamops/data/splitters.py +218 -0
- adamops/data/validators.py +148 -0
- adamops/deployment/__init__.py +21 -0
- adamops/deployment/api.py +237 -0
- adamops/deployment/cloud.py +191 -0
- adamops/deployment/containerize.py +262 -0
- adamops/deployment/exporters.py +148 -0
- adamops/evaluation/__init__.py +24 -0
- adamops/evaluation/comparison.py +133 -0
- adamops/evaluation/explainability.py +143 -0
- adamops/evaluation/metrics.py +233 -0
- adamops/evaluation/reports.py +165 -0
- adamops/evaluation/visualization.py +238 -0
- adamops/models/__init__.py +21 -0
- adamops/models/automl.py +277 -0
- adamops/models/ensembles.py +228 -0
- adamops/models/modelops.py +308 -0
- adamops/models/registry.py +250 -0
- adamops/monitoring/__init__.py +21 -0
- adamops/monitoring/alerts.py +200 -0
- adamops/monitoring/dashboard.py +117 -0
- adamops/monitoring/drift.py +212 -0
- adamops/monitoring/performance.py +195 -0
- adamops/pipelines/__init__.py +15 -0
- adamops/pipelines/orchestrators.py +183 -0
- adamops/pipelines/workflows.py +212 -0
- adamops/utils/__init__.py +18 -0
- adamops/utils/config.py +457 -0
- adamops/utils/helpers.py +663 -0
- adamops/utils/logging.py +412 -0
- adamops-0.1.0.dist-info/METADATA +310 -0
- adamops-0.1.0.dist-info/RECORD +42 -0
- adamops-0.1.0.dist-info/WHEEL +5 -0
- adamops-0.1.0.dist-info/entry_points.txt +2 -0
- adamops-0.1.0.dist-info/licenses/LICENSE +21 -0
- adamops-0.1.0.dist-info/top_level.txt +1 -0
adamops/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps - A comprehensive MLOps library for end-to-end machine learning workflows.
|
|
3
|
+
|
|
4
|
+
AdamOps provides tools for:
|
|
5
|
+
- Data loading, validation, cleaning, and feature engineering
|
|
6
|
+
- Model training, registry, and ensemble methods
|
|
7
|
+
- AutoML with hyperparameter tuning
|
|
8
|
+
- Model evaluation and explainability
|
|
9
|
+
- Deployment to various platforms
|
|
10
|
+
- Monitoring and drift detection
|
|
11
|
+
- Pipeline orchestration
|
|
12
|
+
|
|
13
|
+
Author: AdamOps Team
|
|
14
|
+
Version: 0.1.0
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
__author__ = "AdamOps Team"
|
|
19
|
+
|
|
20
|
+
# Import core modules for easy access
|
|
21
|
+
from adamops.data import loaders, validators, preprocessors, splitters
|
|
22
|
+
from adamops.models import modelops, registry, ensembles, automl
|
|
23
|
+
from adamops.evaluation import metrics
|
|
24
|
+
from adamops.utils import config, logging as adamops_logging, helpers
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"loaders",
|
|
28
|
+
"validators",
|
|
29
|
+
"preprocessors",
|
|
30
|
+
"splitters",
|
|
31
|
+
"modelops",
|
|
32
|
+
"registry",
|
|
33
|
+
"ensembles",
|
|
34
|
+
"automl",
|
|
35
|
+
"metrics",
|
|
36
|
+
"config",
|
|
37
|
+
"adamops_logging",
|
|
38
|
+
"helpers",
|
|
39
|
+
"__version__",
|
|
40
|
+
]
|
adamops/cli.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps CLI Module
|
|
3
|
+
|
|
4
|
+
Command-line interface for AdamOps.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import click
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
CLICK_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
CLICK_AVAILABLE = False
|
|
18
|
+
|
|
19
|
+
if CLICK_AVAILABLE:
|
|
20
|
+
console = Console()
|
|
21
|
+
|
|
22
|
+
@click.group()
|
|
23
|
+
@click.version_option(version="0.1.0", prog_name="adamops")
|
|
24
|
+
def main():
|
|
25
|
+
"""AdamOps - MLOps made simple."""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@main.command()
|
|
29
|
+
@click.option("--data", "-d", required=True, help="Path to data file")
|
|
30
|
+
@click.option("--target", "-t", required=True, help="Target column name")
|
|
31
|
+
@click.option("--algorithm", "-a", default="auto", help="Algorithm to use")
|
|
32
|
+
@click.option("--task", default="auto", help="Task type: classification, regression, auto")
|
|
33
|
+
@click.option("--output", "-o", default="model.joblib", help="Output model path")
|
|
34
|
+
def train(data: str, target: str, algorithm: str, task: str, output: str):
|
|
35
|
+
"""Train a model."""
|
|
36
|
+
console.print(f"[bold blue]Loading data from {data}...[/]")
|
|
37
|
+
|
|
38
|
+
from adamops.data.loaders import load_auto
|
|
39
|
+
from adamops.models.automl import quick_run
|
|
40
|
+
from adamops.models.modelops import train as train_model
|
|
41
|
+
from adamops.deployment.exporters import export_joblib
|
|
42
|
+
|
|
43
|
+
df = load_auto(data)
|
|
44
|
+
X = df.drop(columns=[target])
|
|
45
|
+
y = df[target]
|
|
46
|
+
|
|
47
|
+
console.print(f"[bold blue]Training {algorithm} model...[/]")
|
|
48
|
+
|
|
49
|
+
if algorithm == "auto":
|
|
50
|
+
model = quick_run(X, y, task)
|
|
51
|
+
else:
|
|
52
|
+
model = train_model(X, y, task, algorithm)
|
|
53
|
+
|
|
54
|
+
export_joblib(model, output)
|
|
55
|
+
console.print(f"[bold green]Model saved to {output}[/]")
|
|
56
|
+
|
|
57
|
+
@main.command()
|
|
58
|
+
@click.option("--model", "-m", required=True, help="Path to model file")
|
|
59
|
+
@click.option("--data", "-d", required=True, help="Path to test data")
|
|
60
|
+
@click.option("--target", "-t", required=True, help="Target column name")
|
|
61
|
+
def evaluate(model: str, data: str, target: str):
|
|
62
|
+
"""Evaluate a model."""
|
|
63
|
+
from adamops.deployment.exporters import load_model
|
|
64
|
+
from adamops.data.loaders import load_auto
|
|
65
|
+
from adamops.evaluation.metrics import evaluate as eval_metrics
|
|
66
|
+
|
|
67
|
+
console.print(f"[bold blue]Loading model and data...[/]")
|
|
68
|
+
|
|
69
|
+
model_obj = load_model(model)
|
|
70
|
+
df = load_auto(data)
|
|
71
|
+
X = df.drop(columns=[target])
|
|
72
|
+
y = df[target]
|
|
73
|
+
|
|
74
|
+
y_pred = model_obj.predict(X)
|
|
75
|
+
metrics = eval_metrics(y, y_pred)
|
|
76
|
+
|
|
77
|
+
table = Table(title="Evaluation Results")
|
|
78
|
+
table.add_column("Metric", style="cyan")
|
|
79
|
+
table.add_column("Value", style="green")
|
|
80
|
+
|
|
81
|
+
for name, value in metrics.items():
|
|
82
|
+
if isinstance(value, float):
|
|
83
|
+
table.add_row(name, f"{value:.4f}")
|
|
84
|
+
else:
|
|
85
|
+
table.add_row(name, str(value))
|
|
86
|
+
|
|
87
|
+
console.print(table)
|
|
88
|
+
|
|
89
|
+
@main.command()
|
|
90
|
+
@click.option("--model", "-m", required=True, help="Path to model file")
|
|
91
|
+
@click.option("--type", "deploy_type", default="api", help="Deployment type: api, docker")
|
|
92
|
+
@click.option("--port", "-p", default=8000, help="API port")
|
|
93
|
+
@click.option("--output", "-o", default="./deploy", help="Output directory for docker")
|
|
94
|
+
def deploy(model: str, deploy_type: str, port: int, output: str):
|
|
95
|
+
"""Deploy a model."""
|
|
96
|
+
from adamops.deployment.exporters import load_model
|
|
97
|
+
|
|
98
|
+
model_obj = load_model(model)
|
|
99
|
+
|
|
100
|
+
if deploy_type == "api":
|
|
101
|
+
from adamops.deployment.api import run_api
|
|
102
|
+
console.print(f"[bold blue]Starting API on port {port}...[/]")
|
|
103
|
+
run_api(model_obj, port=port)
|
|
104
|
+
|
|
105
|
+
elif deploy_type == "docker":
|
|
106
|
+
from adamops.deployment.containerize import containerize
|
|
107
|
+
console.print(f"[bold blue]Creating Docker deployment...[/]")
|
|
108
|
+
result = containerize(model, output)
|
|
109
|
+
console.print(f"[bold green]Files created in {output}[/]")
|
|
110
|
+
for name, path in result.items():
|
|
111
|
+
console.print(f" - {name}: {path}")
|
|
112
|
+
|
|
113
|
+
@main.command()
|
|
114
|
+
@click.option("--data", "-d", required=True, help="Path to data file")
|
|
115
|
+
def validate(data: str):
|
|
116
|
+
"""Validate a data file."""
|
|
117
|
+
from adamops.data.loaders import load_auto
|
|
118
|
+
from adamops.data.validators import validate as validate_data
|
|
119
|
+
|
|
120
|
+
console.print(f"[bold blue]Validating {data}...[/]")
|
|
121
|
+
|
|
122
|
+
df = load_auto(data)
|
|
123
|
+
report = validate_data(df)
|
|
124
|
+
|
|
125
|
+
console.print(report.summary())
|
|
126
|
+
|
|
127
|
+
@main.command()
|
|
128
|
+
@click.argument("workflow_name")
|
|
129
|
+
def run_workflow(workflow_name: str):
|
|
130
|
+
"""Run a predefined workflow."""
|
|
131
|
+
from adamops.pipelines.workflows import create_ml_pipeline
|
|
132
|
+
|
|
133
|
+
console.print(f"[bold blue]Running workflow: {workflow_name}[/]")
|
|
134
|
+
|
|
135
|
+
workflow = create_ml_pipeline(workflow_name)
|
|
136
|
+
result = workflow.run()
|
|
137
|
+
|
|
138
|
+
console.print(f"[bold green]Workflow completed![/]")
|
|
139
|
+
console.print(workflow.get_status())
|
|
140
|
+
|
|
141
|
+
@main.command()
|
|
142
|
+
def info():
|
|
143
|
+
"""Show AdamOps information."""
|
|
144
|
+
from adamops import __version__
|
|
145
|
+
|
|
146
|
+
console.print("[bold blue]AdamOps - MLOps Made Simple[/]")
|
|
147
|
+
console.print(f"Version: {__version__}")
|
|
148
|
+
console.print()
|
|
149
|
+
console.print("Available commands:")
|
|
150
|
+
console.print(" train - Train a model")
|
|
151
|
+
console.print(" evaluate - Evaluate a model")
|
|
152
|
+
console.print(" deploy - Deploy a model")
|
|
153
|
+
console.print(" validate - Validate data")
|
|
154
|
+
console.print(" run-workflow - Run a workflow")
|
|
155
|
+
|
|
156
|
+
else:
|
|
157
|
+
def main():
|
|
158
|
+
print("CLI requires click and rich. Install with: pip install click rich")
|
|
159
|
+
sys.exit(1)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|
adamops/data/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Data Module
|
|
3
|
+
|
|
4
|
+
Provides comprehensive data handling capabilities:
|
|
5
|
+
- loaders: Load data from various sources (CSV, Excel, JSON, SQL, API, compressed files)
|
|
6
|
+
- validators: Validate data types, missing values, duplicates, shapes, and statistics
|
|
7
|
+
- preprocessors: Clean data (handle missing values, outliers, duplicates, type conversion)
|
|
8
|
+
- feature_engineering: Encode, scale, and generate features
|
|
9
|
+
- splitters: Split data for training and evaluation
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from adamops.data import loaders
|
|
13
|
+
from adamops.data import validators
|
|
14
|
+
from adamops.data import preprocessors
|
|
15
|
+
from adamops.data import feature_engineering
|
|
16
|
+
from adamops.data import splitters
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"loaders",
|
|
20
|
+
"validators",
|
|
21
|
+
"preprocessors",
|
|
22
|
+
"feature_engineering",
|
|
23
|
+
"splitters",
|
|
24
|
+
]
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AdamOps Feature Engineering Module
|
|
3
|
+
|
|
4
|
+
Provides encoding, scaling, feature selection, and auto feature generation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from sklearn.preprocessing import (
|
|
11
|
+
OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler,
|
|
12
|
+
MinMaxScaler, RobustScaler, MaxAbsScaler, PolynomialFeatures
|
|
13
|
+
)
|
|
14
|
+
from sklearn.feature_selection import (
|
|
15
|
+
VarianceThreshold, SelectKBest, mutual_info_classif, mutual_info_regression, RFE
|
|
16
|
+
)
|
|
17
|
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
18
|
+
from adamops.utils.logging import get_logger
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# =============================================================================
|
|
24
|
+
# Encoding
|
|
25
|
+
# =============================================================================
|
|
26
|
+
|
|
27
|
+
def encode_onehot(
|
|
28
|
+
df: pd.DataFrame, columns: List[str], drop_first: bool = False,
|
|
29
|
+
handle_unknown: str = "ignore"
|
|
30
|
+
) -> pd.DataFrame:
|
|
31
|
+
"""One-hot encode categorical columns."""
|
|
32
|
+
df = df.copy()
|
|
33
|
+
for col in columns:
|
|
34
|
+
dummies = pd.get_dummies(df[col], prefix=col, drop_first=drop_first)
|
|
35
|
+
df = pd.concat([df.drop(col, axis=1), dummies], axis=1)
|
|
36
|
+
logger.info(f"One-hot encoded {len(columns)} columns")
|
|
37
|
+
return df
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def encode_label(df: pd.DataFrame, columns: List[str]) -> Tuple[pd.DataFrame, Dict]:
|
|
41
|
+
"""Label encode categorical columns. Returns df and encoders dict."""
|
|
42
|
+
df = df.copy()
|
|
43
|
+
encoders = {}
|
|
44
|
+
for col in columns:
|
|
45
|
+
le = LabelEncoder()
|
|
46
|
+
df[col] = le.fit_transform(df[col].astype(str))
|
|
47
|
+
encoders[col] = le
|
|
48
|
+
logger.info(f"Label encoded {len(columns)} columns")
|
|
49
|
+
return df, encoders
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def encode_ordinal(
|
|
53
|
+
df: pd.DataFrame, columns: List[str],
|
|
54
|
+
categories: Optional[Dict[str, List]] = None
|
|
55
|
+
) -> pd.DataFrame:
|
|
56
|
+
"""Ordinal encode columns with optional category order."""
|
|
57
|
+
df = df.copy()
|
|
58
|
+
for col in columns:
|
|
59
|
+
if categories and col in categories:
|
|
60
|
+
cat_map = {v: i for i, v in enumerate(categories[col])}
|
|
61
|
+
df[col] = df[col].map(cat_map)
|
|
62
|
+
else:
|
|
63
|
+
df[col] = pd.Categorical(df[col]).codes
|
|
64
|
+
return df
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def encode_target(
|
|
68
|
+
df: pd.DataFrame, columns: List[str], target: str, smoothing: float = 1.0
|
|
69
|
+
) -> pd.DataFrame:
|
|
70
|
+
"""Target encode categorical columns."""
|
|
71
|
+
df = df.copy()
|
|
72
|
+
global_mean = df[target].mean()
|
|
73
|
+
|
|
74
|
+
for col in columns:
|
|
75
|
+
agg = df.groupby(col)[target].agg(['mean', 'count'])
|
|
76
|
+
smooth = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
|
|
77
|
+
df[col + '_target'] = df[col].map(smooth)
|
|
78
|
+
df = df.drop(col, axis=1)
|
|
79
|
+
|
|
80
|
+
return df
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def encode(
|
|
84
|
+
df: pd.DataFrame, columns: List[str], method: str = "onehot", **kwargs
|
|
85
|
+
) -> pd.DataFrame:
|
|
86
|
+
"""Encode categorical columns with specified method."""
|
|
87
|
+
if method == "onehot":
|
|
88
|
+
return encode_onehot(df, columns, **kwargs)
|
|
89
|
+
elif method == "label":
|
|
90
|
+
return encode_label(df, columns, **kwargs)[0]
|
|
91
|
+
elif method == "ordinal":
|
|
92
|
+
return encode_ordinal(df, columns, **kwargs)
|
|
93
|
+
elif method == "target" and "target" in kwargs:
|
|
94
|
+
return encode_target(df, columns, kwargs["target"])
|
|
95
|
+
else:
|
|
96
|
+
raise ValueError(f"Unknown encoding method: {method}")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# =============================================================================
|
|
100
|
+
# Scaling
|
|
101
|
+
# =============================================================================
|
|
102
|
+
|
|
103
|
+
def scale_standard(df: pd.DataFrame, columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
104
|
+
"""Standardize features (zero mean, unit variance)."""
|
|
105
|
+
df = df.copy()
|
|
106
|
+
cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
107
|
+
scaler = StandardScaler()
|
|
108
|
+
df[cols] = scaler.fit_transform(df[cols])
|
|
109
|
+
return df
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def scale_minmax(df: pd.DataFrame, columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
113
|
+
"""Scale features to [0, 1] range."""
|
|
114
|
+
df = df.copy()
|
|
115
|
+
cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
116
|
+
scaler = MinMaxScaler()
|
|
117
|
+
df[cols] = scaler.fit_transform(df[cols])
|
|
118
|
+
return df
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def scale_robust(df: pd.DataFrame, columns: Optional[List[str]] = None) -> pd.DataFrame:
|
|
122
|
+
"""Scale with median and IQR (robust to outliers)."""
|
|
123
|
+
df = df.copy()
|
|
124
|
+
cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
125
|
+
scaler = RobustScaler()
|
|
126
|
+
df[cols] = scaler.fit_transform(df[cols])
|
|
127
|
+
return df
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def scale(
|
|
131
|
+
df: pd.DataFrame, method: str = "standard", columns: Optional[List[str]] = None
|
|
132
|
+
) -> pd.DataFrame:
|
|
133
|
+
"""Scale numeric columns with specified method."""
|
|
134
|
+
if method == "standard":
|
|
135
|
+
return scale_standard(df, columns)
|
|
136
|
+
elif method == "minmax":
|
|
137
|
+
return scale_minmax(df, columns)
|
|
138
|
+
elif method == "robust":
|
|
139
|
+
return scale_robust(df, columns)
|
|
140
|
+
elif method == "maxabs":
|
|
141
|
+
df = df.copy()
|
|
142
|
+
cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
143
|
+
df[cols] = MaxAbsScaler().fit_transform(df[cols])
|
|
144
|
+
return df
|
|
145
|
+
else:
|
|
146
|
+
raise ValueError(f"Unknown scaling method: {method}")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# =============================================================================
|
|
150
|
+
# Feature Selection
|
|
151
|
+
# =============================================================================
|
|
152
|
+
|
|
153
|
+
def select_by_variance(
|
|
154
|
+
df: pd.DataFrame, threshold: float = 0.0, columns: Optional[List[str]] = None
|
|
155
|
+
) -> pd.DataFrame:
|
|
156
|
+
"""Remove low variance features."""
|
|
157
|
+
cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
158
|
+
selector = VarianceThreshold(threshold=threshold)
|
|
159
|
+
selected = selector.fit_transform(df[cols])
|
|
160
|
+
selected_cols = [cols[i] for i in selector.get_support(indices=True)]
|
|
161
|
+
df_result = df.drop(cols, axis=1)
|
|
162
|
+
df_result[selected_cols] = selected
|
|
163
|
+
logger.info(f"Selected {len(selected_cols)}/{len(cols)} features by variance")
|
|
164
|
+
return df_result
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def select_by_correlation(
|
|
168
|
+
df: pd.DataFrame, threshold: float = 0.9, target: Optional[str] = None
|
|
169
|
+
) -> pd.DataFrame:
|
|
170
|
+
"""Remove highly correlated features."""
|
|
171
|
+
df = df.copy()
|
|
172
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
173
|
+
if target and target in num_cols:
|
|
174
|
+
num_cols.remove(target)
|
|
175
|
+
|
|
176
|
+
corr = df[num_cols].corr().abs()
|
|
177
|
+
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
|
|
178
|
+
to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
|
|
179
|
+
|
|
180
|
+
logger.info(f"Dropping {len(to_drop)} highly correlated features")
|
|
181
|
+
return df.drop(to_drop, axis=1)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def select_by_importance(
|
|
185
|
+
df: pd.DataFrame, target: str, n_features: int = 10, task: str = "classification"
|
|
186
|
+
) -> pd.DataFrame:
|
|
187
|
+
"""Select features by tree-based importance."""
|
|
188
|
+
X = df.drop(target, axis=1).select_dtypes(include=[np.number])
|
|
189
|
+
y = df[target]
|
|
190
|
+
|
|
191
|
+
model = RandomForestClassifier(n_estimators=50, random_state=42) if task == "classification" \
|
|
192
|
+
else RandomForestRegressor(n_estimators=50, random_state=42)
|
|
193
|
+
model.fit(X, y)
|
|
194
|
+
|
|
195
|
+
importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
|
|
196
|
+
top_features = importance.head(n_features).index.tolist()
|
|
197
|
+
|
|
198
|
+
logger.info(f"Selected top {n_features} features by importance")
|
|
199
|
+
return df[[target] + top_features]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def select_features(
|
|
203
|
+
df: pd.DataFrame, target: str, method: str = "importance", n_features: int = 10, **kwargs
|
|
204
|
+
) -> pd.DataFrame:
|
|
205
|
+
"""Select features using specified method."""
|
|
206
|
+
if method == "variance":
|
|
207
|
+
return select_by_variance(df, **kwargs)
|
|
208
|
+
elif method == "correlation":
|
|
209
|
+
return select_by_correlation(df, target=target, **kwargs)
|
|
210
|
+
elif method == "importance":
|
|
211
|
+
return select_by_importance(df, target, n_features, **kwargs)
|
|
212
|
+
else:
|
|
213
|
+
raise ValueError(f"Unknown selection method: {method}")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# =============================================================================
|
|
217
|
+
# Feature Generation
|
|
218
|
+
# =============================================================================
|
|
219
|
+
|
|
220
|
+
def generate_polynomial(
|
|
221
|
+
df: pd.DataFrame, columns: List[str], degree: int = 2, include_bias: bool = False
|
|
222
|
+
) -> pd.DataFrame:
|
|
223
|
+
"""Generate polynomial features."""
|
|
224
|
+
df = df.copy()
|
|
225
|
+
poly = PolynomialFeatures(degree=degree, include_bias=include_bias)
|
|
226
|
+
poly_features = poly.fit_transform(df[columns])
|
|
227
|
+
poly_names = poly.get_feature_names_out(columns)
|
|
228
|
+
df_poly = pd.DataFrame(poly_features, columns=poly_names, index=df.index)
|
|
229
|
+
return pd.concat([df.drop(columns, axis=1), df_poly], axis=1)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def generate_interactions(
|
|
233
|
+
df: pd.DataFrame, columns: List[str], operations: List[str] = ["multiply"]
|
|
234
|
+
) -> pd.DataFrame:
|
|
235
|
+
"""Generate interaction features between columns."""
|
|
236
|
+
df = df.copy()
|
|
237
|
+
for i, col1 in enumerate(columns):
|
|
238
|
+
for col2 in columns[i+1:]:
|
|
239
|
+
if "multiply" in operations:
|
|
240
|
+
df[f"{col1}_x_{col2}"] = df[col1] * df[col2]
|
|
241
|
+
if "add" in operations:
|
|
242
|
+
df[f"{col1}_+_{col2}"] = df[col1] + df[col2]
|
|
243
|
+
if "divide" in operations:
|
|
244
|
+
df[f"{col1}_/_{col2}"] = df[col1] / (df[col2] + 1e-8)
|
|
245
|
+
return df
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def generate_datetime_features(df: pd.DataFrame, column: str) -> pd.DataFrame:
|
|
249
|
+
"""Extract datetime features from a column."""
|
|
250
|
+
df = df.copy()
|
|
251
|
+
dt = pd.to_datetime(df[column])
|
|
252
|
+
prefix = column
|
|
253
|
+
df[f"{prefix}_year"] = dt.dt.year
|
|
254
|
+
df[f"{prefix}_month"] = dt.dt.month
|
|
255
|
+
df[f"{prefix}_day"] = dt.dt.day
|
|
256
|
+
df[f"{prefix}_dayofweek"] = dt.dt.dayofweek
|
|
257
|
+
df[f"{prefix}_hour"] = dt.dt.hour
|
|
258
|
+
df[f"{prefix}_is_weekend"] = dt.dt.dayofweek.isin([5, 6]).astype(int)
|
|
259
|
+
return df
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def auto_feature_engineering(
|
|
263
|
+
df: pd.DataFrame, target: Optional[str] = None,
|
|
264
|
+
polynomial: bool = False, interactions: bool = False, datetime_cols: Optional[List[str]] = None
|
|
265
|
+
) -> pd.DataFrame:
|
|
266
|
+
"""Automatic feature engineering pipeline."""
|
|
267
|
+
logger.info("Running auto feature engineering")
|
|
268
|
+
|
|
269
|
+
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
270
|
+
if target and target in num_cols:
|
|
271
|
+
num_cols.remove(target)
|
|
272
|
+
|
|
273
|
+
if datetime_cols:
|
|
274
|
+
for col in datetime_cols:
|
|
275
|
+
df = generate_datetime_features(df, col)
|
|
276
|
+
|
|
277
|
+
if polynomial and len(num_cols) <= 5:
|
|
278
|
+
df = generate_polynomial(df, num_cols[:5], degree=2)
|
|
279
|
+
|
|
280
|
+
if interactions and len(num_cols) >= 2:
|
|
281
|
+
df = generate_interactions(df, num_cols[:4])
|
|
282
|
+
|
|
283
|
+
logger.info(f"Feature engineering complete. New shape: {df.shape}")
|
|
284
|
+
return df
|