kaizenstat 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kaizenstat/__init__.py +3 -0
- kaizenstat/cli.py +53 -0
- kaizenstat/core.py +282 -0
- kaizenstat-0.1.0.dist-info/METADATA +115 -0
- kaizenstat-0.1.0.dist-info/RECORD +8 -0
- kaizenstat-0.1.0.dist-info/WHEEL +5 -0
- kaizenstat-0.1.0.dist-info/entry_points.txt +2 -0
- kaizenstat-0.1.0.dist-info/top_level.txt +1 -0
kaizenstat/__init__.py
ADDED
kaizenstat/cli.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from kaizenstat.core import KaizenStat
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
|
|
7
|
+
console = Console()
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
parser = argparse.ArgumentParser(description="KaizenStat CLI - Zero-friction AutoML & Data Cleaning Toolkit")
|
|
11
|
+
subparsers = parser.add_subparsers(dest="command")
|
|
12
|
+
|
|
13
|
+
# Audit command
|
|
14
|
+
audit_parser = subparsers.add_parser("audit", help="Audit dataset for potential issues")
|
|
15
|
+
audit_parser.add_argument("csv_path", help="Path to the CSV dataset file")
|
|
16
|
+
audit_parser.add_argument("--target", help="Optional name of the target column", default=None)
|
|
17
|
+
|
|
18
|
+
# Heal command
|
|
19
|
+
heal_parser = subparsers.add_parser("heal", help="Clean/heal dataset automatically")
|
|
20
|
+
heal_parser.add_argument("csv_path", help="Path to the CSV dataset file")
|
|
21
|
+
heal_parser.add_argument("--target", help="Optional name of the target column", default=None)
|
|
22
|
+
heal_parser.add_argument("--method", choices=["fill_median", "fill_mean", "drop_rows"], default="fill_median", help="Method used for healing missing values")
|
|
23
|
+
heal_parser.add_argument("-o", "--output", help="Path to output the healed CSV. Defaults to '<filename>_healed.csv'", default=None)
|
|
24
|
+
|
|
25
|
+
# Benchmark command
|
|
26
|
+
benchmark_parser = subparsers.add_parser("benchmark", help="Benchmark AutoML models on a target column")
|
|
27
|
+
benchmark_parser.add_argument("csv_path", help="Path to the CSV dataset file")
|
|
28
|
+
benchmark_parser.add_argument("--target", required=True, help="Name of the target column")
|
|
29
|
+
|
|
30
|
+
args = parser.parse_args()
|
|
31
|
+
|
|
32
|
+
if not args.command:
|
|
33
|
+
parser.print_help()
|
|
34
|
+
sys.exit(1)
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
df = pd.read_csv(args.csv_path)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
console.print(f"[bold red]Error loading dataset:[/] {e}")
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
|
|
42
|
+
if args.command == "audit":
|
|
43
|
+
KaizenStat.audit(df, target_column=args.target)
|
|
44
|
+
elif args.command == "heal":
|
|
45
|
+
healed_df = KaizenStat.heal(df, target_column=args.target, method=args.method)
|
|
46
|
+
output_path = args.output or (args.csv_path[:-4] + "_healed.csv" if args.csv_path.endswith(".csv") else args.csv_path + "_healed")
|
|
47
|
+
healed_df.to_csv(output_path, index=False)
|
|
48
|
+
console.print(f"[bold green]Saved healed dataset to:[/] {output_path}")
|
|
49
|
+
elif args.command == "benchmark":
|
|
50
|
+
KaizenStat.benchmark(df, target_column=args.target)
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
main()
|
kaizenstat/core.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
KAIZENSTAT (v3.0): The Zero-Friction 'Daily Friend' Pipeline.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import time
|
|
7
|
+
from typing import Optional, Dict, List
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.panel import Panel
|
|
14
|
+
from rich.table import Table
|
|
15
|
+
|
|
16
|
+
from sklearn.compose import ColumnTransformer
|
|
17
|
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
18
|
+
from sklearn.model_selection import train_test_split
|
|
19
|
+
from sklearn.linear_model import LogisticRegression, Ridge
|
|
20
|
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
21
|
+
from sklearn.neural_network import MLPClassifier, MLPRegressor
|
|
22
|
+
from sklearn.pipeline import Pipeline
|
|
23
|
+
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_absolute_error
|
|
24
|
+
|
|
25
|
+
# Initialize Rich console
|
|
26
|
+
console = Console()
|
|
27
|
+
|
|
28
|
+
class KaizenStat:
|
|
29
|
+
"""
|
|
30
|
+
KAIZENSTAT (v3.0): The Zero-Friction 'Daily Friend' Pipeline.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# =========================================================================
|
|
34
|
+
# MODULE 1: THE VALIDATOR (kz.audit)
|
|
35
|
+
# =========================================================================
|
|
36
|
+
@staticmethod
|
|
37
|
+
def audit(df: pd.DataFrame, target_column: Optional[str] = None) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Runs a comprehensive diagnostic sweep over the dataframe to catch
|
|
40
|
+
hidden data corruption, NaNs, infinite values, and variance issues.
|
|
41
|
+
"""
|
|
42
|
+
console.print(Panel.fit("[bold cyan]🔍 KAIZENSTAT AUDIT: Diagnostic Sweep[/bold cyan]", border_style="cyan"))
|
|
43
|
+
|
|
44
|
+
table = Table(show_header=True, header_style="bold white", border_style="bright_black")
|
|
45
|
+
table.add_column("Diagnostic Check", style="cyan")
|
|
46
|
+
table.add_column("Status / Findings", justify="left")
|
|
47
|
+
|
|
48
|
+
table.add_row("Matrix Dimensions", f"{df.shape[0]} Rows × {df.shape[1]} Columns")
|
|
49
|
+
|
|
50
|
+
dupes = df.duplicated().sum()
|
|
51
|
+
status = f"[red]Failed: {dupes} duplicate rows found[/red]" if dupes > 0 else "[green]Passed[/green]"
|
|
52
|
+
table.add_row("Duplicate Rows", status)
|
|
53
|
+
|
|
54
|
+
total_missing = df.isna().sum().sum()
|
|
55
|
+
if total_missing > 0:
|
|
56
|
+
missing_cols = df.columns[df.isna().any()].tolist()
|
|
57
|
+
table.add_row("Missing Values (NaNs)", f"[red]Failed: {total_missing} NaNs across {len(missing_cols)} columns[/red]")
|
|
58
|
+
else:
|
|
59
|
+
table.add_row("Missing Values (NaNs)", "[green]Passed[/green]")
|
|
60
|
+
|
|
61
|
+
num_cols = df.select_dtypes(include=[np.number])
|
|
62
|
+
infs = np.isinf(num_cols).sum().sum() if not num_cols.empty else 0
|
|
63
|
+
table.add_row("Infinite Values (inf)", f"[red]Failed: {infs} infs found[/red]" if infs > 0 else "[green]Passed[/green]")
|
|
64
|
+
|
|
65
|
+
dead_cols = [col for col in df.columns if df[col].nunique() <= 1]
|
|
66
|
+
table.add_row("Constant Columns (No Variance)", f"[yellow]Warning: {len(dead_cols)} found {dead_cols}[/yellow]" if dead_cols else "[green]Passed[/green]")
|
|
67
|
+
|
|
68
|
+
if target_column:
|
|
69
|
+
if target_column not in df.columns:
|
|
70
|
+
table.add_row("Target Variable", f"[bold red]CRITICAL: '{target_column}' not found in dataset![/bold red]")
|
|
71
|
+
else:
|
|
72
|
+
target_nans = df[target_column].isna().sum()
|
|
73
|
+
table.add_row("Target Variable Integrity", f"[red]Failed: {target_nans} missing labels[/red]" if target_nans > 0 else "[green]Passed[/green]")
|
|
74
|
+
|
|
75
|
+
console.print(table)
|
|
76
|
+
console.print("[dim]Audit complete. Proceed to kz.heal() to automatically repair failed checks.[/dim]\n")
|
|
77
|
+
|
|
78
|
+
# =========================================================================
|
|
79
|
+
# MODULE 2: THE ADVANCED HEALER (kz.heal)
|
|
80
|
+
# =========================================================================
|
|
81
|
+
@staticmethod
|
|
82
|
+
def heal(df: pd.DataFrame, target_column: Optional[str] = None, method: str = "fill_median") -> pd.DataFrame:
|
|
83
|
+
"""
|
|
84
|
+
Intelligently repairs the dataframe with user-defined control.
|
|
85
|
+
Available methods: "fill_median", "fill_mean", or "drop_rows".
|
|
86
|
+
"""
|
|
87
|
+
console.print(f"[bold green]🩹 KAIZENSTAT HEALER: Initiating Auto-Repair (Strategy: {method.upper()})[/bold green]")
|
|
88
|
+
clean_df = df.copy()
|
|
89
|
+
|
|
90
|
+
if target_column and target_column in clean_df.columns:
|
|
91
|
+
target_missing = clean_df[target_column].isna().sum()
|
|
92
|
+
if target_missing > 0:
|
|
93
|
+
clean_df = clean_df.dropna(subset=[target_column])
|
|
94
|
+
console.print(f" [green]✓[/green] Dropped {target_missing} rows where target '{target_column}' was missing.")
|
|
95
|
+
|
|
96
|
+
dupes = clean_df.duplicated().sum()
|
|
97
|
+
if dupes > 0:
|
|
98
|
+
clean_df = clean_df.drop_duplicates()
|
|
99
|
+
console.print(f" [green]✓[/green] Removed {dupes} duplicated rows.")
|
|
100
|
+
|
|
101
|
+
num_cols = clean_df.select_dtypes(include=[np.number]).columns
|
|
102
|
+
clean_df[num_cols] = clean_df[num_cols].replace([np.inf, -np.inf], np.nan)
|
|
103
|
+
|
|
104
|
+
for col in list(clean_df.columns):
|
|
105
|
+
if clean_df[col].isna().mean() > 0.90:
|
|
106
|
+
clean_df = clean_df.drop(columns=[col])
|
|
107
|
+
console.print(f" [green]✓[/green] Dropped '{col}' (Over 90% missing data).")
|
|
108
|
+
elif clean_df[col].nunique() <= 1 and col != target_column:
|
|
109
|
+
clean_df = clean_df.drop(columns=[col])
|
|
110
|
+
console.print(f" [green]✓[/green] Dropped '{col}' (Zero variance / Constant feature).")
|
|
111
|
+
|
|
112
|
+
if method == "drop_rows":
|
|
113
|
+
before_drop = len(clean_df)
|
|
114
|
+
clean_df = clean_df.dropna()
|
|
115
|
+
rows_dropped = before_drop - len(clean_df)
|
|
116
|
+
console.print(f" [bold yellow]⚠ Strict Mode Active:[/bold yellow] Dropped {rows_dropped} rows containing NaNs.")
|
|
117
|
+
|
|
118
|
+
elif method in ["fill_mean", "fill_median"]:
|
|
119
|
+
numeric_cols = clean_df.select_dtypes(include=[np.number]).columns
|
|
120
|
+
if len(numeric_cols) > 0:
|
|
121
|
+
if method == "fill_mean":
|
|
122
|
+
clean_df[numeric_cols] = clean_df[numeric_cols].fillna(clean_df[numeric_cols].mean())
|
|
123
|
+
console.print(" [green]✓[/green] Filled numeric NaNs using [bold]Mean (Average)[/bold].")
|
|
124
|
+
else:
|
|
125
|
+
clean_df[numeric_cols] = clean_df[numeric_cols].fillna(clean_df[numeric_cols].median())
|
|
126
|
+
console.print(" [green]✓[/green] Filled numeric NaNs using [bold]Median[/bold].")
|
|
127
|
+
|
|
128
|
+
cat_cols = clean_df.select_dtypes(exclude=[np.number]).columns
|
|
129
|
+
for col in cat_cols:
|
|
130
|
+
mode_val = clean_df[col].mode()[0] if not clean_df[col].mode().empty else "Unknown"
|
|
131
|
+
clean_df[col] = clean_df[col].fillna(mode_val)
|
|
132
|
+
if len(cat_cols) > 0:
|
|
133
|
+
console.print(" [green]✓[/green] Filled categorical strings using [bold]Mode[/bold].")
|
|
134
|
+
else:
|
|
135
|
+
raise ValueError(f"Unknown healing method: '{method}'. Choose 'fill_median', 'fill_mean', or 'drop_rows'.")
|
|
136
|
+
|
|
137
|
+
console.print(" [bold green]✓ Dataset is fully healed and ready for modeling.[/bold green]\n")
|
|
138
|
+
return clean_df
|
|
139
|
+
|
|
140
|
+
# =========================================================================
|
|
141
|
+
# MODULE 3: THE FITTER (kz.benchmark)
|
|
142
|
+
# =========================================================================
|
|
143
|
+
@staticmethod
|
|
144
|
+
def benchmark(df: pd.DataFrame, target_column: str, overrides: Optional[Dict[str, Dict]] = None) -> pd.DataFrame:
|
|
145
|
+
"""
|
|
146
|
+
Auto-detects the objective, builds preprocessing pipelines, applies user
|
|
147
|
+
configuration overrides, and ranks elite models in a single sweep.
|
|
148
|
+
"""
|
|
149
|
+
console.print(Panel.fit("[bold magenta]🚀 KAIZENSTAT AUTO-ML: Fitter & Tuner[/bold magenta]", border_style="magenta"))
|
|
150
|
+
|
|
151
|
+
if target_column not in df.columns:
|
|
152
|
+
raise ValueError(f"Target column '{target_column}' not found.")
|
|
153
|
+
|
|
154
|
+
X = df.drop(columns=[target_column])
|
|
155
|
+
y = df[target_column]
|
|
156
|
+
|
|
157
|
+
is_classification = False
|
|
158
|
+
if y.dtype == 'object' or y.nunique() < (len(y) * 0.05):
|
|
159
|
+
is_classification = True
|
|
160
|
+
|
|
161
|
+
task_str = "CLASSIFICATION" if is_classification else "REGRESSION"
|
|
162
|
+
console.print(f"[dim]Detected Objective: [bold gold1]{task_str}[/bold gold1][/dim]\n")
|
|
163
|
+
|
|
164
|
+
num_features = X.select_dtypes(include=[np.number]).columns
|
|
165
|
+
cat_features = X.select_dtypes(exclude=[np.number]).columns
|
|
166
|
+
|
|
167
|
+
preprocessor = ColumnTransformer(transformers=[
|
|
168
|
+
('num', StandardScaler(), num_features),
|
|
169
|
+
('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
|
|
170
|
+
])
|
|
171
|
+
|
|
172
|
+
overrides = overrides or {}
|
|
173
|
+
if is_classification:
|
|
174
|
+
models = {
|
|
175
|
+
"Linear": LogisticRegression(**overrides.get("Linear", {"max_iter": 1000})),
|
|
176
|
+
"RandomForest": RandomForestClassifier(**overrides.get("RandomForest", {"n_estimators": 50, "random_state": 42})),
|
|
177
|
+
"NeuralNet": MLPClassifier(**overrides.get("NeuralNet", {"max_iter": 500, "hidden_layer_sizes": (64, 32), "random_state": 42}))
|
|
178
|
+
}
|
|
179
|
+
else:
|
|
180
|
+
models = {
|
|
181
|
+
"Linear": Ridge(**overrides.get("Linear", {})),
|
|
182
|
+
"RandomForest": RandomForestRegressor(**overrides.get("RandomForest", {"n_estimators": 50, "random_state": 42})),
|
|
183
|
+
"NeuralNet": MLPRegressor(**overrides.get("NeuralNet", {"max_iter": 500, "hidden_layer_sizes": (64, 32), "random_state": 42}))
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
187
|
+
|
|
188
|
+
leaderboard_data = []
|
|
189
|
+
|
|
190
|
+
for name, model in models.items():
|
|
191
|
+
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
start_time = time.time()
|
|
195
|
+
pipeline.fit(X_train, y_train)
|
|
196
|
+
latency = time.time() - start_time
|
|
197
|
+
|
|
198
|
+
preds = pipeline.predict(X_test)
|
|
199
|
+
|
|
200
|
+
if is_classification:
|
|
201
|
+
primary = accuracy_score(y_test, preds)
|
|
202
|
+
secondary = f"F1: {f1_score(y_test, preds, average='weighted', zero_division=0):.3f}"
|
|
203
|
+
else:
|
|
204
|
+
primary = r2_score(y_test, preds)
|
|
205
|
+
secondary = f"MAE: {mean_absolute_error(y_test, preds):.2f}"
|
|
206
|
+
|
|
207
|
+
leaderboard_data.append({
|
|
208
|
+
"Model": name, "Primary Score": primary, "Secondary Metric": secondary,
|
|
209
|
+
"Time (s)": latency, "Status": "[green]SUCCESS[/green]"
|
|
210
|
+
})
|
|
211
|
+
except Exception as e:
|
|
212
|
+
leaderboard_data.append({
|
|
213
|
+
"Model": name, "Primary Score": 0.0, "Secondary Metric": "N/A",
|
|
214
|
+
"Time (s)": 0.0, "Status": f"[red]FAIL: {str(e)[:15]}[/red]"
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
df_results = pd.DataFrame(leaderboard_data).sort_values(by="Primary Score", ascending=False).reset_index(drop=True)
|
|
218
|
+
|
|
219
|
+
table = Table(title="🏆 MODEL LEADERBOARD", border_style="bright_black")
|
|
220
|
+
table.add_column("Rank", justify="center")
|
|
221
|
+
table.add_column("Architecture", style="cyan")
|
|
222
|
+
metric_name = "Accuracy" if is_classification else "R² Score"
|
|
223
|
+
table.add_column(metric_name, justify="right", style="green")
|
|
224
|
+
table.add_column("Secondary", justify="right")
|
|
225
|
+
table.add_column("Latency", justify="right", style="yellow")
|
|
226
|
+
table.add_column("Status", justify="center")
|
|
227
|
+
|
|
228
|
+
for idx, row in df_results.iterrows():
|
|
229
|
+
score_fmt = f"{row['Primary Score']*100:.2f}%" if is_classification else f"{row['Primary Score']:.4f}"
|
|
230
|
+
table.add_row(f"#{idx + 1}", row["Model"], score_fmt, row["Secondary Metric"], f"{row['Time (s)']:.3f}s", row["Status"])
|
|
231
|
+
|
|
232
|
+
console.print(table)
|
|
233
|
+
return df_results
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
# =========================================================================
|
|
237
|
+
# WORKFLOW EXECUTION (The "Ample Data" Enterprise Test)
|
|
238
|
+
# =========================================================================
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
from sklearn.datasets import make_classification
|
|
241
|
+
|
|
242
|
+
console.print("\n[bold yellow]Generating 15,000 rows of synthetic Enterprise Data...[/bold yellow]\n")
|
|
243
|
+
|
|
244
|
+
# 1. Generate a massive, mathematically complex dataset
|
|
245
|
+
X_raw, y_raw = make_classification(
|
|
246
|
+
n_samples=15000,
|
|
247
|
+
n_features=12,
|
|
248
|
+
n_informative=8,
|
|
249
|
+
random_state=42
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
# 2. Convert to DataFrame with realistic column names
|
|
253
|
+
cols = [f"sensor_metric_{i}" for i in range(10)] + ['user_category', 'region_code']
|
|
254
|
+
df_massive = pd.DataFrame(X_raw, columns=cols)
|
|
255
|
+
|
|
256
|
+
# Make some columns categorical strings to test the preprocessor
|
|
257
|
+
df_massive['user_category'] = np.random.choice(['Premium', 'Standard', 'Free'], size=15000)
|
|
258
|
+
df_massive['region_code'] = np.random.choice(['US', 'EU', 'ASIA'], size=15000)
|
|
259
|
+
|
|
260
|
+
# 3. Add the target variable
|
|
261
|
+
df_massive['target_conversion'] = y_raw
|
|
262
|
+
|
|
263
|
+
# 4. Inject Realistic Enterprise Data Corruption (The problems to heal!)
|
|
264
|
+
df_massive.loc[100:500, 'sensor_metric_2'] = np.nan # Missing data
|
|
265
|
+
df_massive.loc[8000:8100, 'sensor_metric_5'] = np.inf # Math errors
|
|
266
|
+
df_massive.loc[12000:12050, 'target_conversion'] = np.nan # Missing targets
|
|
267
|
+
|
|
268
|
+
# --- RUN KAIZENSTAT ---
|
|
269
|
+
# Audit will catch the 15,000 row scale and the injected errors
|
|
270
|
+
KaizenStat.audit(df_massive, target_column="target_conversion")
|
|
271
|
+
|
|
272
|
+
# Healer will safely patch the massive dataset
|
|
273
|
+
df_clean = KaizenStat.heal(df_massive, target_column="target_conversion", method="fill_median")
|
|
274
|
+
|
|
275
|
+
# Power user tweaks for the deep learning and ensemble networks
|
|
276
|
+
my_tweaks = {
|
|
277
|
+
"RandomForest": {"n_estimators": 150, "max_depth": 15, "n_jobs": -1},
|
|
278
|
+
"NeuralNet": {"hidden_layer_sizes": (128, 64), "max_iter": 300}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
# Benchmark will now have ample data to learn and generate a real leaderboard
|
|
282
|
+
leaderboard = KaizenStat.benchmark(df_clean, target_column="target_conversion", overrides=my_tweaks)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kaizenstat
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Zero-friction AutoML + Data Cleaning Toolkit
|
|
5
|
+
Author: Masuddar Rahman
|
|
6
|
+
Requires-Python: >=3.8
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pandas
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: scikit-learn
|
|
11
|
+
Requires-Dist: rich
|
|
12
|
+
Dynamic: author
|
|
13
|
+
Dynamic: description
|
|
14
|
+
Dynamic: description-content-type
|
|
15
|
+
Dynamic: requires-dist
|
|
16
|
+
Dynamic: requires-python
|
|
17
|
+
Dynamic: summary
|
|
18
|
+
|
|
19
|
+
# 🚀 KaizenStat
|
|
20
|
+
|
|
21
|
+
[](https://pypi.org/project/kaizenstat/)
|
|
22
|
+
[](https://opensource.org/licenses/MIT)
|
|
23
|
+
[](https://www.python.org/downloads/)
|
|
24
|
+
|
|
25
|
+
**KaizenStat** is a zero-friction data validation, automatic cleaning, and AutoML benchmarking toolkit designed to fit right into your daily data science workflow. It helps you diagnose and repair dataset issues instantly and trains baseline models to give you immediate insights.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## ✨ Features
|
|
30
|
+
|
|
31
|
+
- 🔍 **`kz.audit()`**: Instantly sweep datasets for duplicates, NaNs, infs, constant columns, and target label integrity.
|
|
32
|
+
- 🩹 **`kz.heal()`**: Automatically clean datasets by repairing missing targets, removing duplicates, dropping dead/constant columns, and imputing missing data using mean, median, or mode.
|
|
33
|
+
- 🚀 **`kz.benchmark()`**: Auto-detects objectives (classification/regression), builds pre-processing pipelines, trains elite models (Linear/Ridge, RandomForest, Neural Networks), and ranks them on a beautiful leaderboard.
|
|
34
|
+
- 💻 **CLI Interface**: Command line utility (`kz`) to audit, heal, or benchmark CSV datasets directly from the terminal.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## 📦 Installation
|
|
39
|
+
|
|
40
|
+
Install KaizenStat from PyPI:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install kaizenstat
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or install it locally in editable mode for development:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install -e .
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## 🚀 Quickstart Usage
|
|
55
|
+
|
|
56
|
+
### Python API
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import pandas as pd
|
|
60
|
+
from kaizenstat import KaizenStat
|
|
61
|
+
|
|
62
|
+
# Load dataset
|
|
63
|
+
df = pd.read_csv("data.csv")
|
|
64
|
+
|
|
65
|
+
# 1. Audit dataset
|
|
66
|
+
KaizenStat.audit(df, target_column="target")
|
|
67
|
+
|
|
68
|
+
# 2. Automatically repair dataset issues
|
|
69
|
+
clean_df = KaizenStat.heal(df, target_column="target", method="fill_median")
|
|
70
|
+
|
|
71
|
+
# 3. Benchmark ML models
|
|
72
|
+
leaderboard = KaizenStat.benchmark(clean_df, target_column="target")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### 💻 Command Line Interface (CLI)
|
|
76
|
+
|
|
77
|
+
KaizenStat provides a powerful CLI tool named `kz` right out of the box:
|
|
78
|
+
|
|
79
|
+
#### Audit a dataset:
|
|
80
|
+
```bash
|
|
81
|
+
kz audit data.csv --target price
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
#### Heal a dataset:
|
|
85
|
+
```bash
|
|
86
|
+
kz heal data.csv --target price --method fill_median -o clean_data.csv
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
#### Benchmark a dataset:
|
|
90
|
+
```bash
|
|
91
|
+
kz benchmark clean_data.csv --target price
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## 🛠 Development and Packaging
|
|
97
|
+
|
|
98
|
+
Build the package using `build`:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install build twine
|
|
102
|
+
python -m build
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Upload to PyPI:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
twine upload dist/*
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## 📄 License
|
|
114
|
+
|
|
115
|
+
Distributed under the MIT License. See `LICENSE` for more information.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
kaizenstat/__init__.py,sha256=oc_o5t2h75KFQsRLI6QZ4qNVhSCcd77GmBMGg18WhBA,55
|
|
2
|
+
kaizenstat/cli.py,sha256=IdzJ8X1GiS1GF2hFLRVzV1nwX6kW1SJSqkxks-nU7Cc,2394
|
|
3
|
+
kaizenstat/core.py,sha256=higyV-MEDz9XMGurKYFASwbrfIC0IlpO9s1-pZODB28,13817
|
|
4
|
+
kaizenstat-0.1.0.dist-info/METADATA,sha256=fcUnNayNxHSsdKzyjfmQQnf7Cko2KtDVgciLnvHQgG4,2989
|
|
5
|
+
kaizenstat-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
kaizenstat-0.1.0.dist-info/entry_points.txt,sha256=OwnHUDfKzmwNf0QdWtjlRRJgmGpq1XqQguqLJ4JLOjE,43
|
|
7
|
+
kaizenstat-0.1.0.dist-info/top_level.txt,sha256=V12e6GeQEjP9isPHVwsQZEK2sXYtJzSRktjTNshejAo,11
|
|
8
|
+
kaizenstat-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
kaizenstat
|