simpute 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
simpute/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ from simpute.core import Simpute
2
+ from simpute.utils import ColumnProfile
3
+
4
+ __version__ = "0.1.0"
5
+ __all__ = ["Simpute", "ColumnProfile", "__version__"]
simpute/core.py ADDED
@@ -0,0 +1,291 @@
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.base import BaseEstimator, TransformerMixin
9
+
10
+ from simpute.models import buildmodel, pickmodel
11
+ from simpute.utils import (
12
+ ColumnProfile,
13
+ decodecolumn,
14
+ expandfeatures,
15
+ featurecolumns,
16
+ isnumerical,
17
+ profilecolumn,
18
+ profiledataframe,
19
+ selectfeatures,
20
+ )
21
+
22
+
23
+ class Simpute(BaseEstimator, TransformerMixin):
24
+ """Adaptive per-column imputer with automatic model selection."""
25
+
26
+ def __init__(
27
+ self,
28
+ columns: list[str] | None = None,
29
+ exclude: list[str] | None = None,
30
+ maskratio: float = 0.0,
31
+ randomstate: int = 42,
32
+ ) -> None:
33
+ self.columns = columns
34
+ self.exclude = exclude or []
35
+ self.maskratio = maskratio
36
+ self.randomstate = randomstate
37
+ self.profiles_: dict[str, ColumnProfile] = {}
38
+ self.models_: dict[str, Any] = {}
39
+ self.featuremaps_: dict[str, list[str]] = {}
40
+ self.dummycolumns_: dict[str, dict[str, list[str]]] = {}
41
+ self.targetencodings_: dict[str, dict[object, int]] = {}
42
+ self.featurefills_: dict[str, dict[str, float | str | bool]] = {}
43
+ self.fallbacks_: dict[str, float | object] = {}
44
+ self.booltargets_: set[str] = set()
45
+ self.fittedcolumns_: list[str] = []
46
+
47
+ def _targets(self, df: pd.DataFrame) -> list[str]:
48
+ if self.columns is not None :
49
+ return [column for column in self.columns if column in df.columns]
50
+ return [column for column in df.columns if column not in self.exclude]
51
+
52
+ def _columnorder(self, df: pd.DataFrame, columns: list[str]) -> list[str]:
53
+ return sorted(
54
+ columns,
55
+ key = lambda column : (
56
+ 0 if self.profiles_.get(column) and self.profiles_[column].kind == "numerical" else 1,
57
+ int(df[column].isna().sum()),
58
+ column,
59
+ ),
60
+ )
61
+
62
+ def _preparefeatures(self, df: pd.DataFrame, target: str, features: list[str]) -> pd.DataFrame:
63
+ dummymap = self.dummycolumns_.get(target)
64
+ expanded, learned = expandfeatures(df[features], features, dummymap)
65
+ if target not in self.dummycolumns_ :
66
+ self.dummycolumns_[target] = learned
67
+ return expanded.astype(float)
68
+
69
+ def _preparetarget(self, series: pd.Series, target: str) -> pd.Series:
70
+ if isnumerical(series) :
71
+ return series.astype(float)
72
+ labels = sorted(series.dropna().unique(), key = lambda value : str(value))
73
+ mapping = {label : index for index, label in enumerate(labels)}
74
+ self.targetencodings_[target] = mapping
75
+ return series.map(mapping).astype(float)
76
+
77
+ def _fallbackvalue(self, series: pd.Series) -> float | object:
78
+ observed = series.dropna()
79
+ if observed.empty :
80
+ return np.nan
81
+ if isnumerical(series) :
82
+ return float(observed.median())
83
+ modes = observed.mode()
84
+ return modes.iloc[0] if not modes.empty else observed.iloc[0]
85
+
86
+ def _nativemissing(self, modelname: str) -> bool:
87
+ return modelname in {"LGBMRegressor", "LGBMClassifier", "CatBoostClassifier"}
88
+
89
+ def _featurefills(self, df: pd.DataFrame, features: list[str]) -> dict[str, float | str | bool]:
90
+ fills: dict[str, float | str | bool] = {}
91
+ for feature in features :
92
+ series = df[feature]
93
+ if isnumerical(series) :
94
+ fills[feature] = float(series.dropna().median())
95
+ continue
96
+ mode = series.dropna().mode()
97
+ fills[feature] = mode.iloc[0] if not mode.empty else series.dropna().iloc[0]
98
+ return fills
99
+
100
+ def _resetcolumn(self, target: str) -> None:
101
+ self.models_.pop(target, None)
102
+ self.featuremaps_.pop(target, None)
103
+ self.dummycolumns_.pop(target, None)
104
+ self.targetencodings_.pop(target, None)
105
+ self.featurefills_.pop(target, None)
106
+ self.fallbacks_.pop(target, None)
107
+ self.booltargets_.discard(target)
108
+
109
+ def _fitcolumn(self, data: pd.DataFrame, target: str) -> bool:
110
+ self._resetcolumn(target)
111
+ profile = self.profiles_.get(target) or profilecolumn(target, data[target])
112
+ self.profiles_[target] = profile
113
+ if profile.missingnessflag == "high_missing" and data[target].isna().all() :
114
+ return False
115
+ features = selectfeatures(data, target, self.exclude, topk = 6)
116
+ if not features :
117
+ return False
118
+ observed = data[target].notna()
119
+ if observed.sum() < 2 :
120
+ return False
121
+
122
+ trainframe = data.loc[observed].copy()
123
+ fills = self._featurefills(trainframe, features)
124
+ selectionframe = trainframe.copy()
125
+ for feature, fill in fills.items() :
126
+ selectionframe[feature] = selectionframe[feature].fillna(fill)
127
+
128
+ xselect = self._preparefeatures(selectionframe, target, features).fillna(0.0)
129
+ ytrain = self._preparetarget(trainframe[target], target)
130
+ valid = xselect.notna().all(axis = 1) & ytrain.notna()
131
+ xselect = xselect.loc[valid]
132
+ ytrain = ytrain.loc[valid]
133
+ if len(xselect) < 2 :
134
+ return False
135
+
136
+ modelname = pickmodel(profile, xselect.values, ytrain.values, data[target])
137
+ usenative = self._nativemissing(modelname)
138
+
139
+ if usenative :
140
+ xtrain = self._preparefeatures(trainframe.loc[ytrain.index], target, features)
141
+ else :
142
+ filled = trainframe.loc[ytrain.index, features + [target]].copy()
143
+ for feature, fill in fills.items() :
144
+ filled[feature] = filled[feature].fillna(fill)
145
+ xtrain = self._preparefeatures(filled, target, features).fillna(0.0)
146
+ complete = xtrain.notna().all(axis = 1)
147
+ xtrain = xtrain.loc[complete]
148
+ ytrain = ytrain.loc[complete.index[complete]]
149
+
150
+ if len(xtrain) > 15000 :
151
+ keep = np.random.default_rng(self.randomstate).choice(len(xtrain), 15000, replace = False)
152
+ xtrain = xtrain.iloc[keep]
153
+ ytrain = ytrain.iloc[keep]
154
+
155
+ if len(xtrain) < 2 :
156
+ return False
157
+
158
+ self.profiles_[target] = ColumnProfile(
159
+ profile.name,
160
+ profile.kind,
161
+ profile.missingratio,
162
+ profile.cardinality,
163
+ profile.distributionshape,
164
+ profile.missingnessflag,
165
+ modelname,
166
+ )
167
+ model = buildmodel(modelname, self.profiles_[target], len(xtrain), xtrain.shape[1])
168
+ model.fit(xtrain.values, ytrain.values)
169
+ self.models_[target] = model
170
+ self.featuremaps_[target] = features
171
+ self.featurefills_[target] = fills
172
+ self.fallbacks_[target] = self._fallbackvalue(data[target])
173
+ if not hasattr(self, "usenative_") :
174
+ self.usenative_ = {}
175
+ self.usenative_[target] = usenative
176
+ if pd.api.types.is_bool_dtype(data[target]) :
177
+ self.booltargets_.add(target)
178
+ return True
179
+
180
+ def _predictvalues(self, model: Any, xpred: pd.DataFrame) -> np.ndarray:
181
+ preds = np.asarray(model.predict(xpred.values))
182
+ return preds.ravel()
183
+
184
+ def _imputecolumn(self, result: pd.DataFrame, target: str) -> None:
185
+ if target not in self.models_ :
186
+ missing = result[target].isna()
187
+ if missing.any() :
188
+ result.loc[missing, target] = self._fallbackvalue(result[target])
189
+ return
190
+
191
+ missing = result[target].isna()
192
+ if not missing.any() :
193
+ return
194
+
195
+ features = self.featuremaps_[target]
196
+ block = result.loc[missing, features].copy()
197
+ usenative = getattr(self, "usenative_", {}).get(target, False)
198
+ if not usenative :
199
+ for feature, fill in self.featurefills_[target].items() :
200
+ block[feature] = block[feature].fillna(fill)
201
+
202
+ xpred = self._preparefeatures(block, target, features)
203
+ if not usenative :
204
+ xpred = xpred.fillna(0.0)
205
+
206
+ preds = self._predictvalues(self.models_[target], xpred)
207
+ profile = self.profiles_[target]
208
+ if profile.kind == "categorical" :
209
+ decoded = decodecolumn(pd.Series(preds), self.targetencodings_[target])
210
+ if target in self.booltargets_ :
211
+ result.loc[missing, target] = decoded.astype(bool).values
212
+ else :
213
+ result.loc[missing, target] = decoded.values
214
+ else :
215
+ if profile.cardinality <= 20 :
216
+ preds = np.round(preds)
217
+ result.loc[missing, target] = np.asarray(preds, dtype = float)
218
+
219
+ stillmissing = result[target].isna()
220
+ if stillmissing.any() :
221
+ result.loc[stillmissing, target] = self.fallbacks_[target]
222
+
223
+ def fit(self, df: pd.DataFrame, y: Any = None) -> Simpute:
224
+ del y
225
+ data = df.copy()
226
+ targets = self._targets(data)
227
+ self.profiles_ = profiledataframe(data, targets)
228
+ self.models_.clear()
229
+ self.featuremaps_.clear()
230
+ self.dummycolumns_.clear()
231
+ self.targetencodings_.clear()
232
+ self.featurefills_.clear()
233
+ self.fallbacks_.clear()
234
+ self.booltargets_.clear()
235
+
236
+ order = self._columnorder(data, targets)
237
+ for target in order :
238
+ self._fitcolumn(data, target)
239
+
240
+ self.fittedcolumns_ = list(self.models_.keys())
241
+ return self
242
+
243
+ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
244
+ if not self.models_ :
245
+ raise RuntimeError("Simpute is not fitted. Call fit before transform.")
246
+ result = df.copy()
247
+ order = self._columnorder(result, self.fittedcolumns_)
248
+ for _ in range(3) :
249
+ before = result.isna().sum().sum()
250
+ for target in order :
251
+ self._imputecolumn(result, target)
252
+ if result.isna().sum().sum() == before :
253
+ break
254
+ for column in self._targets(result) :
255
+ if column not in self.models_ and result[column].isna().any() :
256
+ result[column] = result[column].fillna(self._fallbackvalue(df[column]))
257
+ return result
258
+
259
+ def fit_transform(self, df: pd.DataFrame, y: Any = None) -> pd.DataFrame:
260
+ del y
261
+ data = df.copy()
262
+ targets = self._targets(data)
263
+ self.profiles_ = profiledataframe(data, targets)
264
+ self.models_.clear()
265
+ self.featuremaps_.clear()
266
+ self.dummycolumns_.clear()
267
+ self.targetencodings_.clear()
268
+ self.featurefills_.clear()
269
+ self.fallbacks_.clear()
270
+ self.booltargets_.clear()
271
+
272
+ result = data.copy()
273
+ order = self._columnorder(result, targets)
274
+ for target in order :
275
+ if result[target].isna().sum() == 0 :
276
+ continue
277
+ self._fitcolumn(result, target)
278
+ self._imputecolumn(result, target)
279
+
280
+ self.fittedcolumns_ = list(self.models_.keys())
281
+ return result
282
+
283
+ def getprofiles(self) -> dict[str, ColumnProfile]:
284
+ return copy.deepcopy(self.profiles_)
285
+
286
+ def getmodelselection(self) -> dict[str, str]:
287
+ return {
288
+ target : profile.modelname
289
+ for target, profile in self.profiles_.items()
290
+ if profile.modelname is not None
291
+ }
simpute/models.py ADDED
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ import numpy as np
6
+ from catboost import CatBoostClassifier
7
+ from lightgbm import LGBMClassifier as LGBMC, LGBMRegressor as LGBMR
8
+ from sklearn.ensemble import ExtraTreesRegressor
9
+ from sklearn.linear_model import BayesianRidge, LogisticRegression
10
+ from sklearn.neighbors import KNeighborsRegressor
11
+ from sklearn.pipeline import Pipeline
12
+ from sklearn.preprocessing import StandardScaler
13
+ from sklearn.svm import LinearSVC
14
+
15
+ from simpute.utils import ColumnProfile, isdiscrete, ishighcardinality
16
+
17
+
18
+ def _lgbmparams(nrows: int) -> dict[str, Any]:
19
+ leaves = min(63, max(15, int(np.sqrt(nrows))))
20
+ estimators = min(150, max(50, nrows // 150))
21
+ return {
22
+ "n_estimators" : estimators,
23
+ "num_leaves" : leaves,
24
+ "learning_rate" : 0.05,
25
+ "random_state" : 42,
26
+ "verbosity" : -1,
27
+ "n_jobs" : -1,
28
+ }
29
+
30
+
31
+ def candidates(profile: ColumnProfile, nrows: int, series: Any = None) -> list[str]:
32
+ if profile.kind == "categorical" :
33
+ if ishighcardinality(profile.cardinality) :
34
+ return ["CatBoostClassifier", "LGBMClassifier"]
35
+ if profile.cardinality <= 2 :
36
+ return ["LogisticRegression"]
37
+ return ["LogisticRegression", "LinearSVC"]
38
+ discrete = series is not None and isdiscrete(series, profile.cardinality)
39
+ if nrows >= 1000 :
40
+ return ["LGBMRegressor", "ExtraTreesRegressor"]
41
+ if discrete or profile.distributionshape == "skewed" :
42
+ return ["LGBMRegressor", "ExtraTreesRegressor"]
43
+ return ["KNNRegressor", "BayesianRidge"]
44
+
45
+
46
+ def selectmodel(profile: ColumnProfile, nrows: int, series: Any = None) -> str:
47
+ return candidates(profile, nrows, series)[0]
48
+
49
+
50
+ def buildmodel(modelname: str, profile: ColumnProfile, nrows: int, nfeatures: int) -> Any:
51
+ if modelname == "LGBMClassifier" :
52
+ return LGBMC(**_lgbmparams(nrows))
53
+ if modelname == "CatBoostClassifier" :
54
+ return CatBoostClassifier(
55
+ iterations = min(300, max(100, nrows // 50)),
56
+ depth = min(8, max(4, int(np.log2(nrows + 1)))),
57
+ learning_rate = 0.05,
58
+ random_seed = 42,
59
+ verbose = False,
60
+ thread_count = -1,
61
+ )
62
+ if modelname == "LogisticRegression" :
63
+ return Pipeline([
64
+ ("scaler", StandardScaler()),
65
+ ("model", LogisticRegression(
66
+ max_iter = 2000,
67
+ C = 1.0,
68
+ class_weight = "balanced",
69
+ random_state = 42,
70
+ )),
71
+ ])
72
+ if modelname == "LinearSVC" :
73
+ return Pipeline([
74
+ ("scaler", StandardScaler()),
75
+ ("model", LinearSVC(max_iter = 3000, class_weight = "balanced", random_state = 42)),
76
+ ])
77
+ if modelname == "LGBMRegressor" :
78
+ return LGBMR(**_lgbmparams(nrows))
79
+ if modelname == "ExtraTreesRegressor" :
80
+ return ExtraTreesRegressor(
81
+ n_estimators = min(300, max(100, nrows // 50)),
82
+ max_features = "sqrt",
83
+ random_state = 42,
84
+ n_jobs = -1,
85
+ )
86
+ if modelname == "KNNRegressor" :
87
+ neighbors = min(50, max(5, int(np.sqrt(nrows))))
88
+ return Pipeline([
89
+ ("scaler", StandardScaler()),
90
+ ("model", KNeighborsRegressor(n_neighbors = neighbors, weights = "distance", n_jobs = -1)),
91
+ ])
92
+ if modelname == "BayesianRidge" :
93
+ return Pipeline([
94
+ ("scaler", StandardScaler()),
95
+ ("model", BayesianRidge()),
96
+ ])
97
+ raise ValueError(f"Unsupported model: {modelname}")
98
+
99
+
100
+ def pickmodel(
101
+ profile: ColumnProfile,
102
+ xtrain: np.ndarray,
103
+ ytrain: np.ndarray,
104
+ series: Any = None,
105
+ ) -> str:
106
+ options = candidates(profile, len(ytrain), series)
107
+ if len(options) == 1 :
108
+ return options[0]
109
+ if profile.kind == "numerical" and len(ytrain) >= 1000 :
110
+ return "LGBMRegressor" if "LGBMRegressor" in options else options[0]
111
+ if profile.kind == "categorical" and len(ytrain) >= 1000 and "CatBoostClassifier" in options :
112
+ return "CatBoostClassifier"
113
+ if profile.kind == "categorical" and len(ytrain) >= 1000 :
114
+ return options[0]
115
+ return options[0]
simpute/utils.py ADDED
@@ -0,0 +1,187 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from dataclasses import dataclass
5
+ from typing import Literal
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ from scipy import stats
10
+
11
+ ColumnKind = Literal["numerical", "categorical"]
12
+ DistributionShape = Literal["skewed", "normal_uniform"]
13
+ MissingnessFlag = Literal["ok", "high_missing"]
14
+
15
+ HIGH_MISSING_THRESHOLD = 0.70
16
+ HIGH_CARDINALITY_THRESHOLD = 10
17
+ SKEW_THRESHOLD = 1.0
18
+
19
+
20
+ @dataclass(frozen = True)
21
+ class ColumnProfile:
22
+ name: str
23
+ kind: ColumnKind
24
+ missingratio: float
25
+ cardinality: int
26
+ distributionshape: DistributionShape | None
27
+ missingnessflag: MissingnessFlag
28
+ modelname: str | None = None
29
+
30
+
31
+ def isnumerical(series: pd.Series) -> bool:
32
+ dtype = series.dtype
33
+ return pd.api.types.is_numeric_dtype(dtype) and not pd.api.types.is_bool_dtype(dtype)
34
+
35
+
36
+ def iscategorical(series: pd.Series) -> bool:
37
+ return pd.api.types.is_bool_dtype(series) or pd.api.types.is_object_dtype(series) or pd.api.types.is_categorical_dtype(series)
38
+
39
+
40
+ def missingratio(series: pd.Series) -> float:
41
+ return float(series.isna().mean())
42
+
43
+
44
+ def cardinality(series: pd.Series) -> int:
45
+ return int(series.dropna().nunique())
46
+
47
+
48
+ def distributionshape(series: pd.Series) -> DistributionShape:
49
+ values = series.dropna().astype(float)
50
+ if len(values) < 8 :
51
+ return "normal_uniform"
52
+ skew = float(stats.skew(values))
53
+ if abs(skew) >= SKEW_THRESHOLD :
54
+ return "skewed"
55
+ return "normal_uniform"
56
+
57
+
58
+ def ishighcardinality(card: int) -> bool:
59
+ return card > HIGH_CARDINALITY_THRESHOLD
60
+
61
+
62
+ def isdiscrete(series: pd.Series, card: int) -> bool:
63
+ if not isnumerical(series) :
64
+ return False
65
+ if card > 20 :
66
+ return False
67
+ if pd.api.types.is_integer_dtype(series) :
68
+ return True
69
+ values = series.dropna().astype(float)
70
+ return bool(np.allclose(values, np.round(values)))
71
+
72
+
73
+ def profilecolumn(name: str, series: pd.Series) -> ColumnProfile:
74
+ ratio = missingratio(series)
75
+ flag: MissingnessFlag = "high_missing" if ratio > HIGH_MISSING_THRESHOLD else "ok"
76
+ if flag == "high_missing" :
77
+ warnings.warn(
78
+ f"Column '{name}' has {ratio:.1%} missing values (> {HIGH_MISSING_THRESHOLD:.0%}). "
79
+ "Imputation reliability may be limited.",
80
+ stacklevel = 2,
81
+ )
82
+ if isnumerical(series) :
83
+ card = cardinality(series)
84
+ shape = distributionshape(series)
85
+ return ColumnProfile(name, "numerical", ratio, card, shape, flag)
86
+ return ColumnProfile(
87
+ name,
88
+ "categorical",
89
+ ratio,
90
+ cardinality(series),
91
+ None,
92
+ flag,
93
+ )
94
+
95
+
96
+ def profiledataframe(df: pd.DataFrame, columns: list[str] | None = None) -> dict[str, ColumnProfile]:
97
+ targets = columns if columns is not None else list(df.columns)
98
+ return {column : profilecolumn(column, df[column]) for column in targets if column in df.columns}
99
+
100
+
101
+ def featurecolumns(df: pd.DataFrame, target: str, exclude: list[str] | None = None) -> list[str]:
102
+ blocked = {target, *(exclude or [])}
103
+ return [column for column in df.columns if column not in blocked]
104
+
105
+
106
+ def selectfeatures(
107
+ df: pd.DataFrame,
108
+ target: str,
109
+ exclude: list[str] | None = None,
110
+ topk: int = 6,
111
+ ) -> list[str]:
112
+ from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
113
+
114
+ features = featurecolumns(df, target, exclude)
115
+ if len(features) <= topk :
116
+ return features
117
+ observed = df[target].notna() & df[features].notna().all(axis = 1)
118
+ if observed.sum() < 20 :
119
+ return features
120
+ xframe = df.loc[observed, features].copy()
121
+ yseries = df.loc[observed, target]
122
+ if len(xframe) > 8000 :
123
+ xframe = xframe.sample(8000, random_state = 42)
124
+ yseries = yseries.loc[xframe.index]
125
+ xnum = pd.DataFrame({
126
+ column : (
127
+ xframe[column].astype(float)
128
+ if isnumerical(xframe[column])
129
+ else pd.Categorical(xframe[column]).codes
130
+ )
131
+ for column in features
132
+ })
133
+ if isnumerical(yseries) :
134
+ scores = mutual_info_regression(xnum, yseries.astype(float), random_state = 42)
135
+ else :
136
+ scores = mutual_info_classif(xnum, yseries.astype(str), random_state = 42)
137
+ ranked = sorted(zip(features, scores), key = lambda item : item[1], reverse = True)
138
+ return [column for column, score in ranked[:topk] if score > 0] or [column for column, _ in ranked[:topk]]
139
+
140
+
141
+ def expandfeatures(
142
+ df: pd.DataFrame,
143
+ columns: list[str],
144
+ dummycolumns: dict[str, list[str]] | None = None,
145
+ ) -> tuple[pd.DataFrame, dict[str, list[str]]]:
146
+ parts: list[pd.DataFrame] = []
147
+ dummymap: dict[str, list[str]] = {}
148
+ for column in columns :
149
+ if column not in df.columns :
150
+ continue
151
+ if isnumerical(df[column]) :
152
+ parts.append(df[[column]].astype(float).rename(columns = {column : column}))
153
+ continue
154
+ dummies = pd.get_dummies(df[column].astype(str), prefix = column, dtype = float)
155
+ if dummycolumns and column in dummycolumns :
156
+ for name in dummycolumns[column] :
157
+ if name not in dummies.columns :
158
+ dummies[name] = 0.0
159
+ dummies = dummies[dummycolumns[column]]
160
+ else :
161
+ dummymap[column] = list(dummies.columns)
162
+ parts.append(dummies)
163
+ if not parts :
164
+ return pd.DataFrame(index = df.index), dummymap
165
+ return pd.concat(parts, axis = 1), dummymap
166
+
167
+
168
+ def encodeframe(df: pd.DataFrame, columns: list[str]) -> tuple[pd.DataFrame, dict[str, dict[object, int]]]:
169
+ encoded = df.copy()
170
+ maps: dict[str, dict[object, int]] = {}
171
+ for column in columns :
172
+ if column not in encoded.columns :
173
+ continue
174
+ series = encoded[column]
175
+ if isnumerical(series) :
176
+ encoded[column] = series.astype(float)
177
+ continue
178
+ labels = sorted(series.dropna().unique(), key = lambda value : str(value))
179
+ mapping = {label : index for index, label in enumerate(labels)}
180
+ maps[column] = mapping
181
+ encoded[column] = series.map(mapping)
182
+ return encoded, maps
183
+
184
+
185
+ def decodecolumn(series: pd.Series, mapping: dict[object, int]) -> pd.Series:
186
+ inverse = {code : label for label, code in mapping.items()}
187
+ return series.map(inverse)
@@ -0,0 +1,214 @@
1
+ Metadata-Version: 2.4
2
+ Name: simpute
3
+ Version: 0.1.0
4
+ Summary: Smart Impute: adaptive per-column missing value imputation
5
+ Author: Hvllvix
6
+ Maintainer: Hvllvix
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/Hvllvix/Simpute
9
+ Project-URL: Repository, https://github.com/Hvllvix/Simpute
10
+ Project-URL: Documentation, https://github.com/Hvllvix/Simpute#readme
11
+ Project-URL: Bug Tracker, https://github.com/Hvllvix/Simpute/issues
12
+ Keywords: imputation,missing-data,machine-learning,sklearn,simpute
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: numpy>=1.24
28
+ Requires-Dist: pandas>=2.0
29
+ Requires-Dist: scikit-learn>=1.3
30
+ Requires-Dist: scipy>=1.10
31
+ Requires-Dist: lightgbm>=4.0
32
+ Requires-Dist: catboost>=1.2
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.0; extra == "dev"
35
+ Requires-Dist: matplotlib>=3.7; extra == "dev"
36
+ Requires-Dist: seaborn>=0.13; extra == "dev"
37
+ Dynamic: license-file
38
+
39
+ # Simple Imputation
40
+
41
+ **Simpute** (**sim**ple + im**pute**) is an adaptive missing-value imputation library for tabular data. Instead of applying one global strategy to every column, it profiles each feature, selects a tailored model, and imputes columns sequentially so earlier fills inform later ones.
42
+
43
+ Install from PyPI as `simpute`. Source and releases live at [github.com/Hvllvix/Simpute](https://github.com/Hvllvix/Simpute).
44
+
45
+ ---
46
+
47
+ ## Why Simpute
48
+
49
+ Most imputers pick a single method (mean, median, MICE, KNN) for the whole table. Real datasets mix binary flags, low-cardinality categories, high-cardinality text-like fields, skewed counts, and smooth continuous variables. Simpute treats each column on its own terms.
50
+
51
+ | Approach | Simpute |
52
+ |----------|---------|
53
+ | Strategy | Per-column profiling and model routing |
54
+ | API | Scikit-learn `fit` / `transform` / `fit_transform` |
55
+ | Models | LightGBM, CatBoost, logistic/SVM, KNN, Bayesian Ridge, Extra Trees |
56
+ | Safety | Guard test suite with ground-truth verification |
57
+ | Warnings | Flags columns above 70% missingness |
58
+
59
+ ---
60
+
61
+ ## Installation
62
+
63
+ ```bash
64
+ pip install simpute
65
+ ```
66
+
67
+ Development install with tests and plotting extras:
68
+
69
+ ```bash
70
+ git clone https://github.com/Hvllvix/Simpute.git
71
+ cd Simpute
72
+ pip install -e ".[dev]"
73
+ ```
74
+
75
+ ---
76
+
77
+ ## Quick Start
78
+
79
+ ```python
80
+ import pandas as pd
81
+ from simpute import Simpute
82
+
83
+ df = pd.read_csv("data.csv")
84
+
85
+ imputer = Simpute(exclude=["Student_ID"])
86
+ filled = imputer.fit_transform(df)
87
+
88
+ print(imputer.getmodelselection())
89
+ print(imputer.getprofiles())
90
+ ```
91
+
92
+ `exclude` keeps identifier columns out of the imputation loop. Use `columns=[...]` instead when you only want to impute a subset.
93
+
94
+ ---
95
+
96
+ ## How It Works
97
+
98
+ 1. **Profile** each target column (type, missingness, cardinality, distribution shape).
99
+ 2. **Select features** with mutual information (top 6 predictors by default).
100
+ 3. **Route** to a candidate model based on the column profile.
101
+ 4. **Fit** on observed rows, then **impute** missing cells column by column.
102
+ 5. **Warn** when missingness exceeds 70% on a column.
103
+
104
+ Sequential imputation means numerical columns are generally filled before categorical ones, and values imputed in earlier columns become features for later columns.
105
+
106
+ ---
107
+
108
+ ## Model Selection
109
+
110
+ | Column profile | Candidate models |
111
+ |----------------|------------------|
112
+ | High-cardinality categorical | CatBoost Classifier, LightGBM Classifier |
113
+ | Low-cardinality / binary categorical | Logistic Regression, Linear SVC |
114
+ | Large numerical tables (1000+ rows) | LightGBM Regressor, Extra Trees Regressor |
115
+ | Skewed or discrete numerical | LightGBM Regressor, Extra Trees Regressor |
116
+ | Normal / uniform continuous | KNN Regressor, Bayesian Ridge |
117
+
118
+ Inspect the chosen backend per column after fitting:
119
+
120
+ ```python
121
+ imputer.getmodelselection()
122
+ # {'Pre_Semester_GPA': 'LGBMRegressor', 'Major_Category': 'CatBoostClassifier', ...}
123
+ ```
124
+
125
+ ---
126
+
127
+ ## API Reference
128
+
129
+ | Method | Description |
130
+ |--------|-------------|
131
+ | `fit(df)` | Profile columns, train per-column models |
132
+ | `transform(df)` | Impute using fitted models |
133
+ | `fit_transform(df)` | Fit and impute in one pass (recommended) |
134
+ | `getprofiles()` | Column profiles used during routing |
135
+ | `getmodelselection()` | Model name chosen for each imputed column |
136
+
137
+ Constructor options: `columns`, `exclude`, `maskratio`, `randomstate`.
138
+
139
+ ---
140
+
141
+ ## Guard Tests
142
+
143
+ The guard suite (`tests/guard.py`) masks values in [`tests/data/test.csv`](tests/data/test.csv), imputes them, and checks:
144
+
145
+ - No NaN values remain after imputation
146
+ - Categorical predictions stay within the original domain
147
+ - Numerical predictions stay within bounded ranges
148
+ - Imputation beats adaptive random baselines on held-out masked cells
149
+ - Model selection is deterministic and profile-consistent
150
+ - High-missingness columns emit warnings
151
+ - `transform` before `fit` raises `RuntimeError`
152
+
153
+ See [`tests/data/README.md`](tests/data/README.md) for column descriptions and how to swap in your own CSV.
154
+
155
+ ```bash
156
+ pytest tests/guard.py -v
157
+ ```
158
+
159
+ Metric summary table (MAE for continuous columns, accuracy for nominal):
160
+
161
+ ```bash
162
+ python tests/guard.py
163
+ ```
164
+
165
+ ---
166
+
167
+ ## Validation Plots
168
+
169
+ Generated on the bundled test dataset (`MASKRATIO=0.15`, `SEED=42`):
170
+
171
+ | Plot | Description |
172
+ |------|-------------|
173
+ | [Imputation density](Assets/Plots/imputation_density.png) | KDE of observed vs post-imputation continuous distributions |
174
+ | [Missingness heatmap](Assets/Plots/missingness_heatmap.png) | Feature completeness before and after imputation |
175
+ | [Model allocation](Assets/Plots/model_allocation_grid.png) | Which backend was assigned per column |
176
+
177
+ <p align="center">
178
+ <img src="Assets/Plots/imputation_density.png" alt="Imputation density comparison" width="800"/>
179
+ </p>
180
+
181
+ <p align="center">
182
+ <img src="Assets/Plots/missingness_heatmap.png" alt="Missingness heatmap before and after imputation" width="800"/>
183
+ </p>
184
+
185
+ <p align="center">
186
+ <img src="Assets/Plots/model_allocation_grid.png" alt="Per-column model allocation" width="800"/>
187
+ </p>
188
+
189
+ Regenerate locally:
190
+
191
+ ```bash
192
+ python scripts/generate_plots.py
193
+ ```
194
+
195
+ ---
196
+
197
+ ## Requirements
198
+
199
+ - Python 3.10+
200
+ - NumPy, Pandas, SciPy, scikit-learn, LightGBM, CatBoost
201
+
202
+ ---
203
+
204
+ ## Contributing
205
+
206
+ 1. Fork [Hvllvix/Simpute](https://github.com/Hvllvix/Simpute)
207
+ 2. Create a branch, make changes, run `pytest tests/guard.py -v`
208
+ 3. Open a pull request
209
+
210
+ ---
211
+
212
+ ## License
213
+
214
+ MIT
@@ -0,0 +1,9 @@
1
+ simpute/__init__.py,sha256=jMhOCHHC-RivknFBBhkIvvgjYudlFldEjRkx4y0DC4A,150
2
+ simpute/core.py,sha256=1M6qIC0Deiuy_ebJiCaIuv0O74QqjEr6Nf-GIrF18T8,10197
3
+ simpute/models.py,sha256=pisRcmsgGFkYhsWLex7y02CPQYMlhLcQUB19mfVSRUQ,3837
4
+ simpute/utils.py,sha256=3aXb64FxVtQEEyN7aps_YHvVA6Vk66qg2XhMJHCE7Dk,5905
5
+ simpute-0.1.0.dist-info/licenses/LICENSE,sha256=OF0agoaZ50g--hQij9dCPB4Tk5SEJmhxW5KxxmrbPVI,1077
6
+ simpute-0.1.0.dist-info/METADATA,sha256=r3Yfxxf8R0XSVez-XP5rB4_IYtcVdT0irCCl3Ly0RvY,6820
7
+ simpute-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ simpute-0.1.0.dist-info/top_level.txt,sha256=mpxS5gq56wil9Hki9UAPOjascgCeAJ9Jb9uAoxCBilg,8
9
+ simpute-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Simpute Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ simpute