py-evofe 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
evofe/__init__.py ADDED
@@ -0,0 +1,38 @@
1
+ """
2
+ evoFE — Evolutionary Feature Engineering (Python)
3
+ ==================================================
4
+
5
+ Quick start
6
+ -----------
7
+ from evofe import EvoFE
8
+
9
+ evo = EvoFE(task="multiclass", evaluator="xgboost")
10
+ evo.fit(df_train, y_train)
11
+ df_enriched = evo.transform(df_test)
12
+ preds = evo.predict(df_test)
13
+
14
+ Lower-level API
15
+ ---------------
16
+ from evofe.evolution import evolve_features
17
+ from evofe.evaluation import evaluate_fitness, apply_individual
18
+ from evofe.evaluation.tuning import make_tunable
19
+ """
20
+
21
+ from .transformers import EvoTransformer
22
+ from .estimator import EvoFE
23
+ from .evolution.engine import evolve_features, EvoRecipe
24
+ from .evaluation.cv import evaluate_fitness, apply_individual
25
+ from .evaluation.tuning import make_tunable
26
+ from .builtin import register_transformer
27
+
28
+ __version__ = "0.1.0"
29
+ __all__ = [
30
+ "EvoFE",
31
+ "EvoRecipe",
32
+ "EvoTransformer",
33
+ "evolve_features",
34
+ "evaluate_fitness",
35
+ "apply_individual",
36
+ "make_tunable",
37
+ "register_transformer",
38
+ ]
@@ -0,0 +1,32 @@
1
+ from .math import create_math_transformers
2
+ from .supervised import create_supervised_transformers
3
+ from .grouping import create_grouping_transformers
4
+ from .reduction import create_reduction_transformers
5
+ from .clustering import create_clustering_transformers
6
+ from .categorical import create_categorical_transformers
7
+
8
+ # Global registry of all built-in transformers
9
+ evo_transformers = {}
10
+
11
+ # Register all builtin modules
12
+ evo_transformers.update(create_math_transformers())
13
+ evo_transformers.update(create_supervised_transformers())
14
+ evo_transformers.update(create_grouping_transformers())
15
+ evo_transformers.update(create_reduction_transformers())
16
+ evo_transformers.update(create_clustering_transformers())
17
+ evo_transformers.update(create_categorical_transformers())
18
+
19
+ def register_transformer(name: str, transformer):
20
+ """
21
+ Registers a custom feature transformer into the global pool.
22
+
23
+ Args:
24
+ name: Unique string naming the transformer.
25
+ transformer: An object of class EvoTransformer.
26
+ """
27
+ from ..transformers import EvoTransformer
28
+ if not isinstance(transformer, EvoTransformer):
29
+ raise TypeError("transformer must be an instance of EvoTransformer.")
30
+ evo_transformers[name] = transformer
31
+
32
+ __all__ = ["evo_transformers", "register_transformer"]
@@ -0,0 +1,396 @@
1
+ import polars as pl
2
+ import numpy as np
3
+ import math
4
+ from functools import partial
5
+ from ..transformers import EvoTransformer
6
+ from ..utils import gene_col_name
7
+
8
+ # --- FREQUENCY ENCODING ---
9
+
10
+ def _fit_frequency_encode(data, input_cols, target_col=None, params=None):
11
+ x_col = input_cols[0]
12
+ if not isinstance(data, pl.DataFrame):
13
+ data = pl.DataFrame(data)
14
+ stats = data.group_by(x_col).len().rename({"len": "N"})
15
+ default_val = stats["N"].median()
16
+ if default_val is None:
17
+ default_val = 0.0
18
+ mapping = dict(zip(stats[x_col].to_list(), stats["N"].to_list()))
19
+ return {"mapping": mapping, "default_val": default_val}
20
+
21
+ def _apply_frequency_encode(data, input_cols, state, params=None):
22
+ x_col = input_cols[0]
23
+ mapping = state["mapping"]
24
+ default_val = state["default_val"]
25
+ return pl.col(x_col).replace_strict(mapping, default=default_val).cast(pl.Float64)
26
+
27
+ # --- ONE-HOT ENCODING ---
28
+
29
+ def _fit_one_hot_encode(data, input_cols, target_col=None, params=None):
30
+ x_col = input_cols[0]
31
+ if not isinstance(data, pl.DataFrame):
32
+ data = pl.DataFrame(data)
33
+ stats = data.select(pl.col(x_col).drop_nulls()).group_by(x_col).len().rename({"len": "count"})
34
+ total_n = stats["count"].sum()
35
+ if total_n == 0:
36
+ return {"top_categories": []}
37
+ stats = stats.with_columns(pct = pl.col("count") / total_n)
38
+ stats = stats.sort("count", descending=True)
39
+ top_cats = stats.filter(pl.col("pct") >= 0.05)[x_col].to_list()
40
+ if len(top_cats) > 5:
41
+ top_cats = top_cats[:5]
42
+ return {"top_categories": top_cats}
43
+
44
+ def _apply_one_hot_encode(data, input_cols, state, params=None):
45
+ x_col = input_cols[0]
46
+ comp_idx = 1
47
+ if params is not None and isinstance(params, dict):
48
+ comp_idx = params.get("comp_idx", comp_idx)
49
+
50
+ top_cats = state.get("top_categories", [])
51
+
52
+ if comp_idx == 6:
53
+ if len(top_cats) == 0:
54
+ return pl.lit(1.0)
55
+ return pl.when(pl.col(x_col).is_in(top_cats).not_() | pl.col(x_col).is_null()).then(1.0).otherwise(0.0)
56
+ else:
57
+ idx_0 = comp_idx - 1
58
+ if 0 <= idx_0 < len(top_cats):
59
+ cat = top_cats[idx_0]
60
+ return pl.when(pl.col(x_col) == cat).then(1.0).otherwise(0.0)
61
+ else:
62
+ return pl.lit(0.0)
63
+
64
+ # --- DATETIME EXTRACT ---
65
+
66
+ def _apply_datetime_extract(data, input_cols, state=None, params=None):
67
+ x_col = input_cols[0]
68
+ comp = "month"
69
+ if params is not None and isinstance(params, dict):
70
+ comp = params.get("component", comp)
71
+
72
+ is_str = False
73
+ if isinstance(data, pl.DataFrame):
74
+ is_str = data[x_col].dtype in [pl.Utf8, pl.String]
75
+ elif hasattr(data, 'dtypes'):
76
+ import pandas as pd
77
+ is_str = pd.api.types.is_string_dtype(data[x_col])
78
+ else:
79
+ is_str = True
80
+
81
+ expr = pl.col(x_col)
82
+ if is_str:
83
+ expr = expr.str.to_datetime(strict=False)
84
+
85
+ if comp == "year":
86
+ res = expr.dt.year()
87
+ elif comp == "month":
88
+ res = expr.dt.month()
89
+ elif comp == "day":
90
+ res = expr.dt.day()
91
+ elif comp == "hour":
92
+ res = expr.dt.hour()
93
+ elif comp == "day_of_week":
94
+ res = expr.dt.weekday()
95
+ elif comp == "weekend":
96
+ res = pl.when(expr.dt.weekday().is_in([6, 7])).then(1.0).otherwise(0.0)
97
+ else:
98
+ res = pl.lit(0.0)
99
+
100
+ return res.fill_null(0.0).fill_nan(0.0).cast(pl.Float64)
101
+
102
+ # --- QUANTILE BINNING ---
103
+
104
+ def _fit_quantile_binning(data, input_cols, target_col=None, params=None):
105
+ x_col = input_cols[0]
106
+ if not isinstance(data, pl.DataFrame):
107
+ data = pl.DataFrame(data)
108
+ x = data[x_col].drop_nulls().to_numpy()
109
+ Q = 5
110
+ if params is not None and isinstance(params, dict):
111
+ Q = params.get("Q", Q)
112
+ if len(x) == 0:
113
+ return {"boundaries": [-float('inf'), float('inf')]}
114
+ try:
115
+ boundaries = np.quantile(x, np.linspace(0, 1, Q + 1))
116
+ boundaries = np.unique(boundaries).tolist()
117
+ except Exception:
118
+ boundaries = [-float('inf'), float('inf')]
119
+ return {"boundaries": boundaries}
120
+
121
+ def _apply_quantile_binning(data, input_cols, state, params=None):
122
+ x_col = input_cols[0]
123
+ if not isinstance(data, pl.DataFrame):
124
+ data = pl.DataFrame(data)
125
+ x = data[x_col].cast(pl.Float64).to_numpy()
126
+
127
+ boundaries = state.get("boundaries", [])
128
+ if len(boundaries) <= 1:
129
+ return np.ones(len(x), dtype=np.float64)
130
+
131
+ res = np.digitize(x, boundaries)
132
+ res = np.clip(res, 1, len(boundaries) - 1).astype(np.float64)
133
+
134
+ # Replace NaNs/nulls with 0.0
135
+ res[np.isnan(x)] = 0.0
136
+ return res
137
+
138
+ def _apply_quantile_binning_cat(data, input_cols, state, params=None):
139
+ res = _apply_quantile_binning(data, input_cols, state, params)
140
+ return [str(int(val)) for val in res]
141
+
142
+ # --- LOG BINNING ---
143
+
144
+ def _apply_log_binning(data, input_cols, state=None, params=None):
145
+ x_col = input_cols[0]
146
+ base = 2
147
+ if params is not None and isinstance(params, dict):
148
+ base = params.get("base", base)
149
+
150
+ col = pl.col(x_col).cast(pl.Float64).abs() + 1
151
+ expr = (col.log() / math.log(base)).floor()
152
+ return pl.when(expr.is_infinite() | expr.is_null() | expr.is_nan()).then(0.0).otherwise(expr).cast(pl.Float64)
153
+
154
+ def _apply_log_binning_cat(data, input_cols, state=None, params=None):
155
+ expr = _apply_log_binning(data, input_cols, state, params)
156
+ return expr.cast(pl.Int64).cast(pl.String)
157
+
158
+ # --- RANK TRANSFORM ---
159
+
160
+ def _fit_rank_transform(data, input_cols, target_col=None, params=None):
161
+ x_col = input_cols[0]
162
+ if not isinstance(data, pl.DataFrame):
163
+ data = pl.DataFrame(data)
164
+ x = data[x_col].drop_nulls().to_numpy()
165
+ x = x[np.isfinite(x)]
166
+ if len(x) == 0:
167
+ return {"sorted_x": None}
168
+ return {"sorted_x": np.sort(x)}
169
+
170
+ def _apply_rank_transform(data, input_cols, state, params=None):
171
+ x_col = input_cols[0]
172
+ if not isinstance(data, pl.DataFrame):
173
+ data = pl.DataFrame(data)
174
+ x = data[x_col].cast(pl.Float64).to_numpy()
175
+
176
+ sorted_x = state.get("sorted_x")
177
+ if sorted_x is None or len(sorted_x) == 0:
178
+ from scipy.stats import rankdata
179
+ r = rankdata(x, method="average")
180
+ # Handle case where x might contain NaNs in fallback
181
+ r[np.isnan(x)] = 0.5 * len(x)
182
+ return r / len(x)
183
+
184
+ counts = np.searchsorted(sorted_x, x, side='right')
185
+ res = counts.astype(np.float64) / len(sorted_x)
186
+ res[np.isnan(x)] = 0.5
187
+ return res
188
+
189
+ # --- WOE ENCODE ---
190
+
191
+ def _fit_woe_encode(data, input_cols, target_col):
192
+ if target_col is None:
193
+ raise ValueError("target_col must be provided for supervised woe_encode.")
194
+
195
+ x_col = input_cols[0]
196
+ if not isinstance(data, pl.DataFrame):
197
+ data = pl.DataFrame(data)
198
+
199
+ y_unique = sorted(data[target_col].drop_nulls().unique().to_list())
200
+ if len(y_unique) != 2:
201
+ return {"mapping": {}, "fallback": 0.0, "is_binary": False}
202
+
203
+ event_val = y_unique[1]
204
+ y_bin = data.select(pl.when(pl.col(target_col) == event_val).then(1.0).otherwise(0.0).alias("y_bin"))["y_bin"]
205
+
206
+ total_events = y_bin.sum()
207
+ total_non_events = len(y_bin) - total_events
208
+
209
+ denom_events = total_events + 1.0
210
+ denom_non_events = total_non_events + 1.0
211
+
212
+ df_temp = pl.DataFrame({x_col: data[x_col], "y_bin": y_bin})
213
+ stats = df_temp.group_by(x_col).agg([
214
+ pl.col("y_bin").sum().alias("events"),
215
+ pl.col("y_bin").count().alias("n")
216
+ ])
217
+
218
+ stats = stats.with_columns(
219
+ non_events = pl.col("n") - pl.col("events")
220
+ )
221
+
222
+ stats = stats.with_columns(
223
+ p_event_given_cat = (pl.col("events") + 0.5) / denom_events,
224
+ p_non_event_given_cat = (pl.col("non_events") + 0.5) / denom_non_events
225
+ )
226
+
227
+ stats = stats.with_columns(
228
+ woe = (pl.col("p_event_given_cat") / pl.col("p_non_event_given_cat")).log()
229
+ )
230
+
231
+ mapping = dict(zip(stats[x_col].to_list(), stats["woe"].to_list()))
232
+ return {"mapping": mapping, "fallback": 0.0, "is_binary": True}
233
+
234
+ def _apply_woe_encode(data, input_cols, state):
235
+ x_col = input_cols[0]
236
+ if not state.get("is_binary", False):
237
+ return pl.lit(0.0)
238
+
239
+ mapping = state["mapping"]
240
+ fallback = state["fallback"]
241
+ return pl.col(x_col).replace_strict(mapping, default=fallback).cast(pl.Float64)
242
+
243
+ # --- TARGET ENCODE MULTICLASS ---
244
+
245
+ def _fit_target_encode_multiclass(data, input_cols, target_col):
246
+ if target_col is None:
247
+ raise ValueError("target_col must be provided for supervised target_encode_multiclass.")
248
+
249
+ x_col = input_cols[0]
250
+ if not isinstance(data, pl.DataFrame):
251
+ data = pl.DataFrame(data)
252
+
253
+ classes = sorted(data[target_col].drop_nulls().unique().to_list())
254
+ mappings = []
255
+ global_means = []
256
+ smoothing = 10.0
257
+
258
+ for c in classes:
259
+ y_bin = data.select(pl.when(pl.col(target_col) == c).then(1.0).otherwise(0.0).alias("y_bin"))["y_bin"]
260
+ global_mean = y_bin.mean()
261
+ if global_mean is None:
262
+ global_mean = 0.0
263
+ global_means.append(global_mean)
264
+
265
+ df_temp = pl.DataFrame({x_col: data[x_col], "y_bin": y_bin})
266
+ stats = df_temp.group_by(x_col).agg([
267
+ pl.col("y_bin").mean().alias("mean"),
268
+ pl.col("y_bin").count().alias("n")
269
+ ])
270
+
271
+ stats = stats.with_columns(
272
+ smoothed = (pl.col("n") * pl.col("mean") + smoothing * global_mean) / (pl.col("n") + smoothing)
273
+ )
274
+
275
+ mapping = dict(zip(stats[x_col].to_list(), stats["smoothed"].to_list()))
276
+ mappings.append(mapping)
277
+
278
+ return {"mappings": mappings, "global_means": global_means, "classes": classes}
279
+
280
+ def _apply_target_encode_multiclass(data, input_cols, state, params=None):
281
+ x_col = input_cols[0]
282
+ comp_idx = 0
283
+ if params is not None and isinstance(params, dict):
284
+ comp_idx = params.get("comp_idx", comp_idx)
285
+
286
+ mappings = state.get("mappings", [])
287
+ global_means = state.get("global_means", [])
288
+
289
+ if not mappings:
290
+ return pl.lit(0.0)
291
+
292
+ comp_idx = max(0, min(comp_idx, len(mappings) - 1))
293
+ mapping = mappings[comp_idx]
294
+ global_mean = global_means[comp_idx]
295
+
296
+ return pl.col(x_col).replace_strict(mapping, default=global_mean).cast(pl.Float64)
297
+
298
+ # --- CREATE CATEGORICAL TRANSFORMERS ---
299
+
300
+ def create_categorical_transformers() -> dict:
301
+ """
302
+ Creates and returns a dictionary of built-in categorical and binning transformers.
303
+ """
304
+ transformers = {}
305
+
306
+ transformers['frequency_encode'] = EvoTransformer(
307
+ name="frequency_encode",
308
+ type_="unary",
309
+ input_type="categorical",
310
+ fit_func=_fit_frequency_encode,
311
+ apply_func=_apply_frequency_encode,
312
+ name_generator=partial(gene_col_name, transformer_name="frequency_encode", prefix="freq")
313
+ )
314
+
315
+ transformers['one_hot_encode'] = EvoTransformer(
316
+ name="one_hot_encode",
317
+ type_="unary",
318
+ input_type="categorical",
319
+ output_type="numeric",
320
+ fit_func=_fit_one_hot_encode,
321
+ apply_func=_apply_one_hot_encode,
322
+ name_generator=partial(gene_col_name, transformer_name="one_hot_encode", prefix="ohe")
323
+ )
324
+
325
+ transformers['datetime_extract'] = EvoTransformer(
326
+ name="datetime_extract",
327
+ type_="unary",
328
+ input_type="categorical",
329
+ apply_func=_apply_datetime_extract,
330
+ name_generator=partial(gene_col_name, transformer_name="datetime_extract", prefix="dt")
331
+ )
332
+
333
+ transformers['quantile_binning'] = EvoTransformer(
334
+ name="quantile_binning",
335
+ type_="unary",
336
+ input_type="numeric",
337
+ fit_func=_fit_quantile_binning,
338
+ apply_func=_apply_quantile_binning,
339
+ name_generator=partial(gene_col_name, transformer_name="quantile_binning", prefix="qb")
340
+ )
341
+
342
+ transformers['quantile_binning_cat'] = EvoTransformer(
343
+ name="quantile_binning_cat",
344
+ type_="unary",
345
+ input_type="numeric",
346
+ output_type="categorical",
347
+ fit_func=_fit_quantile_binning,
348
+ apply_func=_apply_quantile_binning_cat,
349
+ name_generator=partial(gene_col_name, transformer_name="quantile_binning_cat", prefix="qbc")
350
+ )
351
+
352
+ transformers['log_binning'] = EvoTransformer(
353
+ name="log_binning",
354
+ type_="unary",
355
+ input_type="numeric",
356
+ apply_func=_apply_log_binning,
357
+ name_generator=partial(gene_col_name, transformer_name="log_binning", prefix="lb")
358
+ )
359
+
360
+ transformers['log_binning_cat'] = EvoTransformer(
361
+ name="log_binning_cat",
362
+ type_="unary",
363
+ input_type="numeric",
364
+ output_type="categorical",
365
+ apply_func=_apply_log_binning_cat,
366
+ name_generator=partial(gene_col_name, transformer_name="log_binning_cat", prefix="lbc")
367
+ )
368
+
369
+ transformers['rank_transform'] = EvoTransformer(
370
+ name="rank_transform",
371
+ type_="unary",
372
+ input_type="numeric",
373
+ fit_func=_fit_rank_transform,
374
+ apply_func=_apply_rank_transform,
375
+ name_generator=partial(gene_col_name, transformer_name="rank_transform", prefix="rnk")
376
+ )
377
+
378
+ transformers['woe_encode'] = EvoTransformer(
379
+ name="woe_encode",
380
+ type_="supervised_unary",
381
+ input_type="categorical",
382
+ fit_func=_fit_woe_encode,
383
+ apply_func=_apply_woe_encode,
384
+ name_generator=partial(gene_col_name, transformer_name="woe_encode", prefix="woe")
385
+ )
386
+
387
+ transformers['target_encode_multiclass'] = EvoTransformer(
388
+ name="target_encode_multiclass",
389
+ type_="supervised_unary",
390
+ input_type="categorical",
391
+ fit_func=_fit_target_encode_multiclass,
392
+ apply_func=_apply_target_encode_multiclass,
393
+ name_generator=partial(gene_col_name, transformer_name="target_encode_multiclass", prefix="temc")
394
+ )
395
+
396
+ return transformers