py-evofe 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_evofe-0.1.0/PKG-INFO +127 -0
- py_evofe-0.1.0/README.md +111 -0
- py_evofe-0.1.0/pyproject.toml +31 -0
- py_evofe-0.1.0/src/evofe/__init__.py +38 -0
- py_evofe-0.1.0/src/evofe/builtin/__init__.py +32 -0
- py_evofe-0.1.0/src/evofe/builtin/categorical.py +396 -0
- py_evofe-0.1.0/src/evofe/builtin/clustering.py +324 -0
- py_evofe-0.1.0/src/evofe/builtin/grouping.py +134 -0
- py_evofe-0.1.0/src/evofe/builtin/math.py +115 -0
- py_evofe-0.1.0/src/evofe/builtin/reduction.py +157 -0
- py_evofe-0.1.0/src/evofe/builtin/supervised.py +198 -0
- py_evofe-0.1.0/src/evofe/estimator.py +299 -0
- py_evofe-0.1.0/src/evofe/evaluation/__init__.py +14 -0
- py_evofe-0.1.0/src/evofe/evaluation/cv.py +493 -0
- py_evofe-0.1.0/src/evofe/evaluation/metrics.py +127 -0
- py_evofe-0.1.0/src/evofe/evaluation/models.py +157 -0
- py_evofe-0.1.0/src/evofe/evaluation/tuning.py +262 -0
- py_evofe-0.1.0/src/evofe/evolution/__init__.py +5 -0
- py_evofe-0.1.0/src/evofe/evolution/engine.py +495 -0
- py_evofe-0.1.0/src/evofe/evolution/individual.py +392 -0
- py_evofe-0.1.0/src/evofe/evolution/population.py +36 -0
- py_evofe-0.1.0/src/evofe/py.typed +0 -0
- py_evofe-0.1.0/src/evofe/transformers.py +118 -0
- py_evofe-0.1.0/src/evofe/utils.py +56 -0
py_evofe-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: py-evofe
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Requires-Dist: lightgbm>=4.6.0
|
|
6
|
+
Requires-Dist: optuna>=4.9.0
|
|
7
|
+
Requires-Dist: pandas>=3.0.3
|
|
8
|
+
Requires-Dist: polars>=1.36.1
|
|
9
|
+
Requires-Dist: pyarrow>=24.0.0
|
|
10
|
+
Requires-Dist: scikit-learn>=1.6.1
|
|
11
|
+
Requires-Dist: scipy>=1.13.1
|
|
12
|
+
Requires-Dist: umap-learn>=0.5.12
|
|
13
|
+
Requires-Dist: xgboost>=2.1.4
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# py-evofe: Evolutionary Feature Engineering in Python
|
|
18
|
+
|
|
19
|
+
[](https://pypi.org/project/py-evofe/)
|
|
20
|
+
[](https://opensource.org/licenses/MIT)
|
|
21
|
+
[](https://pypi.org/project/py-evofe/)
|
|
22
|
+
|
|
23
|
+
**py-evofe** is a Python library that uses a genetic algorithm to automatically discover, combine, and optimize feature transformations for tabular datasets. Instead of manually engineering interaction terms, ratios, or binning strategies, `py-evofe` searches the space of possible feature recipes to maximize the predictive performance of LightGBM or XGBoost models.
|
|
24
|
+
|
|
25
|
+
It implements a scikit-learn compatible interface (`fit`, `transform`, `predict`), allowing seamless integration into standard ML pipelines.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Features
|
|
30
|
+
|
|
31
|
+
* **Scikit-Learn Interface:** Compatible with scikit-learn's `Pipeline`, `GridSearchCV`, and cross-validation tools.
|
|
32
|
+
* **Genetic Algorithm Optimization:** Searches the feature transformation space using selection, crossover, and mutation.
|
|
33
|
+
* **Hierarchical Chaining:** Evolved features can build on top of other proven features from previous generations (e.g., `log(ratio(x1, x2))`).
|
|
34
|
+
* **Stateful Transformers:** Includes PCA, SVD, UMAP, Genie Clustering, Lumbermark Clustering, and Deadwood Anomaly Detection.
|
|
35
|
+
* **Performance Caching:** Features are cached using matrix-hashing to avoid redundant computations (like $K$-NN search or UMAP projections) during cross-validation folds.
|
|
36
|
+
* **Flexible Evaluation:** Supports both Cross-Validation (`cv`) and stratified Train/Validation/Holdout Split (`split`) strategies.
|
|
37
|
+
* **Alternative & Custom Metrics:** Optimize for standard metrics (LogLoss, AUC, F1, MAE) or use the custom Temperature Scaled Refinement (`ts_refinement`) metric.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
You can install the released version of **py-evofe** from PyPI with:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install py-evofe
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Or using `uv`:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
uv pip install py-evofe
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Quick Start
|
|
58
|
+
|
|
59
|
+
Here is a quick example using the Breast Cancer dataset for a binary classification task:
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import polars as pl
|
|
63
|
+
from sklearn.datasets import load_breast_cancer
|
|
64
|
+
from sklearn.model_selection import train_test_split
|
|
65
|
+
from evofe import EvoFE
|
|
66
|
+
|
|
67
|
+
# Load dataset and rename columns to be clean
|
|
68
|
+
bc = load_breast_cancer(as_frame=True)
|
|
69
|
+
feature_cols = bc.feature_names[:8].tolist() # use first 8 features for speed
|
|
70
|
+
df = pl.from_pandas(bc.frame[feature_cols + ["target"]])
|
|
71
|
+
|
|
72
|
+
X = df.drop("target")
|
|
73
|
+
y = df["target"].to_numpy()
|
|
74
|
+
|
|
75
|
+
# Split into train/test
|
|
76
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
77
|
+
X.to_numpy(), y, test_size=0.25, random_state=42, stratify=y
|
|
78
|
+
)
|
|
79
|
+
X_train_df = pl.DataFrame(X_train, schema=X.columns)
|
|
80
|
+
X_test_df = pl.DataFrame(X_test, schema=X.columns)
|
|
81
|
+
|
|
82
|
+
# 1. Create and configure EvoFE
|
|
83
|
+
evo = EvoFE(
|
|
84
|
+
task="classification", # "classification" | "multiclass" | "regression"
|
|
85
|
+
evaluator="lightgbm", # "lightgbm" | "xgboost"
|
|
86
|
+
pop_size=10, # population size
|
|
87
|
+
n_generations=5, # max evolutionary generations
|
|
88
|
+
cv_folds=3, # CV folds per fitness evaluation
|
|
89
|
+
verbose=True
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# 2. Fit: Runs evolution to discover best features
|
|
93
|
+
evo.fit(X_train_df, y_train)
|
|
94
|
+
|
|
95
|
+
# 3. Get evolved feature recipe
|
|
96
|
+
recipe = evo.get_recipe()
|
|
97
|
+
print(f"Best Fitness (exp(-log_loss)): {recipe.fitness:.4f}")
|
|
98
|
+
print("Evolved genes:")
|
|
99
|
+
for gene in recipe.genes:
|
|
100
|
+
print(f" • {gene.to_formula()} -> {gene.output_col}")
|
|
101
|
+
|
|
102
|
+
# 4. Transform: Add evolved features to test data
|
|
103
|
+
X_test_enriched = evo.transform(X_test_df)
|
|
104
|
+
print(f"Enriched test columns: {X_test_enriched.columns}")
|
|
105
|
+
|
|
106
|
+
# 5. Predict using the best evolved model
|
|
107
|
+
predictions = evo.predict(X_test_df)
|
|
108
|
+
probabilities = evo.predict_proba(X_test_df)
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Supported Transformers
|
|
114
|
+
|
|
115
|
+
| Category | Transformers |
|
|
116
|
+
| :--- | :--- |
|
|
117
|
+
| **Arithmetic & Math** | `log`, `sqrt`, `reciprocal`, `power`, `add`, `subtract`, `multiply`, `divide`, `normalized_difference`, `log_ratio` |
|
|
118
|
+
| **Group-by Aggregations** | `groupby_mean`, `groupby_median`, `groupby_sd`, `groupby_max`, `groupby_min`, `groupby_ratio`, `groupby_zscore`, `groupby_quantile` |
|
|
119
|
+
| **Encoding & Binning** | `target_encode`, `target_encode_multiclass`, `frequency_encode`, `one_hot_encode`, `quantile_binning`, `log_binning`, `rank_transform`, `datetime_extract` |
|
|
120
|
+
| **Dimensionality Reduction** | `pca`, `truncated_svd`, `random_projection`, `umap` |
|
|
121
|
+
| **Graph & Clustering** | `genie`, `genie_centroid_dist`, `lumbermark`, `lumbermark_centroid_dist`, `mst_score`, `deadwood` |
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## License
|
|
126
|
+
|
|
127
|
+
This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details.
|
py_evofe-0.1.0/README.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# py-evofe: Evolutionary Feature Engineering in Python
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/py-evofe/)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://pypi.org/project/py-evofe/)
|
|
6
|
+
|
|
7
|
+
**py-evofe** is a Python library that uses a genetic algorithm to automatically discover, combine, and optimize feature transformations for tabular datasets. Instead of manually engineering interaction terms, ratios, or binning strategies, `py-evofe` searches the space of possible feature recipes to maximize the predictive performance of LightGBM or XGBoost models.
|
|
8
|
+
|
|
9
|
+
It implements a scikit-learn compatible interface (`fit`, `transform`, `predict`), allowing seamless integration into standard ML pipelines.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
* **Scikit-Learn Interface:** Compatible with scikit-learn's `Pipeline`, `GridSearchCV`, and cross-validation tools.
|
|
16
|
+
* **Genetic Algorithm Optimization:** Searches the feature transformation space using selection, crossover, and mutation.
|
|
17
|
+
* **Hierarchical Chaining:** Evolved features can build on top of other proven features from previous generations (e.g., `log(ratio(x1, x2))`).
|
|
18
|
+
* **Stateful Transformers:** Includes PCA, SVD, UMAP, Genie Clustering, Lumbermark Clustering, and Deadwood Anomaly Detection.
|
|
19
|
+
* **Performance Caching:** Features are cached using matrix-hashing to avoid redundant computations (like $K$-NN search or UMAP projections) during cross-validation folds.
|
|
20
|
+
* **Flexible Evaluation:** Supports both Cross-Validation (`cv`) and stratified Train/Validation/Holdout Split (`split`) strategies.
|
|
21
|
+
* **Alternative & Custom Metrics:** Optimize for standard metrics (LogLoss, AUC, F1, MAE) or use the custom Temperature Scaled Refinement (`ts_refinement`) metric.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
You can install the released version of **py-evofe** from PyPI with:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install py-evofe
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Or using `uv`:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
uv pip install py-evofe
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
Here is a quick example using the Breast Cancer dataset for a binary classification task:
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import polars as pl
|
|
47
|
+
from sklearn.datasets import load_breast_cancer
|
|
48
|
+
from sklearn.model_selection import train_test_split
|
|
49
|
+
from evofe import EvoFE
|
|
50
|
+
|
|
51
|
+
# Load dataset and rename columns to be clean
|
|
52
|
+
bc = load_breast_cancer(as_frame=True)
|
|
53
|
+
feature_cols = bc.feature_names[:8].tolist() # use first 8 features for speed
|
|
54
|
+
df = pl.from_pandas(bc.frame[feature_cols + ["target"]])
|
|
55
|
+
|
|
56
|
+
X = df.drop("target")
|
|
57
|
+
y = df["target"].to_numpy()
|
|
58
|
+
|
|
59
|
+
# Split into train/test
|
|
60
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
61
|
+
X.to_numpy(), y, test_size=0.25, random_state=42, stratify=y
|
|
62
|
+
)
|
|
63
|
+
X_train_df = pl.DataFrame(X_train, schema=X.columns)
|
|
64
|
+
X_test_df = pl.DataFrame(X_test, schema=X.columns)
|
|
65
|
+
|
|
66
|
+
# 1. Create and configure EvoFE
|
|
67
|
+
evo = EvoFE(
|
|
68
|
+
task="classification", # "classification" | "multiclass" | "regression"
|
|
69
|
+
evaluator="lightgbm", # "lightgbm" | "xgboost"
|
|
70
|
+
pop_size=10, # population size
|
|
71
|
+
n_generations=5, # max evolutionary generations
|
|
72
|
+
cv_folds=3, # CV folds per fitness evaluation
|
|
73
|
+
verbose=True
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# 2. Fit: Runs evolution to discover best features
|
|
77
|
+
evo.fit(X_train_df, y_train)
|
|
78
|
+
|
|
79
|
+
# 3. Get evolved feature recipe
|
|
80
|
+
recipe = evo.get_recipe()
|
|
81
|
+
print(f"Best Fitness (exp(-log_loss)): {recipe.fitness:.4f}")
|
|
82
|
+
print("Evolved genes:")
|
|
83
|
+
for gene in recipe.genes:
|
|
84
|
+
print(f" • {gene.to_formula()} -> {gene.output_col}")
|
|
85
|
+
|
|
86
|
+
# 4. Transform: Add evolved features to test data
|
|
87
|
+
X_test_enriched = evo.transform(X_test_df)
|
|
88
|
+
print(f"Enriched test columns: {X_test_enriched.columns}")
|
|
89
|
+
|
|
90
|
+
# 5. Predict using the best evolved model
|
|
91
|
+
predictions = evo.predict(X_test_df)
|
|
92
|
+
probabilities = evo.predict_proba(X_test_df)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Supported Transformers
|
|
98
|
+
|
|
99
|
+
| Category | Transformers |
|
|
100
|
+
| :--- | :--- |
|
|
101
|
+
| **Arithmetic & Math** | `log`, `sqrt`, `reciprocal`, `power`, `add`, `subtract`, `multiply`, `divide`, `normalized_difference`, `log_ratio` |
|
|
102
|
+
| **Group-by Aggregations** | `groupby_mean`, `groupby_median`, `groupby_sd`, `groupby_max`, `groupby_min`, `groupby_ratio`, `groupby_zscore`, `groupby_quantile` |
|
|
103
|
+
| **Encoding & Binning** | `target_encode`, `target_encode_multiclass`, `frequency_encode`, `one_hot_encode`, `quantile_binning`, `log_binning`, `rank_transform`, `datetime_extract` |
|
|
104
|
+
| **Dimensionality Reduction** | `pca`, `truncated_svd`, `random_projection`, `umap` |
|
|
105
|
+
| **Graph & Clustering** | `genie`, `genie_centroid_dist`, `lumbermark`, `lumbermark_centroid_dist`, `mst_score`, `deadwood` |
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## License
|
|
110
|
+
|
|
111
|
+
This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "py-evofe"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"lightgbm>=4.6.0",
|
|
9
|
+
"optuna>=4.9.0",
|
|
10
|
+
"pandas>=3.0.3",
|
|
11
|
+
"polars>=1.36.1",
|
|
12
|
+
"pyarrow>=24.0.0",
|
|
13
|
+
"scikit-learn>=1.6.1",
|
|
14
|
+
"scipy>=1.13.1",
|
|
15
|
+
"umap-learn>=0.5.12",
|
|
16
|
+
"xgboost>=2.1.4",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[build-system]
|
|
20
|
+
requires = ["uv_build>=0.11.21,<0.12.0"]
|
|
21
|
+
build-backend = "uv_build"
|
|
22
|
+
|
|
23
|
+
[dependency-groups]
|
|
24
|
+
dev = [
|
|
25
|
+
"mypy>=1.19.1",
|
|
26
|
+
"pytest>=8.4.2",
|
|
27
|
+
"ruff>=0.15.18",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[tool.uv.build-backend]
|
|
31
|
+
module-name = "evofe"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
evoFE — Evolutionary Feature Engineering (Python)
|
|
3
|
+
==================================================
|
|
4
|
+
|
|
5
|
+
Quick start
|
|
6
|
+
-----------
|
|
7
|
+
from evofe import EvoFE
|
|
8
|
+
|
|
9
|
+
evo = EvoFE(task="multiclass", evaluator="xgboost")
|
|
10
|
+
evo.fit(df_train, y_train)
|
|
11
|
+
df_enriched = evo.transform(df_test)
|
|
12
|
+
preds = evo.predict(df_test)
|
|
13
|
+
|
|
14
|
+
Lower-level API
|
|
15
|
+
---------------
|
|
16
|
+
from evofe.evolution import evolve_features
|
|
17
|
+
from evofe.evaluation import evaluate_fitness, apply_individual
|
|
18
|
+
from evofe.evaluation.tuning import make_tunable
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from .transformers import EvoTransformer
|
|
22
|
+
from .estimator import EvoFE
|
|
23
|
+
from .evolution.engine import evolve_features, EvoRecipe
|
|
24
|
+
from .evaluation.cv import evaluate_fitness, apply_individual
|
|
25
|
+
from .evaluation.tuning import make_tunable
|
|
26
|
+
from .builtin import register_transformer
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
__all__ = [
|
|
30
|
+
"EvoFE",
|
|
31
|
+
"EvoRecipe",
|
|
32
|
+
"EvoTransformer",
|
|
33
|
+
"evolve_features",
|
|
34
|
+
"evaluate_fitness",
|
|
35
|
+
"apply_individual",
|
|
36
|
+
"make_tunable",
|
|
37
|
+
"register_transformer",
|
|
38
|
+
]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from .math import create_math_transformers
|
|
2
|
+
from .supervised import create_supervised_transformers
|
|
3
|
+
from .grouping import create_grouping_transformers
|
|
4
|
+
from .reduction import create_reduction_transformers
|
|
5
|
+
from .clustering import create_clustering_transformers
|
|
6
|
+
from .categorical import create_categorical_transformers
|
|
7
|
+
|
|
8
|
+
# Global registry of all built-in transformers
|
|
9
|
+
evo_transformers = {}
|
|
10
|
+
|
|
11
|
+
# Register all builtin modules
|
|
12
|
+
evo_transformers.update(create_math_transformers())
|
|
13
|
+
evo_transformers.update(create_supervised_transformers())
|
|
14
|
+
evo_transformers.update(create_grouping_transformers())
|
|
15
|
+
evo_transformers.update(create_reduction_transformers())
|
|
16
|
+
evo_transformers.update(create_clustering_transformers())
|
|
17
|
+
evo_transformers.update(create_categorical_transformers())
|
|
18
|
+
|
|
19
|
+
def register_transformer(name: str, transformer):
|
|
20
|
+
"""
|
|
21
|
+
Registers a custom feature transformer into the global pool.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
name: Unique string naming the transformer.
|
|
25
|
+
transformer: An object of class EvoTransformer.
|
|
26
|
+
"""
|
|
27
|
+
from ..transformers import EvoTransformer
|
|
28
|
+
if not isinstance(transformer, EvoTransformer):
|
|
29
|
+
raise TypeError("transformer must be an instance of EvoTransformer.")
|
|
30
|
+
evo_transformers[name] = transformer
|
|
31
|
+
|
|
32
|
+
__all__ = ["evo_transformers", "register_transformer"]
|