pyeggp 1.0.3__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyeggp-1.0.3 → pyeggp-1.0.4}/PKG-INFO +170 -5
- pyeggp-1.0.4/README.md +178 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/pyproject.toml +1 -1
- pyeggp-1.0.4/src/pyeggp/__init__.py +377 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp.egg-info/PKG-INFO +170 -5
- {pyeggp-1.0.3 → pyeggp-1.0.4}/test/test_pyeggp.py +1 -1
- pyeggp-1.0.3/README.md +0 -13
- pyeggp-1.0.3/src/pyeggp/__init__.py +0 -176
- {pyeggp-1.0.3 → pyeggp-1.0.4}/LICENSE +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/setup.cfg +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/setup.py +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp/__main__.py +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp/_binding.pyi +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp/binding.i +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp/typing.py +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp.egg-info/SOURCES.txt +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp.egg-info/dependency_links.txt +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp.egg-info/entry_points.txt +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp.egg-info/requires.txt +0 -0
- {pyeggp-1.0.3 → pyeggp-1.0.4}/src/pyeggp.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pyeggp
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.4
|
4
4
|
Summary: Python Wheels for eggp algorithm.
|
5
5
|
Author-email: Fabricio Olivetti <folivetti@users.noreply.github.com>
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
@@ -695,16 +695,181 @@ Provides-Extra: mypy
|
|
695
695
|
Requires-Dist: types_setuptools>=45; extra == "mypy"
|
696
696
|
Dynamic: license-file
|
697
697
|
|
698
|
-
#
|
698
|
+
# PyEGGP
|
699
699
|
|
700
|
-
Python
|
700
|
+
A Python package for symbolic regression using e-graph-based genetic programming. PyEGGP provides a scikit-learn compatible API for evolutionary symbolic regression tasks.
|
701
701
|
|
702
|
-
|
702
|
+
More info [here](https://github.com/folivetti/srtree/tree/main/apps/eggp)
|
703
703
|
|
704
|
-
|
704
|
+
## Installation
|
705
705
|
|
706
706
|
```bash
|
707
707
|
pip install pyeggp
|
708
708
|
```
|
709
709
|
|
710
|
+
## Features
|
711
|
+
|
712
|
+
- Scikit-learn compatible API with `fit()` and `predict()` methods
|
713
|
+
- Genetic programming approach with e-graph representation
|
714
|
+
- Support for **multi-view symbolic regression** [see here](https://arxiv.org/abs/2402.04298)
|
715
|
+
- Customizable evolutionary parameters (population size, tournament selection, etc.)
|
716
|
+
- Flexible function set selection
|
717
|
+
- Various loss functions for different problem types
|
718
|
+
- Parameter optimization with multiple restarts
|
719
|
+
- Optional expression simplification through equality saturation
|
720
|
+
- Ability to save and load e-graphs
|
721
|
+
|
722
|
+
## Usage
|
723
|
+
|
724
|
+
### Basic Example
|
725
|
+
|
726
|
+
```python
|
727
|
+
from pyeggp import PyEGGP
|
728
|
+
import numpy as np
|
729
|
+
|
730
|
+
# Create sample data
|
731
|
+
X = np.linspace(-10, 10, 100).reshape(-1, 1)
|
732
|
+
y = 2 * X.ravel() + 3 * np.sin(X.ravel()) + np.random.normal(0, 1, 100)
|
733
|
+
|
734
|
+
# Create and fit the model
|
735
|
+
model = PyEGGP(gen=100, nonterminals="add,sub,mul,div,sin,cos")
|
736
|
+
model.fit(X, y)
|
737
|
+
|
738
|
+
# Make predictions
|
739
|
+
y_pred = model.predict(X)
|
740
|
+
|
741
|
+
# Examine the results
|
742
|
+
print(model.results)
|
743
|
+
```
|
744
|
+
|
745
|
+
### Multi-View Symbolic Regression
|
746
|
+
|
747
|
+
```python
|
748
|
+
from pyeggp import PyEGGP
|
749
|
+
import numpy as np
|
750
|
+
|
751
|
+
# Create multiple views of data
|
752
|
+
X1 = np.linspace(-5, 5, 50).reshape(-1, 1)
|
753
|
+
y1 = np.sin(X1.ravel()) + np.random.normal(0, 0.1, 50)
|
754
|
+
|
755
|
+
X2 = np.linspace(0, 10, 100).reshape(-1, 1)
|
756
|
+
y2 = np.sin(X2.ravel()) + np.random.normal(0, 0.2, 100)
|
757
|
+
|
758
|
+
# Create and fit multi-view model
|
759
|
+
model = PyEGGP(gen=150, nPop=200)
|
760
|
+
model.fit_mvsr([X1, X2], [y1, y2])
|
761
|
+
|
762
|
+
# Make predictions for each view
|
763
|
+
y_pred1 = model.predict_mvsr(X1, view=0)
|
764
|
+
y_pred2 = model.predict_mvsr(X2, view=1)
|
765
|
+
```
|
766
|
+
|
767
|
+
### Integration with scikit-learn
|
768
|
+
|
769
|
+
```python
|
770
|
+
from sklearn.model_selection import train_test_split
|
771
|
+
from sklearn.metrics import mean_squared_error
|
772
|
+
from pyeggp import PyEGGP
|
773
|
+
|
774
|
+
# Split data
|
775
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
776
|
+
|
777
|
+
# Create and fit model
|
778
|
+
model = PyEGGP(gen=150, nPop=150, optIter=100)
|
779
|
+
model.fit(X_train, y_train)
|
780
|
+
|
781
|
+
# Evaluate on test set
|
782
|
+
y_pred = model.predict(X_test)
|
783
|
+
mse = mean_squared_error(y_test, y_pred)
|
784
|
+
print(f"Test MSE: {mse}")
|
785
|
+
```
|
786
|
+
|
787
|
+
## Parameters
|
788
|
+
|
789
|
+
| Parameter | Type | Default | Description |
|
790
|
+
|-----------|------|---------|-------------|
|
791
|
+
| `gen` | int | 100 | Number of generations to run |
|
792
|
+
| `nPop` | int | 100 | Population size |
|
793
|
+
| `maxSize` | int | 15 | Maximum allowed size for expressions (max 100) |
|
794
|
+
| `nTournament` | int | 3 | Tournament size for parent selection |
|
795
|
+
| `pc` | float | 0.9 | Probability of performing crossover |
|
796
|
+
| `pm` | float | 0.3 | Probability of performing mutation |
|
797
|
+
| `nonterminals` | str | "add,sub,mul,div" | Comma-separated list of allowed functions |
|
798
|
+
| `loss` | str | "MSE" | Loss function: "MSE", "Gaussian", "Bernoulli", or "Poisson" |
|
799
|
+
| `optIter` | int | 50 | Number of iterations for parameter optimization |
|
800
|
+
| `optRepeat` | int | 2 | Number of restarts for parameter optimization |
|
801
|
+
| `nParams` | int | -1 | Maximum number of parameters (-1 for unlimited) |
|
802
|
+
| `split` | int | 1 | Data splitting ratio for validation |
|
803
|
+
| `simplify` | bool | False | Whether to apply equality saturation to simplify expressions |
|
804
|
+
| `dumpTo` | str | "" | Filename to save the final e-graph |
|
805
|
+
| `loadFrom` | str | "" | Filename to load an e-graph to resume search |
|
806
|
+
|
807
|
+
## Available Functions
|
808
|
+
|
809
|
+
The following functions can be used in the `nonterminals` parameter:
|
810
|
+
|
811
|
+
- Basic operations: `add`, `sub`, `mul`, `div`
|
812
|
+
- Powers: `power`, `powerabs`, `square`, `cube`
|
813
|
+
- Roots: `sqrt`, `sqrtabs`, `cbrt`
|
814
|
+
- Trigonometric: `sin`, `cos`, `tan`, `asin`, `acos`, `atan`
|
815
|
+
- Hyperbolic: `sinh`, `cosh`, `tanh`, `asinh`, `acosh`, `atanh`
|
816
|
+
- Others: `abs`, `log`, `logabs`, `exp`, `recip`, `aq` (analytical quotient)
|
817
|
+
|
818
|
+
## Methods
|
819
|
+
|
820
|
+
### Core Methods
|
821
|
+
- `fit(X, y)`: Fits the symbolic regression model
|
822
|
+
- `predict(X)`: Generates predictions using the best model
|
823
|
+
- `score(X, y)`: Computes R² score of the best model
|
824
|
+
|
825
|
+
### Multi-View Methods
|
826
|
+
- `fit_mvsr(Xs, ys)`: Fits a multi-view regression model
|
827
|
+
- `predict_mvsr(X, view)`: Generates predictions for a specific view
|
828
|
+
- `evaluate_best_model_view(X, view)`: Evaluates the best model on a specific view
|
829
|
+
- `evaluate_model_view(X, ix, view)`: Evaluates a specific model on a specific view
|
830
|
+
|
831
|
+
### Utility Methods
|
832
|
+
- `evaluate_best_model(X)`: Evaluates the best model on the given data
|
833
|
+
- `evaluate_model(ix, X)`: Evaluates the model with index `ix` on the given data
|
834
|
+
- `get_model(idx)`: Returns a model function and its visual representation
|
835
|
+
|
836
|
+
## Results
|
837
|
+
|
838
|
+
After fitting, the `results` attribute contains a pandas DataFrame with details about the discovered models, including:
|
839
|
+
- Mathematical expressions
|
840
|
+
- Model complexity
|
841
|
+
- Parameter values
|
842
|
+
- Error metrics
|
843
|
+
- NumPy-compatible expressions
|
844
|
+
|
845
|
+
## License
|
846
|
+
|
847
|
+
[LICENSE]
|
848
|
+
|
849
|
+
## Citation
|
850
|
+
|
851
|
+
If you use PyEGGP in your research, please cite:
|
852
|
+
|
853
|
+
```
|
854
|
+
@inproceedings{eggp,
|
855
|
+
author = {de Franca, Fabricio Olivetti and Kronberger, Gabriel},
|
856
|
+
title = {Improving Genetic Programming for Symbolic Regression with Equality Graphs},
|
857
|
+
year = {2025},
|
858
|
+
isbn = {9798400714658},
|
859
|
+
publisher = {Association for Computing Machinery},
|
860
|
+
address = {New York, NY, USA},
|
861
|
+
url = {https://doi.org/10.1145/3712256.3726383},
|
862
|
+
doi = {10.1145/3712256.3726383},
|
863
|
+
booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference},
|
864
|
+
pages = {},
|
865
|
+
numpages = {9},
|
866
|
+
keywords = {Symbolic regression, Genetic programming, Equality saturation, Equality graphs},
|
867
|
+
location = {Malaga, Spain},
|
868
|
+
series = {GECCO '25},
|
869
|
+
archivePrefix = {arXiv},
|
870
|
+
eprint = {2501.17848},
|
871
|
+
primaryClass = {cs.LG},
|
872
|
+
}
|
873
|
+
```
|
874
|
+
|
710
875
|
The bindings were created following the amazing example written by [wenkokke](https://github.com/wenkokke/example-haskell-wheel)
|
pyeggp-1.0.4/README.md
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
# PyEGGP
|
2
|
+
|
3
|
+
A Python package for symbolic regression using e-graph-based genetic programming. PyEGGP provides a scikit-learn compatible API for evolutionary symbolic regression tasks.
|
4
|
+
|
5
|
+
More info [here](https://github.com/folivetti/srtree/tree/main/apps/eggp)
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
```bash
|
10
|
+
pip install pyeggp
|
11
|
+
```
|
12
|
+
|
13
|
+
## Features
|
14
|
+
|
15
|
+
- Scikit-learn compatible API with `fit()` and `predict()` methods
|
16
|
+
- Genetic programming approach with e-graph representation
|
17
|
+
- Support for **multi-view symbolic regression** [see here](https://arxiv.org/abs/2402.04298)
|
18
|
+
- Customizable evolutionary parameters (population size, tournament selection, etc.)
|
19
|
+
- Flexible function set selection
|
20
|
+
- Various loss functions for different problem types
|
21
|
+
- Parameter optimization with multiple restarts
|
22
|
+
- Optional expression simplification through equality saturation
|
23
|
+
- Ability to save and load e-graphs
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
### Basic Example
|
28
|
+
|
29
|
+
```python
|
30
|
+
from pyeggp import PyEGGP
|
31
|
+
import numpy as np
|
32
|
+
|
33
|
+
# Create sample data
|
34
|
+
X = np.linspace(-10, 10, 100).reshape(-1, 1)
|
35
|
+
y = 2 * X.ravel() + 3 * np.sin(X.ravel()) + np.random.normal(0, 1, 100)
|
36
|
+
|
37
|
+
# Create and fit the model
|
38
|
+
model = PyEGGP(gen=100, nonterminals="add,sub,mul,div,sin,cos")
|
39
|
+
model.fit(X, y)
|
40
|
+
|
41
|
+
# Make predictions
|
42
|
+
y_pred = model.predict(X)
|
43
|
+
|
44
|
+
# Examine the results
|
45
|
+
print(model.results)
|
46
|
+
```
|
47
|
+
|
48
|
+
### Multi-View Symbolic Regression
|
49
|
+
|
50
|
+
```python
|
51
|
+
from pyeggp import PyEGGP
|
52
|
+
import numpy as np
|
53
|
+
|
54
|
+
# Create multiple views of data
|
55
|
+
X1 = np.linspace(-5, 5, 50).reshape(-1, 1)
|
56
|
+
y1 = np.sin(X1.ravel()) + np.random.normal(0, 0.1, 50)
|
57
|
+
|
58
|
+
X2 = np.linspace(0, 10, 100).reshape(-1, 1)
|
59
|
+
y2 = np.sin(X2.ravel()) + np.random.normal(0, 0.2, 100)
|
60
|
+
|
61
|
+
# Create and fit multi-view model
|
62
|
+
model = PyEGGP(gen=150, nPop=200)
|
63
|
+
model.fit_mvsr([X1, X2], [y1, y2])
|
64
|
+
|
65
|
+
# Make predictions for each view
|
66
|
+
y_pred1 = model.predict_mvsr(X1, view=0)
|
67
|
+
y_pred2 = model.predict_mvsr(X2, view=1)
|
68
|
+
```
|
69
|
+
|
70
|
+
### Integration with scikit-learn
|
71
|
+
|
72
|
+
```python
|
73
|
+
from sklearn.model_selection import train_test_split
|
74
|
+
from sklearn.metrics import mean_squared_error
|
75
|
+
from pyeggp import PyEGGP
|
76
|
+
|
77
|
+
# Split data
|
78
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
79
|
+
|
80
|
+
# Create and fit model
|
81
|
+
model = PyEGGP(gen=150, nPop=150, optIter=100)
|
82
|
+
model.fit(X_train, y_train)
|
83
|
+
|
84
|
+
# Evaluate on test set
|
85
|
+
y_pred = model.predict(X_test)
|
86
|
+
mse = mean_squared_error(y_test, y_pred)
|
87
|
+
print(f"Test MSE: {mse}")
|
88
|
+
```
|
89
|
+
|
90
|
+
## Parameters
|
91
|
+
|
92
|
+
| Parameter | Type | Default | Description |
|
93
|
+
|-----------|------|---------|-------------|
|
94
|
+
| `gen` | int | 100 | Number of generations to run |
|
95
|
+
| `nPop` | int | 100 | Population size |
|
96
|
+
| `maxSize` | int | 15 | Maximum allowed size for expressions (max 100) |
|
97
|
+
| `nTournament` | int | 3 | Tournament size for parent selection |
|
98
|
+
| `pc` | float | 0.9 | Probability of performing crossover |
|
99
|
+
| `pm` | float | 0.3 | Probability of performing mutation |
|
100
|
+
| `nonterminals` | str | "add,sub,mul,div" | Comma-separated list of allowed functions |
|
101
|
+
| `loss` | str | "MSE" | Loss function: "MSE", "Gaussian", "Bernoulli", or "Poisson" |
|
102
|
+
| `optIter` | int | 50 | Number of iterations for parameter optimization |
|
103
|
+
| `optRepeat` | int | 2 | Number of restarts for parameter optimization |
|
104
|
+
| `nParams` | int | -1 | Maximum number of parameters (-1 for unlimited) |
|
105
|
+
| `split` | int | 1 | Data splitting ratio for validation |
|
106
|
+
| `simplify` | bool | False | Whether to apply equality saturation to simplify expressions |
|
107
|
+
| `dumpTo` | str | "" | Filename to save the final e-graph |
|
108
|
+
| `loadFrom` | str | "" | Filename to load an e-graph to resume search |
|
109
|
+
|
110
|
+
## Available Functions
|
111
|
+
|
112
|
+
The following functions can be used in the `nonterminals` parameter:
|
113
|
+
|
114
|
+
- Basic operations: `add`, `sub`, `mul`, `div`
|
115
|
+
- Powers: `power`, `powerabs`, `square`, `cube`
|
116
|
+
- Roots: `sqrt`, `sqrtabs`, `cbrt`
|
117
|
+
- Trigonometric: `sin`, `cos`, `tan`, `asin`, `acos`, `atan`
|
118
|
+
- Hyperbolic: `sinh`, `cosh`, `tanh`, `asinh`, `acosh`, `atanh`
|
119
|
+
- Others: `abs`, `log`, `logabs`, `exp`, `recip`, `aq` (analytical quotient)
|
120
|
+
|
121
|
+
## Methods
|
122
|
+
|
123
|
+
### Core Methods
|
124
|
+
- `fit(X, y)`: Fits the symbolic regression model
|
125
|
+
- `predict(X)`: Generates predictions using the best model
|
126
|
+
- `score(X, y)`: Computes R² score of the best model
|
127
|
+
|
128
|
+
### Multi-View Methods
|
129
|
+
- `fit_mvsr(Xs, ys)`: Fits a multi-view regression model
|
130
|
+
- `predict_mvsr(X, view)`: Generates predictions for a specific view
|
131
|
+
- `evaluate_best_model_view(X, view)`: Evaluates the best model on a specific view
|
132
|
+
- `evaluate_model_view(X, ix, view)`: Evaluates a specific model on a specific view
|
133
|
+
|
134
|
+
### Utility Methods
|
135
|
+
- `evaluate_best_model(X)`: Evaluates the best model on the given data
|
136
|
+
- `evaluate_model(ix, X)`: Evaluates the model with index `ix` on the given data
|
137
|
+
- `get_model(idx)`: Returns a model function and its visual representation
|
138
|
+
|
139
|
+
## Results
|
140
|
+
|
141
|
+
After fitting, the `results` attribute contains a pandas DataFrame with details about the discovered models, including:
|
142
|
+
- Mathematical expressions
|
143
|
+
- Model complexity
|
144
|
+
- Parameter values
|
145
|
+
- Error metrics
|
146
|
+
- NumPy-compatible expressions
|
147
|
+
|
148
|
+
## License
|
149
|
+
|
150
|
+
[LICENSE]
|
151
|
+
|
152
|
+
## Citation
|
153
|
+
|
154
|
+
If you use PyEGGP in your research, please cite:
|
155
|
+
|
156
|
+
```
|
157
|
+
@inproceedings{eggp,
|
158
|
+
author = {de Franca, Fabricio Olivetti and Kronberger, Gabriel},
|
159
|
+
title = {Improving Genetic Programming for Symbolic Regression with Equality Graphs},
|
160
|
+
year = {2025},
|
161
|
+
isbn = {9798400714658},
|
162
|
+
publisher = {Association for Computing Machinery},
|
163
|
+
address = {New York, NY, USA},
|
164
|
+
url = {https://doi.org/10.1145/3712256.3726383},
|
165
|
+
doi = {10.1145/3712256.3726383},
|
166
|
+
booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference},
|
167
|
+
pages = {},
|
168
|
+
numpages = {9},
|
169
|
+
keywords = {Symbolic regression, Genetic programming, Equality saturation, Equality graphs},
|
170
|
+
location = {Malaga, Spain},
|
171
|
+
series = {GECCO '25},
|
172
|
+
archivePrefix = {arXiv},
|
173
|
+
eprint = {2501.17848},
|
174
|
+
primaryClass = {cs.LG},
|
175
|
+
}
|
176
|
+
```
|
177
|
+
|
178
|
+
The bindings were created following the amazing example written by [wenkokke](https://github.com/wenkokke/example-haskell-wheel)
|
@@ -12,7 +12,7 @@ build-backend = "setuptools.build_meta"
|
|
12
12
|
|
13
13
|
[project]
|
14
14
|
name = "pyeggp"
|
15
|
-
version = "1.0.
|
15
|
+
version = "1.0.4"
|
16
16
|
authors = [{ name = "Fabricio Olivetti", email = "folivetti@users.noreply.github.com" }]
|
17
17
|
description = "Python Wheels for eggp algorithm."
|
18
18
|
readme = "README.md"
|
@@ -0,0 +1,377 @@
|
|
1
|
+
import atexit
|
2
|
+
from contextlib import contextmanager
|
3
|
+
from threading import Lock
|
4
|
+
from typing import Iterator, List
|
5
|
+
import string
|
6
|
+
from io import StringIO
|
7
|
+
import tempfile
|
8
|
+
import csv
|
9
|
+
|
10
|
+
import numpy as np
|
11
|
+
import pandas as pd
|
12
|
+
from sklearn.base import BaseEstimator, RegressorMixin
|
13
|
+
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
14
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
15
|
+
|
16
|
+
from ._binding import (
|
17
|
+
unsafe_hs_pyeggp_version,
|
18
|
+
unsafe_hs_pyeggp_main,
|
19
|
+
unsafe_hs_pyeggp_run,
|
20
|
+
unsafe_hs_pyeggp_init,
|
21
|
+
unsafe_hs_pyeggp_exit,
|
22
|
+
)
|
23
|
+
|
24
|
+
VERSION: str = "1.0.4"
|
25
|
+
|
26
|
+
|
27
|
+
_hs_rts_init: bool = False
|
28
|
+
_hs_rts_lock: Lock = Lock()
|
29
|
+
|
30
|
+
|
31
|
+
def hs_rts_exit() -> None:
|
32
|
+
global _hs_rts_lock
|
33
|
+
with _hs_rts_lock:
|
34
|
+
unsafe_hs_pyeggp_exit()
|
35
|
+
|
36
|
+
|
37
|
+
@contextmanager
|
38
|
+
def hs_rts_init(args: List[str] = []) -> Iterator[None]:
|
39
|
+
global _hs_rts_init
|
40
|
+
global _hs_rts_lock
|
41
|
+
with _hs_rts_lock:
|
42
|
+
if not _hs_rts_init:
|
43
|
+
_hs_rts_init = True
|
44
|
+
unsafe_hs_pyeggp_init(args)
|
45
|
+
atexit.register(hs_rts_exit)
|
46
|
+
yield None
|
47
|
+
|
48
|
+
|
49
|
+
def version() -> str:
|
50
|
+
with hs_rts_init():
|
51
|
+
return unsafe_hs_pyeggp_version()
|
52
|
+
|
53
|
+
|
54
|
+
def main(args: List[str] = []) -> int:
|
55
|
+
with hs_rts_init(args):
|
56
|
+
return unsafe_hs_pyeggp_main()
|
57
|
+
|
58
|
+
def pyeggp_run(dataset: str, gen: int, nPop: int, maxSize: int, nTournament: int, pc: float, pm: float, nonterminals: str, loss: str, optIter: int, optRepeat: int, nParams: int, split: int, simplify: int, dumpTo: str, loadFrom: str) -> str:
|
59
|
+
with hs_rts_init():
|
60
|
+
return unsafe_hs_pyeggp_run(dataset, gen, nPop, maxSize, nTournament, pc, pm, nonterminals, loss, optIter, optRepeat, nParams, split, simplify, dumpTo, loadFrom)
|
61
|
+
|
62
|
+
def make_function(expression, loss="MSE"):
|
63
|
+
def func(x, t):
|
64
|
+
y = eval(expression)
|
65
|
+
if loss == "Bernoulli":
|
66
|
+
return 1/(1 + np.exp(-y))
|
67
|
+
elif loss == "Poisson":
|
68
|
+
return np.exp(y)
|
69
|
+
return y
|
70
|
+
return func
|
71
|
+
|
72
|
+
class PyEGGP(BaseEstimator, RegressorMixin):
|
73
|
+
""" Builds a symbolic regression model using eggp.
|
74
|
+
|
75
|
+
Parameters
|
76
|
+
----------
|
77
|
+
gen : int, default=100
|
78
|
+
The number of generations.
|
79
|
+
|
80
|
+
nPop : int, default=100
|
81
|
+
Population size.
|
82
|
+
|
83
|
+
maxSize : int, default=15
|
84
|
+
Maximum allowed size for the expression.
|
85
|
+
This should not be larger than 100 as the e-graph may grow
|
86
|
+
too large.
|
87
|
+
|
88
|
+
nTournament : int, default=3
|
89
|
+
Tournament size. During parent selection it will
|
90
|
+
pick `nTournament` expressions at random and
|
91
|
+
return the best among them.
|
92
|
+
|
93
|
+
pc : float, default=0.9
|
94
|
+
Probability of performing the crossover operator.
|
95
|
+
|
96
|
+
pm : float, default=0.3
|
97
|
+
Probability of performing the mutation operator.
|
98
|
+
|
99
|
+
nonterminals : str, default="add,sub,mul,div"
|
100
|
+
String of a comma separated list of nonterminals.
|
101
|
+
These are the allowed functions to be used during the search.
|
102
|
+
Available functions: add,sub,mul,div,power,powerabs,aq,abs,sin,cos,
|
103
|
+
tan,sinh,cosh,tanh,asin,acos,atan,asinh,acosh,
|
104
|
+
atanh,sqrt,sqrtabs,cbrt,square,log,logabs,exp,
|
105
|
+
recip,cube.
|
106
|
+
Where `aq` is the analytical quotient (x/sqrt(1 + y^2)),
|
107
|
+
`powerabs` is the protected power (x^|y|)
|
108
|
+
`sqrtabs` is the protected sqrt (sqrt(|x|))
|
109
|
+
`logabs` is the protected log (log(|x|))
|
110
|
+
`recip` is the reciprocal (1/x)
|
111
|
+
`cbrt` is the cubic root
|
112
|
+
|
113
|
+
loss : {"MSE", "Gaussian", "Bernoulli", "Poisson"}, default="MSE"
|
114
|
+
Loss function used to evaluate the expressions:
|
115
|
+
- MSE (mean squared error) should be used for regression problems.
|
116
|
+
- Gaussian likelihood should be used for regression problem when you want to
|
117
|
+
fit the error term.
|
118
|
+
- Bernoulli likelihood should be used for classification problem.
|
119
|
+
- Poisson likelihood should be used when the data distribution follows a Poisson.
|
120
|
+
|
121
|
+
optIter : int, default=50
|
122
|
+
Number of iterations for the parameter optimization.
|
123
|
+
|
124
|
+
optRepeat : int, default=2
|
125
|
+
Number of restarts for the parameter optimization.
|
126
|
+
|
127
|
+
nParams : int, default=-1
|
128
|
+
Maximum number of parameters. If set to -1 it will
|
129
|
+
allow the expression to have any number of parameters.
|
130
|
+
If set to a number > 0, it will limit the number of parameters,
|
131
|
+
but allow it to appear multiple times in the expression.
|
132
|
+
E.g., t0 * x0 + exp(t0*x0 + t1)
|
133
|
+
|
134
|
+
split : int, default=1
|
135
|
+
How to split the data to create the validation set.
|
136
|
+
If set to 1, it will use the whole data for fitting the parameter and
|
137
|
+
calculating the fitness function.
|
138
|
+
If set to n>1, it will use 1/n for calculating the fitness function
|
139
|
+
and the reminder for fitting the parameter.
|
140
|
+
|
141
|
+
simplify : bool, default=False
|
142
|
+
Whether to apply a final step of equality saturation to simplify the expressions.
|
143
|
+
|
144
|
+
dumpTo : str, default=""
|
145
|
+
If not empty, it will save the final e-graph into the filename.
|
146
|
+
|
147
|
+
loadFrom : str, default=""
|
148
|
+
If not empty, it will load an e-graph and resume the search.
|
149
|
+
The user must ensure that the loaded e-graph is from the same
|
150
|
+
dataset and loss function.
|
151
|
+
|
152
|
+
Examples
|
153
|
+
--------
|
154
|
+
>>> from pyeggp import PyEGGP
|
155
|
+
>>> import numpy as np
|
156
|
+
>>> X = np.arange(100).reshape(100, 1)
|
157
|
+
>>> y = np.zeros((100, ))
|
158
|
+
>>> estimator = PyEGGP()
|
159
|
+
>>> estimator.fit(X, y)
|
160
|
+
>>>
|
161
|
+
>>> estimator = PyEGGP(loss="Bernoulli")
|
162
|
+
>>> estimator.fit(X, y)
|
163
|
+
"""
|
164
|
+
def __init__(self, gen = 100, nPop = 100, maxSize = 15, nTournament = 3, pc = 0.9, pm = 0.3, nonterminals = "add,sub,mul,div", loss = "MSE", optIter = 50, optRepeat = 2, nParams = -1, split = 1, simplify = False, dumpTo = "", loadFrom = ""):
|
165
|
+
nts = "add,sub,mul,div,power,powerabs,\
|
166
|
+
aq,abs,sin,cos,tan,sinh,cosh,tanh,\
|
167
|
+
asin,acos,atan,asinh,acosh,atanh,sqrt,\
|
168
|
+
sqrtabs,cbrt,square,log,logabs,exp,recip,cube"
|
169
|
+
losses = ["MSE", "Gaussian", "Bernoulli", "Poisson"]
|
170
|
+
if gen < 1:
|
171
|
+
raise ValueError('gen should be greater than 1')
|
172
|
+
if nPop < 1:
|
173
|
+
raise ValueError('nPop should be greater than 1')
|
174
|
+
if maxSize < 1 or maxSize > 100:
|
175
|
+
raise ValueError('maxSize should be a value between 1 and 100')
|
176
|
+
if nTournament < 1 or nTournament > nPop:
|
177
|
+
raise ValueError('nTournament should be a value between 1 and nPop')
|
178
|
+
if pc < 0 or pc > 1:
|
179
|
+
raise ValueError('pc should be between 0 and 1')
|
180
|
+
if pm < 0 or pm > 1:
|
181
|
+
raise ValueError('pm should be between 0 and 1')
|
182
|
+
if any(t not in nts for t in nonterminals):
|
183
|
+
raise ValueError('nonterminals must be a comma separated list of one or more of ', nts)
|
184
|
+
if loss not in losses:
|
185
|
+
raise ValueError('loss must be one of ', losses)
|
186
|
+
if optIter < 0:
|
187
|
+
raise ValueError('optIter must be a positive number')
|
188
|
+
if optRepeat < 0:
|
189
|
+
raise ValueError('optRepeat must be a positive number')
|
190
|
+
if nParams < -1:
|
191
|
+
raise ValueError('nParams must be either -1 or a positive number')
|
192
|
+
if split < 1:
|
193
|
+
raise ValueError('split must be equal or greater than 1')
|
194
|
+
if not isinstance(simplify, bool):
|
195
|
+
raise TypeError('simplify must be a boolean')
|
196
|
+
self.gen = gen
|
197
|
+
self.nPop = nPop
|
198
|
+
self.maxSize = maxSize
|
199
|
+
self.nTournament = nTournament
|
200
|
+
self.pc = pc
|
201
|
+
self.pm = pm
|
202
|
+
self.nonterminals = nonterminals
|
203
|
+
self.loss = loss
|
204
|
+
self.optIter = optIter
|
205
|
+
self.optRepeat = optRepeat
|
206
|
+
self.nParams = nParams
|
207
|
+
self.split = split
|
208
|
+
self.simplify = int(simplify)
|
209
|
+
self.dumpTo = dumpTo
|
210
|
+
self.loadFrom = loadFrom
|
211
|
+
self.is_fitted_ = False
|
212
|
+
|
213
|
+
def fit(self, X, y):
|
214
|
+
''' Fits the regression model.
|
215
|
+
|
216
|
+
Parameters
|
217
|
+
----------
|
218
|
+
X : np.array
|
219
|
+
An m x n np.array describing m observations of n features.
|
220
|
+
y : np.array
|
221
|
+
An np.array of size m with the measured target values.
|
222
|
+
'''
|
223
|
+
if X.ndim == 1:
|
224
|
+
X = X.reshape(-1,1)
|
225
|
+
y = y.reshape(-1, 1)
|
226
|
+
combined = np.hstack([X, y])
|
227
|
+
header = [f"x{i}" for i in range(X.shape[1])] + ["y"]
|
228
|
+
with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False, suffix='.csv') as temp_file:
|
229
|
+
writer = csv.writer(temp_file)
|
230
|
+
writer.writerow(header)
|
231
|
+
writer.writerows(combined)
|
232
|
+
dataset = temp_file.name
|
233
|
+
|
234
|
+
csv_data = pyeggp_run(dataset, self.gen, self.nPop, self.maxSize, self.nTournament, self.pc, self.pm, self.nonterminals, self.loss, self.optIter, self.optRepeat, self.nParams, self.split, self.simplify, self.dumpTo, self.loadFrom)
|
235
|
+
if len(csv_data) > 0:
|
236
|
+
csv_io = StringIO(csv_data.strip())
|
237
|
+
self.results = pd.read_csv(csv_io, header=0)
|
238
|
+
self.is_fitted_ = True
|
239
|
+
return self
|
240
|
+
|
241
|
+
def fit_mvsr(self, Xs, ys):
|
242
|
+
''' Fits a multi-view regression model.
|
243
|
+
|
244
|
+
Parameters
|
245
|
+
----------
|
246
|
+
Xs : list(np.array)
|
247
|
+
A list with k elements of m_k x n np.arrays describing m_k observations of n features.
|
248
|
+
ys : list(np.array)
|
249
|
+
A list of k elements of np.arrays of size m_k with the measured target values.
|
250
|
+
'''
|
251
|
+
if Xs[0].ndim == 1:
|
252
|
+
Xs = [X.reshape(-1,1) for X in Xs]
|
253
|
+
ys = [y.reshape(-1, 1) for y in ys]
|
254
|
+
combineds = [np.hstack([X, y]) for X, y in zip(Xs, ys)]
|
255
|
+
header = [f"x{i}" for i in range(Xs[0].shape[1])] + ["y"]
|
256
|
+
datasets = []
|
257
|
+
for combined in combineds:
|
258
|
+
with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False, suffix='.csv') as temp_file:
|
259
|
+
writer = csv.writer(temp_file)
|
260
|
+
writer.writerow(header)
|
261
|
+
writer.writerows(combined)
|
262
|
+
datasets.append(temp_file.name)
|
263
|
+
|
264
|
+
csv_data = pyeggp_run(" ".join(datasets), self.gen, self.nPop, self.maxSize, self.nTournament, self.pc, self.pm, self.nonterminals, self.loss, self.optIter, self.optRepeat, self.nParams, self.split, self.simplify, self.dumpTo, self.loadFrom)
|
265
|
+
if len(csv_data) > 0:
|
266
|
+
csv_io = StringIO(csv_data.strip())
|
267
|
+
self.results = pd.read_csv(csv_io, header=0, dtype={'theta':str})
|
268
|
+
self.is_fitted_ = True
|
269
|
+
return self
|
270
|
+
|
271
|
+
def predict(self, X):
|
272
|
+
''' Generates the prediction using the best model (selected by accuracy)
|
273
|
+
|
274
|
+
Parameters
|
275
|
+
----------
|
276
|
+
X : np.array
|
277
|
+
An m x n np.array describing m observations of n features.
|
278
|
+
This array must have the same number of features as the training data.
|
279
|
+
|
280
|
+
Return
|
281
|
+
------
|
282
|
+
y : np.array
|
283
|
+
A vector of predictions
|
284
|
+
|
285
|
+
A table with the fitted models and additional information
|
286
|
+
will be stored as a Pandas dataframe in self.results.
|
287
|
+
'''
|
288
|
+
check_is_fitted(self)
|
289
|
+
return self.evaluate_best_model(X)
|
290
|
+
|
291
|
+
def predict_mvsr(self, X, view):
|
292
|
+
''' Generates the prediction using the best model (selected by accuracy)
|
293
|
+
of the sepecified `view`
|
294
|
+
|
295
|
+
Parameters
|
296
|
+
----------
|
297
|
+
X : np.array
|
298
|
+
An m x n np.array describing m observations of n features.
|
299
|
+
This array must have the same number of features as the training data.
|
300
|
+
|
301
|
+
view : int
|
302
|
+
The index of the view (starting at 0).
|
303
|
+
|
304
|
+
Return
|
305
|
+
------
|
306
|
+
y : np.array
|
307
|
+
A vector of predictions
|
308
|
+
'''
|
309
|
+
check_is_fitted(self)
|
310
|
+
return self.evaluate_best_model_view(X, view)
|
311
|
+
|
312
|
+
def evaluate_best_model(self, x):
|
313
|
+
if x.ndim == 1:
|
314
|
+
x = x.reshape(-1,1)
|
315
|
+
t = np.array(list(map(float, self.results.iloc[-1].theta.split(";"))))
|
316
|
+
y = eval(self.results.iloc[-1].Numpy)
|
317
|
+
if self.loss == "Bernoulli":
|
318
|
+
return 1/(1 + np.exp(-y))
|
319
|
+
elif self.loss == "Poisson":
|
320
|
+
return np.exp(y)
|
321
|
+
return y
|
322
|
+
def evaluate_best_model_view(self, x, view):
|
323
|
+
if x.ndim == 1:
|
324
|
+
x = x.reshape(-1,1)
|
325
|
+
ix = self.results.iloc[-1].id
|
326
|
+
best = self.results[self.results.id==ix].iloc[view]
|
327
|
+
t = np.array(list(map(float, best.theta.split(";"))))
|
328
|
+
y = eval(best.Numpy)
|
329
|
+
if self.loss == "Bernoulli":
|
330
|
+
return 1/(1 + np.exp(-y))
|
331
|
+
elif self.loss == "Poisson":
|
332
|
+
return np.exp(y)
|
333
|
+
return y
|
334
|
+
|
335
|
+
def evaluate_model_view(self, x, ix, view):
|
336
|
+
if x.ndim == 1:
|
337
|
+
x = x.reshape(-1,1)
|
338
|
+
best = self.results[self.results.id==ix].iloc[view]
|
339
|
+
t = np.array(list(map(float, best.theta.split(";"))))
|
340
|
+
y = eval(best.Numpy)
|
341
|
+
if self.loss == "Bernoulli":
|
342
|
+
return 1/(1 + np.exp(-y))
|
343
|
+
elif self.loss == "Poisson":
|
344
|
+
return np.exp(y)
|
345
|
+
return y
|
346
|
+
def evaluate_model(self, ix, x):
|
347
|
+
if x.ndim == 1:
|
348
|
+
x = x.reshape(-1,1)
|
349
|
+
t = np.array(list(map(float, self.results.iloc[-1].theta.split(";"))))
|
350
|
+
y = eval(self.results.iloc[i].Numpy)
|
351
|
+
if self.loss == "Bernoulli":
|
352
|
+
return 1/(1 + np.exp(-y))
|
353
|
+
elif self.loss == "Poisson":
|
354
|
+
return np.exp(y)
|
355
|
+
return y
|
356
|
+
def score(self, X, y):
|
357
|
+
''' Calculates the score (single-view only).
|
358
|
+
'''
|
359
|
+
ypred = self.evaluate_best_model(X)
|
360
|
+
return r2_score(y, ypred)
|
361
|
+
def get_model(self, idx):
|
362
|
+
''' Get a `model` function and its visual representation. '''
|
363
|
+
alphabet = list(string.ascii_uppercase)
|
364
|
+
row = self.results[self.results['id']==idx].iloc[0]
|
365
|
+
visual_expression = row['Numpy']
|
366
|
+
model = make_function(visual_expression, self.loss)
|
367
|
+
n_params_used = len(row['theta'].split(sep=';'))
|
368
|
+
|
369
|
+
# Works for solutions with less than 26 parameters
|
370
|
+
for i in range(n_params_used):
|
371
|
+
visual_expression = visual_expression.replace(f't[{i}]', alphabet[i])
|
372
|
+
|
373
|
+
# Works for data with less than 50 dimensions
|
374
|
+
for i in range(50):
|
375
|
+
visual_expression = visual_expression.replace(f'x[:, {i}]', f'X{i}')
|
376
|
+
|
377
|
+
return model, visual_expression
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pyeggp
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.4
|
4
4
|
Summary: Python Wheels for eggp algorithm.
|
5
5
|
Author-email: Fabricio Olivetti <folivetti@users.noreply.github.com>
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
@@ -695,16 +695,181 @@ Provides-Extra: mypy
|
|
695
695
|
Requires-Dist: types_setuptools>=45; extra == "mypy"
|
696
696
|
Dynamic: license-file
|
697
697
|
|
698
|
-
#
|
698
|
+
# PyEGGP
|
699
699
|
|
700
|
-
Python
|
700
|
+
A Python package for symbolic regression using e-graph-based genetic programming. PyEGGP provides a scikit-learn compatible API for evolutionary symbolic regression tasks.
|
701
701
|
|
702
|
-
|
702
|
+
More info [here](https://github.com/folivetti/srtree/tree/main/apps/eggp)
|
703
703
|
|
704
|
-
|
704
|
+
## Installation
|
705
705
|
|
706
706
|
```bash
|
707
707
|
pip install pyeggp
|
708
708
|
```
|
709
709
|
|
710
|
+
## Features
|
711
|
+
|
712
|
+
- Scikit-learn compatible API with `fit()` and `predict()` methods
|
713
|
+
- Genetic programming approach with e-graph representation
|
714
|
+
- Support for **multi-view symbolic regression** [see here](https://arxiv.org/abs/2402.04298)
|
715
|
+
- Customizable evolutionary parameters (population size, tournament selection, etc.)
|
716
|
+
- Flexible function set selection
|
717
|
+
- Various loss functions for different problem types
|
718
|
+
- Parameter optimization with multiple restarts
|
719
|
+
- Optional expression simplification through equality saturation
|
720
|
+
- Ability to save and load e-graphs
|
721
|
+
|
722
|
+
## Usage
|
723
|
+
|
724
|
+
### Basic Example
|
725
|
+
|
726
|
+
```python
|
727
|
+
from pyeggp import PyEGGP
|
728
|
+
import numpy as np
|
729
|
+
|
730
|
+
# Create sample data
|
731
|
+
X = np.linspace(-10, 10, 100).reshape(-1, 1)
|
732
|
+
y = 2 * X.ravel() + 3 * np.sin(X.ravel()) + np.random.normal(0, 1, 100)
|
733
|
+
|
734
|
+
# Create and fit the model
|
735
|
+
model = PyEGGP(gen=100, nonterminals="add,sub,mul,div,sin,cos")
|
736
|
+
model.fit(X, y)
|
737
|
+
|
738
|
+
# Make predictions
|
739
|
+
y_pred = model.predict(X)
|
740
|
+
|
741
|
+
# Examine the results
|
742
|
+
print(model.results)
|
743
|
+
```
|
744
|
+
|
745
|
+
### Multi-View Symbolic Regression
|
746
|
+
|
747
|
+
```python
|
748
|
+
from pyeggp import PyEGGP
|
749
|
+
import numpy as np
|
750
|
+
|
751
|
+
# Create multiple views of data
|
752
|
+
X1 = np.linspace(-5, 5, 50).reshape(-1, 1)
|
753
|
+
y1 = np.sin(X1.ravel()) + np.random.normal(0, 0.1, 50)
|
754
|
+
|
755
|
+
X2 = np.linspace(0, 10, 100).reshape(-1, 1)
|
756
|
+
y2 = np.sin(X2.ravel()) + np.random.normal(0, 0.2, 100)
|
757
|
+
|
758
|
+
# Create and fit multi-view model
|
759
|
+
model = PyEGGP(gen=150, nPop=200)
|
760
|
+
model.fit_mvsr([X1, X2], [y1, y2])
|
761
|
+
|
762
|
+
# Make predictions for each view
|
763
|
+
y_pred1 = model.predict_mvsr(X1, view=0)
|
764
|
+
y_pred2 = model.predict_mvsr(X2, view=1)
|
765
|
+
```
|
766
|
+
|
767
|
+
### Integration with scikit-learn
|
768
|
+
|
769
|
+
```python
|
770
|
+
from sklearn.model_selection import train_test_split
|
771
|
+
from sklearn.metrics import mean_squared_error
|
772
|
+
from pyeggp import PyEGGP
|
773
|
+
|
774
|
+
# Split data
|
775
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
776
|
+
|
777
|
+
# Create and fit model
|
778
|
+
model = PyEGGP(gen=150, nPop=150, optIter=100)
|
779
|
+
model.fit(X_train, y_train)
|
780
|
+
|
781
|
+
# Evaluate on test set
|
782
|
+
y_pred = model.predict(X_test)
|
783
|
+
mse = mean_squared_error(y_test, y_pred)
|
784
|
+
print(f"Test MSE: {mse}")
|
785
|
+
```
|
786
|
+
|
787
|
+
## Parameters
|
788
|
+
|
789
|
+
| Parameter | Type | Default | Description |
|
790
|
+
|-----------|------|---------|-------------|
|
791
|
+
| `gen` | int | 100 | Number of generations to run |
|
792
|
+
| `nPop` | int | 100 | Population size |
|
793
|
+
| `maxSize` | int | 15 | Maximum allowed size for expressions (max 100) |
|
794
|
+
| `nTournament` | int | 3 | Tournament size for parent selection |
|
795
|
+
| `pc` | float | 0.9 | Probability of performing crossover |
|
796
|
+
| `pm` | float | 0.3 | Probability of performing mutation |
|
797
|
+
| `nonterminals` | str | "add,sub,mul,div" | Comma-separated list of allowed functions |
|
798
|
+
| `loss` | str | "MSE" | Loss function: "MSE", "Gaussian", "Bernoulli", or "Poisson" |
|
799
|
+
| `optIter` | int | 50 | Number of iterations for parameter optimization |
|
800
|
+
| `optRepeat` | int | 2 | Number of restarts for parameter optimization |
|
801
|
+
| `nParams` | int | -1 | Maximum number of parameters (-1 for unlimited) |
|
802
|
+
| `split` | int | 1 | Data splitting ratio for validation |
|
803
|
+
| `simplify` | bool | False | Whether to apply equality saturation to simplify expressions |
|
804
|
+
| `dumpTo` | str | "" | Filename to save the final e-graph |
|
805
|
+
| `loadFrom` | str | "" | Filename to load an e-graph to resume search |
|
806
|
+
|
807
|
+
## Available Functions
|
808
|
+
|
809
|
+
The following functions can be used in the `nonterminals` parameter:
|
810
|
+
|
811
|
+
- Basic operations: `add`, `sub`, `mul`, `div`
|
812
|
+
- Powers: `power`, `powerabs`, `square`, `cube`
|
813
|
+
- Roots: `sqrt`, `sqrtabs`, `cbrt`
|
814
|
+
- Trigonometric: `sin`, `cos`, `tan`, `asin`, `acos`, `atan`
|
815
|
+
- Hyperbolic: `sinh`, `cosh`, `tanh`, `asinh`, `acosh`, `atanh`
|
816
|
+
- Others: `abs`, `log`, `logabs`, `exp`, `recip`, `aq` (analytical quotient)
|
817
|
+
|
818
|
+
## Methods
|
819
|
+
|
820
|
+
### Core Methods
|
821
|
+
- `fit(X, y)`: Fits the symbolic regression model
|
822
|
+
- `predict(X)`: Generates predictions using the best model
|
823
|
+
- `score(X, y)`: Computes R² score of the best model
|
824
|
+
|
825
|
+
### Multi-View Methods
|
826
|
+
- `fit_mvsr(Xs, ys)`: Fits a multi-view regression model
|
827
|
+
- `predict_mvsr(X, view)`: Generates predictions for a specific view
|
828
|
+
- `evaluate_best_model_view(X, view)`: Evaluates the best model on a specific view
|
829
|
+
- `evaluate_model_view(X, ix, view)`: Evaluates a specific model on a specific view
|
830
|
+
|
831
|
+
### Utility Methods
|
832
|
+
- `evaluate_best_model(X)`: Evaluates the best model on the given data
|
833
|
+
- `evaluate_model(ix, X)`: Evaluates the model with index `ix` on the given data
|
834
|
+
- `get_model(idx)`: Returns a model function and its visual representation
|
835
|
+
|
836
|
+
## Results
|
837
|
+
|
838
|
+
After fitting, the `results` attribute contains a pandas DataFrame with details about the discovered models, including:
|
839
|
+
- Mathematical expressions
|
840
|
+
- Model complexity
|
841
|
+
- Parameter values
|
842
|
+
- Error metrics
|
843
|
+
- NumPy-compatible expressions
|
844
|
+
|
845
|
+
## License
|
846
|
+
|
847
|
+
[LICENSE]
|
848
|
+
|
849
|
+
## Citation
|
850
|
+
|
851
|
+
If you use PyEGGP in your research, please cite:
|
852
|
+
|
853
|
+
```
|
854
|
+
@inproceedings{eggp,
|
855
|
+
author = {de Franca, Fabricio Olivetti and Kronberger, Gabriel},
|
856
|
+
title = {Improving Genetic Programming for Symbolic Regression with Equality Graphs},
|
857
|
+
year = {2025},
|
858
|
+
isbn = {9798400714658},
|
859
|
+
publisher = {Association for Computing Machinery},
|
860
|
+
address = {New York, NY, USA},
|
861
|
+
url = {https://doi.org/10.1145/3712256.3726383},
|
862
|
+
doi = {10.1145/3712256.3726383},
|
863
|
+
booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference},
|
864
|
+
pages = {},
|
865
|
+
numpages = {9},
|
866
|
+
keywords = {Symbolic regression, Genetic programming, Equality saturation, Equality graphs},
|
867
|
+
location = {Malaga, Spain},
|
868
|
+
series = {GECCO '25},
|
869
|
+
archivePrefix = {arXiv},
|
870
|
+
eprint = {2501.17848},
|
871
|
+
primaryClass = {cs.LG},
|
872
|
+
}
|
873
|
+
```
|
874
|
+
|
710
875
|
The bindings were created following the amazing example written by [wenkokke](https://github.com/wenkokke/example-haskell-wheel)
|
@@ -15,7 +15,7 @@ Z = df.values
|
|
15
15
|
X = Z[:,:-1]
|
16
16
|
y = Z[:,-1]
|
17
17
|
|
18
|
-
reg = PyEGGP(100, 100, 10, 3, 0.9, 0.3, "add,sub,mul,div,log", "MSE", 50, 2, -1, 3,
|
18
|
+
reg = PyEGGP(100, 100, 10, 3, 0.9, 0.3, "add,sub,mul,div,log", "MSE", 50, 2, -1, 3, True, "", "")
|
19
19
|
reg.fit(X, y)
|
20
20
|
print(reg.score(X, y))
|
21
21
|
|
pyeggp-1.0.3/README.md
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
# pyeggp - Python e-graph GP
|
2
|
-
|
3
|
-
Python bindings for [eggp](https://github.com/folivetti/srtree/blob/main/apps/eggp/README.md).
|
4
|
-
|
5
|
-
ggp (e-graph genetic programming), follows the same structure as the traditional GP. The initial population is created using ramped half-and-half respecting a maximum size and maximum depth parameter and, for a number of generations, it will choose two parents using tournament selection, apply the subtree crossover with probability $pc$ followed by the subtree mutation with probability $pm$, when the offsprings replace the current population following a dominance criteria.
|
6
|
-
|
7
|
-
How to install the package:
|
8
|
-
|
9
|
-
```bash
|
10
|
-
pip install pyeggp
|
11
|
-
```
|
12
|
-
|
13
|
-
The bindings were created following the amazing example written by [wenkokke](https://github.com/wenkokke/example-haskell-wheel)
|
@@ -1,176 +0,0 @@
|
|
1
|
-
import atexit
|
2
|
-
from contextlib import contextmanager
|
3
|
-
from threading import Lock
|
4
|
-
from typing import Iterator, List
|
5
|
-
import string
|
6
|
-
from io import StringIO
|
7
|
-
import tempfile
|
8
|
-
import csv
|
9
|
-
|
10
|
-
import numpy as np
|
11
|
-
import pandas as pd
|
12
|
-
from sklearn.base import BaseEstimator, RegressorMixin
|
13
|
-
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
14
|
-
from sklearn.metrics import mean_squared_error, r2_score
|
15
|
-
|
16
|
-
from ._binding import (
|
17
|
-
unsafe_hs_pyeggp_version,
|
18
|
-
unsafe_hs_pyeggp_main,
|
19
|
-
unsafe_hs_pyeggp_run,
|
20
|
-
unsafe_hs_pyeggp_init,
|
21
|
-
unsafe_hs_pyeggp_exit,
|
22
|
-
)
|
23
|
-
|
24
|
-
VERSION: str = "1.3.0"
|
25
|
-
|
26
|
-
|
27
|
-
_hs_rts_init: bool = False
|
28
|
-
_hs_rts_lock: Lock = Lock()
|
29
|
-
|
30
|
-
|
31
|
-
def hs_rts_exit() -> None:
|
32
|
-
global _hs_rts_lock
|
33
|
-
with _hs_rts_lock:
|
34
|
-
unsafe_hs_pyeggp_exit()
|
35
|
-
|
36
|
-
|
37
|
-
@contextmanager
|
38
|
-
def hs_rts_init(args: List[str] = []) -> Iterator[None]:
|
39
|
-
global _hs_rts_init
|
40
|
-
global _hs_rts_lock
|
41
|
-
with _hs_rts_lock:
|
42
|
-
if not _hs_rts_init:
|
43
|
-
_hs_rts_init = True
|
44
|
-
unsafe_hs_pyeggp_init(args)
|
45
|
-
atexit.register(hs_rts_exit)
|
46
|
-
yield None
|
47
|
-
|
48
|
-
|
49
|
-
def version() -> str:
|
50
|
-
with hs_rts_init():
|
51
|
-
return unsafe_hs_pyeggp_version()
|
52
|
-
|
53
|
-
|
54
|
-
def main(args: List[str] = []) -> int:
|
55
|
-
with hs_rts_init(args):
|
56
|
-
return unsafe_hs_pyeggp_main()
|
57
|
-
|
58
|
-
def pyeggp_run(dataset: str, gen: int, nPop: int, maxSize: int, nTournament: int, pc: float, pm: float, nonterminals: str, loss: str, optIter: int, optRepeat: int, nParams: int, split: int, simplify: int, dumpTo: str, loadFrom: str) -> str:
|
59
|
-
with hs_rts_init():
|
60
|
-
return unsafe_hs_pyeggp_run(dataset, gen, nPop, maxSize, nTournament, pc, pm, nonterminals, loss, optIter, optRepeat, nParams, split, simplify, dumpTo, loadFrom)
|
61
|
-
|
62
|
-
def make_function(expression):
|
63
|
-
def func(x, t):
|
64
|
-
return eval(expression)
|
65
|
-
return func
|
66
|
-
|
67
|
-
class PyEGGP(BaseEstimator, RegressorMixin):
|
68
|
-
def __init__(self, gen = 100, nPop = 100, maxSize = 15, nTournament = 3, pc = 0.9, pm = 0.3, nonterminals = "add,sub,mul,div", loss = "MSE", optIter = 50, optRepeat = 2, nParams = -1, split = 1, simplify = False, dumpTo = "", loadFrom = ""):
|
69
|
-
self.gen = gen
|
70
|
-
self.nPop = nPop
|
71
|
-
self.maxSize = maxSize
|
72
|
-
self.nTournament = nTournament
|
73
|
-
self.pc = pc
|
74
|
-
self.pm = pm
|
75
|
-
self.nonterminals = nonterminals
|
76
|
-
self.loss = loss
|
77
|
-
self.optIter = optIter
|
78
|
-
self.optRepeat = optRepeat
|
79
|
-
self.nParams = nParams
|
80
|
-
self.split = split
|
81
|
-
self.simplify = int(simplify)
|
82
|
-
self.dumpTo = dumpTo
|
83
|
-
self.loadFrom = loadFrom
|
84
|
-
self.is_fitted_ = False
|
85
|
-
|
86
|
-
def fit(self, X, y):
|
87
|
-
if X.ndim == 1:
|
88
|
-
X = X.reshape(-1,1)
|
89
|
-
y = y.reshape(-1, 1)
|
90
|
-
combined = np.hstack([X, y])
|
91
|
-
header = [f"x{i}" for i in range(X.shape[1])] + ["y"]
|
92
|
-
with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False, suffix='.csv') as temp_file:
|
93
|
-
writer = csv.writer(temp_file)
|
94
|
-
writer.writerow(header)
|
95
|
-
writer.writerows(combined)
|
96
|
-
dataset = temp_file.name
|
97
|
-
|
98
|
-
csv_data = pyeggp_run(dataset, self.gen, self.nPop, self.maxSize, self.nTournament, self.pc, self.pm, self.nonterminals, self.loss, self.optIter, self.optRepeat, self.nParams, self.split, self.simplify, self.dumpTo, self.loadFrom)
|
99
|
-
if len(csv_data) > 0:
|
100
|
-
csv_io = StringIO(csv_data.strip())
|
101
|
-
self.results = pd.read_csv(csv_io, header=0)
|
102
|
-
self.is_fitted_ = True
|
103
|
-
return self
|
104
|
-
|
105
|
-
def fit_mvsr(self, Xs, ys):
|
106
|
-
if Xs[0].ndim == 1:
|
107
|
-
Xs = [X.reshape(-1,1) for X in Xs]
|
108
|
-
ys = [y.reshape(-1, 1) for y in ys]
|
109
|
-
combineds = [np.hstack([X, y]) for X, y in zip(Xs, ys)]
|
110
|
-
header = [f"x{i}" for i in range(Xs[0].shape[1])] + ["y"]
|
111
|
-
datasets = []
|
112
|
-
for combined in combineds:
|
113
|
-
with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False, suffix='.csv') as temp_file:
|
114
|
-
writer = csv.writer(temp_file)
|
115
|
-
writer.writerow(header)
|
116
|
-
writer.writerows(combined)
|
117
|
-
datasets.append(temp_file.name)
|
118
|
-
|
119
|
-
csv_data = pyeggp_run(" ".join(datasets), self.gen, self.nPop, self.maxSize, self.nTournament, self.pc, self.pm, self.nonterminals, self.loss, self.optIter, self.optRepeat, self.nParams, self.split, self.simplify, self.dumpTo, self.loadFrom)
|
120
|
-
if len(csv_data) > 0:
|
121
|
-
csv_io = StringIO(csv_data.strip())
|
122
|
-
self.results = pd.read_csv(csv_io, header=0, dtype={'theta':str})
|
123
|
-
self.is_fitted_ = True
|
124
|
-
return self
|
125
|
-
|
126
|
-
def predict(self, X):
|
127
|
-
check_is_fitted(self)
|
128
|
-
return self.evaluate_best_model(X)
|
129
|
-
|
130
|
-
def predict_mvsr(self, X, view):
|
131
|
-
check_is_fitted(self)
|
132
|
-
return self.evaluate_best_model_view(X, view)
|
133
|
-
|
134
|
-
def evaluate_best_model(self, x):
|
135
|
-
if x.ndim == 1:
|
136
|
-
x = x.reshape(-1,1)
|
137
|
-
t = np.array(list(map(float, self.results.iloc[-1].theta.split(";"))))
|
138
|
-
return eval(self.results.iloc[-1].Numpy)
|
139
|
-
def evaluate_best_model_view(self, x, view):
|
140
|
-
if x.ndim == 1:
|
141
|
-
x = x.reshape(-1,1)
|
142
|
-
ix = self.results.iloc[-1].id
|
143
|
-
best = self.results[self.results.id==ix].iloc[view]
|
144
|
-
t = np.array(list(map(float, best.theta.split(";"))))
|
145
|
-
return eval(best.Numpy)
|
146
|
-
|
147
|
-
def evaluate_model_view(self, x, ix, view):
|
148
|
-
if x.ndim == 1:
|
149
|
-
x = x.reshape(-1,1)
|
150
|
-
best = self.results[self.results.id==ix].iloc[view]
|
151
|
-
t = np.array(list(map(float, best.theta.split(";"))))
|
152
|
-
return eval(best.Numpy)
|
153
|
-
def evaluate_model(self, ix, x):
|
154
|
-
if x.ndim == 1:
|
155
|
-
x = x.reshape(-1,1)
|
156
|
-
t = np.array(list(map(float, self.results.iloc[-1].theta.split(";"))))
|
157
|
-
return eval(self.results.iloc[i].Numpy)
|
158
|
-
def score(self, X, y):
|
159
|
-
ypred = self.evaluate_best_model(X)
|
160
|
-
return r2_score(y, ypred)
|
161
|
-
def get_model(self, idx):
|
162
|
-
alphabet = list(string.ascii_uppercase)
|
163
|
-
row = self.results[self.results['id']==idx].iloc[0]
|
164
|
-
visual_expression = row['Numpy']
|
165
|
-
model = make_function(visual_expression)
|
166
|
-
n_params_used = len(row['theta'].split(sep=';'))
|
167
|
-
|
168
|
-
# Works for solutions with less than 26 parameters
|
169
|
-
for i in range(n_params_used):
|
170
|
-
visual_expression = visual_expression.replace(f't[{i}]', alphabet[i])
|
171
|
-
|
172
|
-
# Works for data with less than 50 dimensions
|
173
|
-
for i in range(50):
|
174
|
-
visual_expression = visual_expression.replace(f'x[:, {i}]', f'X{i}')
|
175
|
-
|
176
|
-
return model, visual_expression
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|