gp-elite 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gp_elite-0.1.0/LICENSE +21 -0
- gp_elite-0.1.0/PKG-INFO +166 -0
- gp_elite-0.1.0/README.md +136 -0
- gp_elite-0.1.0/gp_elite/__init__.py +34 -0
- gp_elite-0.1.0/gp_elite/__main__.py +5 -0
- gp_elite-0.1.0/gp_elite/api.py +171 -0
- gp_elite-0.1.0/gp_elite/cli.py +32 -0
- gp_elite-0.1.0/gp_elite/core.py +8049 -0
- gp_elite-0.1.0/gp_elite/py.typed +0 -0
- gp_elite-0.1.0/gp_elite.egg-info/PKG-INFO +166 -0
- gp_elite-0.1.0/gp_elite.egg-info/SOURCES.txt +16 -0
- gp_elite-0.1.0/gp_elite.egg-info/dependency_links.txt +1 -0
- gp_elite-0.1.0/gp_elite.egg-info/entry_points.txt +2 -0
- gp_elite-0.1.0/gp_elite.egg-info/requires.txt +6 -0
- gp_elite-0.1.0/gp_elite.egg-info/top_level.txt +1 -0
- gp_elite-0.1.0/pyproject.toml +50 -0
- gp_elite-0.1.0/setup.cfg +4 -0
- gp_elite-0.1.0/tests/test_gp_elite.py +130 -0
gp_elite-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sabri Hakou
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
gp_elite-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gp-elite
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Régression symbolique par programmation génétique — lois interprétables sur données expérimentales (pur Python/NumPy)
|
|
5
|
+
Author: Sabri Hakou
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ariel95500-create/gp-elite
|
|
8
|
+
Project-URL: Repository, https://github.com/ariel95500-create/gp-elite
|
|
9
|
+
Project-URL: Issues, https://github.com/ariel95500-create/gp-elite/issues
|
|
10
|
+
Keywords: symbolic-regression,genetic-programming,machine-learning,interpretable-ml,equation-discovery,scientific-computing
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: numpy>=1.21
|
|
25
|
+
Requires-Dist: pandas>=1.3
|
|
26
|
+
Requires-Dist: scikit-learn>=1.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# GP_ELITE
|
|
32
|
+
|
|
33
|
+
**Genetic-programming symbolic regression — discover interpretable laws from your experimental data.**
|
|
34
|
+
|
|
35
|
+
*[🇫🇷 Version française](README.fr.md)*
|
|
36
|
+
|
|
37
|
+
GP_ELITE searches for a **mathematical formula** linking your variables to a target, instead of a black box. It is built for small experimental datasets (≤10 variables, 100–5000 points) where you want to *understand* the relationship: degradation laws, sensor calibration, engineering correlations, dose–response curves, physical laws.
|
|
38
|
+
|
|
39
|
+
Pure **Python / NumPy** — no Julia, no compilation, no GPU. `pip install` and you're ready.
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from gp_elite import symbolic_regression
|
|
43
|
+
|
|
44
|
+
result = symbolic_regression(X, y, feature_names=["cycle", "temperature", "current"])
|
|
45
|
+
print(result.expression) # capacity_SOH = 0.913 - 0.352·tanh(...)
|
|
46
|
+
print(result.r2_validation) # 0.996 (on data never seen during training)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Why GP_ELITE?
|
|
52
|
+
|
|
53
|
+
| | GP_ELITE | Neural networks | PySR (state of the art) |
|
|
54
|
+
|---|---|---|---|
|
|
55
|
+
| Output | **readable formula** | black box | readable formula |
|
|
56
|
+
| Installation | `pip install` (pure Python) | heavy | requires **Julia** |
|
|
57
|
+
| Overfitting guard | **built-in** (hold-out) | do it yourself | do it yourself |
|
|
58
|
+
| Variable selection | **importance report** | no | partial |
|
|
59
|
+
|
|
60
|
+
GP_ELITE's niche: **zero barrier to entry**. A lab engineer, a student, or a technician points at a CSV file and gets a validated law back — without becoming a developer.
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Installation
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
pip install gp-elite # from PyPI
|
|
68
|
+
# or, from source:
|
|
69
|
+
git clone https://github.com/ariel95500-create/gp-elite
|
|
70
|
+
cd gp-elite && pip install -e .
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Dependencies: `numpy`, `pandas`, `scikit-learn`.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Usage
|
|
78
|
+
|
|
79
|
+
### One line, on your own data (console UI)
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
gp-elite
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Choose mode **6 (generic CSV)**, point to your file, and keep the defaults. GP_ELITE detects the columns, holds out a validation set, evolves, and prints the discovered law with its generalization report.
|
|
86
|
+
|
|
87
|
+
### Programmatically (notebooks, pipelines)
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
import numpy as np
|
|
91
|
+
from gp_elite import symbolic_regression
|
|
92
|
+
|
|
93
|
+
X = np.random.uniform(1, 5, (200, 2))
|
|
94
|
+
y = 2.0 + 3.0 * np.sqrt(X[:, 0]) - 0.5 * X[:, 1]
|
|
95
|
+
|
|
96
|
+
result = symbolic_regression(
|
|
97
|
+
X, y,
|
|
98
|
+
feature_names=["a", "b"],
|
|
99
|
+
operators="physical", # 'physical' | 'trig' | 'full' | 'poly'
|
|
100
|
+
generations=60,
|
|
101
|
+
speed="fast", # 'ultrafast' | 'fast' | 'normal'
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
print(result.expression) # e.g. 2.0 + 3.0·sqrt(a) - 0.5·b
|
|
105
|
+
print(result.r2_validation) # quality on the hold-out set
|
|
106
|
+
print(result.size) # node count (readability)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Full example: battery degradation (NASA data)
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
python examples/battery_soh.py
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
From 168 real charge cycles, GP_ELITE discovers a state-of-health (SOH) law:
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
capacity_SOH ≈ 0.913 − 0.352 · tanh( cycle^((temperature/cycle)^0.485) )
|
|
121
|
+
|
|
122
|
+
R² validation = 0.996 (on cycles never seen) 12 nodes
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
A saturating degradation with cycle count, modulated by temperature — physically plausible, and **certified on unseen data**.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## What is GP_ELITE good (and less good) at?
|
|
130
|
+
|
|
131
|
+
**Good at**: physical / engineering laws with multiplicative or exponential structure, modest-size noisy experimental data, problems where interpretability matters most.
|
|
132
|
+
|
|
133
|
+
On a representative subset of the **Feynman Symbolic Regression Benchmark** (16 physics equations), in `fast` mode (~15 s/equation): **81% of equations solved at R² > 0.999**, mean R² 0.993.
|
|
134
|
+
|
|
135
|
+
**Less good at**: chaotic sequences (e.g. Collatz flight time — an intrinsically random component), >15–20 variables (the search space explodes), large datasets where raw accuracy outweighs interpretability (ensemble models dominate there).
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Technical features
|
|
140
|
+
|
|
141
|
+
- **Asymmetric island model** (explorer / cleaner / stigmergic) with periodic migration
|
|
142
|
+
- **Linear scaling** (Keijzer 2003): the engine searches for the *shape*; scale and offset coefficients are solved in closed form
|
|
143
|
+
- **ε-lexicase selection** (La Cava 2016) to preserve behavioral diversity
|
|
144
|
+
- **Island parallelism** (multi-core) — ≈ ×3 measured on 4 cores
|
|
145
|
+
- **Hold-out validation** + parsimonious champion selection (R² tolerance): built-in overfitting guard
|
|
146
|
+
- **Shift-free normalization** preserving multiplicative structure (x·y stays a clean product)
|
|
147
|
+
- **Transferable stigmergic memory** across runs (grammar export/import)
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Tests
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
pip install pytest
|
|
155
|
+
pytest -q
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT — see [LICENSE](LICENSE). Free to use, including commercially, with retention of the copyright notice.
|
|
163
|
+
|
|
164
|
+
## Citing GP_ELITE
|
|
165
|
+
|
|
166
|
+
If GP_ELITE is useful in academic work, see [CITATION.cff](CITATION.cff).
|
gp_elite-0.1.0/README.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# GP_ELITE
|
|
2
|
+
|
|
3
|
+
**Genetic-programming symbolic regression — discover interpretable laws from your experimental data.**
|
|
4
|
+
|
|
5
|
+
*[🇫🇷 Version française](README.fr.md)*
|
|
6
|
+
|
|
7
|
+
GP_ELITE searches for a **mathematical formula** linking your variables to a target, instead of a black box. It is built for small experimental datasets (≤10 variables, 100–5000 points) where you want to *understand* the relationship: degradation laws, sensor calibration, engineering correlations, dose–response curves, physical laws.
|
|
8
|
+
|
|
9
|
+
Pure **Python / NumPy** — no Julia, no compilation, no GPU. `pip install` and you're ready.
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from gp_elite import symbolic_regression
|
|
13
|
+
|
|
14
|
+
result = symbolic_regression(X, y, feature_names=["cycle", "temperature", "current"])
|
|
15
|
+
print(result.expression) # capacity_SOH = 0.913 - 0.352·tanh(...)
|
|
16
|
+
print(result.r2_validation) # 0.996 (on data never seen during training)
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Why GP_ELITE?
|
|
22
|
+
|
|
23
|
+
| | GP_ELITE | Neural networks | PySR (state of the art) |
|
|
24
|
+
|---|---|---|---|
|
|
25
|
+
| Output | **readable formula** | black box | readable formula |
|
|
26
|
+
| Installation | `pip install` (pure Python) | heavy | requires **Julia** |
|
|
27
|
+
| Overfitting guard | **built-in** (hold-out) | do it yourself | do it yourself |
|
|
28
|
+
| Variable selection | **importance report** | no | partial |
|
|
29
|
+
|
|
30
|
+
GP_ELITE's niche: **zero barrier to entry**. A lab engineer, a student, or a technician points at a CSV file and gets a validated law back — without becoming a developer.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install gp-elite # from PyPI
|
|
38
|
+
# or, from source:
|
|
39
|
+
git clone https://github.com/ariel95500-create/gp-elite
|
|
40
|
+
cd gp-elite && pip install -e .
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Dependencies: `numpy`, `pandas`, `scikit-learn`.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Usage
|
|
48
|
+
|
|
49
|
+
### One line, on your own data (console UI)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
gp-elite
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Choose mode **6 (generic CSV)**, point to your file, and keep the defaults. GP_ELITE detects the columns, holds out a validation set, evolves, and prints the discovered law with its generalization report.
|
|
56
|
+
|
|
57
|
+
### Programmatically (notebooks, pipelines)
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import numpy as np
|
|
61
|
+
from gp_elite import symbolic_regression
|
|
62
|
+
|
|
63
|
+
X = np.random.uniform(1, 5, (200, 2))
|
|
64
|
+
y = 2.0 + 3.0 * np.sqrt(X[:, 0]) - 0.5 * X[:, 1]
|
|
65
|
+
|
|
66
|
+
result = symbolic_regression(
|
|
67
|
+
X, y,
|
|
68
|
+
feature_names=["a", "b"],
|
|
69
|
+
operators="physical", # 'physical' | 'trig' | 'full' | 'poly'
|
|
70
|
+
generations=60,
|
|
71
|
+
speed="fast", # 'ultrafast' | 'fast' | 'normal'
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
print(result.expression) # e.g. 2.0 + 3.0·sqrt(a) - 0.5·b
|
|
75
|
+
print(result.r2_validation) # quality on the hold-out set
|
|
76
|
+
print(result.size) # node count (readability)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Full example: battery degradation (NASA data)
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
python examples/battery_soh.py
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
From 168 real charge cycles, GP_ELITE discovers a state-of-health (SOH) law:
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
capacity_SOH ≈ 0.913 − 0.352 · tanh( cycle^((temperature/cycle)^0.485) )
|
|
91
|
+
|
|
92
|
+
R² validation = 0.996 (on cycles never seen) 12 nodes
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
A saturating degradation with cycle count, modulated by temperature — physically plausible, and **certified on unseen data**.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## What is GP_ELITE good (and less good) at?
|
|
100
|
+
|
|
101
|
+
**Good at**: physical / engineering laws with multiplicative or exponential structure, modest-size noisy experimental data, problems where interpretability matters most.
|
|
102
|
+
|
|
103
|
+
On a representative subset of the **Feynman Symbolic Regression Benchmark** (16 physics equations), in `fast` mode (~15 s/equation): **81% of equations solved at R² > 0.999**, mean R² 0.993.
|
|
104
|
+
|
|
105
|
+
**Less good at**: chaotic sequences (e.g. Collatz flight time — an intrinsically random component), >15–20 variables (the search space explodes), large datasets where raw accuracy outweighs interpretability (ensemble models dominate there).
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## Technical features
|
|
110
|
+
|
|
111
|
+
- **Asymmetric island model** (explorer / cleaner / stigmergic) with periodic migration
|
|
112
|
+
- **Linear scaling** (Keijzer 2003): the engine searches for the *shape*; scale and offset coefficients are solved in closed form
|
|
113
|
+
- **ε-lexicase selection** (La Cava 2016) to preserve behavioral diversity
|
|
114
|
+
- **Island parallelism** (multi-core) — ≈ ×3 measured on 4 cores
|
|
115
|
+
- **Hold-out validation** + parsimonious champion selection (R² tolerance): built-in overfitting guard
|
|
116
|
+
- **Shift-free normalization** preserving multiplicative structure (x·y stays a clean product)
|
|
117
|
+
- **Transferable stigmergic memory** across runs (grammar export/import)
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## Tests
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
pip install pytest
|
|
125
|
+
pytest -q
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## License
|
|
131
|
+
|
|
132
|
+
MIT — see [LICENSE](LICENSE). Free to use, including commercially, with retention of the copyright notice.
|
|
133
|
+
|
|
134
|
+
## Citing GP_ELITE
|
|
135
|
+
|
|
136
|
+
If GP_ELITE is useful in academic work, see [CITATION.cff](CITATION.cff).
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GP_ELITE — Régression symbolique par programmation génétique (1-D, N-D, CSV).
|
|
3
|
+
|
|
4
|
+
Découverte de lois empiriques interprétables sur petits jeux de données
|
|
5
|
+
expérimentaux : ≤10 variables, 100-5000 points, où l'on veut une FORMULE
|
|
6
|
+
plutôt qu'une boîte noire. Pur Python/NumPy, sans dépendance exotique.
|
|
7
|
+
|
|
8
|
+
Caractéristiques principales
|
|
9
|
+
----------------------------
|
|
10
|
+
- Modèle en îles asymétriques (explorer / cleaner / stigmergic) avec migration
|
|
11
|
+
- Linear scaling (Keijzer) + sélection ε-lexicase
|
|
12
|
+
- Parallélisme des îles (ProcessPoolExecutor, multi-cœurs)
|
|
13
|
+
- Validation hold-out + sélection parcimonieuse du champion (tolérance R²)
|
|
14
|
+
- Normalisation shift-free préservant la structure multiplicative
|
|
15
|
+
- Mode CSV générique : pointez un fichier, obtenez une loi validée
|
|
16
|
+
- Mémoire stigmergique transférable entre exécutions
|
|
17
|
+
|
|
18
|
+
Exemple minimal
|
|
19
|
+
---------------
|
|
20
|
+
>>> import numpy as np
|
|
21
|
+
>>> from gp_elite import symbolic_regression
|
|
22
|
+
>>> X = np.random.uniform(1, 5, (200, 2))
|
|
23
|
+
>>> y = 2.0 + 3.0 * np.sqrt(X[:, 0]) - 0.5 * X[:, 1]
|
|
24
|
+
>>> result = symbolic_regression(X, y, feature_names=["a", "b"], generations=40)
|
|
25
|
+
>>> print(result.expression) # ex : 2.0 + 3.0*sqrt(a) - 0.5*b
|
|
26
|
+
>>> print(result.r2_validation)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from .api import symbolic_regression, SRResult
|
|
30
|
+
from . import core
|
|
31
|
+
|
|
32
|
+
__version__ = "0.1.0"
|
|
33
|
+
|
|
34
|
+
__all__ = ["symbolic_regression", "SRResult", "core", "__version__"]
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""
|
|
2
|
+
API haut niveau de GP_ELITE.
|
|
3
|
+
|
|
4
|
+
Expose une fonction unique `symbolic_regression(X, y, ...)` qui encapsule le
|
|
5
|
+
moteur d'évolution et retourne un objet résultat propre. Pensé pour l'usage
|
|
6
|
+
programmatique (notebooks, pipelines) ; le menu interactif reste accessible
|
|
7
|
+
via la CLI `gp-elite` ou `python -m gp_elite`.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import io
|
|
12
|
+
import random
|
|
13
|
+
import contextlib
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Optional, Sequence
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
from . import core
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class SRResult:
|
|
24
|
+
"""Résultat d'une régression symbolique.
|
|
25
|
+
|
|
26
|
+
Attributs
|
|
27
|
+
---------
|
|
28
|
+
expression : str — la formule trouvée (noms de colonnes si fournis)
|
|
29
|
+
r2_validation : float — R² sur le hold-out (None si validation désactivée)
|
|
30
|
+
mse_validation : float — MSE sur le hold-out
|
|
31
|
+
mse_train : float — MSE sur l'entraînement
|
|
32
|
+
size : int — nombre de nœuds de l'arbre
|
|
33
|
+
depth : int — profondeur de l'arbre
|
|
34
|
+
feature_names : list — noms des variables
|
|
35
|
+
node : core.Node — l'arbre brut (pour évaluation / inspection)
|
|
36
|
+
predict : callable — predict(X_norm) -> ndarray
|
|
37
|
+
"""
|
|
38
|
+
expression: str
|
|
39
|
+
r2_validation: Optional[float]
|
|
40
|
+
mse_validation: Optional[float]
|
|
41
|
+
mse_train: float
|
|
42
|
+
size: int
|
|
43
|
+
depth: int
|
|
44
|
+
feature_names: list
|
|
45
|
+
node: "core.Node"
|
|
46
|
+
|
|
47
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
48
|
+
"""Prédit sur des features DÉJÀ normalisées (mêmes échelles que le fit)."""
|
|
49
|
+
return core.evaluate_vector(self.node, np.asarray(X, dtype=float))
|
|
50
|
+
|
|
51
|
+
def __str__(self) -> str:
|
|
52
|
+
r2 = f"{self.r2_validation:.4f}" if self.r2_validation is not None else "n/a"
|
|
53
|
+
return (f"SRResult(expr='{self.expression}', "
|
|
54
|
+
f"R²_val={r2}, size={self.size})")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def symbolic_regression(
|
|
58
|
+
X: np.ndarray,
|
|
59
|
+
y: np.ndarray,
|
|
60
|
+
feature_names: Optional[Sequence[str]] = None,
|
|
61
|
+
*,
|
|
62
|
+
operators: str = "physical",
|
|
63
|
+
normalize: str = "auto",
|
|
64
|
+
generations: int = 100,
|
|
65
|
+
speed: str = "fast",
|
|
66
|
+
parallel: Optional[bool] = None,
|
|
67
|
+
validation_split: float = 0.20,
|
|
68
|
+
seed: Optional[int] = None,
|
|
69
|
+
verbose: bool = False,
|
|
70
|
+
) -> SRResult:
|
|
71
|
+
"""Trouve une expression symbolique reliant X à y.
|
|
72
|
+
|
|
73
|
+
Paramètres
|
|
74
|
+
----------
|
|
75
|
+
X : ndarray (n_samples, n_features) — variables explicatives (valeurs brutes)
|
|
76
|
+
y : ndarray (n_samples,) — cible
|
|
77
|
+
feature_names : noms des colonnes (sinon X0, X1, …) — apparaissent dans la formule
|
|
78
|
+
operators : 'physical' | 'trig' | 'full' | 'poly' — pool d'opérateurs
|
|
79
|
+
normalize : 'auto' | 'divmax' | 'minmax' | 'standard'
|
|
80
|
+
'auto' = shift-free si toutes les features sont positives
|
|
81
|
+
generations : nombre de générations
|
|
82
|
+
speed : 'ultrafast' | 'fast' | 'normal' — taille population / îles
|
|
83
|
+
parallel : True force le multi-processus, False le désactive,
|
|
84
|
+
None = auto (≥4 cœurs)
|
|
85
|
+
validation_split : fraction hold-out (0.0 = pas de validation)
|
|
86
|
+
seed : graine de reproductibilité
|
|
87
|
+
verbose : True affiche les logs détaillés du moteur
|
|
88
|
+
|
|
89
|
+
Retourne
|
|
90
|
+
--------
|
|
91
|
+
SRResult
|
|
92
|
+
"""
|
|
93
|
+
X = np.asarray(X, dtype=float)
|
|
94
|
+
y = np.asarray(y, dtype=float)
|
|
95
|
+
if X.ndim != 2:
|
|
96
|
+
raise ValueError(f"X doit être 2-D (n_samples, n_features), reçu {X.shape}")
|
|
97
|
+
if y.ndim != 1 or len(y) != len(X):
|
|
98
|
+
raise ValueError("y doit être 1-D de même longueur que X")
|
|
99
|
+
n_feat = X.shape[1]
|
|
100
|
+
|
|
101
|
+
if feature_names is None:
|
|
102
|
+
feature_names = [f"X{i}" for i in range(n_feat)]
|
|
103
|
+
feature_names = list(feature_names)
|
|
104
|
+
if len(feature_names) != n_feat:
|
|
105
|
+
raise ValueError("feature_names doit avoir une entrée par colonne de X")
|
|
106
|
+
|
|
107
|
+
if seed is not None:
|
|
108
|
+
random.seed(seed)
|
|
109
|
+
np.random.seed(seed)
|
|
110
|
+
|
|
111
|
+
# ── Normalisation (réutilise le sélecteur du moteur) ──
|
|
112
|
+
scaler, _desc = core._choose_scaler(X, normalize, (-2.0, 2.0))
|
|
113
|
+
X_scaled = scaler.fit_transform(X)
|
|
114
|
+
|
|
115
|
+
# ── Pool d'opérateurs + noms de colonnes (mode CSV générique) ──
|
|
116
|
+
pool = (operators or "physical").lower()
|
|
117
|
+
if pool not in core._GENCSV_POOLS:
|
|
118
|
+
pool = "physical"
|
|
119
|
+
b_ops, b_w, u_ops, u_w = core._GENCSV_POOLS[pool]
|
|
120
|
+
core._GENERIC_BINARY_OPS, core._GENERIC_BINARY_WEIGHTS = list(b_ops), list(b_w)
|
|
121
|
+
core._GENERIC_UNARY_OPS, core._GENERIC_UNARY_WEIGHTS = list(u_ops), list(u_w)
|
|
122
|
+
core._GENERIC_CSV_MODE = True
|
|
123
|
+
core.CSV_FEATURE_NAMES = list(feature_names)
|
|
124
|
+
core.CSV_TARGET_NAME = "y"
|
|
125
|
+
|
|
126
|
+
fast = (speed == "fast")
|
|
127
|
+
ultrafast = (speed == "ultrafast")
|
|
128
|
+
|
|
129
|
+
cfg = core.make_cfg_nd(n_features=n_feat, x_min=-2.0, x_max=2.0,
|
|
130
|
+
fast=fast, ultrafast=ultrafast, use_seeding=False,
|
|
131
|
+
use_lib=True, use_cograph=True, use_seqmem=True)
|
|
132
|
+
cfg.GENERATIONS = int(generations)
|
|
133
|
+
cfg.N_POINTS = len(y)
|
|
134
|
+
cfg.VALIDATION_SPLIT = float(validation_split)
|
|
135
|
+
if parallel is not None:
|
|
136
|
+
cfg.PARALLEL_ISLANDS = bool(parallel)
|
|
137
|
+
|
|
138
|
+
_placeholder = lambda Xm: np.zeros(Xm.shape[0])
|
|
139
|
+
sink = contextlib.nullcontext() if verbose else contextlib.redirect_stdout(io.StringIO())
|
|
140
|
+
try:
|
|
141
|
+
with sink:
|
|
142
|
+
best, X_full, y_full = core.evolve(
|
|
143
|
+
_placeholder, cfg, problem_key="GENERIC_CSV",
|
|
144
|
+
X_override=X_scaled, y_override=y)
|
|
145
|
+
# Capturer l'expression PENDANT que le mode CSV est actif, pour que
|
|
146
|
+
# to_string substitue bien les noms de colonnes (a, b, …) à X[i].
|
|
147
|
+
expression = core.to_string(best)
|
|
148
|
+
finally:
|
|
149
|
+
core._GENERIC_CSV_MODE = False
|
|
150
|
+
|
|
151
|
+
# ── Métriques ──
|
|
152
|
+
mse_tr = core._pure_mse(best, X_full, y_full)
|
|
153
|
+
val_xs = getattr(core, "_VAL_XS", None)
|
|
154
|
+
val_ys = getattr(core, "_VAL_YS", None)
|
|
155
|
+
r2_val = mse_val = None
|
|
156
|
+
if val_xs is not None and val_ys is not None and len(val_ys) > 1:
|
|
157
|
+
preds = core.evaluate_vector(best, val_xs)
|
|
158
|
+
mse_val = float(np.mean((preds - val_ys) ** 2))
|
|
159
|
+
var = float(np.var(val_ys))
|
|
160
|
+
r2_val = (1.0 - mse_val / var) if var > 1e-15 else float("nan")
|
|
161
|
+
|
|
162
|
+
return SRResult(
|
|
163
|
+
expression=expression,
|
|
164
|
+
r2_validation=r2_val,
|
|
165
|
+
mse_validation=mse_val,
|
|
166
|
+
mse_train=float(mse_tr),
|
|
167
|
+
size=core.tree_size(best),
|
|
168
|
+
depth=core.tree_depth(best),
|
|
169
|
+
feature_names=feature_names,
|
|
170
|
+
node=best,
|
|
171
|
+
)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Point d'entrée ligne de commande : `gp-elite` ou `python -m gp_elite`.
|
|
2
|
+
|
|
3
|
+
Lance le menu interactif du moteur (modes 1-D, N-D, Syracuse, CSV générique).
|
|
4
|
+
Le guard if __name__ est requis pour le parallélisme (multiprocessing spawn).
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import traceback
|
|
8
|
+
|
|
9
|
+
from . import core
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
try:
|
|
14
|
+
core._interactive_menu()
|
|
15
|
+
except KeyboardInterrupt:
|
|
16
|
+
print("\nInterrompu.")
|
|
17
|
+
sys.exit(130)
|
|
18
|
+
except Exception:
|
|
19
|
+
print("\n" + "=" * 70)
|
|
20
|
+
print("ERREUR FATALE — traceback complet :")
|
|
21
|
+
print("=" * 70)
|
|
22
|
+
traceback.print_exc()
|
|
23
|
+
print("=" * 70)
|
|
24
|
+
try:
|
|
25
|
+
input("\nAppuyez sur Entrée pour quitter...")
|
|
26
|
+
except Exception:
|
|
27
|
+
pass
|
|
28
|
+
sys.exit(1)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
if __name__ == "__main__":
|
|
32
|
+
main()
|