symbolfit 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ho Fung Tsoi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.1
2
+ Name: symbolfit
3
+ Version: 0.0.1
4
+ Summary: Automatic parametric modeling with symbolic regression
5
+ Home-page: https://github.com/hftsoi/symbolfit
6
+ Author: Ho Fung Tsoi
7
+ Author-email: ho.fung.tsoi@cern.ch
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pysr==0.16.9
15
+ Requires-Dist: lmfit==1.2.2
16
+ Requires-Dist: pandas==2.2.3
17
+ Requires-Dist: matplotlib==3.8.2
18
+ Requires-Dist: seaborn==0.13.2
19
+
20
+ <p align="center">
21
+ <img src="https://raw.githubusercontent.com/hftsoi/symbolfit/main/docs/logo.png" width="300"/>
22
+ </p>
23
+
24
+ [![Documentation Status](https://readthedocs.org/projects/symbolfit/badge/?version=latest)](https://symbolfit.readthedocs.io)
25
+
26
+ An API to automate parametric modeling with symbolic regression, originally developed for data analysis in the experimental high-energy physics community, but also applicable beyond.
27
+
28
+ Symbolfit takes binned data with measurement/systematic uncertainties as input, utilizes [PySR](https://github.com/MilesCranmer/PySR) to perform a machine-search for batches of functional forms that model the data, parameterizes these functions, and utilizes [LMFIT](https://github.com/lmfit/lmfit-py) to re-optimize the functions and provide uncertainty estimation, all in one go.
29
+ It is designed to maximize automatation with minimal human input. Each run produces a batch of functions with uncertainty estimation, which are evaluated, saved, and plotted automatically into readable output files, ready for downstream tasks.
30
+
31
+ - [Installation](#installation)
32
+ - [Getting Started](#getting-started)
33
+ - [Documentation](#documentation)
34
+ - [Citation](#citation)
35
+
36
+ ## Installation
37
+ **Prerequisite**
38
+
39
+ Install Julia (backend for PySR)
40
+ ```
41
+ curl -fsSL https://install.julialang.org | sh
42
+ ```
43
+ then check if installed properly
44
+ ```
45
+ julia --version
46
+ ```
47
+
48
+ **Installation via PyPI**
49
+
50
+ With Python>=3.9
51
+ ```
52
+ pip install symbolfit
53
+ ```
54
+ Upon first installation, run
55
+ ```
56
+ python3 -m pysr install
57
+ ```
58
+
59
+ ## Getting Started
60
+ To run an example fit (or ```python fit_example.py```):
61
+ ```
62
+ from symbolfit.symbolfit import *
63
+
64
+ dataset = importlib.import_module('examples.datasets.toy_dataset_1.dataset')
65
+ pysr_config = importlib.import_module('examples.pysr_configs.pysr_config_1')
66
+
67
+ model = SymbolFit(
68
+ x = dataset.x,
69
+ y = dataset.y,
70
+ y_up = dataset.y_up,
71
+ y_down = dataset.y_down,
72
+ pysr_config = pysr_config,
73
+ max_complexity = 60,
74
+ input_rescale = True,
75
+ scale_y_by = 'mean',
76
+ max_stderr = 40,
77
+ fit_y_unc = True,
78
+ random_seed = None,
79
+ loss_weights = None
80
+ )
81
+
82
+ model.fit()
83
+ ```
84
+ After the fit, save results to csv:
85
+ ```
86
+ model.save_to_csv(output_dir = 'output_dir/')
87
+ ```
88
+ and plot results to pdf:
89
+ ```
90
+ model.plot_to_pdf(
91
+ output_dir = 'output_dir/',
92
+ bin_widths_1d = dataset.bin_widths_1d,
93
+ #bin_edges_2d = dataset.bin_edges_2d,
94
+ plot_logy = False,
95
+ plot_logx = False
96
+ )
97
+ ```
98
+ Candidate functions with full substitutions can be printed in prompt:
99
+ ```
100
+ model.print_candidate(candidate_number = 10)
101
+ ```
102
+
103
+ Each run will produce a batch of candidate functions and will automatically save all results to five output files:
104
+ 1) ```candidates.csv```: saves all candidate functions and evaluations in a csv table.
105
+ 2) ```candidates_reduced.csv```: saves a reduced version for essential information without intermediate results.
106
+ 3) ```candidates.pdf```: plots all candidate functions with associated uncertainties one by one for fit quality evaluation.
107
+ 4) ```candidates_gof.pdf```: plots the goodness-of-fit scores.
108
+ 5) ```candidates_correlation.pdf```: plots the correlation matrices for the parameters of each candidate function.
109
+
110
+ ## Documentation
111
+ The documentation can be found [here](https://symbolfit.readthedocs.io) for more info and demonstrations.
112
+
113
+ ## Citation
114
+ If you find this useful in your research, please consider citing Symbolfit:
115
+ ```
116
+ Coming soon!
117
+ ```
118
+ and PySR:
119
+ ```
120
+ @misc{cranmerInterpretableMachineLearning2023,
121
+ title = {Interpretable {Machine} {Learning} for {Science} with {PySR} and {SymbolicRegression}.jl},
122
+ url = {http://arxiv.org/abs/2305.01582},
123
+ doi = {10.48550/arXiv.2305.01582},
124
+ urldate = {2023-07-17},
125
+ publisher = {arXiv},
126
+ author = {Cranmer, Miles},
127
+ month = may,
128
+ year = {2023},
129
+ note = {arXiv:2305.01582 [astro-ph, physics:physics]},
130
+ keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Computer Science - Symbolic Computation, Physics - Data Analysis, Statistics and Probability},
131
+ }
132
+ ```
133
+
@@ -0,0 +1,114 @@
1
+ <p align="center">
2
+ <img src="https://raw.githubusercontent.com/hftsoi/symbolfit/main/docs/logo.png" width="300"/>
3
+ </p>
4
+
5
+ [![Documentation Status](https://readthedocs.org/projects/symbolfit/badge/?version=latest)](https://symbolfit.readthedocs.io)
6
+
7
+ An API to automate parametric modeling with symbolic regression, originally developed for data analysis in the experimental high-energy physics community, but also applicable beyond.
8
+
9
+ Symbolfit takes binned data with measurement/systematic uncertainties as input, utilizes [PySR](https://github.com/MilesCranmer/PySR) to perform a machine-search for batches of functional forms that model the data, parameterizes these functions, and utilizes [LMFIT](https://github.com/lmfit/lmfit-py) to re-optimize the functions and provide uncertainty estimation, all in one go.
10
+ It is designed to maximize automatation with minimal human input. Each run produces a batch of functions with uncertainty estimation, which are evaluated, saved, and plotted automatically into readable output files, ready for downstream tasks.
11
+
12
+ - [Installation](#installation)
13
+ - [Getting Started](#getting-started)
14
+ - [Documentation](#documentation)
15
+ - [Citation](#citation)
16
+
17
+ ## Installation
18
+ **Prerequisite**
19
+
20
+ Install Julia (backend for PySR)
21
+ ```
22
+ curl -fsSL https://install.julialang.org | sh
23
+ ```
24
+ then check if installed properly
25
+ ```
26
+ julia --version
27
+ ```
28
+
29
+ **Installation via PyPI**
30
+
31
+ With Python>=3.9
32
+ ```
33
+ pip install symbolfit
34
+ ```
35
+ Upon first installation, run
36
+ ```
37
+ python3 -m pysr install
38
+ ```
39
+
40
+ ## Getting Started
41
+ To run an example fit (or ```python fit_example.py```):
42
+ ```
43
+ from symbolfit.symbolfit import *
44
+
45
+ dataset = importlib.import_module('examples.datasets.toy_dataset_1.dataset')
46
+ pysr_config = importlib.import_module('examples.pysr_configs.pysr_config_1')
47
+
48
+ model = SymbolFit(
49
+ x = dataset.x,
50
+ y = dataset.y,
51
+ y_up = dataset.y_up,
52
+ y_down = dataset.y_down,
53
+ pysr_config = pysr_config,
54
+ max_complexity = 60,
55
+ input_rescale = True,
56
+ scale_y_by = 'mean',
57
+ max_stderr = 40,
58
+ fit_y_unc = True,
59
+ random_seed = None,
60
+ loss_weights = None
61
+ )
62
+
63
+ model.fit()
64
+ ```
65
+ After the fit, save results to csv:
66
+ ```
67
+ model.save_to_csv(output_dir = 'output_dir/')
68
+ ```
69
+ and plot results to pdf:
70
+ ```
71
+ model.plot_to_pdf(
72
+ output_dir = 'output_dir/',
73
+ bin_widths_1d = dataset.bin_widths_1d,
74
+ #bin_edges_2d = dataset.bin_edges_2d,
75
+ plot_logy = False,
76
+ plot_logx = False
77
+ )
78
+ ```
79
+ Candidate functions with full substitutions can be printed in prompt:
80
+ ```
81
+ model.print_candidate(candidate_number = 10)
82
+ ```
83
+
84
+ Each run will produce a batch of candidate functions and will automatically save all results to five output files:
85
+ 1) ```candidates.csv```: saves all candidate functions and evaluations in a csv table.
86
+ 2) ```candidates_reduced.csv```: saves a reduced version for essential information without intermediate results.
87
+ 3) ```candidates.pdf```: plots all candidate functions with associated uncertainties one by one for fit quality evaluation.
88
+ 4) ```candidates_gof.pdf```: plots the goodness-of-fit scores.
89
+ 5) ```candidates_correlation.pdf```: plots the correlation matrices for the parameters of each candidate function.
90
+
91
+ ## Documentation
92
+ The documentation can be found [here](https://symbolfit.readthedocs.io) for more info and demonstrations.
93
+
94
+ ## Citation
95
+ If you find this useful in your research, please consider citing Symbolfit:
96
+ ```
97
+ Coming soon!
98
+ ```
99
+ and PySR:
100
+ ```
101
+ @misc{cranmerInterpretableMachineLearning2023,
102
+ title = {Interpretable {Machine} {Learning} for {Science} with {PySR} and {SymbolicRegression}.jl},
103
+ url = {http://arxiv.org/abs/2305.01582},
104
+ doi = {10.48550/arXiv.2305.01582},
105
+ urldate = {2023-07-17},
106
+ publisher = {arXiv},
107
+ author = {Cranmer, Miles},
108
+ month = may,
109
+ year = {2023},
110
+ note = {arXiv:2305.01582 [astro-ph, physics:physics]},
111
+ keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Computer Science - Symbolic Computation, Physics - Data Analysis, Statistics and Probability},
112
+ }
113
+ ```
114
+
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,30 @@
1
+ [metadata]
2
+ name = symbolfit
3
+ version = 0.0.1
4
+ description = Automatic parametric modeling with symbolic regression
5
+ author = Ho Fung Tsoi
6
+ author_email = ho.fung.tsoi@cern.ch
7
+ license = MIT
8
+ license_files = LICENSE
9
+ long_description = file: README.md
10
+ long_description_content_type = text/markdown
11
+ url = https://github.com/hftsoi/symbolfit
12
+ classifiers =
13
+ Programming Language :: Python :: 3
14
+ License :: OSI Approved :: MIT License
15
+
16
+ [options]
17
+ packages = find:
18
+ python_requires = >=3.9
19
+ install_requires =
20
+ pysr==0.16.9
21
+ lmfit==1.2.2
22
+ pandas==2.2.3
23
+ matplotlib==3.8.2
24
+ seaborn==0.13.2
25
+ include_package_data = True
26
+
27
+ [egg_info]
28
+ tag_build =
29
+ tag_date = 0
30
+
@@ -0,0 +1,4 @@
1
+ from setuptools import setup
2
+
3
+ if __name__ == "__main__":
4
+ setup()
File without changes
@@ -0,0 +1,228 @@
1
+ import numpy as np
2
+ from .utils import *
3
+ import scipy
4
+
5
+
6
+ # compute the refitted function, with all parameters at their best fit values, or have one of them shifted +/-1sigma
7
+ def func_evaluate(
8
+ func_candidate,
9
+ x,
10
+ dim,
11
+ param_shifted = None,
12
+ sigma_pm = None,
13
+ evaluate_pysr = False
14
+ ):
15
+ '''
16
+ Compute the function values corresponding to the input array x.
17
+
18
+ Arguments
19
+ ---------
20
+ func_candidate (pd.dataframe):
21
+ A particular function candidate (single row of the func_candidates dataframe).
22
+
23
+ x0 (np.ndarray):
24
+ Numpy array of the independent variable.
25
+
26
+ param_shifted (str):
27
+ Set to 'a3' to evaluate the function values shifting the 'a3' while keeping the rest parameters unshifted.
28
+ If None, evaluate the function values with all parameters at their best-fit values.
29
+
30
+ sigma_pm (str):
31
+ Set to '+' ('-1') for +1 (-1) sigma of param_shifted.
32
+ If None, evaluate the function values with all parameters at their best-fit values.
33
+
34
+ evaluate_pysr (bool):
35
+ True: evaluate with the original function from PySR.
36
+ False: evaluate with the refitted function from LMFIT.
37
+
38
+
39
+ Returns
40
+ -------
41
+ y_pred (np.ndarray):
42
+ The predicted dependent variable y for the function candidate.
43
+ '''
44
+
45
+ if param_shifted is None:
46
+ # Substitute with the parameters from LMFIT (second fit) or from PySR (first fit).
47
+ if evaluate_pysr == False:
48
+ func = re.sub(r'\b(a\d+)\b',
49
+ r"func_candidate['Parameters: (best-fit, +1, -1)']['\1'][0]",
50
+ func_candidate['Parameterized equation, unscaled'])
51
+
52
+ else:
53
+ func = re.sub(r'\b(a\d+)\b',
54
+ r"func_candidate['Parameterization']['\1']",
55
+ func_candidate['Parameterized equation, unscaled'])
56
+
57
+ else:
58
+ # First substitute with all best-fit parameters.
59
+ func = re.sub(r'\b(a\d+)\b',
60
+ r"func_candidate['Parameters: (best-fit, +1, -1)']['\1'][0]",
61
+ func_candidate['Parameterized equation, unscaled'])
62
+
63
+ # Then replace the param_shifted with its +/-1 sigma value.
64
+ if sigma_pm == '+':
65
+ func = re.sub(r"func_candidate\['Parameters: \(best-fit, \+1, -1\)'\]\['" + re.escape(param_shifted) + r"'\]\[0\]",
66
+ r"(func_candidate['Parameters: (best-fit, +1, -1)']['" + param_shifted + r"'][0] + func_candidate['Parameters: (best-fit, +1, -1)']['" + param_shifted + r"'][1])",
67
+ func)
68
+
69
+ elif sigma_pm == '-':
70
+ func = re.sub(r"func_candidate\['Parameters: \(best-fit, \+1, -1\)'\]\['" + re.escape(param_shifted) + r"'\]\[0\]",
71
+ r"(func_candidate['Parameters: (best-fit, +1, -1)']['" + param_shifted + r"'][0] + func_candidate['Parameters: (best-fit, +1, -1)']['" + param_shifted + r"'][2])",
72
+ func)
73
+
74
+ if dim > 1:
75
+ for i in range(dim):
76
+ globals()[f'x{i}'] = np.reshape(x[:, i], (-1, 1))
77
+
78
+ else:
79
+ x0 = x
80
+
81
+ if re.findall(r'x\d+', func_candidate['Parameterized equation, unscaled']):
82
+ return eval(func)
83
+
84
+ else:
85
+ # For function not depending on x.
86
+ return np.full((x.shape[0], 1), eval(func))
87
+
88
+
89
+ def add_gof(
90
+ func_candidates,
91
+ x,
92
+ y,
93
+ y_up,
94
+ y_down,
95
+ dim
96
+ ):
97
+ '''
98
+ Compute goodness-of-fit metrics for all function candidates at once.
99
+
100
+ Arguments
101
+ ---------
102
+ func_candidates (pd.dataframe):
103
+ The full dataframe containing all function candidates.
104
+
105
+ x (np.ndarray):
106
+ Numpy array of the independent variable.
107
+
108
+ y (np.ndarray):
109
+ Numpy array of the dependent variable (input at central).
110
+
111
+ y_up (np.ndarray):
112
+ Numpy array of the dependent variable (input at +1 sigma).
113
+
114
+ y_down (np.ndarray):
115
+ Numpy array of the dependent variable (input at -1 sigma).
116
+
117
+ dim (np.int):
118
+ Dimension of the input data.
119
+
120
+
121
+ Returns
122
+ -------
123
+ func_candidates (pd.dataframe):
124
+ Add new columns to the dataframe:
125
+ RMSE (before refit): root-mean-square-error of the PySR function before refit with LMFIT,
126
+ RMSE: after refit with LMFIT,
127
+ R2: coefficient of determination,
128
+ NDF: number of degrees of freedom,
129
+ Chi2 (before refit): Chi2 of the PySR function before refit with LMFIT,
130
+ Chi2: after refit with LMFIT,
131
+ Chi2/NDF (before refit): similar as above,
132
+ Chi2/NDF: similar as above.
133
+ '''
134
+ # Functions after ROF.
135
+ rmse_values = []
136
+ r2_values = []
137
+ chi2_values = []
138
+ ndf_values = []
139
+ chi2_ndf_values = []
140
+ p_values = []
141
+
142
+ # Functions before ROF.
143
+ chi2_values_pysr = []
144
+ chi2_ndf_values_pysr = []
145
+ p_values_pysr = []
146
+ rmse_values_pysr = []
147
+
148
+ for i in range(len(func_candidates)):
149
+ func_candidate = func_candidates.iloc[i]
150
+
151
+ # Function evaluated with re-fitted parameters from LMFIT.
152
+ y_pred = func_evaluate(func_candidate, x, dim)
153
+
154
+ # Function evaluated with original parameters from PySR.
155
+ y_pred_pysr = func_evaluate(func_candidate, x, dim, evaluate_pysr = True)
156
+
157
+ rmse = np.sqrt(np.sum((y - y_pred)**2) / y.size)
158
+ r2 = 1 - np.sum((y - y_pred)**2) / np.sum((y - np.mean(y))**2)
159
+
160
+ rmse_values.append(round_a_number(rmse, 4))
161
+ r2_values.append(round_a_number(r2, 4))
162
+
163
+ # The uncertainties of input y are required to compute Chi2.
164
+ if y_up is not None and y_down is not None:
165
+ residual = y_pred - y
166
+ residual_pysr = y_pred_pysr - y
167
+
168
+ # For each bin, take up/down uncertainty if the residual error is +/-ve.
169
+ # If either uncertainty input is 0, take the other one.
170
+ y_unc = np.where(residual > 0,
171
+ np.where(y_up != 0, y_up, y_down),
172
+ np.where(y_down != 0, y_down, y_up))
173
+
174
+ y_unc_pysr = np.where(residual_pysr > 0,
175
+ np.where(y_up != 0, y_up, y_down),
176
+ np.where(y_down != 0, y_down, y_up))
177
+
178
+ chi2 = np.sum(residual**2 / y_unc**2)
179
+ chi2_pysr = np.sum(residual_pysr**2 / y_unc_pysr**2)
180
+
181
+ # NDF = number of independent data point - number of varying parameters in the function.
182
+ num_free_param = 0
183
+ if len(func_candidate['Parameters: (best-fit, +1, -1)']) > 0:
184
+ for j in range(len(func_candidate['Parameters: (best-fit, +1, -1)'])):
185
+ if func_candidate['Parameters: (best-fit, +1, -1)'][f'a{j+1}'][1] > 0:
186
+ num_free_param += 1
187
+
188
+ if y.size - num_free_param > 0:
189
+ ndf = y.size - num_free_param
190
+
191
+ else:
192
+ ndf = -1
193
+
194
+ p_value = scipy.stats.chi2.sf(chi2, ndf)
195
+ p_value_pysr = scipy.stats.chi2.sf(chi2_pysr, ndf)
196
+
197
+ ndf_values.append(ndf)
198
+ chi2_values.append(round_a_number(chi2, 4))
199
+ chi2_ndf_values.append(round_a_number(chi2/ndf, 4))
200
+ chi2_values_pysr.append(round_a_number(chi2_pysr, 4))
201
+ chi2_ndf_values_pysr.append(round_a_number(chi2_pysr/ndf, 4))
202
+ p_values.append(round_a_number(p_value, 4))
203
+ p_values_pysr.append(round_a_number(p_value_pysr, 4))
204
+
205
+ else:
206
+ rmse_pysr = np.sqrt(np.sum((y - y_pred_pysr)**2) / y.size)
207
+ rmse_values_pysr.append(round_a_number(rmse_pysr, 4))
208
+
209
+ func_candidates['RMSE'] = rmse_values
210
+ func_candidates['R2'] = r2_values
211
+
212
+ if y_up is not None and y_down is not None:
213
+ func_candidates['NDF'] = ndf_values
214
+
215
+ func_candidates['Chi2 (before ROF)'] = chi2_values_pysr
216
+ func_candidates['Chi2'] = chi2_values
217
+
218
+ func_candidates['Chi2/NDF (before ROF)'] = chi2_ndf_values_pysr
219
+ func_candidates['Chi2/NDF'] = chi2_ndf_values
220
+
221
+ func_candidates['p-value (before ROF)'] = p_values_pysr
222
+ func_candidates['p-value'] = p_values
223
+
224
+ else:
225
+ func_candidates['RMSE (before ROF)'] = rmse_values_pysr
226
+
227
+ return func_candidates
228
+
@@ -0,0 +1,20 @@
1
+ import numpy as np
2
+ from numpy import sin, cos, tan, exp, sinh, cosh, tanh, log10, log, where
3
+ from numpy import arcsin, arccos, arctan, arcsinh, arccosh, arctanh
4
+
5
+
6
+ def square(x):
7
+ return x*x
8
+
9
+ def cond(x, y):
10
+ return where(x > 0., y, 0.)
11
+
12
+ def piecewise(y, x):
13
+ return where(x > 0., y, 0.)
14
+
15
+ def gauss(x):
16
+ return exp(-x*x)
17
+
18
+ def sigmoid(x):
19
+ return 1./(1.+exp(-x))
20
+