pysips 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pysips/regressor.py ADDED
@@ -0,0 +1,451 @@
1
+ """
2
+ PySIPS: Python package for Symbolic Inference via Posterior Sampling
3
+
4
+ This module provides a scikit-learn compatible interface for symbolic regression
5
+ using Sequential Monte Carlo (SMC) sampling with Bayesian model selection. It
6
+ combines symbolic expression generation, probabilistic proposal mechanisms, and
7
+ Laplace approximation for normalized marginal likelihood estimation to discover
8
+ mathematical expressions that best explain observed data.
9
+
10
+ The approach uses SMC to sample from a posterior distribution over symbolic
11
+ expressions, allowing for principled uncertainty quantification and model
12
+ selection in symbolic regression tasks. Unlike traditional genetic programming
13
+ approaches, this method provides probabilistic estimates of model quality and
14
+ can naturally handle model uncertainty.
15
+
16
+ Methodology
17
+ -----------
18
+ The algorithm works through the following steps:
19
+
20
+ 1. **Expression Generation**: Creates initial symbolic expressions using
21
+ configurable operators and complexity constraints
22
+
23
+ 2. **Proposal Mechanisms**: Uses probabilistic combination of:
24
+ - Mutation operations (structural changes to expressions)
25
+ - Crossover operations (combining expressions from gene pool)
26
+
27
+ 3. **Likelihood Evaluation**: Employs Laplace approximation to estimate
28
+ normalized marginal likelihood for Bayesian model comparison
29
+
30
+ 4. **SMC Sampling**: Uses Sequential Monte Carlo to sample from the
31
+ posterior distribution over symbolic expressions
32
+
33
+ 5. **Model Selection**: Chooses final model using either:
34
+ - Mode selection (most frequently sampled expression)
35
+ - Maximum likelihood selection (highest scoring expression)
36
+
37
+ Parameters Overview
38
+ -------------------
39
+ Expression Generation:
40
+ - operators: Mathematical operators to include
41
+ - max_complexity: Maximum expression graph size
42
+ - terminal_probability: Probability of terminal node selection
43
+ - constant_probability: Probability of constant vs variable terminals
44
+
45
+ Mutation Parameters:
46
+ - command_probability: Probability of operation changes
47
+ - node_probability: Probability of node replacement
48
+ - parameter_probability: Probability of constant modification
49
+ - prune_probability: Probability of expression pruning
50
+ - fork_probability: Probability of expression expansion
51
+
52
+ Sampling Parameters:
53
+ - num_particles: Population size for SMC
54
+ - num_mcmc_samples: MCMC steps per SMC iteration
55
+ - target_ess: Target effective sample size
56
+ - crossover_pool_size: Size of crossover gene pool
57
+
58
+ Usage Example
59
+ -------------
60
+ >>> from pysips import PysipsRegressor
61
+ >>> import numpy as np
62
+ >>>
63
+ >>> # Generate sample data
64
+ >>> X = np.random.randn(100, 2)
65
+ >>> y = X[:, 0]**2 + 2*X[:, 1] + np.random.normal(0, 0.1, 100)
66
+ >>>
67
+ >>> # Create and fit regressor
68
+ >>> regressor = PysipsRegressor(
69
+ ... operators=['+', '*', 'pow'],
70
+ ... max_complexity=20,
71
+ ... num_particles=100,
72
+ ... model_selection='mode',
73
+ ... random_state=42
74
+ ... )
75
+ >>> regressor.fit(X, y)
76
+ >>>
77
+ >>> # Make predictions
78
+ >>> y_pred = regressor.predict(X)
79
+ >>>
80
+ >>> # Get discovered expression
81
+ >>> expression = regressor.get_expression()
82
+ >>> print(f"Discovered expression: {expression}")
83
+ >>>
84
+ >>> # Get all sampled models
85
+ >>> models, likelihoods = regressor.get_models()
86
+
87
+ Applications
88
+ ------------
89
+ This approach is particularly well-suited for:
90
+ - Scientific discovery where interpretability is crucial
91
+ - Problems requiring uncertainty quantification in model selection
92
+ - Cases where multiple plausible models exist and need to be ranked
93
+ - Regression tasks where symbolic relationships are preferred over black-box models
94
+ - Applications requiring principled model complexity control
95
+
96
+ Notes
97
+ -----
98
+ The method balances exploration and exploitation through:
99
+ - Probabilistic proposal selection between mutation and crossover
100
+ - Adaptive sampling that focuses on promising regions of expression space
101
+ - Multiple model selection criteria to handle different use cases
102
+
103
+ For best results, consider:
104
+ - Adjusting complexity limits based on problem difficulty
105
+ - Tuning mutation/crossover probabilities for your domain
106
+ - Using sufficient particles for good posterior approximation
107
+ - Setting appropriate number of MCMC samples for mixing
108
+ """
109
+
110
+ from collections import Counter
111
+ import numpy as np
112
+ from sklearn.base import BaseEstimator, RegressorMixin
113
+ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
114
+
115
+ from bingo.symbolic_regression import ComponentGenerator, AGraphGenerator
116
+
117
+ from .laplace_nmll import LaplaceNmll
118
+ from .mutation_proposal import MutationProposal
119
+ from .crossover_proposal import CrossoverProposal
120
+ from .random_choice_proposal import RandomChoiceProposal
121
+ from .sampler import sample
122
+
123
+ USE_PYTHON = True
124
+ USE_SIMPLIFICATION = True
125
+ DEFAULT_OPERATORS = ["+", "*"]
126
+ DEFALT_PARAMETER_INITIALIZATION_BOUNDS = [-5, 5]
127
+
128
+
129
+ # pylint: disable=R0902,R0913,R0917,R0914
130
+ class PysipsRegressor(BaseEstimator, RegressorMixin):
131
+ """
132
+ A scikit-learn compatible wrapper for PySIPS symbolic regression.
133
+
134
+ Parameters
135
+ ----------
136
+ operators : list, default=['+', '*']
137
+ List of operators to use in symbolic expressions.
138
+
139
+ max_complexity : int, default=24
140
+ Maximum complexity of symbolic expressions.
141
+
142
+ terminal_probability : float, default=0.1
143
+ Probability of selecting a terminal during expression generation.
144
+
145
+ constant_probability : float or None, default=None
146
+ Probability of selecting a constant terminal. If None, will be set to 1/(x_dim + 1).
147
+
148
+ command_probability : float, default=0.2
149
+ Probability of command mutation.
150
+
151
+ node_probability : float, default=0.2
152
+ Probability of node mutation.
153
+
154
+ parameter_probability : float, default=0.2
155
+ Probability of parameter mutation.
156
+
157
+ prune_probability : float, default=0.2
158
+ Probability of pruning mutation.
159
+
160
+ fork_probability : float, default=0.2
161
+ Probability of fork mutation.
162
+
163
+ repeat_mutation_probability : float, default=0.05
164
+ Probability of repeating a mutation.
165
+
166
+ crossover_pool_size : int, default=num_particles
167
+ Size of the crossover pool.
168
+
169
+ mutation_prob : float, default=0.75
170
+ Probability of mutation (vs crossover).
171
+
172
+ crossover_prob : float, default=0.25
173
+ Probability of crossover (vs mutation).
174
+
175
+ exclusive : bool, default=True
176
+ Whether mutation and crossover are exclusive.
177
+
178
+ num_particles : int, default=50
179
+ Number of particles for sampling.
180
+
181
+ num_mcmc_samples : int, default=5
182
+ Number of MCMC samples.
183
+
184
+ target_ess : float, default=0.8
185
+ Target effective sample size.
186
+
187
+ param_init_bounds : list, default=[-5, 5]
188
+ Bounds for parameter initialization.
189
+
190
+ opt_restarts : int, default=1
191
+ Number of optimization restarts.
192
+
193
+ model_selection : str, default="mode"
194
+ The way to choose a best model from the produced distribution of
195
+ models. Current options are "mode" for the most frequently occuring
196
+ model and "max_nml" for the model with maximum normalized marginal
197
+ likelihood.
198
+
199
+ random_state : int or None, default=None
200
+ Random seed for reproducibility.
201
+ """
202
+
203
+ def __init__(
204
+ self,
205
+ operators=None,
206
+ max_complexity=24,
207
+ terminal_probability=0.1,
208
+ constant_probability=None,
209
+ command_probability=0.2,
210
+ node_probability=0.2,
211
+ parameter_probability=0.2,
212
+ prune_probability=0.2,
213
+ fork_probability=0.2,
214
+ repeat_mutation_probability=0.05,
215
+ crossover_pool_size=None,
216
+ mutation_prob=0.75,
217
+ crossover_prob=0.25,
218
+ exclusive=True,
219
+ num_particles=50,
220
+ num_mcmc_samples=5,
221
+ target_ess=0.8,
222
+ param_init_bounds=None,
223
+ opt_restarts=1,
224
+ model_selection="mode",
225
+ random_state=None,
226
+ ):
227
+
228
+ self.operators = operators if operators is not None else DEFAULT_OPERATORS
229
+ self.max_complexity = max_complexity
230
+ self.terminal_probability = terminal_probability
231
+ self.constant_probability = constant_probability
232
+ self.command_probability = command_probability
233
+ self.node_probability = node_probability
234
+ self.parameter_probability = parameter_probability
235
+ self.prune_probability = prune_probability
236
+ self.fork_probability = fork_probability
237
+ self.repeat_mutation_probability = repeat_mutation_probability
238
+ self.crossover_pool_size = (
239
+ crossover_pool_size if crossover_pool_size is not None else num_particles
240
+ )
241
+ self.mutation_prob = mutation_prob
242
+ self.crossover_prob = crossover_prob
243
+ self.exclusive = exclusive
244
+ self.num_particles = num_particles
245
+ self.num_mcmc_samples = num_mcmc_samples
246
+ self.target_ess = target_ess
247
+ self.param_init_bounds = (
248
+ param_init_bounds
249
+ if param_init_bounds is not None
250
+ else DEFALT_PARAMETER_INITIALIZATION_BOUNDS
251
+ )
252
+ self.opt_restarts = opt_restarts
253
+ self.model_selection = model_selection
254
+ self.random_state = random_state
255
+
256
+ # attributes set after fitting
257
+ self.n_features_in_ = None
258
+ self.models_ = None
259
+ self.likelihoods_ = None
260
+ self.best_model_ = None
261
+ self.best_likelihood_ = None
262
+
263
+ def _get_generator(self, x_dim):
264
+ """Create expression generator."""
265
+ constant_prob = self.constant_probability
266
+ if constant_prob is None:
267
+ constant_prob = 1 / (x_dim + 1)
268
+
269
+ component_generator = ComponentGenerator(
270
+ input_x_dimension=x_dim,
271
+ terminal_probability=self.terminal_probability,
272
+ constant_probability=constant_prob,
273
+ )
274
+ for comp in self.operators:
275
+ component_generator.add_operator(comp)
276
+
277
+ return AGraphGenerator(
278
+ self.max_complexity,
279
+ component_generator,
280
+ use_python=USE_PYTHON,
281
+ use_simplification=USE_SIMPLIFICATION,
282
+ )
283
+
284
+ def _get_proposal(self, x_dim, generator):
285
+ """Create proposal operator."""
286
+ constant_prob = self.constant_probability
287
+ if constant_prob is None:
288
+ constant_prob = 1 / (x_dim + 1)
289
+
290
+ mutation = MutationProposal(
291
+ x_dim,
292
+ operators=self.operators,
293
+ terminal_probability=self.terminal_probability,
294
+ constant_probability=constant_prob,
295
+ command_probability=self.command_probability,
296
+ node_probability=self.node_probability,
297
+ parameter_probability=self.parameter_probability,
298
+ prune_probability=self.prune_probability,
299
+ fork_probability=self.fork_probability,
300
+ repeat_mutation_probability=self.repeat_mutation_probability,
301
+ )
302
+
303
+ # Generate crossover pool
304
+ pool = set()
305
+ while len(pool) < self.crossover_pool_size:
306
+ pool.add(generator())
307
+ crossover = CrossoverProposal(list(pool))
308
+
309
+ # Create combined proposal
310
+ return RandomChoiceProposal(
311
+ [mutation, crossover],
312
+ [self.mutation_prob, self.crossover_prob],
313
+ self.exclusive,
314
+ )
315
+
316
+ def fit(self, X, y):
317
+ """
318
+ Fit the symbolic regression model to training data.
319
+
320
+ Parameters
321
+ ----------
322
+ X : array-like of shape (n_samples, n_features)
323
+ Training input samples.
324
+ y : array-like of shape (n_samples,)
325
+ Target values.
326
+
327
+ Returns
328
+ -------
329
+ self : object
330
+ Returns self.
331
+ """
332
+ # Check and validate input data
333
+ X, y = check_X_y(X, y, y_numeric=True)
334
+ self.n_features_in_ = X.shape[1]
335
+
336
+ # Set up the sampling config
337
+ x_dim = X.shape[1]
338
+
339
+ # Create generator, proposal, and likelihood
340
+ generator = self._get_generator(x_dim)
341
+ proposal = self._get_proposal(x_dim, generator)
342
+ likelihood = LaplaceNmll(X, y)
343
+
344
+ # Run sampling
345
+ models, likelihoods = sample(
346
+ likelihood,
347
+ proposal,
348
+ generator,
349
+ seed=self.random_state,
350
+ kwargs={
351
+ "num_particles": self.num_particles,
352
+ "num_mcmc_samples": self.num_mcmc_samples,
353
+ "target_ess": self.target_ess,
354
+ },
355
+ )
356
+
357
+ # Save the models and their likelihoods
358
+ self.models_ = models
359
+ self.likelihoods_ = likelihoods
360
+
361
+ # Select the best model
362
+ if self.model_selection == "max_nml":
363
+ best_idx = np.argmax(likelihoods)
364
+ elif self.model_selection == "mode":
365
+ model_indices = {model: i for i, model in enumerate(models)}
366
+ model_counts = Counter(model for model in self.models_)
367
+ most_common_model = model_counts.most_common(1)[0][0]
368
+ best_idx = model_indices[most_common_model]
369
+ else:
370
+ raise KeyError(
371
+ f"model_selection method {self.model_selection} not recognized."
372
+ )
373
+
374
+ self.best_model_ = models[best_idx]
375
+ self.best_likelihood_ = likelihoods[best_idx]
376
+
377
+ return self
378
+
379
+ def predict(self, X):
380
+ """
381
+ Predict using the best symbolic regression model.
382
+
383
+ Parameters
384
+ ----------
385
+ X : array-like of shape (n_samples, n_features)
386
+ Samples to predict.
387
+
388
+ Returns
389
+ -------
390
+ y_pred : array-like of shape (n_samples,)
391
+ Returns predicted values.
392
+ """
393
+ check_is_fitted(self, ["best_model_", "models_"])
394
+ X = check_array(X)
395
+
396
+ # Ensure consistent feature count
397
+ if X.shape[1] != self.n_features_in_:
398
+ raise ValueError(
399
+ f"X has {X.shape[1]} features, but BayesRRegressor was "
400
+ f"trained with {self.n_features_in_} features."
401
+ )
402
+
403
+ # Use the best model for prediction
404
+ return self.best_model_.evaluate_equation_at(X).flatten()
405
+
406
+ def score(self, X, y, sample_weight=None):
407
+ """
408
+ Return the coefficient of determination R^2 of the prediction.
409
+
410
+ Parameters
411
+ ----------
412
+ X : array-like of shape (n_samples, n_features)
413
+ Test samples.
414
+ y : array-like of shape (n_samples,)
415
+ True values for X.
416
+ sample_weight : array-like of shape (n_samples,), default=None
417
+ Sample weights.
418
+
419
+ Returns
420
+ -------
421
+ score : float
422
+ R^2 of self.predict(X) with respect to y.
423
+ """
424
+ # Use default implementation from scikit-learn
425
+ return super().score(X, y, sample_weight=sample_weight)
426
+
427
+ def get_expression(self):
428
+ """
429
+ Get the symbolic expression of the best model.
430
+
431
+ Returns
432
+ -------
433
+ expression : str
434
+ String representation of the best model.
435
+ """
436
+ check_is_fitted(self, ["best_model_"])
437
+ return str(self.best_model_)
438
+
439
+ def get_models(self):
440
+ """
441
+ Get all sampled models and their likelihoods.
442
+
443
+ Returns
444
+ -------
445
+ models : list
446
+ List of all sampled models.
447
+ likelihoods : numpy.ndarray
448
+ Array of corresponding likelihoods.
449
+ """
450
+ check_is_fitted(self, ["models_", "likelihoods_"])
451
+ return self.models_, self.likelihoods_
pysips/sampler.py ADDED
@@ -0,0 +1,159 @@
1
+ """
2
+ Sequential Monte Carlo (SMC) Sampling with Custom Prior and MCMC Kernel.
3
+
4
+ This module provides high-level functions for performing Sequential Monte Carlo
5
+ sampling using custom prior distributions and Metropolis-Hastings MCMC kernels.
6
+ It integrates with the smcpy library to provide adaptive sampling capabilities
7
+ with unique value generation.
8
+
9
+ The module is designed for scenarios where you need to sample from a parameter
10
+ space using a custom generator function while ensuring uniqueness of samples
11
+ and applying likelihood-based filtering.
12
+
13
+ Example
14
+ -------
15
+ >>> def my_likelihood(x):
16
+ ... return np.exp(-0.5 * x**2) # Gaussian-like likelihood
17
+ >>>
18
+ >>> def my_proposal(x):
19
+ ... return x + np.random.normal(0, 0.1) # Random walk proposal
20
+ >>>
21
+ >>> def my_generator():
22
+ ... return np.random.uniform(-5, 5) # Uniform parameter generator
23
+ >>>
24
+ >>> models, likelihoods = sample(my_likelihood, my_proposal, my_generator)
25
+ >>> print(f"Found {len(models)} models with likelihoods")
26
+
27
+ Notes
28
+ -----
29
+ This module uses the following workflow:
30
+ 1. Creates a custom Prior that generates unique values
31
+ 2. Sets up a Metropolis-Hastings MCMC kernel
32
+ 3. Runs adaptive SMC sampling
33
+ 4. Returns the final population of models and their likelihoods
34
+
35
+ The covariance calculation is disabled in the mutator as a workaround for
36
+ object-based parameters that may not support standard covariance computation.
37
+ """
38
+
39
+ # pylint: disable=R0913,R0917
40
+ import numpy as np
41
+ from smcpy import VectorMCMCKernel, AdaptiveSampler
42
+
43
+ from .metropolis import Metropolis
44
+ from .prior import Prior
45
+
46
+
47
+ def sample(likelihood, proposal, generator, multiprocess=False, kwargs=None, seed=None):
48
+ """
49
+ Perform Sequential Monte Carlo sampling with default parameters.
50
+
51
+ This is a high-level convenience function that sets up and runs SMC sampling
52
+ with commonly used default parameters. For more control over the sampling
53
+ process, use run_smc directly.
54
+
55
+ Parameters
56
+ ----------
57
+ likelihood : callable
58
+ Function that computes the likelihood of a given parameter value.
59
+ Should accept a single parameter and return a scalar likelihood value.
60
+ proposal : callable
61
+ Function that proposes new parameter values given a current value.
62
+ Used in the Metropolis-Hastings MCMC steps.
63
+ generator : callable
64
+ Function that generates initial parameter values when called with no
65
+ arguments. Should return hashable values for uniqueness tracking.
66
+ multiprocess : bool, optional
67
+ Whether to use multiprocessing for likelihood evaluations (default: False).
68
+ kwargs : dict, optional
69
+ Additional keyword arguments to override default SMC parameters.
70
+ Default parameters are {"num_particles": 5000, "num_mcmc_samples": 10}.
71
+ seed : int, optional
72
+ Random seed for reproducible results (default: None).
73
+
74
+ Returns
75
+ -------
76
+ models : list
77
+ List of parameter values from the final SMC population.
78
+ likelihoods : list
79
+ List of likelihood values corresponding to each model in the final population.
80
+
81
+ Examples
82
+ --------
83
+ >>> def likelihood_func(x):
84
+ ... return np.exp(-0.5 * (x - 2)**2)
85
+ >>>
86
+ >>> def proposal_func(x):
87
+ ... return x + np.random.normal(0, 0.5)
88
+ >>>
89
+ >>> def generator_func():
90
+ ... return np.random.uniform(-10, 10)
91
+ >>>
92
+ >>> models, likes = sample(likelihood_func, proposal_func, generator_func)
93
+ >>> print(f"Sampled {len(models)} models")
94
+
95
+ Notes
96
+ -----
97
+ This function internally calls run_smc with default parameters. The default
98
+ configuration uses 5000 particles and 10 MCMC samples per SMC step, which
99
+ provides a reasonable balance between accuracy and computational cost for
100
+ many applications.
101
+ """
102
+ rng = np.random.default_rng(seed)
103
+
104
+ smc_kwargs = {"num_particles": 5000, "num_mcmc_samples": 10}
105
+ if kwargs is not None:
106
+ smc_kwargs.update(kwargs)
107
+ return run_smc(likelihood, proposal, generator, multiprocess, smc_kwargs, rng)
108
+
109
+
110
+ def run_smc(likelihood, proposal, generator, multiprocess, kwargs, rng):
111
+ """
112
+ Execute Sequential Monte Carlo sampling with full parameter control.
113
+
114
+ This function implements the core SMC sampling algorithm using a custom
115
+ prior distribution and Metropolis-Hastings MCMC kernel. It provides
116
+ complete control over all sampling parameters.
117
+
118
+ Parameters
119
+ ----------
120
+ likelihood : callable
121
+ Function that computes the likelihood of a given parameter value.
122
+ proposal : callable
123
+ Function that proposes new parameter values in MCMC steps.
124
+ generator : callable
125
+ Function that generates unique initial parameter values.
126
+ multiprocess : bool
127
+ Whether to enable multiprocessing for likelihood evaluations.
128
+ kwargs : dict
129
+ Keyword arguments for the SMC sampler (e.g., num_particles, num_mcmc_samples).
130
+ rng : numpy.random.Generator
131
+ Random number generator instance for reproducible sampling.
132
+
133
+ Returns
134
+ -------
135
+ models : list
136
+ Parameter values from the final SMC population, converted to list format.
137
+ likelihoods : list
138
+ Likelihood values for each model in the final population, computed
139
+ fresh to ensure consistency.
140
+ """
141
+ prior = Prior(generator)
142
+
143
+ mcmc = Metropolis(
144
+ likelihood=likelihood,
145
+ proposal=proposal,
146
+ prior=prior,
147
+ multiprocess=multiprocess,
148
+ )
149
+ kernel = VectorMCMCKernel(mcmc, param_order=["f"], rng=rng)
150
+ smc = AdaptiveSampler(kernel)
151
+
152
+ # pylint: disable=W0212
153
+ smc._mutator._compute_cov = False # hack to bypass covariance calc on obj
154
+ steps, _ = smc.sample(**kwargs)
155
+
156
+ models = steps[-1].params[:, 0].tolist()
157
+ likelihoods = [likelihood(c) for c in models] # fit final pop of equ
158
+
159
+ return models, likelihoods