oikan 0.0.3.3__tar.gz → 0.0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oikan
3
- Version: 0.0.3.3
3
+ Version: 0.0.3.4
4
4
  Summary: OIKAN: Neuro-Symbolic ML for Scientific Discovery
5
5
  Author: Arman Zhalgasbayev
6
6
  License: MIT
@@ -14,6 +14,7 @@ Requires-Dist: torch
14
14
  Requires-Dist: numpy
15
15
  Requires-Dist: scikit-learn
16
16
  Requires-Dist: tqdm
17
+ Requires-Dist: sympy
17
18
  Dynamic: license-file
18
19
 
19
20
  <!-- logo in the center -->
@@ -118,7 +119,6 @@ model = OIKANRegressor(
118
119
  hidden_sizes=[32, 32], # Hidden layer sizes
119
120
  activation='relu', # Activation function (other options: 'tanh', 'leaky_relu', 'elu', 'swish', 'gelu')
120
121
  augmentation_factor=5, # Augmentation factor for data generation
121
- polynomial_degree=2, # Degree of polynomial basis functions
122
122
  alpha=0.1, # L1 regularization strength (Symbolic regression)
123
123
  sigma=0.1, # Standard deviation of Gaussian noise for data augmentation
124
124
  top_k=5, # Number of top features to select (Symbolic regression)
@@ -140,7 +140,7 @@ mse = mean_squared_error(y_test, y_pred)
140
140
  print("Mean Squared Error:", mse)
141
141
 
142
142
  # Get symbolic formula
143
- formula = model.get_formula()
143
+ formula = model.get_formula() # default: type='original' -> returns all formula without pruning | other options: 'sympied' -> simplified formula using sympy; 'latex' -> LaTeX format
144
144
  print("Symbolic Formula:", formula)
145
145
 
146
146
  # Get feature importances
@@ -168,7 +168,6 @@ model = OIKANClassifier(
168
168
  hidden_sizes=[32, 32], # Hidden layer sizes
169
169
  activation='relu', # Activation function (other options: 'tanh', 'leaky_relu', 'elu', 'swish', 'gelu')
170
170
  augmentation_factor=10, # Augmentation factor for data generation
171
- polynomial_degree=2, # Degree of polynomial basis functions
172
171
  alpha=0.1, # L1 regularization strength (Symbolic regression)
173
172
  sigma=0.1, # Standard deviation of Gaussian noise for data augmentation
174
173
  top_k=5, # Number of top features to select (Symbolic regression)
@@ -190,7 +189,7 @@ accuracy = model.score(X_test, y_test)
190
189
  print("Accuracy:", accuracy)
191
190
 
192
191
  # Get symbolic formulas for each class
193
- formulas = model.get_formula()
192
+ formulas = model.get_formula() # default: type='original' -> returns all formula without pruning | other options: 'sympied' -> simplified formula using sympy; 'latex' -> LaTeX format
194
193
  for i, formula in enumerate(formulas):
195
194
  print(f"Class {i} Formula:", formula)
196
195
 
@@ -212,6 +211,60 @@ loaded_model.load("outputs/model.json")
212
211
 
213
212
  ![OIKAN v0.0.3(1) Architecture](https://raw.githubusercontent.com/silvermete0r/oikan/main/docs/media/oikan-v0.0.3(1)-architecture-oop.png)
214
213
 
214
+ ## OIKAN Symbolic Model Compilers
215
+
216
+ OIKAN provides a set of symbolic model compilers to convert the symbolic formulas generated by the OIKAN model into different programming languages.
217
+
218
+ *Currently, we support: `Python`, `C++`, `C`, `JavaScript`, `Rust`, and `Go`. This allows users to easily integrate the generated formulas into their applications or systems.*
219
+
220
+ All compilers: [model_compilers/](model_compilers)
221
+
222
+ ### Example of Python Compiler
223
+
224
+ 1. Regression Model:
225
+ ```python
226
+ import numpy as np
227
+ import json
228
+
229
+ def predict(X, symbolic_model):
230
+ X = np.asarray(X)
231
+ X_transformed = evaluate_basis_functions(X, symbolic_model['basis_functions'],
232
+ symbolic_model['n_features'])
233
+ return np.dot(X_transformed, symbolic_model['coefficients'])
234
+
235
+ if __name__ == "__main__":
236
+ with open('outputs/california_housing_model.json', 'r') as f:
237
+ symbolic_model = json.load(f)
238
+ X = np.random.rand(10, symbolic_model['n_features'])
239
+ y_pred = predict(X, symbolic_model)
240
+ print(y_pred)
241
+ ```
242
+
243
+ 2. Classification Model:
244
+ ```python
245
+ import numpy as np
246
+ import json
247
+
248
+ def predict(X, symbolic_model):
249
+ X = np.asarray(X)
250
+ X_transformed = evaluate_basis_functions(X, symbolic_model['basis_functions'],
251
+ symbolic_model['n_features'])
252
+ logits = np.dot(X_transformed, np.array(symbolic_model['coefficients_list']).T)
253
+ probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
254
+ return np.argmax(probabilities, axis=1)
255
+
256
+ if __name__ == "__main__":
257
+ with open('outputs/iris_model.json', 'r') as f:
258
+ symbolic_model = json.load(f)
259
+ X = np.array([[5.1, 3.5, 1.4, 0.2],
260
+ [7.0, 3.2, 4.7, 1.4],
261
+ [6.3, 3.3, 6.0, 2.5]])
262
+ y_pred = predict(X, symbolic_model)
263
+ print(y_pred)
264
+ ```
265
+
266
+
267
+
215
268
  ## Contributing
216
269
 
217
270
  We welcome contributions! Key areas of interest:
@@ -100,7 +100,6 @@ model = OIKANRegressor(
100
100
  hidden_sizes=[32, 32], # Hidden layer sizes
101
101
  activation='relu', # Activation function (other options: 'tanh', 'leaky_relu', 'elu', 'swish', 'gelu')
102
102
  augmentation_factor=5, # Augmentation factor for data generation
103
- polynomial_degree=2, # Degree of polynomial basis functions
104
103
  alpha=0.1, # L1 regularization strength (Symbolic regression)
105
104
  sigma=0.1, # Standard deviation of Gaussian noise for data augmentation
106
105
  top_k=5, # Number of top features to select (Symbolic regression)
@@ -122,7 +121,7 @@ mse = mean_squared_error(y_test, y_pred)
122
121
  print("Mean Squared Error:", mse)
123
122
 
124
123
  # Get symbolic formula
125
- formula = model.get_formula()
124
+ formula = model.get_formula() # default: type='original' -> returns all formula without pruning | other options: 'sympied' -> simplified formula using sympy; 'latex' -> LaTeX format
126
125
  print("Symbolic Formula:", formula)
127
126
 
128
127
  # Get feature importances
@@ -150,7 +149,6 @@ model = OIKANClassifier(
150
149
  hidden_sizes=[32, 32], # Hidden layer sizes
151
150
  activation='relu', # Activation function (other options: 'tanh', 'leaky_relu', 'elu', 'swish', 'gelu')
152
151
  augmentation_factor=10, # Augmentation factor for data generation
153
- polynomial_degree=2, # Degree of polynomial basis functions
154
152
  alpha=0.1, # L1 regularization strength (Symbolic regression)
155
153
  sigma=0.1, # Standard deviation of Gaussian noise for data augmentation
156
154
  top_k=5, # Number of top features to select (Symbolic regression)
@@ -172,7 +170,7 @@ accuracy = model.score(X_test, y_test)
172
170
  print("Accuracy:", accuracy)
173
171
 
174
172
  # Get symbolic formulas for each class
175
- formulas = model.get_formula()
173
+ formulas = model.get_formula() # default: type='original' -> returns all formula without pruning | other options: 'sympied' -> simplified formula using sympy; 'latex' -> LaTeX format
176
174
  for i, formula in enumerate(formulas):
177
175
  print(f"Class {i} Formula:", formula)
178
176
 
@@ -194,6 +192,60 @@ loaded_model.load("outputs/model.json")
194
192
 
195
193
  ![OIKAN v0.0.3(1) Architecture](https://raw.githubusercontent.com/silvermete0r/oikan/main/docs/media/oikan-v0.0.3(1)-architecture-oop.png)
196
194
 
195
+ ## OIKAN Symbolic Model Compilers
196
+
197
+ OIKAN provides a set of symbolic model compilers to convert the symbolic formulas generated by the OIKAN model into different programming languages.
198
+
199
+ *Currently, we support: `Python`, `C++`, `C`, `JavaScript`, `Rust`, and `Go`. This allows users to easily integrate the generated formulas into their applications or systems.*
200
+
201
+ All compilers: [model_compilers/](model_compilers)
202
+
203
+ ### Example of Python Compiler
204
+
205
+ 1. Regression Model:
206
+ ```python
207
+ import numpy as np
208
+ import json
209
+
210
+ def predict(X, symbolic_model):
211
+ X = np.asarray(X)
212
+ X_transformed = evaluate_basis_functions(X, symbolic_model['basis_functions'],
213
+ symbolic_model['n_features'])
214
+ return np.dot(X_transformed, symbolic_model['coefficients'])
215
+
216
+ if __name__ == "__main__":
217
+ with open('outputs/california_housing_model.json', 'r') as f:
218
+ symbolic_model = json.load(f)
219
+ X = np.random.rand(10, symbolic_model['n_features'])
220
+ y_pred = predict(X, symbolic_model)
221
+ print(y_pred)
222
+ ```
223
+
224
+ 2. Classification Model:
225
+ ```python
226
+ import numpy as np
227
+ import json
228
+
229
+ def predict(X, symbolic_model):
230
+ X = np.asarray(X)
231
+ X_transformed = evaluate_basis_functions(X, symbolic_model['basis_functions'],
232
+ symbolic_model['n_features'])
233
+ logits = np.dot(X_transformed, np.array(symbolic_model['coefficients_list']).T)
234
+ probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
235
+ return np.argmax(probabilities, axis=1)
236
+
237
+ if __name__ == "__main__":
238
+ with open('outputs/iris_model.json', 'r') as f:
239
+ symbolic_model = json.load(f)
240
+ X = np.array([[5.1, 3.5, 1.4, 0.2],
241
+ [7.0, 3.2, 4.7, 1.4],
242
+ [6.3, 3.3, 6.0, 2.5]])
243
+ y_pred = predict(X, symbolic_model)
244
+ print(y_pred)
245
+ ```
246
+
247
+
248
+
197
249
  ## Contributing
198
250
 
199
251
  We welcome contributions! Key areas of interest:
@@ -7,7 +7,7 @@ from sklearn.linear_model import ElasticNet
7
7
  from abc import ABC, abstractmethod
8
8
  import json
9
9
  from .neural import TabularNet
10
- from .utils import evaluate_basis_functions, get_features_involved
10
+ from .utils import evaluate_basis_functions, get_features_involved, sympify_formula, get_latex_formula
11
11
  from sklearn.model_selection import train_test_split
12
12
  from sklearn.metrics import r2_score, accuracy_score
13
13
  from .exceptions import *
@@ -25,8 +25,6 @@ class OIKAN(ABC):
25
25
  Activation function for the neural network ('relu', 'tanh', 'leaky_relu', 'elu', 'swish', 'gelu').
26
26
  augmentation_factor : int, optional (default=10)
27
27
  Number of augmented samples per original sample.
28
- polynomial_degree : int, optional (default=2)
29
- Maximum degree of polynomial features for symbolic regression.
30
28
  alpha : float, optional (default=0.1)
31
29
  L1 regularization strength for Lasso in symbolic regression.
32
30
  sigma : float, optional (default=0.1)
@@ -45,7 +43,7 @@ class OIKAN(ABC):
45
43
  Whether to evaluate neural network performance before full training.
46
44
  """
47
45
  def __init__(self, hidden_sizes=[64, 64], activation='relu', augmentation_factor=10,
48
- polynomial_degree=2, alpha=0.1, sigma=0.1, epochs=100, lr=0.001, batch_size=32,
46
+ alpha=0.1, sigma=0.1, epochs=100, lr=0.001, batch_size=32,
49
47
  verbose=False, evaluate_nn=False, top_k=5):
50
48
  if not isinstance(hidden_sizes, list) or not all(isinstance(x, int) and x > 0 for x in hidden_sizes):
51
49
  raise InvalidParameterError("hidden_sizes must be a list of positive integers")
@@ -53,8 +51,6 @@ class OIKAN(ABC):
53
51
  raise InvalidParameterError(f"Unsupported activation function: {activation}")
54
52
  if not isinstance(augmentation_factor, int) or augmentation_factor < 1:
55
53
  raise InvalidParameterError("augmentation_factor must be a positive integer")
56
- if not isinstance(polynomial_degree, int) or polynomial_degree < 1:
57
- raise InvalidParameterError("polynomial_degree must be a positive integer")
58
54
  if not isinstance(top_k, int) or top_k < 1:
59
55
  raise InvalidParameterError("top_k must be a positive integer")
60
56
  if not 0 < lr < 1:
@@ -71,7 +67,6 @@ class OIKAN(ABC):
71
67
  self.hidden_sizes = hidden_sizes
72
68
  self.activation = activation
73
69
  self.augmentation_factor = augmentation_factor
74
- self.polynomial_degree = polynomial_degree
75
70
  self.alpha = alpha
76
71
  self.sigma = sigma
77
72
  self.epochs = epochs
@@ -92,23 +87,53 @@ class OIKAN(ABC):
92
87
  def predict(self, X):
93
88
  pass
94
89
 
95
- def get_formula(self):
96
- """Returns the symbolic formula(s) as a string (regression) or list of strings (classification)."""
90
+ def get_formula(self, type='original'):
91
+ """
92
+ Returns the symbolic formula(s) as a string (regression) or list of strings (classification).
93
+
94
+ Parameter:
95
+ --------
96
+ type : str, optional (default='original') other options: 'sympied', 'latex'
97
+ 'original' returns the original formula with coefficients, 'sympied' returns sympy simplified formula.
98
+ """
99
+ if type.lower() not in ['original', 'sympied', 'latex']:
100
+ raise InvalidParameterError("Invalid type. Choose 'original', 'sympied', 'latex'.")
97
101
  if self.symbolic_model is None:
98
102
  raise ValueError("Model not fitted yet.")
99
103
  basis_functions = self.symbolic_model['basis_functions']
100
- if 'coefficients' in self.symbolic_model:
101
- coefficients = self.symbolic_model['coefficients']
102
- formula = " + ".join([f"{coefficients[i]:.5f}*{basis_functions[i]}"
103
- for i in range(len(coefficients)) if coefficients[i] != 0])
104
- return formula if formula else "0"
104
+ if type.lower() == 'original':
105
+ if 'coefficients' in self.symbolic_model:
106
+ coefficients = self.symbolic_model['coefficients']
107
+ formula = " + ".join([f"{coefficients[i]:.6f}*{basis_functions[i]}"
108
+ for i in range(len(coefficients)) if coefficients[i] != 0])
109
+ return formula if formula else "0"
110
+ else:
111
+ formulas = []
112
+ for c, coef in enumerate(self.symbolic_model['coefficients_list']):
113
+ formula = " + ".join([f"{coef[i]:.6f}*{basis_functions[i]}"
114
+ for i in range(len(coef)) if coef[i] != 0])
115
+ formulas.append(f"Class {self.classes_[c]}: {formula if formula else '0'}")
116
+ return formulas
117
+ elif type.lower() == 'sympied':
118
+ if 'coefficients' in self.symbolic_model:
119
+ formula = sympify_formula(self.symbolic_model['basis_functions'], self.symbolic_model['coefficients'], self.symbolic_model['n_features'])
120
+ return formula
121
+ else:
122
+ formulas = []
123
+ for c, coef in enumerate(self.symbolic_model['coefficients_list']):
124
+ formula = sympify_formula(self.symbolic_model['basis_functions'], coef, self.symbolic_model['n_features'])
125
+ formulas.append(f"Class {self.classes_[c]}: {formula}")
126
+ return formulas
105
127
  else:
106
- formulas = []
107
- for c, coef in enumerate(self.symbolic_model['coefficients_list']):
108
- formula = " + ".join([f"{coef[i]:.5f}*{basis_functions[i]}"
109
- for i in range(len(coef)) if coef[i] != 0])
110
- formulas.append(f"Class {self.classes_[c]}: {formula if formula else '0'}")
111
- return formulas
128
+ if 'coefficients' in self.symbolic_model:
129
+ formula = get_latex_formula(self.symbolic_model['basis_functions'], self.symbolic_model['coefficients'], self.symbolic_model['n_features'])
130
+ return formula
131
+ else:
132
+ formulas = []
133
+ for c, coef in enumerate(self.symbolic_model['coefficients_list']):
134
+ formula = get_latex_formula(self.symbolic_model['basis_functions'], coef, self.symbolic_model['n_features'])
135
+ formulas.append(f"Class {self.classes_[c]}: {formula}")
136
+ return formulas
112
137
 
113
138
  def feature_importances(self):
114
139
  """
@@ -163,7 +188,6 @@ class OIKAN(ABC):
163
188
  # Convert numpy arrays and other non-serializable types to lists
164
189
  model_data = {
165
190
  'n_features': self.symbolic_model['n_features'],
166
- 'degree': self.symbolic_model['degree'],
167
191
  'basis_functions': self.symbolic_model['basis_functions']
168
192
  }
169
193
 
@@ -200,7 +224,6 @@ class OIKAN(ABC):
200
224
 
201
225
  self.symbolic_model = {
202
226
  'n_features': model_data['n_features'],
203
- 'degree': model_data['degree'],
204
227
  'basis_functions': model_data['basis_functions']
205
228
  }
206
229
 
@@ -222,7 +245,6 @@ class OIKAN(ABC):
222
245
 
223
246
  input_size = X.shape[1]
224
247
  self.neural_net = TabularNet(input_size, self.hidden_sizes, output_size, self.activation)
225
- optimizer = optim.Adam(self.neural_net.parameters(), lr=self.lr)
226
248
 
227
249
  # Train on the training set
228
250
  self._train_neural_net(X_train, y_train, output_size, loss_fn)
@@ -378,7 +400,6 @@ class OIKAN(ABC):
378
400
  selected_indices = np.where(np.abs(coef_refined) > 1e-6)[0]
379
401
  self.symbolic_model = {
380
402
  'n_features': X.shape[1],
381
- 'degree': self.polynomial_degree,
382
403
  'basis_functions': [basis_functions_refined[i] for i in selected_indices],
383
404
  'coefficients': coef_refined[selected_indices].tolist()
384
405
  }
@@ -398,7 +419,6 @@ class OIKAN(ABC):
398
419
  coefficients_list.append(coef_selected)
399
420
  self.symbolic_model = {
400
421
  'n_features': X.shape[1],
401
- 'degree': self.polynomial_degree,
402
422
  'basis_functions': basis_functions,
403
423
  'coefficients_list': coefficients_list
404
424
  }
@@ -0,0 +1,256 @@
1
+ import numpy as np
2
+ import sympy as sp
3
+ import json
4
+ from functools import lru_cache
5
+
6
+ def evaluate_basis_functions(X, basis_functions, n_features):
7
+ """
8
+ Evaluates basis functions on the input data.
9
+
10
+ Parameters:
11
+ -----------
12
+ X : array-like of shape (n_samples, n_features)
13
+ Input data.
14
+ basis_functions : list
15
+ List of basis function strings (e.g., '1', 'x0', 'x0^2', 'x0 x1', 'log1p_x0').
16
+ n_features : int
17
+ Number of input features.
18
+
19
+ Returns:
20
+ --------
21
+ X_transformed : ndarray of shape (n_samples, n_basis_functions)
22
+ Transformed data matrix.
23
+ """
24
+ X_transformed = np.zeros((X.shape[0], len(basis_functions)))
25
+ for i, func in enumerate(basis_functions):
26
+ if func == '1':
27
+ X_transformed[:, i] = 1
28
+ elif func.startswith('log1p_x'):
29
+ idx = int(func.split('_')[1][1:])
30
+ X_transformed[:, i] = np.log1p(np.abs(X[:, idx]))
31
+ elif func.startswith('exp_x'):
32
+ idx = int(func.split('_')[1][1:])
33
+ X_transformed[:, i] = np.exp(np.clip(X[:, idx], -10, 10))
34
+ elif func.startswith('sin_x'):
35
+ idx = int(func.split('_')[1][1:])
36
+ X_transformed[:, i] = np.sin(X[:, idx])
37
+ elif '^' in func:
38
+ var, power = func.split('^')
39
+ idx = int(var[1:])
40
+ X_transformed[:, i] = X[:, idx] ** int(power)
41
+ elif ' ' in func:
42
+ vars = func.split(' ')
43
+ result = np.ones(X.shape[0])
44
+ for var in vars:
45
+ idx = int(var[1:])
46
+ result *= X[:, idx]
47
+ X_transformed[:, i] = result
48
+ else:
49
+ idx = int(func[1:])
50
+ X_transformed[:, i] = X[:, idx]
51
+ return X_transformed
52
+
53
+ def get_features_involved(basis_function):
54
+ """
55
+ Extracts the feature indices involved in a basis function string.
56
+
57
+ Parameters:
58
+ -----------
59
+ basis_function : str
60
+ String representation of the basis function, e.g., 'x0', 'x0^2', 'x0 x1', 'log1p_x0'.
61
+
62
+ Returns:
63
+ --------
64
+ set : Set of feature indices involved.
65
+ """
66
+ if basis_function == '1':
67
+ return set()
68
+ features = set()
69
+ if '_' in basis_function: # Handle non-linear functions like 'log1p_x0'
70
+ parts = basis_function.split('_')
71
+ if len(parts) == 2 and parts[1].startswith('x'):
72
+ idx = int(parts[1][1:])
73
+ features.add(idx)
74
+ elif '^' in basis_function: # Handle powers, e.g., 'x0^2'
75
+ var = basis_function.split('^')[0]
76
+ idx = int(var[1:])
77
+ features.add(idx)
78
+ elif ' ' in basis_function: # Handle interactions, e.g., 'x0 x1'
79
+ for part in basis_function.split():
80
+ idx = int(part[1:])
81
+ features.add(idx)
82
+ elif basis_function.startswith('x'):
83
+ idx = int(basis_function[1:])
84
+ features.add(idx)
85
+ return features
86
+
87
+ @lru_cache(maxsize=1000)
88
+ def _cached_sympify_formula(basis_functions_tuple, coefficients_tuple, n_features, threshold):
89
+ """
90
+ Internal function to perform SymPy formula simplification with caching.
91
+
92
+ Parameters:
93
+ -----------
94
+ basis_functions_tuple : tuple
95
+ Tuple of basis function strings.
96
+ coefficients_tuple : tuple
97
+ Tuple of coefficients.
98
+ n_features : int
99
+ Number of input features.
100
+ threshold : float
101
+ Coefficients with absolute value below this are excluded.
102
+
103
+ Returns:
104
+ --------
105
+ str
106
+ Simplified formula as a string, or '0' if empty.
107
+ """
108
+ # Convert tuples back to lists
109
+ basis_functions = list(basis_functions_tuple)
110
+ coefficients = list(coefficients_tuple)
111
+
112
+ # Define symbolic variables
113
+ x = sp.symbols(f'x0:{n_features}')
114
+ expr = 0
115
+
116
+ # Build the expression
117
+ for coef, func in zip(coefficients, basis_functions):
118
+ if abs(coef) < threshold:
119
+ continue # Skip negligible coefficients
120
+ if func == '1':
121
+ term = coef
122
+ elif func.startswith('log1p_x'):
123
+ idx = int(func.split('_')[1][1:])
124
+ term = coef * sp.log(1 + sp.Abs(x[idx]))
125
+ elif func.startswith('exp_x'):
126
+ idx = int(func.split('_')[1][1:])
127
+ term = coef * sp.exp(x[idx])
128
+ elif func.startswith('sin_x'):
129
+ idx = int(func.split('_')[1][1:])
130
+ term = coef * sp.sin(x[idx])
131
+ elif '^' in func:
132
+ var, power = func.split('^')
133
+ idx = int(var[1:])
134
+ term = coef * x[idx]**int(power)
135
+ elif ' ' in func:
136
+ vars = func.split(' ')
137
+ term = coef
138
+ for var in vars:
139
+ idx = int(var[1:])
140
+ term *= x[idx]
141
+ else:
142
+ idx = int(func[1:])
143
+ term = coef * x[idx]
144
+ expr += term
145
+
146
+ # Simplify the expression
147
+ simplified_expr = sp.simplify(expr)
148
+
149
+ # Convert to string with rounded coefficients
150
+ def format_term(term):
151
+ if term.is_Mul:
152
+ coeff = 1
153
+ factors = []
154
+ for factor in term.args:
155
+ if factor.is_Number:
156
+ coeff *= float(factor)
157
+ else:
158
+ factors.append(str(factor))
159
+ if abs(coeff) < threshold:
160
+ return None
161
+ return f"{coeff:.5f}*{'*'.join(factors)}" if factors else f"{coeff:.5f}"
162
+ elif term.is_Add:
163
+ return None # Handle in recursion
164
+ elif term.is_Number:
165
+ return f"{float(term):.5f}" if abs(float(term)) >= threshold else None
166
+ else:
167
+ return f"{1.0:.5f}*{term}" if abs(1.0) >= threshold else None
168
+
169
+ terms = []
170
+ if simplified_expr.is_Add:
171
+ for term in simplified_expr.args:
172
+ formatted = format_term(term)
173
+ if formatted:
174
+ terms.append(formatted)
175
+ else:
176
+ formatted = format_term(simplified_expr)
177
+ if formatted:
178
+ terms.append(formatted)
179
+
180
+ formula = " + ".join(terms).replace("+ -", "- ")
181
+ return formula if formula else "0"
182
+
183
+ def sympify_formula(basis_functions, coefficients, n_features, threshold=0.00005):
184
+ """
185
+ Simplifies a symbolic formula using SymPy with caching.
186
+
187
+ Parameters:
188
+ -----------
189
+ basis_functions : list
190
+ List of basis function strings (e.g., 'x0', 'x0^2', 'x0 x1', 'exp_x0').
191
+ coefficients : list
192
+ List of coefficients corresponding to each basis function.
193
+ n_features : int
194
+ Number of input features.
195
+ threshold : float, optional (default=0.00005)
196
+ Coefficients with absolute value below this are excluded.
197
+
198
+ Returns:
199
+ --------
200
+ str
201
+ Simplified formula as a string, or '0' if empty.
202
+ """
203
+ # Convert inputs to hashable types
204
+ basis_functions_tuple = tuple(basis_functions)
205
+ coefficients_tuple = tuple(coefficients)
206
+
207
+ # Call cached function
208
+ return _cached_sympify_formula(basis_functions_tuple, coefficients_tuple, n_features, threshold)
209
+
210
+ @lru_cache(maxsize=1000)
211
+ def _cached_get_latex_formula(formula):
212
+ """
213
+ Internal function to convert a simplified formula to LaTeX with caching.
214
+
215
+ Parameters:
216
+ -----------
217
+ formula : str
218
+ Simplified formula string.
219
+
220
+ Returns:
221
+ --------
222
+ str
223
+ LaTeX formula as a string.
224
+ """
225
+ return sp.latex(sp.sympify(formula))
226
+
227
+ def get_latex_formula(basis_functions, coefficients, n_features, threshold=0.00005):
228
+ """
229
+ Generates a LaTeX formula from the basis functions and coefficients with caching.
230
+
231
+ Parameters:
232
+ -----------
233
+ basis_functions : list
234
+ List of basis function strings (e.g., 'x0', 'x0^2', 'x0 x1', 'exp_x0').
235
+ coefficients : list
236
+ List of coefficients corresponding to each basis function.
237
+ n_features : int
238
+ Number of input features.
239
+ threshold : float, optional (default=0.00005)
240
+ Coefficients with absolute value below this are excluded.
241
+
242
+ Returns:
243
+ --------
244
+ str
245
+ LaTeX formula as a string, or '0' if empty.
246
+ """
247
+ # Get simplified formula (cached)
248
+ formula = sympify_formula(basis_functions, coefficients, n_features, threshold)
249
+ # Convert to LaTeX (cached)
250
+ return _cached_get_latex_formula(formula)
251
+
252
+ if __name__ == "__main__":
253
+ with open('outputs/california_housing_model.json', 'r') as f:
254
+ model = json.load(f)
255
+ print('Sympified formula:', sympify_formula(model['basis_functions'], model['coefficients'], model['n_features']))
256
+ print('LaTeX formula:', get_latex_formula(model['basis_functions'], model['coefficients'], model['n_features']))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: oikan
3
- Version: 0.0.3.3
3
+ Version: 0.0.3.4
4
4
  Summary: OIKAN: Neuro-Symbolic ML for Scientific Discovery
5
5
  Author: Arman Zhalgasbayev
6
6
  License: MIT
@@ -14,6 +14,7 @@ Requires-Dist: torch
14
14
  Requires-Dist: numpy
15
15
  Requires-Dist: scikit-learn
16
16
  Requires-Dist: tqdm
17
+ Requires-Dist: sympy
17
18
  Dynamic: license-file
18
19
 
19
20
  <!-- logo in the center -->
@@ -118,7 +119,6 @@ model = OIKANRegressor(
118
119
  hidden_sizes=[32, 32], # Hidden layer sizes
119
120
  activation='relu', # Activation function (other options: 'tanh', 'leaky_relu', 'elu', 'swish', 'gelu')
120
121
  augmentation_factor=5, # Augmentation factor for data generation
121
- polynomial_degree=2, # Degree of polynomial basis functions
122
122
  alpha=0.1, # L1 regularization strength (Symbolic regression)
123
123
  sigma=0.1, # Standard deviation of Gaussian noise for data augmentation
124
124
  top_k=5, # Number of top features to select (Symbolic regression)
@@ -140,7 +140,7 @@ mse = mean_squared_error(y_test, y_pred)
140
140
  print("Mean Squared Error:", mse)
141
141
 
142
142
  # Get symbolic formula
143
- formula = model.get_formula()
143
+ formula = model.get_formula() # default: type='original' -> returns all formula without pruning | other options: 'sympied' -> simplified formula using sympy; 'latex' -> LaTeX format
144
144
  print("Symbolic Formula:", formula)
145
145
 
146
146
  # Get feature importances
@@ -168,7 +168,6 @@ model = OIKANClassifier(
168
168
  hidden_sizes=[32, 32], # Hidden layer sizes
169
169
  activation='relu', # Activation function (other options: 'tanh', 'leaky_relu', 'elu', 'swish', 'gelu')
170
170
  augmentation_factor=10, # Augmentation factor for data generation
171
- polynomial_degree=2, # Degree of polynomial basis functions
172
171
  alpha=0.1, # L1 regularization strength (Symbolic regression)
173
172
  sigma=0.1, # Standard deviation of Gaussian noise for data augmentation
174
173
  top_k=5, # Number of top features to select (Symbolic regression)
@@ -190,7 +189,7 @@ accuracy = model.score(X_test, y_test)
190
189
  print("Accuracy:", accuracy)
191
190
 
192
191
  # Get symbolic formulas for each class
193
- formulas = model.get_formula()
192
+ formulas = model.get_formula() # default: type='original' -> returns all formula without pruning | other options: 'sympied' -> simplified formula using sympy; 'latex' -> LaTeX format
194
193
  for i, formula in enumerate(formulas):
195
194
  print(f"Class {i} Formula:", formula)
196
195
 
@@ -212,6 +211,60 @@ loaded_model.load("outputs/model.json")
212
211
 
213
212
  ![OIKAN v0.0.3(1) Architecture](https://raw.githubusercontent.com/silvermete0r/oikan/main/docs/media/oikan-v0.0.3(1)-architecture-oop.png)
214
213
 
214
+ ## OIKAN Symbolic Model Compilers
215
+
216
+ OIKAN provides a set of symbolic model compilers to convert the symbolic formulas generated by the OIKAN model into different programming languages.
217
+
218
+ *Currently, we support: `Python`, `C++`, `C`, `JavaScript`, `Rust`, and `Go`. This allows users to easily integrate the generated formulas into their applications or systems.*
219
+
220
+ All compilers: [model_compilers/](model_compilers)
221
+
222
+ ### Example of Python Compiler
223
+
224
+ 1. Regression Model:
225
+ ```python
226
+ import numpy as np
227
+ import json
228
+
229
+ def predict(X, symbolic_model):
230
+ X = np.asarray(X)
231
+ X_transformed = evaluate_basis_functions(X, symbolic_model['basis_functions'],
232
+ symbolic_model['n_features'])
233
+ return np.dot(X_transformed, symbolic_model['coefficients'])
234
+
235
+ if __name__ == "__main__":
236
+ with open('outputs/california_housing_model.json', 'r') as f:
237
+ symbolic_model = json.load(f)
238
+ X = np.random.rand(10, symbolic_model['n_features'])
239
+ y_pred = predict(X, symbolic_model)
240
+ print(y_pred)
241
+ ```
242
+
243
+ 2. Classification Model:
244
+ ```python
245
+ import numpy as np
246
+ import json
247
+
248
+ def predict(X, symbolic_model):
249
+ X = np.asarray(X)
250
+ X_transformed = evaluate_basis_functions(X, symbolic_model['basis_functions'],
251
+ symbolic_model['n_features'])
252
+ logits = np.dot(X_transformed, np.array(symbolic_model['coefficients_list']).T)
253
+ probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)
254
+ return np.argmax(probabilities, axis=1)
255
+
256
+ if __name__ == "__main__":
257
+ with open('outputs/iris_model.json', 'r') as f:
258
+ symbolic_model = json.load(f)
259
+ X = np.array([[5.1, 3.5, 1.4, 0.2],
260
+ [7.0, 3.2, 4.7, 1.4],
261
+ [6.3, 3.3, 6.0, 2.5]])
262
+ y_pred = predict(X, symbolic_model)
263
+ print(y_pred)
264
+ ```
265
+
266
+
267
+
215
268
  ## Contributing
216
269
 
217
270
  We welcome contributions! Key areas of interest:
@@ -2,3 +2,4 @@ torch
2
2
  numpy
3
3
  scikit-learn
4
4
  tqdm
5
+ sympy
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "oikan"
7
- version = "0.0.3.3"
7
+ version = "0.0.3.4"
8
8
  description = "OIKAN: Neuro-Symbolic ML for Scientific Discovery"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Arman Zhalgasbayev"}]
@@ -12,7 +12,8 @@ dependencies = [
12
12
  "torch",
13
13
  "numpy",
14
14
  "scikit-learn",
15
- "tqdm"
15
+ "tqdm",
16
+ "sympy"
16
17
  ]
17
18
  requires-python = ">=3.7"
18
19
  license = {text = "MIT"}
@@ -7,6 +7,7 @@ setup(
7
7
  "torch",
8
8
  "numpy",
9
9
  "scikit-learn",
10
- "tqdm"
10
+ "tqdm",
11
+ "sympy"
11
12
  ]
12
13
  )
@@ -1,82 +0,0 @@
1
- import numpy as np
2
-
3
- def evaluate_basis_functions(X, basis_functions, n_features):
4
- """
5
- Evaluates basis functions on the input data.
6
-
7
- Parameters:
8
- -----------
9
- X : array-like of shape (n_samples, n_features)
10
- Input data.
11
- basis_functions : list
12
- List of basis function strings (e.g., '1', 'x0', 'x0^2', 'x0 x1', 'log1p_x0').
13
- n_features : int
14
- Number of input features.
15
-
16
- Returns:
17
- --------
18
- X_transformed : ndarray of shape (n_samples, n_basis_functions)
19
- Transformed data matrix.
20
- """
21
- X_transformed = np.zeros((X.shape[0], len(basis_functions)))
22
- for i, func in enumerate(basis_functions):
23
- if func == '1':
24
- X_transformed[:, i] = 1
25
- elif func.startswith('log1p_x'):
26
- idx = int(func.split('_')[1][1:])
27
- X_transformed[:, i] = np.log1p(np.abs(X[:, idx]))
28
- elif func.startswith('exp_x'):
29
- idx = int(func.split('_')[1][1:])
30
- X_transformed[:, i] = np.exp(np.clip(X[:, idx], -10, 10))
31
- elif func.startswith('sin_x'):
32
- idx = int(func.split('_')[1][1:])
33
- X_transformed[:, i] = np.sin(X[:, idx])
34
- elif '^' in func:
35
- var, power = func.split('^')
36
- idx = int(var[1:])
37
- X_transformed[:, i] = X[:, idx] ** int(power)
38
- elif ' ' in func:
39
- vars = func.split(' ')
40
- result = np.ones(X.shape[0])
41
- for var in vars:
42
- idx = int(var[1:])
43
- result *= X[:, idx]
44
- X_transformed[:, i] = result
45
- else:
46
- idx = int(func[1:])
47
- X_transformed[:, i] = X[:, idx]
48
- return X_transformed
49
-
50
- def get_features_involved(basis_function):
51
- """
52
- Extracts the feature indices involved in a basis function string.
53
-
54
- Parameters:
55
- -----------
56
- basis_function : str
57
- String representation of the basis function, e.g., 'x0', 'x0^2', 'x0 x1', 'log1p_x0'.
58
-
59
- Returns:
60
- --------
61
- set : Set of feature indices involved.
62
- """
63
- if basis_function == '1':
64
- return set()
65
- features = set()
66
- if '_' in basis_function: # Handle non-linear functions like 'log1p_x0'
67
- parts = basis_function.split('_')
68
- if len(parts) == 2 and parts[1].startswith('x'):
69
- idx = int(parts[1][1:])
70
- features.add(idx)
71
- elif '^' in basis_function: # Handle powers, e.g., 'x0^2'
72
- var = basis_function.split('^')[0]
73
- idx = int(var[1:])
74
- features.add(idx)
75
- elif ' ' in basis_function: # Handle interactions, e.g., 'x0 x1'
76
- for part in basis_function.split():
77
- idx = int(part[1:])
78
- features.add(idx)
79
- elif basis_function.startswith('x'):
80
- idx = int(basis_function[1:])
81
- features.add(idx)
82
- return features
File without changes
File without changes
File without changes
File without changes
File without changes