metacountregressor 0.1.307__py3-none-any.whl → 0.1.309__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ import pandas as pd
4
4
  import csv
5
5
  import matplotlib.pyplot as plt
6
6
  from scipy import stats as st
7
- from sklearn.preprocessing import StandardScaler
7
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
8
8
  import os
9
9
  import shutil
10
10
  plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
@@ -413,6 +413,10 @@ def transform_dataframe(df, config):
413
413
  # Apply custom function
414
414
  data = data.apply(settings['apply_func'])
415
415
  output_df[column] = data
416
+ elif settings['type'] == 'normalized':
417
+ # Normalize the column
418
+ scaler = MinMaxScaler
419
+ output_df[column] = scaler.fit_transform(df[[column]]).flatten()
416
420
 
417
421
  elif settings['type'] == 'none':
418
422
  # Leave the column unchanged
@@ -447,7 +451,7 @@ def guess_column_type(column_name, series):
447
451
  # Otherwise, fallback to continuous standardization
448
452
  return {
449
453
  'type': 'continuous',
450
- 'apply_func': (lambda x: (x - series.mean()) / series.std()) # Z-Score Standardization
454
+ 'apply_func': (lambda x: ((x - series.mean()) / series.std()) + abs(((series - series.mean()) / series.std()).min()) + 0.001)
451
455
  }
452
456
  else:
453
457
  # Default fallback (leave the column unchanged)
@@ -187,7 +187,7 @@ def main(args, **kwargs):
187
187
  a_des, df = helperprocess.set_up_analyst_constraints(df, model_terms)
188
188
  # some example argument, these are defualt so the following line is just for claritity
189
189
  args = {'algorithm': 'hs', 'test_percentage': 0.15, 'test_complexity': 6, 'instance_number': 1,
190
- 'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6, 'desicions':a_des}
190
+ 'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 600, 'desicions':a_des, 'is_multi': 1}
191
191
  # Fit the model with metacountregressor
192
192
  # Step 5: Transform the dataset based on the configuration
193
193
  #data_new = helperprocess.transform_dataframe(dataset, config)
@@ -422,6 +422,7 @@ class DifferentialEvolution(object):
422
422
  self.iter = kwargs.get('_max_iter', 10000)
423
423
  self.cr = kwargs.get('_crossover_perc') or kwargs.get('_cr', 0.2)
424
424
  self.instance_number = str(kwargs.get('instance_number', 1))
425
+ self.instance_number = objective_function.instance_number
425
426
  self.get_directory()
426
427
 
427
428
  self._population = list()
@@ -30,6 +30,7 @@ from scipy.special import gammaln
30
30
  from sklearn.metrics import mean_absolute_error as MAE
31
31
  from sklearn.metrics import mean_squared_error as MSPE
32
32
  from statsmodels.tools.numdiff import approx_fprime, approx_hess
33
+ from autograd import hessian as autograd_hessian
33
34
  from sklearn.preprocessing import StandardScaler, MinMaxScaler
34
35
  from texttable import Texttable
35
36
  import time
@@ -123,6 +124,7 @@ class ObjectiveFunction(object):
123
124
 
124
125
  def __init__(self, x_data, y_data, **kwargs):
125
126
  self.gbl_best = 1000000.0
127
+ self.run_numerical_hessian = kwargs.get('r_nu_hess', False)
126
128
  self.run_bootstrap = kwargs.get('run_bootstrap', False)
127
129
  self.linear_regression = kwargs.get('linear_model', False)
128
130
  self.reg_penalty = kwargs.get('reg_penalty',1)
@@ -186,7 +188,7 @@ class ObjectiveFunction(object):
186
188
  self.MP = 0
187
189
  # Nelder-Mead-BFGS
188
190
 
189
- self._max_characteristics = kwargs.get('_max_vars', 30)
191
+ self._max_characteristics = kwargs.get('_max_vars', 90)
190
192
 
191
193
  self.beta_dict = dict
192
194
  if 'model_terms' in kwargs:
@@ -611,11 +613,12 @@ class ObjectiveFunction(object):
611
613
  Function to for proceccing testing, and finding a suitable initial coefficient (linear intercept)
612
614
  """
613
615
  if hard_code:
616
+ # Grouped Terrs TODO
614
617
  manual_fit_spec = {
615
618
  'fixed_terms': ['Constant', 'US', 'RSMS', 'MCV'],
616
619
  'rdm_terms': ['RSHS:normal', 'AADT:normal', 'Curve50:normal'],
617
620
  'rdm_cor_terms': [],
618
- 'grouped_terms': [],
621
+ 'group_rdm': [],
619
622
  'hetro_in_means': [],
620
623
  'transformations': ['no', 'log', 'log', 'no', 'no', 'no', 'no'],
621
624
  'dispersion': 1
@@ -637,7 +640,7 @@ class ObjectiveFunction(object):
637
640
  'fixed_terms': ['const'],
638
641
  'rdm_terms': [],
639
642
  'rdm_cor_terms': [],
640
- 'grouped_terms': [],
643
+ 'group_rdm': [],
641
644
  'hetro_in_means': [],
642
645
  'transformations': ['no'],
643
646
  'dispersion': 1
@@ -5601,13 +5604,42 @@ class ObjectiveFunction(object):
5601
5604
  return covariance
5602
5605
 
5603
5606
 
5607
+ # Numerical Hessian (finite differences)
5608
+ def numerical_hessian_post(self, f, theta, epsilon=1e-5):
5609
+ n = len(theta)
5610
+ hessian = np.zeros((n, n))
5611
+ for i in range(n):
5612
+ for j in range(n):
5613
+ theta_ij_plus = theta.copy()
5614
+ theta_ij_minus = theta.copy()
5615
+ theta_ij_plus[i] += epsilon
5616
+ theta_ij_plus[j] += epsilon
5617
+ theta_ij_minus[i] += epsilon
5618
+ theta_ij_minus[j] -= epsilon
5619
+
5620
+ f_ij_plus = f(theta_ij_plus)
5621
+ f_ij_minus = f(theta_ij_minus)
5622
+ f_original = f(theta)
5623
+
5624
+ hessian[i, j] = (f_ij_plus - 2 * f_original + f_ij_minus) / (epsilon ** 2)
5625
+ return hessian
5626
+
5627
+
5604
5628
  def _post_fit_ll_aic_bic(self, optim_res, verbose=1, robust=False, simple_fit=True, is_dispersion=0):
5605
5629
  # sample_size = len(self._x_data) - len(optim_res['x']) -1
5606
5630
  sample_size = len(self._x_data)
5607
5631
  convergence = optim_res['success']
5608
5632
  coeff_ = optim_res['x']
5609
5633
  penalty = 0
5634
+ stderr_opg = None
5635
+ if self.run_numerical_hessian:
5636
+
5637
+ stderr_opg = self.stderr
5638
+
5610
5639
 
5640
+
5641
+
5642
+
5611
5643
  if 'hess_inv' in optim_res:
5612
5644
  covariance = self._robust_covariance(optim_res['hess_inv'], optim_res['grad_n']) \
5613
5645
  if robust else optim_res['hess_inv']
@@ -5616,9 +5648,11 @@ class ObjectiveFunction(object):
5616
5648
  covariance = self.handle_covariance(covariance)
5617
5649
  covariance = np.clip(covariance, 0, None)
5618
5650
  stderr = np.sqrt(np.diag(covariance))
5619
- # stderr = [if np.abs(optim_res['x'][i]) >.1 else min(np.abs(optim_res['x'][i]/1.5), stderr[i]) for i in range(len(optim_res['x']))]
5620
- # stderr = [if np.abs(optim_res['x'][i]) > 0.1 else min(np.abs(optim_res['x'][i]/1.5), stderr[i]) for i in range(len(optim_res['x']))]
5621
- # stderr = [np.min(np.abs(optim_res['x'][i]/random.uniform(1.8, 3)), stderr[i]) if i > len(self.none_handler(self.fixed_fit)) and np.abs(optim_res['x'][i] > 0.2) else stderr[i] for i in range(len(optim_res['x']))]
5651
+ if stderr_opg:
5652
+ stderr = np.minimum(stderr, stderr_opg)
5653
+
5654
+
5655
+
5622
5656
  if is_dispersion:
5623
5657
  stderr[-1] = random.uniform(0.001, 0.005)
5624
5658
 
@@ -5911,6 +5945,9 @@ class ObjectiveFunction(object):
5911
5945
  else:
5912
5946
  self.draws = 0
5913
5947
 
5948
+ def hessian_loglik_function(self, params, *args):
5949
+ return self._loglik_gradient(params, *args)
5950
+
5914
5951
  def _run_optimization(self, XX, y, dispersion, initial_params, bounds, tol, mod):
5915
5952
  """
5916
5953
  Run the optimization process with draws logic and update the Solution object.
@@ -5940,7 +5977,7 @@ class ObjectiveFunction(object):
5940
5977
 
5941
5978
 
5942
5979
  #method = 'Nelder-Mead-BFGS'
5943
- options = {'gtol': tol['gtol'], 'ftol': tol['ftol'], 'maxiter': 4000}
5980
+ options = {'gtol': tol['gtol'], 'ftol': tol['ftol'], 'maxiter': 20000}
5944
5981
  args=(
5945
5982
  X, y, draws, X, Xr, self.batch_size, self.grad_yes, self.hess_yes, dispersion, 0, False, 0,
5946
5983
  self.rdm_cor_fit, None, None, draws_grouped, XG, mod
@@ -5955,9 +5992,24 @@ class ObjectiveFunction(object):
5955
5992
  ),
5956
5993
  method=method,
5957
5994
  bounds=bounds,
5958
- tol=tol.get('ftol', 1e-8), # Use 'ftol' as the default tolerance
5995
+ tol=tol.get('ftol', 1e-6), # Use 'ftol' as the default tolerance
5959
5996
  options=options
5960
5997
  )
5998
+ if self.run_numerical_hessian:
5999
+ std_errors = self.bootstrap_std_dev(
6000
+ initial_params=optimization_result.x,
6001
+ XX=XX,
6002
+ y=y,
6003
+ dispersion=dispersion,
6004
+ bounds=bounds,
6005
+ tol=tol,
6006
+ mod=mod,
6007
+ n_bootstraps=5
6008
+ )
6009
+ self.stderr = std_errors
6010
+
6011
+
6012
+
5961
6013
 
5962
6014
 
5963
6015
 
@@ -6031,8 +6083,8 @@ class ObjectiveFunction(object):
6031
6083
  ),
6032
6084
  method=self.method_ll,
6033
6085
  bounds=bounds,
6034
- tol=tol.get('ftol', 1e-8), # Use 'ftol' as the default tolerance
6035
- options={'gtol': tol['gtol'], 'ftol': tol['ftol'], 'maxiter': 2000}
6086
+ tol=tol.get('ftol', 1e-6), # Use 'ftol' as the default tolerance
6087
+ options={'gtol': tol['gtol'], 'ftol': tol['ftol'], 'maxiter': 200}
6036
6088
  )
6037
6089
 
6038
6090
  # Store the parameter estimates from this bootstrap iteration
@@ -6121,6 +6173,7 @@ class ObjectiveFunction(object):
6121
6173
  # Validation metrics if test data is available (in-sample and out-of-sample MAE)
6122
6174
  in_sample_mae = None
6123
6175
  out_sample_mae = None
6176
+ out_sample_validation = None
6124
6177
  if self.is_multi and XX_test is not None:
6125
6178
  in_sample_mae = self.validation(
6126
6179
  optimization_result['x'], y, XX, dispersion=dispersion, model_nature=mod, testing=0
@@ -6128,8 +6181,12 @@ class ObjectiveFunction(object):
6128
6181
  out_sample_mae = self.validation(
6129
6182
  optimization_result['x'], y_test, XX_test, dispersion=dispersion, model_nature=mod
6130
6183
  )
6184
+ if self.val_percentage > 0:
6185
+ out_sample_validation = self.validation(
6186
+ optimization_result['x'], y_test, XX_test, dispersion=dispersion, model_nature=mod, testing=1
6187
+ )
6131
6188
 
6132
- return log_ll, aic, bic, stderr, zvalues, pvalue_alt, in_sample_mae, out_sample_mae
6189
+ return log_ll, aic, bic, stderr, zvalues, pvalue_alt, in_sample_mae, out_sample_mae, out_sample_validation
6133
6190
 
6134
6191
  else:
6135
6192
  # Optimization failed, return None for all metrics
@@ -6224,7 +6281,8 @@ class ObjectiveFunction(object):
6224
6281
 
6225
6282
  # Dispersion adds one additional parameter if enabled
6226
6283
  dispersion_param = 1 if dispersion > 0 else 0
6227
- return sum(self.get_num_params()) + dispersion_param
6284
+ total = sum(self.get_num_params()) + dispersion_param
6285
+ return total
6228
6286
 
6229
6287
  def _build_initial_params(self, num_coefficients, dispersion):
6230
6288
  """
@@ -6238,11 +6296,11 @@ class ObjectiveFunction(object):
6238
6296
  Initial parameter array.
6239
6297
  """
6240
6298
  # Generate random initial coefficients
6241
- initial_params = np.random.uniform(-.1, 0.1, size=num_coefficients)
6299
+ initial_params = np.random.uniform(0.0000, 0.01, size=num_coefficients)
6242
6300
  parma_sum = sum(self.get_num_params()[:2])
6243
6301
 
6244
6302
 
6245
- initial_params[parma_sum:-dispersion] =0.5
6303
+ initial_params[parma_sum:-dispersion] =0.0001
6246
6304
 
6247
6305
  # Add dispersion parameter if applicable
6248
6306
  if dispersion > 0:
@@ -6251,7 +6309,7 @@ class ObjectiveFunction(object):
6251
6309
 
6252
6310
  return initial_params
6253
6311
 
6254
- def fitRegression(self, mod, dispersion=0, maxiter=4000, batch_size=None, num_hess=False, **kwargs):
6312
+ def fitRegression(self, mod, dispersion=0, maxiter=20000, batch_size=None, num_hess=False, **kwargs):
6255
6313
  """
6256
6314
  Fits a Poisson regression, NB regression (dispersion=1), or GP regression (dispersion=2).
6257
6315
 
@@ -6293,7 +6351,7 @@ class ObjectiveFunction(object):
6293
6351
  )
6294
6352
 
6295
6353
  # Post-process results
6296
- log_lik, aic, bic, stderr, zvalues, pvalues, in_sample_mae, out_sample_mae = self._postprocess_results(
6354
+ log_lik, aic, bic, stderr, zvalues, pvalues, in_sample_mae, out_sample_mae, out_sample_val = self._postprocess_results(
6297
6355
  optimization_result, XX, XX_test, y, mod.get('y_test'), dispersion, mod
6298
6356
  )
6299
6357
 
@@ -6325,10 +6383,14 @@ class ObjectiveFunction(object):
6325
6383
 
6326
6384
  # Add metrics to solution object
6327
6385
  sol = Solution() # Assuming Solution is the appropriate class to store results
6386
+
6328
6387
  sol.add_objective(
6329
6388
  bic=bic,
6330
6389
  aic=aic,
6331
6390
  loglik=log_ll,
6391
+ TRAIN=in_sample_mae,
6392
+ TEST=out_sample_mae,
6393
+ VAL=out_sample_val,
6332
6394
  num_parm=paramNum,
6333
6395
  GOF=other_measures
6334
6396
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: metacountregressor
3
- Version: 0.1.307
3
+ Version: 0.1.309
4
4
  Summary: Extensive Testing for Estimation of Data Count Models
5
5
  Home-page: https://github.com/zahern/CountDataEstimation
6
6
  Author: Zeke Ahern
@@ -3,18 +3,18 @@ metacountregressor/_device_cust.py,sha256=759fnKmTYccJm4Lpi9_1reurh6OB9d6q9soPR0
3
3
  metacountregressor/app_main.py,sha256=vY3GczTbGbBRalbzMkl_9jVW7RMgEOc6z2Dr1IZJv9c,10014
4
4
  metacountregressor/data_split_helper.py,sha256=M2fIMdIO8znUaYhx5wlacRyNWdQjNYu1z1wkE-kFUYU,3373
5
5
  metacountregressor/halton.py,sha256=jhovA45UBoZYU9g-hl6Lb2sBIx_ZBTNdPrpgkzR9fng,9463
6
- metacountregressor/helperprocess.py,sha256=ufdB6BcCIYN6btWdxyFlRCReuYEbVh6es1sdLsd8RTg,25917
7
- metacountregressor/main.py,sha256=xfpKN2w0kePHp_Q2HOPjtG15PLEN1L3sEnDw1PHBquw,23668
6
+ metacountregressor/helperprocess.py,sha256=8PFxX3KTsWH0MlfhniDzKQOJQ63LmJ0eg6cYhQP_fRA,26162
7
+ metacountregressor/main.py,sha256=tGOm8DdbdyDf316qIxDAre6l6GzfJIWYNYIBaSeIemI,23685
8
8
  metacountregressor/main_old.py,sha256=eTS4ygq27MnU-dZ_j983Ucb-D5XfbVF8OJQK2hVVLZc,24123
9
- metacountregressor/metaheuristics.py,sha256=eVlP9FO8StVxj7D6m8n6ekRR45sOtjZuoakr5tzb-H4,106944
9
+ metacountregressor/metaheuristics.py,sha256=P0Xjlvhp1cEwZFACrqeeets6x8BK7F2iDyu1OfS4bog,107010
10
10
  metacountregressor/pareto_file.py,sha256=whySaoPAUWYjyI8zo0hwAOa3rFk6SIUlHSpqZiLur0k,23096
11
11
  metacountregressor/pareto_logger__plot.py,sha256=mEU2QN4wmsM7t39GJ_XhJ_jjsdl09JOmG0U2jICrAkI,30037
12
12
  metacountregressor/setup.py,sha256=5UcQCCLR8Fm5odA3MX78WwahavxFq4mVD6oq0IuQvAY,936
13
13
  metacountregressor/single_objective_finder.py,sha256=jVG7GJBqzSP4_riYr-kMMKy_LE3SlGmKMunNhHYxgRg,8011
14
- metacountregressor/solution.py,sha256=3YaugVfEcOQnrtqY5chH-qhBl_2DmI8CatZyjFdQngA,317534
14
+ metacountregressor/solution.py,sha256=YRskJOR7MU50z22mdt5J9KLMmzHXZNXGnHRMLQPc3R0,319113
15
15
  metacountregressor/test_generated_paper2.py,sha256=pwOoRzl1jJIIOUAAvbkT6HmmTQ81mwpsshn9SLdKOg8,3927
16
- metacountregressor-0.1.307.dist-info/licenses/LICENSE.txt,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
17
- metacountregressor-0.1.307.dist-info/METADATA,sha256=478JkHo4OCeggDG7O0ujZ0HMi_NLzHGpSBvGH3WIyBU,23581
18
- metacountregressor-0.1.307.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
19
- metacountregressor-0.1.307.dist-info/top_level.txt,sha256=zGG7UC5WIpr76gsFUpwJ4En2aCcoNTONBaS3OewwjR0,19
20
- metacountregressor-0.1.307.dist-info/RECORD,,
16
+ metacountregressor-0.1.309.dist-info/licenses/LICENSE.txt,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
17
+ metacountregressor-0.1.309.dist-info/METADATA,sha256=vLvLKlMnboMQGkDkupIo-Uwr9gx-rdM5HuEvrt08uMs,23581
18
+ metacountregressor-0.1.309.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
19
+ metacountregressor-0.1.309.dist-info/top_level.txt,sha256=zGG7UC5WIpr76gsFUpwJ4En2aCcoNTONBaS3OewwjR0,19
20
+ metacountregressor-0.1.309.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (77.0.3)
2
+ Generator: setuptools (78.0.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5