metacountregressor 1.0.10__py3-none-any.whl → 1.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metacountregressor/main.py +9 -1
- metacountregressor/solution.py +705 -150
- {metacountregressor-1.0.10.dist-info → metacountregressor-1.0.12.dist-info}/METADATA +1 -1
- {metacountregressor-1.0.10.dist-info → metacountregressor-1.0.12.dist-info}/RECORD +7 -7
- {metacountregressor-1.0.10.dist-info → metacountregressor-1.0.12.dist-info}/WHEEL +1 -1
- {metacountregressor-1.0.10.dist-info → metacountregressor-1.0.12.dist-info}/licenses/LICENSE.txt +0 -0
- {metacountregressor-1.0.10.dist-info → metacountregressor-1.0.12.dist-info}/top_level.txt +0 -0
metacountregressor/main.py
CHANGED
@@ -186,8 +186,16 @@ def main(args, **kwargs):
|
|
186
186
|
}
|
187
187
|
a_des, df = helperprocess.set_up_analyst_constraints(df, model_terms)
|
188
188
|
# some example argument, these are defualt so the following line is just for claritity
|
189
|
-
|
189
|
+
AMALAN = False
|
190
|
+
if AMALAN:
|
191
|
+
|
192
|
+
args = {'algorithm': 'hs', 'test_percentage': 0.15, 'test_complexity': 6, 'instance_number': 1,
|
190
193
|
'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 600, 'desicions':a_des, 'is_multi': 1}
|
194
|
+
else:
|
195
|
+
|
196
|
+
args = {'algorithm': 'hs', 'test_percentage': 0, 'test_complexity': 2, 'instance_number': 1,
|
197
|
+
'val_percentage': 0, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 600, 'desicions': a_des,
|
198
|
+
'is_multi': False, 'grad_est': False, 'non_sig_prints':True, 'model_types': [[0]], 'run_bootstrap':0}
|
191
199
|
# Fit the model with metacountregressor
|
192
200
|
# Step 5: Transform the dataset based on the configuration
|
193
201
|
#data_new = helperprocess.transform_dataframe(dataset, config)
|
metacountregressor/solution.py
CHANGED
@@ -124,16 +124,17 @@ class ObjectiveFunction(object):
|
|
124
124
|
|
125
125
|
def __init__(self, x_data, y_data, **kwargs):
|
126
126
|
self.gbl_best = 1e5
|
127
|
+
self.non_sig_prints = kwargs.get('non_sig_prints', False)
|
127
128
|
self.run_numerical_hessian = kwargs.get('r_nu_hess', False)
|
128
|
-
self.run_bootstrap =
|
129
|
+
self.run_bootstrap = kwargs.get('run_bootstrap', False)
|
129
130
|
self.linear_regression = kwargs.get('linear_model', False)
|
130
|
-
self.reg_penalty = kwargs.get('reg_penalty',1)
|
131
|
+
self.reg_penalty = kwargs.get('reg_penalty', 1)
|
131
132
|
self.power_up_ll = False
|
132
133
|
self.nb_parma = 1
|
133
134
|
self.bic = None
|
134
135
|
self.other_bic = False
|
135
136
|
self.test_flag = 1
|
136
|
-
self.no_extra_param =0 #if true, fix dispersion. w
|
137
|
+
self.no_extra_param = 0 # if true, fix dispersion. w
|
137
138
|
if self.other_bic:
|
138
139
|
print('change this to false latter ')
|
139
140
|
|
@@ -158,14 +159,14 @@ class ObjectiveFunction(object):
|
|
158
159
|
self.rounding_point = kwargs.get('decimals_in_coeff', 4)
|
159
160
|
self.MAE = None
|
160
161
|
self.best_obj_1 = 1000000.0
|
161
|
-
self._obj_1 = kwargs.get('_obj_1', 'bic')
|
162
|
-
self._obj_2 = kwargs.get('_obj_2', 'MSE')
|
162
|
+
self._obj_1 = kwargs.get('_obj_1', 'bic')
|
163
|
+
self._obj_2 = kwargs.get('_obj_2', 'MSE')
|
163
164
|
self.numerical_hessian_calc = 0 # calculates hessian by statsmodels otherwise scipy
|
164
165
|
self.full_model = None
|
165
166
|
self.GP_parameter = 0
|
166
167
|
self.is_multi = kwargs.get('is_multi', False)
|
167
|
-
self.complexity_level = kwargs.get('complexity_level', 6)
|
168
|
-
self._max_iterations_improvement = kwargs.get("WIC",10000)
|
168
|
+
self.complexity_level = kwargs.get('complexity_level', kwargs.get('test_complexity',6))
|
169
|
+
self._max_iterations_improvement = kwargs.get("WIC", 10000)
|
169
170
|
self.generated_sln = set()
|
170
171
|
self.ave_mae = 0
|
171
172
|
# defalt paramaters for hs #TODO unpack into harmony search class
|
@@ -173,17 +174,18 @@ class ObjectiveFunction(object):
|
|
173
174
|
self._hms = 20
|
174
175
|
self._max_time = self._max_time = kwargs.get('_max_time', kwargs.get('MAX_TIME', 0.8 * 60 * 60 * 24))
|
175
176
|
self._hmcr = kwargs.get('_hmcr', .5)
|
176
|
-
self._par = 0.3
|
177
|
+
self._par = 0.3 # dont think this gets useted
|
177
178
|
self._mpai = 1
|
178
179
|
self._max_imp = kwargs.get('_max_imp', 90000000)
|
179
|
-
self._WIC =
|
180
|
+
self._WIC = kwargs.get("WIC",
|
181
|
+
10000) # Number of Iterations without Multiobjective Improvement #tod chuck into solution
|
180
182
|
self._panels = None
|
181
183
|
self.method_ll = 'Nelder-Mead-BFGS'
|
182
184
|
|
183
185
|
self.method_ll = 'L-BFGS-B' # alternatives 'BFGS_2', 'BFGS
|
184
186
|
self.method_ll = kwargs.get('method', 'BFGS_2')
|
185
|
-
|
186
|
-
#self.method_ll = 'Nelder-Mead-BFGS'
|
187
|
+
|
188
|
+
# self.method_ll = 'Nelder-Mead-BFGS'
|
187
189
|
self.Keep_Fit = 2
|
188
190
|
self.MP = 0
|
189
191
|
# Nelder-Mead-BFGS
|
@@ -192,7 +194,7 @@ class ObjectiveFunction(object):
|
|
192
194
|
|
193
195
|
self.beta_dict = dict
|
194
196
|
if 'model_terms' in kwargs:
|
195
|
-
|
197
|
+
|
196
198
|
if kwargs.get('model_terms').get('group') is not None:
|
197
199
|
kwargs['group'] = kwargs.get('model_terms').get('group')
|
198
200
|
|
@@ -208,14 +210,13 @@ class ObjectiveFunction(object):
|
|
208
210
|
if k in acceptable_keys_list:
|
209
211
|
self.__setattr__(k, self.tryeval(kwargs[k]))
|
210
212
|
|
211
|
-
|
212
213
|
if 'complexity_level' in kwargs:
|
213
214
|
self.complexity_level = kwargs['complexity_level']
|
214
215
|
|
215
216
|
if 'instance_name' in kwargs:
|
216
217
|
self.instance_name = str(kwargs['instance_name'])
|
217
218
|
else:
|
218
|
-
|
219
|
+
|
219
220
|
print('no name set, setting name as 0')
|
220
221
|
self.instance_name = f"run_{str(0)}" # set an arbitrary instance number
|
221
222
|
|
@@ -223,7 +224,8 @@ class ObjectiveFunction(object):
|
|
223
224
|
self.save_state = True
|
224
225
|
if not os.path.exists(self.instance_name):
|
225
226
|
if kwargs.get('make_directory', True):
|
226
|
-
print(
|
227
|
+
print(
|
228
|
+
'Making a Directory, if you want to stop from storing the files to this directory set argumet: make_directory:False')
|
227
229
|
os.makedirs(self.instance_name)
|
228
230
|
else:
|
229
231
|
self.save_state = False
|
@@ -243,7 +245,6 @@ class ObjectiveFunction(object):
|
|
243
245
|
|
244
246
|
self._input_data(x_data, y_data)
|
245
247
|
|
246
|
-
|
247
248
|
if y_data.ndim == 1:
|
248
249
|
y_data = pd.DataFrame(y_data)
|
249
250
|
|
@@ -257,14 +258,16 @@ class ObjectiveFunction(object):
|
|
257
258
|
self.test_percentage = float(kwargs.get('test_percentage', 0))
|
258
259
|
self.val_percentage = float(kwargs.get('val_percentage', 0))
|
259
260
|
if self.test_percentage == 0:
|
260
|
-
print(
|
261
|
+
print(
|
262
|
+
'test percentage is 0, please enter arg test_percentage as decimal if intended for multi objective optimisation, eg 0.8')
|
261
263
|
print('continuing single objective')
|
262
264
|
time.sleep(2)
|
263
265
|
self.is_multi = False
|
264
266
|
|
265
267
|
if 'panels' in kwargs and not (kwargs.get('panels') == None):
|
266
268
|
if kwargs.get('group') is not None:
|
267
|
-
self.group_names = np.asarray(
|
269
|
+
self.group_names = np.asarray(
|
270
|
+
x_data[kwargs['group']].astype('category').cat._parent.dtype.categories)
|
268
271
|
|
269
272
|
x_data[kwargs['group']] = x_data[kwargs['group']].astype(
|
270
273
|
'category').cat.codes
|
@@ -296,7 +299,7 @@ class ObjectiveFunction(object):
|
|
296
299
|
ids = np.random.choice(N, training_size, replace=False)
|
297
300
|
id_unique = np.array([i for i in range(N)])
|
298
301
|
ids = id_unique[ids]
|
299
|
-
#todo make sure its split so counts are split
|
302
|
+
# todo make sure its split so counts are split
|
300
303
|
train_idx = [ii for ii in range(len(id_unique)) if id_unique[ii] in ids]
|
301
304
|
test_idx = [ii for ii in range(len(id_unique)) if id_unique[ii] not in ids]
|
302
305
|
df_train = x_data.loc[train_idx, :]
|
@@ -304,13 +307,12 @@ class ObjectiveFunction(object):
|
|
304
307
|
y_train = y_data.loc[train_idx, :]
|
305
308
|
y_test = y_data.loc[test_idx, :]
|
306
309
|
|
307
|
-
|
308
|
-
#self.n_obs = N
|
310
|
+
# self.n_obs = N
|
309
311
|
self._characteristics_names = list(self._x_data.columns)
|
310
312
|
self._max_group_all_means = 2
|
311
313
|
|
312
314
|
exclude_this_test = [4]
|
313
|
-
|
315
|
+
|
314
316
|
if 'panels' in kwargs and not (kwargs.get('panels') == None):
|
315
317
|
self.panels = np.asarray(df_train[kwargs['panels']])
|
316
318
|
self.panels_test = np.asarray(df_test[kwargs['panels']])
|
@@ -358,70 +360,466 @@ class ObjectiveFunction(object):
|
|
358
360
|
X.drop(kwargs['group'], axis=1, inplace=True)
|
359
361
|
self.N_test, self.P_test = panel_info.shape
|
360
362
|
|
361
|
-
self.G = 1
|
362
|
-
self._Gnum = self.group_dummies.shape[2]
|
363
|
-
self.group_dummies_test = pd.get_dummies(group)
|
364
|
-
self.group_dummies_test = self.group_dummies_test.values.reshape(self.N_test, self.P_test, -1)
|
365
|
-
K = X.shape[1]
|
366
|
-
self.columns_names = X.columns
|
367
|
-
X = X.values.reshape(self.N_test, self.P_test, K)
|
368
|
-
X = X.astype('float')
|
369
|
-
self.group_halton_test = self.group_halton_test.reshape(self.N_test, self.P_test)[:, 0]
|
370
|
-
Y = Y.values.reshape(self.N_test, self.P_test, 1)
|
371
|
-
Y = Y.astype('float')
|
372
|
-
self._x_data_test = X.copy()
|
373
|
-
self.y_data_test = Y.copy()
|
374
|
-
self.y_data_test = self.y_data_test.astype('float')
|
363
|
+
self.G = 1
|
364
|
+
self._Gnum = self.group_dummies.shape[2]
|
365
|
+
self.group_dummies_test = pd.get_dummies(group)
|
366
|
+
self.group_dummies_test = self.group_dummies_test.values.reshape(self.N_test, self.P_test, -1)
|
367
|
+
K = X.shape[1]
|
368
|
+
self.columns_names = X.columns
|
369
|
+
X = X.values.reshape(self.N_test, self.P_test, K)
|
370
|
+
X = X.astype('float')
|
371
|
+
self.group_halton_test = self.group_halton_test.reshape(self.N_test, self.P_test)[:, 0]
|
372
|
+
Y = Y.values.reshape(self.N_test, self.P_test, 1)
|
373
|
+
Y = Y.astype('float')
|
374
|
+
self._x_data_test = X.copy()
|
375
|
+
self.y_data_test = Y.copy()
|
376
|
+
self.y_data_test = self.y_data_test.astype('float')
|
377
|
+
|
378
|
+
self._samples, self._panels, self._characteristics = self._x_data.shape
|
379
|
+
|
380
|
+
|
381
|
+
|
382
|
+
else:
|
383
|
+
print('No Panels. Grouped Random Paramaters Will not be estimated')
|
384
|
+
self.G = None
|
385
|
+
self._Gnum = 1
|
386
|
+
self._max_group_all_means = 0
|
387
|
+
self.ids = np.asarray(train_idx)
|
388
|
+
self.ids_test = np.asarray(test_idx)
|
389
|
+
groupll = None
|
390
|
+
X, Y, panel, group = self._arrange_long_format(
|
391
|
+
df_train, y_train, self.ids, self.ids, groupll)
|
392
|
+
|
393
|
+
Xnew, Ynew, panel_info = self._balance_panels(X, Y, panel)
|
394
|
+
self.panel_info = panel_info
|
395
|
+
self.N, self.P = panel_info.shape
|
396
|
+
|
397
|
+
K = Xnew.shape[1]
|
398
|
+
self._characteristics_names = list(Xnew.columns)
|
399
|
+
XX = Xnew.values.reshape(self.N, self.P, K).copy()
|
400
|
+
XX = XX.astype('float')
|
401
|
+
YY = Ynew.values.reshape(self.N, self.P, 1).copy()
|
402
|
+
YY = YY.astype('float')
|
403
|
+
self._x_data = XX.copy()
|
404
|
+
self._y_data = YY.copy()
|
405
|
+
|
406
|
+
if self.is_multi:
|
407
|
+
X, Y, panel, group = self._arrange_long_format(df_test, y_test, self.ids_test, self.ids_test, None)
|
408
|
+
if np.max(group) > 50:
|
409
|
+
exclude_this_test = [4]
|
410
|
+
else:
|
411
|
+
exclude_this_test = []
|
412
|
+
X, Y, panel_info = self._balance_panels(X, Y, panel)
|
413
|
+
|
414
|
+
self.N_test, self.P_test = panel_info.shape
|
415
|
+
K = X.shape[1]
|
416
|
+
self.columns_names = X.columns
|
417
|
+
X = X.values.reshape(self.N_test, self.P_test, K)
|
418
|
+
X = X.astype('float')
|
419
|
+
Y = Y.values.reshape(self.N_test, self.P_test, 1)
|
420
|
+
Y = Y.astype('float')
|
421
|
+
self._x_data_test = X.copy()
|
422
|
+
self.y_data_test = Y.copy()
|
423
|
+
|
424
|
+
self._samples, self._panels, self._characteristics = self._x_data.shape
|
425
|
+
|
426
|
+
# Define the offset into the data
|
427
|
+
self.process_offset()
|
428
|
+
if self.is_multi:
|
429
|
+
self.pareto_printer = Pareto(self._obj_1, self._obj_2, True)
|
430
|
+
self._pareto_population = list()
|
431
|
+
|
432
|
+
self.Ndraws = kwargs.get('Ndraws', 200)
|
433
|
+
self.draws1 = None
|
434
|
+
self.initial_sig = 1 # pass the test of a single model
|
435
|
+
self.pvalue_sig_value = kwargs.get('pvalue_sig_value', .1)
|
436
|
+
self.observations = self._x_data.shape[0]
|
437
|
+
self.minimize_scaler = 1 / self.observations # scale the minimization function to the observations
|
438
|
+
|
439
|
+
self.batch_size = None
|
440
|
+
# open the file in the write mode
|
441
|
+
self.grab_transforms = 0
|
442
|
+
|
443
|
+
if not isinstance(self._characteristics, int):
|
444
|
+
raise Exception
|
445
|
+
if not isinstance(self._x_data, pd.DataFrame):
|
446
|
+
|
447
|
+
print('Setup Complete...')
|
448
|
+
else:
|
449
|
+
print('No Panels Supplied')
|
450
|
+
print('Setup Complete...')
|
451
|
+
self._characteristics_names = list(self._x_data.columns)
|
452
|
+
# define the variables
|
453
|
+
|
454
|
+
self._transformations = kwargs.get('_transformations', ["no", "log", "sqrt", "arcsinh", "nil"])
|
455
|
+
# self._distribution = ['triangular', 'uniform', 'normal', 'ln_normal', 'tn_normal', 'lindley']
|
456
|
+
|
457
|
+
self._distribution = kwargs.get('_distributions', ['triangular', 'uniform', 'normal', 'tn_normal', 'ln_normal'])
|
458
|
+
|
459
|
+
if self.G is not None:
|
460
|
+
# TODO need to handle this for groups
|
461
|
+
|
462
|
+
self._distribution = ["trad| " + item for item in self._distribution
|
463
|
+
] + ["grpd| " + item for item in self._distribution]
|
464
|
+
|
465
|
+
# output information
|
466
|
+
self.convergence = None
|
467
|
+
self.coeff_names = None
|
468
|
+
self._interactions = None # was 2
|
469
|
+
self.coeff_ = None
|
470
|
+
|
471
|
+
self.significant = 0
|
472
|
+
# define the states of our explanatory variables
|
473
|
+
|
474
|
+
self._discrete_values = self.define_alphas(self.complexity_level, exclude_this_test,
|
475
|
+
kwargs.get('must_include', []), extra=kwargs.get('decisions', None))
|
476
|
+
|
477
|
+
self._discrete_values = self._discrete_values + \
|
478
|
+
self.define_distributions_analyst(extra=kwargs.get('decisions', None))
|
479
|
+
|
480
|
+
if 'model_types' in kwargs or 'Model' in kwargs:
|
481
|
+
model_type_mapping = {
|
482
|
+
'POS': 0,
|
483
|
+
'NB': 1
|
484
|
+
}
|
485
|
+
model_types = kwargs.get('model_types', kwargs.get('Model', [[0, 1]]))
|
486
|
+
converted_model_types = [
|
487
|
+
[model_type_mapping.get(item, item) for item in sublist]
|
488
|
+
for sublist in model_types
|
489
|
+
]
|
490
|
+
model_types = converted_model_types
|
491
|
+
# this should be a list of list like [[0, 1]]
|
492
|
+
# also if it is [['POS', 'NB']] then it will be converted to [0, 1]
|
493
|
+
else:
|
494
|
+
|
495
|
+
model_types = [[0, 1]] # add 2 for Generalized Poisson
|
496
|
+
|
497
|
+
# model_types = [[0]]
|
498
|
+
|
499
|
+
if self.linear_regression:
|
500
|
+
model_types = [[1]]
|
501
|
+
self.grad_yes = False
|
502
|
+
|
503
|
+
print(f'Linear Model Selected: turning off gradient calculation')
|
504
|
+
|
505
|
+
model_t_dict = {'Poisson': 0,
|
506
|
+
"NB": 1}
|
507
|
+
if self.linear_regression:
|
508
|
+
# Rename key "NB" to "sigma" if it exists in the dictionary
|
509
|
+
if "NB" in model_t_dict:
|
510
|
+
model_t_dict["sigma"] = model_t_dict.pop("NB")
|
511
|
+
|
512
|
+
# Retrieve the keys (model names) corresponding to the values in model_types
|
513
|
+
model_keys = [key for key, value in model_t_dict.items() if value in model_types[0]]
|
514
|
+
# Print the formatted result
|
515
|
+
print(f'The type of models possible will consider: {", ".join(model_keys)}')
|
516
|
+
self._discrete_values = self._discrete_values + self.define_poissible_transforms(
|
517
|
+
self._transformations, kwargs.get('decisions', None)) + model_types
|
518
|
+
|
519
|
+
self._model_type_codes = ['p', 'nb',
|
520
|
+
'gp', "pl", ["nb-theta", 'nb-dis']]
|
521
|
+
self.update_model_type_codes()
|
522
|
+
self._variable = [True] * len(self._discrete_values)
|
523
|
+
self._lower_bounds = [None] * \
|
524
|
+
len(self._discrete_values) # TODO have continus
|
525
|
+
self._upper_bounds = [None] * \
|
526
|
+
len(self._discrete_values) # TODO have continous
|
527
|
+
# model specs
|
528
|
+
self.endog = None
|
529
|
+
# solution parameters
|
530
|
+
self._min_characteristics = kwargs.get('_min_vars', 3)
|
531
|
+
self._max_hurdle = 4
|
532
|
+
|
533
|
+
# Manually fit from analyst specification
|
534
|
+
manual_fit = kwargs.get('Manual_Fit', None)
|
535
|
+
if manual_fit is not None:
|
536
|
+
print('fitting manual')
|
537
|
+
self.process_manual_fit(manual_fit)
|
538
|
+
|
539
|
+
self.solution_analyst = None
|
540
|
+
|
541
|
+
def __init__v(self, x_data, y_data, **kwargs):
|
542
|
+
# Default attributes
|
543
|
+
self.gbl_best = 1e5
|
544
|
+
self.non_sig_prints = kwargs.get('non_sig_prints', False)
|
545
|
+
self.run_numerical_hessian = kwargs.get('r_nu_hess', False)
|
546
|
+
self.run_bootstrap = kwargs.get('run_bootstrap', False)
|
547
|
+
self.linear_regression = kwargs.get('linear_model', False)
|
548
|
+
self.reg_penalty = kwargs.get('reg_penalty', 1)
|
549
|
+
self.power_up_ll = False
|
550
|
+
self.nb_parma = 1
|
551
|
+
self.bic = None
|
552
|
+
self.other_bic = False
|
553
|
+
self.test_flag = 1
|
554
|
+
self.no_extra_param = 0 # Fix dispersion if true
|
555
|
+
self.constant_value = 0
|
556
|
+
self.negative_binomial_value = 1
|
557
|
+
self.verbose_safe = kwargs.get('verbose', 0)
|
558
|
+
self.please_print = kwargs.get('please_print', 0)
|
559
|
+
self.grad_yes = kwargs.get('grad_est', False)
|
560
|
+
self.hess_yes = False
|
561
|
+
self.rounding_point = kwargs.get('decimals_in_coeff', 4)
|
562
|
+
self.best_obj_1 = 1e6
|
563
|
+
self._obj_1 = kwargs.get('_obj_1', 'bic')
|
564
|
+
self._obj_2 = kwargs.get('_obj_2', 'MSE')
|
565
|
+
self.numerical_hessian_calc = 0
|
566
|
+
self.full_model = None
|
567
|
+
self.GP_parameter = 0
|
568
|
+
self.is_multi = kwargs.get('is_multi', False)
|
569
|
+
self.complexity_level = kwargs.get('complexity_level', 6)
|
570
|
+
self._max_iterations_improvement = kwargs.get("WIC", 10000)
|
571
|
+
self.generated_sln = set()
|
572
|
+
self.ave_mae = 0
|
573
|
+
|
574
|
+
# Harmony search parameters
|
575
|
+
self.algorithm = kwargs.get('algorithm', 'hs')
|
576
|
+
self._hms = 20
|
577
|
+
self._max_time = kwargs.get('_max_time', 0.8 * 60 * 60 * 24)
|
578
|
+
self._hmcr = kwargs.get('_hmcr', 0.5)
|
579
|
+
self._par = 0.3
|
580
|
+
self._mpai = 1
|
581
|
+
self._max_imp = kwargs.get('_max_imp', 90000000)
|
582
|
+
self.method_ll = kwargs.get('method', 'BFGS_2')
|
583
|
+
self._max_characteristics = kwargs.get('_max_vars', 90)
|
584
|
+
|
585
|
+
# Beta dictionary
|
586
|
+
self.beta_dict = dict
|
587
|
+
|
588
|
+
# Handle `model_terms` in kwargs
|
589
|
+
if 'model_terms' in kwargs:
|
590
|
+
if kwargs['model_terms'].get('group') is not None:
|
591
|
+
kwargs['group'] = kwargs['model_terms']['group']
|
592
|
+
if kwargs['model_terms'].get('panels') is not None:
|
593
|
+
kwargs['panels'] = kwargs['model_terms']['panels']
|
594
|
+
|
595
|
+
# Acceptable keys
|
596
|
+
acceptable_keys = [
|
597
|
+
'_par', '_max_imp', '_hmcr', 'steps', 'algorithm', '_random_seed', '_max_time',
|
598
|
+
'forcedvariables', '_obj_1', '_obj_2', 'Manuel_Estimate', 'test_percentage', 'is_multi',
|
599
|
+
'val_percentage', 'complexity_level', '_hms', '_mpai', 'group', '_max_characteristics',
|
600
|
+
'zi_force_names'
|
601
|
+
]
|
602
|
+
for k, v in kwargs.items():
|
603
|
+
if k in acceptable_keys:
|
604
|
+
setattr(self, k, self.tryeval(v))
|
605
|
+
|
606
|
+
# Instance name
|
607
|
+
self.instance_name = str(kwargs.get('instance_name', f"run_0"))
|
608
|
+
if kwargs.get('save_directory', True):
|
609
|
+
self.save_state = True
|
610
|
+
if not os.path.exists(self.instance_name):
|
611
|
+
if kwargs.get('make_directory', True):
|
612
|
+
print(f"Creating directory: {self.instance_name}")
|
613
|
+
os.makedirs(self.instance_name)
|
614
|
+
else:
|
615
|
+
self.save_state = False
|
616
|
+
|
617
|
+
# P-value penalty
|
618
|
+
self.pvalue_penalty = float(kwargs.get('pvalue_penalty', 0.5))
|
619
|
+
self.pvalue_exceed = 0
|
620
|
+
self._maximize = False
|
621
|
+
|
622
|
+
# Data processing
|
623
|
+
x_data = sm.add_constant(x_data)
|
624
|
+
if kwargs.get('standardize_data', False):
|
625
|
+
print("Standardizing data")
|
626
|
+
x_data = self.self_standardize_positive(x_data)
|
627
|
+
|
628
|
+
self._input_data(x_data, y_data)
|
629
|
+
if y_data.ndim == 1:
|
630
|
+
y_data = pd.DataFrame(y_data)
|
631
|
+
|
632
|
+
# Handle panels and groups
|
633
|
+
self.handle_panels_and_groups(x_data, y_data, kwargs)
|
634
|
+
|
635
|
+
# Define transformations and distributions
|
636
|
+
self._transformations = kwargs.get('_transformations', ["no", "log", "sqrt", "arcsinh", "nil"])
|
637
|
+
self._distribution = kwargs.get(
|
638
|
+
'_distributions', ['triangular', 'uniform', 'normal', 'tn_normal', 'ln_normal']
|
639
|
+
)
|
640
|
+
if self.G is not None:
|
641
|
+
self._distribution = ["trad| " + dist for dist in self._distribution] + \
|
642
|
+
["grpd| " + dist for dist in self._distribution]
|
643
|
+
|
644
|
+
# Output and model specs
|
645
|
+
self.convergence = None
|
646
|
+
self.coeff_names = None
|
647
|
+
self._interactions = None
|
648
|
+
self.coeff_ = None
|
649
|
+
self.significant = 0
|
650
|
+
self._min_characteristics = kwargs.get('_min_vars', 3)
|
651
|
+
self._max_hurdle = 4
|
652
|
+
self.solution_analyst = None
|
653
|
+
|
654
|
+
# Setup complete
|
655
|
+
print("Setup Complete...")
|
656
|
+
|
657
|
+
def handle_panels_and_groups(self, x_data, y_data, kwargs):
|
658
|
+
"""Handles panels and groups for the model."""
|
659
|
+
if 'panels' in kwargs and kwargs['panels'] is not None:
|
660
|
+
self.setup_panels_groups(x_data, y_data, kwargs)
|
661
|
+
else:
|
662
|
+
print("No Panels. Grouped Random Parameters Will Not Be Estimated")
|
663
|
+
self.G = None
|
664
|
+
self._Gnum = 1
|
665
|
+
self._max_group_all_means = 0
|
666
|
+
|
667
|
+
|
668
|
+
|
669
|
+
|
670
|
+
|
671
|
+
|
672
|
+
|
673
|
+
|
674
|
+
|
675
|
+
def __init__old(self, x_data, y_data, **kwargs):
|
676
|
+
|
677
|
+
# 1. GENERAL PARAMETERS
|
678
|
+
self.gbl_best = 1e5
|
679
|
+
self.non_sig_prints = kwargs.get('non_sig_prints', True)
|
680
|
+
self.run_numerical_hessian = kwargs.get('r_nu_hess', False)
|
681
|
+
self.run_bootstrap = kwargs.get('run_bootstrap', False)
|
682
|
+
self.linear_regression = kwargs.get('linear_model', False)
|
683
|
+
self.reg_penalty = kwargs.get('reg_penalty',1)
|
684
|
+
self.power_up_ll = False
|
685
|
+
self.nb_parma = 1
|
686
|
+
self.bic = None
|
687
|
+
self.other_bic = False
|
688
|
+
self.test_flag = 1
|
689
|
+
self.no_extra_param =0 #if true, fix dispersion. w
|
690
|
+
if self.other_bic:
|
691
|
+
print('change this to false latter ')
|
692
|
+
|
693
|
+
# initialize values
|
694
|
+
self.constant_value = 0
|
695
|
+
self.negative_binomial_value = 1
|
696
|
+
|
697
|
+
self.verbose_safe = kwargs.get('verbose', 0)
|
698
|
+
self.please_print = kwargs.get('please_print', 0)
|
699
|
+
self.group_halton = None
|
700
|
+
self.grad_yes = kwargs.get('grad_est', False)
|
701
|
+
self.hess_yes = False
|
702
|
+
self.group_halton_test = None
|
703
|
+
self.panels = None
|
704
|
+
self.group_names = []
|
705
|
+
self.pvalues = None
|
706
|
+
self.Last_Sol = None
|
707
|
+
self.fixed_fit = None
|
708
|
+
self.rdm_fit = None
|
709
|
+
self.rdm_cor_fit = None
|
710
|
+
self.dist_fit = None
|
711
|
+
self.rounding_point = kwargs.get('decimals_in_coeff', 4)
|
712
|
+
self.MAE = None
|
713
|
+
self.best_obj_1 = 1000000.0
|
714
|
+
self._obj_1 = kwargs.get('_obj_1', 'bic')
|
715
|
+
self._obj_2 = kwargs.get('_obj_2', 'MSE')
|
716
|
+
self.numerical_hessian_calc = 0 # calculates hessian by statsmodels otherwise scipy
|
717
|
+
self.full_model = None
|
718
|
+
self.GP_parameter = 0
|
719
|
+
self.is_multi = kwargs.get('is_multi', False)
|
720
|
+
self.complexity_level = kwargs.get('complexity_level', 6)
|
721
|
+
self._max_iterations_improvement = kwargs.get("WIC",10000)
|
722
|
+
self.generated_sln = set()
|
723
|
+
self.ave_mae = 0
|
724
|
+
# defalt paramaters for hs #TODO unpack into harmony search class
|
725
|
+
self.algorithm = kwargs.get('algorithm', 'hs') # 'sa' 'de' also avialable
|
726
|
+
self._hms = 20
|
727
|
+
self._max_time = self._max_time = kwargs.get('_max_time', kwargs.get('MAX_TIME', 0.8 * 60 * 60 * 24))
|
728
|
+
self._hmcr = kwargs.get('_hmcr', .5)
|
729
|
+
self._par = 0.3 #dont think this gets useted
|
730
|
+
self._mpai = 1
|
731
|
+
self._max_imp = kwargs.get('_max_imp', 90000000)
|
732
|
+
self._WIC = kwargs.get("WIC",10000) # Number of Iterations without Multiobjective Improvement #tod chuck into solution
|
733
|
+
self._panels = None
|
734
|
+
self.method_ll = 'Nelder-Mead-BFGS'
|
735
|
+
|
736
|
+
self.method_ll = 'L-BFGS-B' # alternatives 'BFGS_2', 'BFGS
|
737
|
+
self.method_ll = kwargs.get('method', 'BFGS_2')
|
738
|
+
|
739
|
+
#self.method_ll = 'Nelder-Mead-BFGS'
|
740
|
+
self.Keep_Fit = 2
|
741
|
+
self.MP = 0
|
742
|
+
# Nelder-Mead-BFGS
|
743
|
+
|
744
|
+
self._max_characteristics = kwargs.get('_max_vars', 90)
|
745
|
+
|
746
|
+
self.beta_dict = dict
|
747
|
+
if 'model_terms' in kwargs:
|
748
|
+
|
749
|
+
if kwargs.get('model_terms').get('group') is not None:
|
750
|
+
kwargs['group'] = kwargs.get('model_terms').get('group')
|
751
|
+
|
752
|
+
if kwargs.get('model_terms').get('panels') is not None:
|
753
|
+
kwargs['panels'] = kwargs.get('model_terms').get('panels')
|
754
|
+
acceptable_keys_list = ['_par', '_max_imp', '_hmcr', 'steps',
|
755
|
+
'algorithm', '_random_seed', '_max_time',
|
756
|
+
'forcedvariables', '_obj_1', '_obj_2', '_par',
|
757
|
+
'Manuel_Estimate', 'test_percentage', 'is_multi', 'val_percentage'
|
758
|
+
'complexity_level', '_hms', '_mpai',
|
759
|
+
'group', '_max_characteristics', 'zi_force_names']
|
760
|
+
for k in kwargs.keys():
|
761
|
+
if k in acceptable_keys_list:
|
762
|
+
self.__setattr__(k, self.tryeval(kwargs[k]))
|
763
|
+
|
764
|
+
|
765
|
+
if 'complexity_level' in kwargs:
|
766
|
+
self.complexity_level = kwargs['complexity_level']
|
767
|
+
|
768
|
+
if 'instance_name' in kwargs:
|
769
|
+
self.instance_name = str(kwargs['instance_name'])
|
770
|
+
else:
|
771
|
+
|
772
|
+
print('no name set, setting name as 0')
|
773
|
+
self.instance_name = f"run_{str(0)}" # set an arbitrary instance number
|
774
|
+
|
775
|
+
if kwargs.get('save_directory', True):
|
776
|
+
self.save_state = True
|
777
|
+
if not os.path.exists(self.instance_name):
|
778
|
+
if kwargs.get('make_directory', True):
|
779
|
+
print('Making a Directory, if you want to stop from storing the files to this directory set argumet: make_directory:False')
|
780
|
+
os.makedirs(self.instance_name)
|
781
|
+
else:
|
782
|
+
self.save_state = False
|
783
|
+
if not hasattr(self, '_obj_1'):
|
784
|
+
print('_obj_1 required, define as bic, aic, ll')
|
785
|
+
raise Exception
|
786
|
+
|
787
|
+
self.pvalue_penalty = float(kwargs.get('pvalue_penalty', 0.5))
|
788
|
+
self.pvalue_exceed = 0
|
789
|
+
self._maximize = False # do we maximize or minimize?
|
790
|
+
|
791
|
+
x_data = sm.add_constant(x_data)
|
792
|
+
standardize_the_data = 0
|
793
|
+
if standardize_the_data:
|
794
|
+
print('we are standardize the data')
|
795
|
+
x_data = self.self_standardize_positive(x_data)
|
375
796
|
|
376
|
-
|
797
|
+
self._input_data(x_data, y_data)
|
377
798
|
|
378
|
-
|
379
799
|
|
380
|
-
|
381
|
-
|
382
|
-
self.G = None
|
383
|
-
self._Gnum = 1
|
384
|
-
self._max_group_all_means = 0
|
385
|
-
self.ids = np.asarray(train_idx)
|
386
|
-
self.ids_test = np.asarray(test_idx)
|
387
|
-
groupll = None
|
388
|
-
X, Y, panel, group = self._arrange_long_format(
|
389
|
-
df_train, y_train, self.ids, self.ids, groupll)
|
800
|
+
if y_data.ndim == 1:
|
801
|
+
y_data = pd.DataFrame(y_data)
|
390
802
|
|
391
|
-
|
392
|
-
|
393
|
-
|
803
|
+
'''
|
804
|
+
#TODO ADD THIS IN LATER
|
805
|
+
splitter = DataProcessor(x_data, y_data, kwargs)
|
806
|
+
self.copy_class_attributes(splitter) #inherit the self objects
|
807
|
+
'''
|
394
808
|
|
395
|
-
K = Xnew.shape[1]
|
396
|
-
self._characteristics_names = list(Xnew.columns)
|
397
|
-
XX = Xnew.values.reshape(self.N, self.P, K).copy()
|
398
|
-
XX = XX.astype('float')
|
399
|
-
YY = Ynew.values.reshape(self.N, self.P, 1).copy()
|
400
|
-
YY = YY.astype('float')
|
401
|
-
self._x_data = XX.copy()
|
402
|
-
self._y_data = YY.copy()
|
403
809
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
X, Y, panel_info = self._balance_panels(X, Y, panel)
|
411
|
-
|
412
|
-
self.N_test, self.P_test = panel_info.shape
|
413
|
-
K = X.shape[1]
|
414
|
-
self.columns_names = X.columns
|
415
|
-
X = X.values.reshape(self.N_test, self.P_test, K)
|
416
|
-
X = X.astype('float')
|
417
|
-
Y = Y.values.reshape(self.N_test, self.P_test, 1)
|
418
|
-
Y = Y.astype('float')
|
419
|
-
self._x_data_test = X.copy()
|
420
|
-
self.y_data_test = Y.copy()
|
421
|
-
|
422
|
-
self._samples, self._panels, self._characteristics = self._x_data.shape
|
423
|
-
|
810
|
+
result = self.handle_data_splitting(x_data, y_data, kwargs)
|
811
|
+
if result[0] is not None: # Check if splitting was done
|
812
|
+
self.df_train, self.df_test, self.y_train, self.y_test = result
|
813
|
+
#self.n_obs = N
|
814
|
+
self._characteristics_names = list(self._x_data.columns)
|
815
|
+
self._max_group_all_means = 2
|
424
816
|
|
817
|
+
exclude_this_test = [4]
|
818
|
+
|
819
|
+
|
820
|
+
# Other initialization logic...
|
821
|
+
#TODO CHECK THIS IS RIGHT
|
822
|
+
self.process_panels_and_groups(self.df_train, self.df_test, self.y_train, self.y_test, kwargs)
|
425
823
|
#Define the offset into the data
|
426
824
|
self.process_offset()
|
427
825
|
if self.is_multi:
|
@@ -534,8 +932,7 @@ class ObjectiveFunction(object):
|
|
534
932
|
len(self._discrete_values) # TODO have continus
|
535
933
|
self._upper_bounds = [None] * \
|
536
934
|
len(self._discrete_values) # TODO have continous
|
537
|
-
|
538
|
-
self.endog = None
|
935
|
+
|
539
936
|
# solution parameters
|
540
937
|
self._min_characteristics = kwargs.get('_min_vars', 3)
|
541
938
|
self._max_hurdle = 4
|
@@ -561,7 +958,221 @@ class ObjectiveFunction(object):
|
|
561
958
|
# Update the _model_type_codes list
|
562
959
|
self._model_type_codes = replace_nb_with_sigma(self._model_type_codes)
|
563
960
|
|
961
|
+
def process_panels_and_groups(self, df_train, df_test, y_train, y_test, kwargs):
|
962
|
+
"""
|
963
|
+
Process panels and groups for grouped random parameters.
|
964
|
+
|
965
|
+
Args:
|
966
|
+
df_train (DataFrame): Training feature data.
|
967
|
+
df_test (DataFrame): Testing feature data.
|
968
|
+
y_train (DataFrame): Training target data.
|
969
|
+
y_test (DataFrame): Testing target data.
|
970
|
+
kwargs (dict): Keyword arguments with configurations.
|
971
|
+
|
972
|
+
Returns:
|
973
|
+
None. Updates instance variables for panel processing.
|
974
|
+
"""
|
975
|
+
|
976
|
+
|
977
|
+
if 'panels' in kwargs and kwargs.get('panels') is not None:
|
978
|
+
# Extract panel and group information
|
979
|
+
self.panels = np.asarray(df_train[kwargs['panels']])
|
980
|
+
self.panels_test = np.asarray(df_test[kwargs['panels']])
|
981
|
+
self.ids = np.asarray(df_train[kwargs['panels']])
|
982
|
+
self.ids_test = np.asarray(df_test[kwargs['panels']])
|
983
|
+
|
984
|
+
# Process group information if provided
|
985
|
+
if kwargs.get('group') is not None:
|
986
|
+
groupll = np.asarray(df_train[kwargs['group']].astype('category').cat.codes)
|
987
|
+
group_test = np.asarray(df_test[kwargs['group']].astype('category').cat.codes)
|
988
|
+
else:
|
989
|
+
groupll = None
|
990
|
+
|
991
|
+
# Arrange data in long format
|
992
|
+
X, Y, panel, group = self._arrange_long_format(df_train, y_train, self.ids, self.ids, groupll)
|
993
|
+
self.group_halton = group.copy()
|
994
|
+
self.group_dummies = pd.get_dummies(group)
|
995
|
+
|
996
|
+
# Balance panels for training data
|
997
|
+
Xnew, Ynew, panel_info = self._balance_panels(X, Y, panel)
|
998
|
+
Xnew = pd.DataFrame(Xnew, columns=X.columns)
|
999
|
+
self.panel_info = panel_info
|
1000
|
+
self.N, self.P = panel_info.shape
|
1001
|
+
|
1002
|
+
# Drop panel and group columns
|
1003
|
+
Xnew.drop(kwargs['panels'], axis=1, inplace=True)
|
1004
|
+
Xnew.drop(kwargs['group'], axis=1, inplace=True)
|
1005
|
+
|
1006
|
+
# Reshape data
|
1007
|
+
K = Xnew.shape[1]
|
1008
|
+
self._characteristics_names = list(Xnew.columns)
|
1009
|
+
XX = Xnew.values.reshape(self.N, self.P, K).astype('float')
|
1010
|
+
YY = Ynew.values.reshape(self.N, self.P, 1).astype('float')
|
1011
|
+
|
1012
|
+
self._x_data = XX.copy()
|
1013
|
+
self._y_data = YY.copy()
|
1014
|
+
|
1015
|
+
# Process test data
|
1016
|
+
X, Y, panel, group = self._arrange_long_format(df_test, y_test, self.ids_test, self.panels_test, group_test)
|
1017
|
+
if np.max(group) > 50:
|
1018
|
+
exclude_this_test = [4]
|
1019
|
+
self._max_group_all_means = 0
|
1020
|
+
else:
|
1021
|
+
exclude_this_test = []
|
1022
|
+
|
1023
|
+
self.group_halton_test = group.copy()
|
1024
|
+
X, Y, panel_info = self._balance_panels(X, Y, panel)
|
1025
|
+
X.drop(kwargs['panels'], axis=1, inplace=True)
|
1026
|
+
X.drop(kwargs['group'], axis=1, inplace=True)
|
1027
|
+
|
1028
|
+
# Reshape test data
|
1029
|
+
self.N_test, self.P_test = panel_info.shape
|
1030
|
+
self.G = 1
|
1031
|
+
self._Gnum = self.group_dummies.shape[2]
|
1032
|
+
self.group_dummies_test = pd.get_dummies(group).values.reshape(self.N_test, self.P_test, -1)
|
1033
|
+
|
1034
|
+
K = X.shape[1]
|
1035
|
+
self.columns_names = X.columns
|
1036
|
+
X = X.values.reshape(self.N_test, self.P_test, K).astype('float')
|
1037
|
+
Y = Y.values.reshape(self.N_test, self.P_test, 1).astype('float')
|
1038
|
+
|
1039
|
+
self._x_data_test = X.copy()
|
1040
|
+
self.y_data_test = Y.copy()
|
1041
|
+
self.y_data_test = self.y_data_test.astype('float')
|
1042
|
+
|
1043
|
+
# Update shape attributes
|
1044
|
+
self._samples, self._panels, self._characteristics = self._x_data.shape
|
1045
|
+
|
1046
|
+
else:
|
1047
|
+
print('No Panels. Grouped Random Parameters Will not be estimated')
|
1048
|
+
# Handle case with no panels
|
1049
|
+
self.G = None
|
1050
|
+
self._Gnum = 1
|
1051
|
+
self._max_group_all_means = 0
|
1052
|
+
self.ids = np.asarray(df_train.index)
|
1053
|
+
self.ids_test = np.asarray(df_test.index)
|
1054
|
+
|
1055
|
+
# Arrange and balance training data
|
1056
|
+
groupll = None
|
1057
|
+
X, Y, panel, group = self._arrange_long_format(df_train, y_train, self.ids, self.ids, groupll)
|
1058
|
+
Xnew, Ynew, panel_info = self._balance_panels(X, Y, panel)
|
1059
|
+
self.panel_info = panel_info
|
1060
|
+
self.N, self.P = panel_info.shape
|
1061
|
+
|
1062
|
+
K = Xnew.shape[1]
|
1063
|
+
self._characteristics_names = list(Xnew.columns)
|
1064
|
+
XX = Xnew.values.reshape(self.N, self.P, K).astype('float')
|
1065
|
+
YY = Ynew.values.reshape(self.N, self.P, 1).astype('float')
|
1066
|
+
|
1067
|
+
self._x_data = XX.copy()
|
1068
|
+
self._y_data = YY.copy()
|
1069
|
+
|
1070
|
+
# Arrange and balance test data if multi-objective optimization is enabled
|
1071
|
+
if self.is_multi:
|
1072
|
+
X, Y, panel, group = self._arrange_long_format(df_test, y_test, self.ids_test, self.ids_test, None)
|
1073
|
+
if np.max(group) > 50:
|
1074
|
+
exclude_this_test = [4]
|
1075
|
+
else:
|
1076
|
+
exclude_this_test = []
|
1077
|
+
|
1078
|
+
X, Y, panel_info = self._balance_panels(X, Y, panel)
|
1079
|
+
self.N_test, self.P_test = panel_info.shape
|
1080
|
+
K = X.shape[1]
|
1081
|
+
self.columns_names = X.columns
|
1082
|
+
X = X.values.reshape(self.N_test, self.P_test, K).astype('float')
|
1083
|
+
Y = Y.values.reshape(self.N_test, self.P_test, 1).astype('float')
|
1084
|
+
|
1085
|
+
self._x_data_test = X.copy()
|
1086
|
+
self.y_data_test = Y.copy()
|
1087
|
+
|
1088
|
+
# Update shape attributes
|
1089
|
+
self._samples, self._panels, self._characteristics = self._x_data.shape
|
1090
|
+
|
1091
|
+
def handle_data_splitting(self, x_data, y_data, kwargs):
|
1092
|
+
"""
|
1093
|
+
Handle data splitting for training and testing based on objectives and panels.
|
1094
|
+
|
1095
|
+
Args:
|
1096
|
+
x_data (DataFrame): Input feature data.
|
1097
|
+
y_data (DataFrame): Input target data.
|
1098
|
+
kwargs (dict): Dictionary of configuration options.
|
1099
|
+
|
1100
|
+
Returns:
|
1101
|
+
tuple: (df_train, df_test, y_train, y_test) - Split datasets.
|
1102
|
+
"""
|
1103
|
+
|
1104
|
+
|
1105
|
+
# Check if the objectives involve multi-objective metrics
|
1106
|
+
if self._obj_1 == 'MAE' or self._obj_2 in ["MAE", "RMSE", "MSE", "RMSE_IN", "RMSE_TEST"]:
|
1107
|
+
# Retrieve test and validation percentages
|
1108
|
+
self.test_percentage = float(kwargs.get('test_percentage', 0))
|
1109
|
+
self.val_percentage = float(kwargs.get('val_percentage', 0))
|
1110
|
+
|
1111
|
+
# Handle zero test percentage
|
1112
|
+
if self.test_percentage == 0:
|
1113
|
+
print("Test percentage is 0. Please provide 'test_percentage' as a decimal (e.g., 0.8) for multi-objective optimization.")
|
1114
|
+
print("Continuing with single-objective optimization.")
|
1115
|
+
time.sleep(2)
|
1116
|
+
self.is_multi = False
|
1117
|
+
|
1118
|
+
# Handle panels if provided
|
1119
|
+
if 'panels' in kwargs and kwargs.get('panels') is not None:
|
1120
|
+
# Handle group information if provided
|
1121
|
+
if kwargs.get('group') is not None:
|
1122
|
+
self.group_names = np.asarray(x_data[kwargs['group']].astype('category').cat.categories)
|
1123
|
+
x_data[kwargs['group']] = x_data[kwargs['group']].astype('category').cat.codes
|
1124
|
+
|
1125
|
+
# Set complexity level
|
1126
|
+
self.complexity_level = 6
|
1127
|
+
|
1128
|
+
# Prepare test and training dataset splits based on panels
|
1129
|
+
try:
|
1130
|
+
x_data[kwargs['panels']] = x_data[kwargs['panels']].rank(method='dense').astype(int)
|
1131
|
+
x_data[kwargs['panels']] -= x_data[kwargs['panels']].min() - 1
|
564
1132
|
|
1133
|
+
N = len(np.unique(x_data[kwargs['panels']].values))
|
1134
|
+
id_unique = np.unique(x_data[kwargs['panels']].values)
|
1135
|
+
except KeyError:
|
1136
|
+
N = len(np.unique(x_data[kwargs['panels']]))
|
1137
|
+
id_unique = np.unique(x_data[kwargs['panels']].values)
|
1138
|
+
|
1139
|
+
# Calculate training size and split IDs
|
1140
|
+
training_size = int((1 - self.test_percentage - self.val_percentage) * N)
|
1141
|
+
ids = np.random.choice(N, training_size, replace=False)
|
1142
|
+
ids = id_unique[ids]
|
1143
|
+
|
1144
|
+
# Create training and testing indices
|
1145
|
+
train_idx = [ii for ii, id_val in enumerate(x_data[kwargs['panels']]) if id_val in ids]
|
1146
|
+
test_idx = [ii for ii, id_val in enumerate(x_data[kwargs['panels']]) if id_val not in ids]
|
1147
|
+
|
1148
|
+
# Split datasets
|
1149
|
+
df_train = x_data.loc[train_idx, :]
|
1150
|
+
df_test = x_data.loc[test_idx, :]
|
1151
|
+
y_train = y_data.loc[train_idx, :]
|
1152
|
+
y_test = y_data.loc[test_idx, :]
|
1153
|
+
else:
|
1154
|
+
# Handle case when panels are not provided
|
1155
|
+
N = len(x_data)
|
1156
|
+
training_size = int((1 - self.test_percentage - self.val_percentage) * N)
|
1157
|
+
ids = np.random.choice(N, training_size, replace=False)
|
1158
|
+
id_unique = np.array([i for i in range(N)])
|
1159
|
+
ids = id_unique[ids]
|
1160
|
+
|
1161
|
+
# Create training and testing indices
|
1162
|
+
train_idx = [ii for ii in range(len(id_unique)) if id_unique[ii] in ids]
|
1163
|
+
test_idx = [ii for ii in range(len(id_unique)) if id_unique[ii] not in ids]
|
1164
|
+
|
1165
|
+
# Split datasets
|
1166
|
+
df_train = x_data.loc[train_idx, :]
|
1167
|
+
df_test = x_data.loc[test_idx, :]
|
1168
|
+
y_train = y_data.loc[train_idx, :]
|
1169
|
+
y_test = y_data.loc[test_idx, :]
|
1170
|
+
|
1171
|
+
return df_train, df_test, y_train, y_test
|
1172
|
+
|
1173
|
+
else:
|
1174
|
+
print("No multi-objective metrics detected. No data splitting performed.")
|
1175
|
+
return None, None, None, None
|
565
1176
|
def over_ride_self(self, **kwargs):
|
566
1177
|
"""
|
567
1178
|
Dynamically sets attributes on the instance based on the provided keyword arguments.
|
@@ -1194,7 +1805,7 @@ class ObjectiveFunction(object):
|
|
1194
1805
|
|
1195
1806
|
self.coeff_ = self.convert_coefficients(self.coeff_, model)
|
1196
1807
|
self.coeff_ = [self.round_with_padding(x, self.rounding_point) for x in self.coeff_]
|
1197
|
-
self.stderr = [self.round_with_padding(x,
|
1808
|
+
self.stderr = [self.round_with_padding(x, self.rounding_point) for x in self.stderr]
|
1198
1809
|
self.zvalues = [self.round_with_padding(
|
1199
1810
|
x, 2) for x in self.zvalues]
|
1200
1811
|
|
@@ -2774,27 +3385,6 @@ class ObjectiveFunction(object):
|
|
2774
3385
|
def get_termination_iter(self):
|
2775
3386
|
return self._max_iterations_improvement
|
2776
3387
|
|
2777
|
-
def score(self, params):
|
2778
|
-
"""
|
2779
|
-
Poisson model score (gradient) vector of the log-likelihood
|
2780
|
-
Parameters
|
2781
|
-
----------
|
2782
|
-
params : array_like
|
2783
|
-
The parameters of the model
|
2784
|
-
Returns
|
2785
|
-
-------
|
2786
|
-
score : ndarray, 1-D
|
2787
|
-
The score vector of the model, i.e. the first derivative of the
|
2788
|
-
loglikelihood function, evaluated at `params`
|
2789
|
-
Notes
|
2790
|
-
-----
|
2791
|
-
.. math:: \\frac{\\partial\\ln L}{\\partial\\beta}=\\sum_{i=1}^{n}\\left(y_{i}-\\lambda_{i}\\right)x_{i}
|
2792
|
-
where the loglinear model is assumed
|
2793
|
-
.. math:: \\ln\\lambda_{i}=x_{i}\\beta
|
2794
|
-
"""
|
2795
|
-
X = self.exog
|
2796
|
-
L = np.exp(np.dot(X, params))
|
2797
|
-
return np.dot(self.endog - L, X)
|
2798
3388
|
|
2799
3389
|
def GenPos_Score(self, params, y, mu, X, p=0, obs_specific=False):
|
2800
3390
|
|
@@ -4041,48 +4631,7 @@ class ObjectiveFunction(object):
|
|
4041
4631
|
eVd = np.exp(np.clip(eta, None, EXP_UPPER_LIMIT))
|
4042
4632
|
return eVd
|
4043
4633
|
|
4044
|
-
|
4045
|
-
from scipy._lib._util import _lazywhere
|
4046
|
-
from statsmodels.discrete.discrete_model import Logit
|
4047
|
-
self.k_inflate = k_inflate
|
4048
|
-
self.exog = exog.to_numpy()
|
4049
|
-
self.endog = y_values.values.ravel()
|
4050
|
-
exog = exog.to_numpy()
|
4051
|
-
exog_infl = exog_infl.to_numpy()
|
4052
|
-
|
4053
|
-
def _argcheck(self, mu, alpha, p):
|
4054
|
-
return (mu >= 0) & (alpha == alpha) & (p > 0)
|
4055
|
-
|
4056
|
-
def loglik_obs_poisson(params, y):
|
4057
|
-
"""
|
4058
|
-
Loglikelihood for observations of Poisson model
|
4059
|
-
|
4060
|
-
Parameters
|
4061
|
-
----------
|
4062
|
-
params : array_like
|
4063
|
-
The parameters of the model.
|
4064
|
-
|
4065
|
-
Returns
|
4066
|
-
-------
|
4067
|
-
loglike : array_like
|
4068
|
-
The log likelihood for each observation of the model evaluated
|
4069
|
-
at `params`. See Notes
|
4070
|
-
|
4071
|
-
Notes
|
4072
|
-
-----
|
4073
|
-
.. math:: \\ln L_{i}=\\left[-\\lambda_{i}+y_{i}x_{i}^{\\prime}\\beta-\\ln y_{i}!\\right]
|
4074
|
-
|
4075
|
-
for observations :math:`i=1,...,n`
|
4076
|
-
"""
|
4077
|
-
offset = getattr(self, "offset", 0)
|
4078
|
-
exposure = getattr(self, "exposure", 0)
|
4079
|
-
XB = np.dot(self.exog, params) + offset + exposure
|
4080
|
-
|
4081
|
-
# np.sum(stats.poisson.logpmf(endog, np.exp(XB)))
|
4082
|
-
return -np.exp(XB) + y * XB - sc.gammaln(y + 1)
|
4083
|
-
|
4084
|
-
|
4085
|
-
|
4634
|
+
|
4086
4635
|
def dpoisl(self, x, theta, log=False):
|
4087
4636
|
# if theta < 0:
|
4088
4637
|
# raise ValueError("theta must be positive!")
|
@@ -5703,7 +6252,9 @@ class ObjectiveFunction(object):
|
|
5703
6252
|
coeff_ = optim_res['x']
|
5704
6253
|
penalty = 0
|
5705
6254
|
stderr_opg = None
|
5706
|
-
if self.
|
6255
|
+
if self.run_bootstrap:
|
6256
|
+
stderr_opg = self.stderr
|
6257
|
+
elif self.run_numerical_hessian:
|
5707
6258
|
|
5708
6259
|
stderr_opg = self.stderr
|
5709
6260
|
|
@@ -5717,7 +6268,8 @@ class ObjectiveFunction(object):
|
|
5717
6268
|
else:
|
5718
6269
|
covariance = np.diag(np.ones(len(optim_res.x)))
|
5719
6270
|
covariance = self.handle_covariance(covariance)
|
5720
|
-
|
6271
|
+
self
|
6272
|
+
covariance = np.clip(covariance, 0, self.N)
|
5721
6273
|
stderr = np.sqrt(np.diag(covariance))
|
5722
6274
|
if stderr_opg is not None:
|
5723
6275
|
stderr = np.minimum(stderr, stderr_opg)
|
@@ -6114,7 +6666,7 @@ class ObjectiveFunction(object):
|
|
6114
6666
|
bounds=bounds,
|
6115
6667
|
tol=tol,
|
6116
6668
|
mod=mod,
|
6117
|
-
n_bootstraps=
|
6669
|
+
n_bootstraps=6
|
6118
6670
|
)
|
6119
6671
|
self.stderr = std_errors
|
6120
6672
|
|
@@ -6260,6 +6812,7 @@ class ObjectiveFunction(object):
|
|
6260
6812
|
optimization_result, simple_fit=False, is_dispersion=dispersion
|
6261
6813
|
)
|
6262
6814
|
|
6815
|
+
|
6263
6816
|
# Validation metrics if test data is available (in-sample and out-of-sample MAE)
|
6264
6817
|
in_sample_mae = None
|
6265
6818
|
out_sample_mae = None
|
@@ -7575,9 +8128,11 @@ class ObjectiveFunction(object):
|
|
7575
8128
|
|
7576
8129
|
|
7577
8130
|
else:
|
7578
|
-
if self.significant == 1 and obj_1['layout'] is not None and obj_1['pval_exceed'] == 0:
|
7579
|
-
self.summary_alternative(model=dispersion, solution=obj_1)
|
7580
8131
|
|
8132
|
+
if self.significant == 1 and obj_1['layout'] is not None:
|
8133
|
+
self.summary_alternative(long_print = self.non_sig_prints, model=dispersion, solution=obj_1)
|
8134
|
+
elif self.significant == 1 and obj_1['layout'] is not None and obj_1['pval_exceed'] == 0:
|
8135
|
+
self.summary_alternative(model=dispersion, solution=obj_1)
|
7581
8136
|
return obj_1, model_nature
|
7582
8137
|
|
7583
8138
|
def get_X_tril(self):
|
@@ -4,18 +4,18 @@ metacountregressor/app_main.py,sha256=vY3GczTbGbBRalbzMkl_9jVW7RMgEOc6z2Dr1IZJv9
|
|
4
4
|
metacountregressor/data_split_helper.py,sha256=M2fIMdIO8znUaYhx5wlacRyNWdQjNYu1z1wkE-kFUYU,3373
|
5
5
|
metacountregressor/halton.py,sha256=jhovA45UBoZYU9g-hl6Lb2sBIx_ZBTNdPrpgkzR9fng,9463
|
6
6
|
metacountregressor/helperprocess.py,sha256=wW45-i31zy6rwaXt5PZt0GyR83PzF30jc9Wl4SQtnUI,26372
|
7
|
-
metacountregressor/main.py,sha256=
|
7
|
+
metacountregressor/main.py,sha256=cuGmhSSDjqpYTJC1ktvbqKnGi-8L2kTf9cPLJJiPOJw,24083
|
8
8
|
metacountregressor/main_old.py,sha256=eTS4ygq27MnU-dZ_j983Ucb-D5XfbVF8OJQK2hVVLZc,24123
|
9
9
|
metacountregressor/metaheuristics.py,sha256=gVqJRNiHOa48-dHZxaJNgu2OLiYOpSYvWHJ1VFPqFWY,107817
|
10
10
|
metacountregressor/pareto_file.py,sha256=whySaoPAUWYjyI8zo0hwAOa3rFk6SIUlHSpqZiLur0k,23096
|
11
11
|
metacountregressor/pareto_logger__plot.py,sha256=mEU2QN4wmsM7t39GJ_XhJ_jjsdl09JOmG0U2jICrAkI,30037
|
12
12
|
metacountregressor/setup.py,sha256=5UcQCCLR8Fm5odA3MX78WwahavxFq4mVD6oq0IuQvAY,936
|
13
13
|
metacountregressor/single_objective_finder.py,sha256=jVG7GJBqzSP4_riYr-kMMKy_LE3SlGmKMunNhHYxgRg,8011
|
14
|
-
metacountregressor/solution.py,sha256=
|
14
|
+
metacountregressor/solution.py,sha256=71cD5m0pHWazMMbWdTb7n4s8M2Or3U_d2uBKfpOGeQA,348665
|
15
15
|
metacountregressor/test_code.py,sha256=_7Emm2JbhK_NVhxoqMhshN2JeHZtihZuSDl3Jpe7Ajk,1641
|
16
16
|
metacountregressor/test_generated_paper2.py,sha256=pwOoRzl1jJIIOUAAvbkT6HmmTQ81mwpsshn9SLdKOg8,3927
|
17
|
-
metacountregressor-1.0.
|
18
|
-
metacountregressor-1.0.
|
19
|
-
metacountregressor-1.0.
|
20
|
-
metacountregressor-1.0.
|
21
|
-
metacountregressor-1.0.
|
17
|
+
metacountregressor-1.0.12.dist-info/licenses/LICENSE.txt,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
18
|
+
metacountregressor-1.0.12.dist-info/METADATA,sha256=47-B3YhWfePxRhSbs3m7vdLcJQuSZkgTX996lZQjZog,23620
|
19
|
+
metacountregressor-1.0.12.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
|
20
|
+
metacountregressor-1.0.12.dist-info/top_level.txt,sha256=zGG7UC5WIpr76gsFUpwJ4En2aCcoNTONBaS3OewwjR0,19
|
21
|
+
metacountregressor-1.0.12.dist-info/RECORD,,
|
{metacountregressor-1.0.10.dist-info → metacountregressor-1.0.12.dist-info}/licenses/LICENSE.txt
RENAMED
File without changes
|
File without changes
|