metacountregressor 0.1.88__py3-none-any.whl → 0.1.89__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,7 +28,137 @@ def convert_df_columns_to_binary_and_wide(df):
28
28
  return df
29
29
 
30
30
 
31
+
32
+
33
+
34
+
35
+ def process_arguments(**kwargs):
36
+ '''
37
+ TRYING TO TURN THE CSV FILES INTO RELEVANT ARGS
38
+ '''
39
+ #dataset
40
+ '''
41
+ if kwargs.get('dataset_file', False
42
+ ):
43
+ dataset = pd.read_csv(kwargs.get('dataset_file'))
44
+ named_data_headers = dataset.columns.tolist()
45
+ decision_constants = {name: list(range(7)) for name in named_data_headers}
46
+ data_info = {
47
+
48
+
49
+ 'AADT': {
50
+ 'type': 'continuous',
51
+ 'bounds': [0.0, np.infty],
52
+ 'discrete': False,
53
+ 'apply_func': (lambda x: np.log(x + 1)),
54
+ },
55
+ 'SPEED': {
56
+ 'type': 'continuous',
57
+ 'bounds': [0, 100],
58
+ 'enforce_bounds': True,
59
+ 'discrete': True
60
+ },
61
+ 'TIME': {
62
+ 'type': 'continuous',
63
+ 'bounds': [0, 23.999],
64
+ 'discrete': False
65
+ }
66
+ }
67
+ #remove ID CoLUMNS from dataset
68
+ dataset = dataset.drop(columns = [
69
+ 'ID'
70
+ ])
71
+ for c in dataset.columns:
72
+ if c not in data_info.keys():
73
+ data_info[c] = {'type': 'categorical'}
74
+
75
+ data_new =helperprocess.transform_dataframe(dataset,data_info)
76
+
77
+ update_constant = kwargs.get('analyst_constraints')
78
+ #update the decision_constraints
79
+ '''
80
+ data_characteristic = pd.read_csv(kwargs.get('problem_data', 'problem_data.csv'))
81
+ # Extract the column as a list of characteristic names
82
+ #name_data_characteristics = data_characteristic.columns.tolist()
83
+
84
+ # Create the dictionary
85
+ #decision_constraints = {name: list(range(7)) for name in name_data_characteristics}
86
+
87
+ #print('this gets all the features, I need to remove...')
88
+
89
+ analyst_d = pd.read_csv(kwargs.get('decison_constraints', 'decisions.csv'))
90
+ hyper = pd.read_csv('setup_hyper.csv')
91
+
92
+ new_data = {'data': data_characteristic,
93
+ 'analyst':analyst_d,
94
+ 'hyper': hyper}
95
+ return new_data
96
+
97
+ def process_package_arguments():
98
+
99
+ new_data = {}
100
+ pass
101
+
102
+
31
103
  def main(args, **kwargs):
104
+
105
+ '''METACOUNT REGRESSOR TESTING ENVIRONMENT'''
106
+
107
+ '''
108
+ TESTING_ENV = False
109
+ if TESTING_ENV:
110
+
111
+ import statsmodels.api as sm
112
+
113
+ data = sm.datasets.sunspots.load_pandas().data
114
+ # print(data.exog)
115
+ data_exog = data['YEAR']
116
+ data_exog = sm.add_constant(data_exog)
117
+ data_endog = data['SUNACTIVITY']
118
+
119
+ # Instantiate a gamma family model with the default link function.
120
+ import numpy as np
121
+
122
+ gamma_model = sm.NegativeBinomial(data_endog, data_exog)
123
+ gamma_results = gamma_model.fit()
124
+
125
+ print(gamma_results.summary())
126
+
127
+ # NOW LET's COMPARE THIS TO METACOUNT REGRESSOR
128
+ import metacountregressor
129
+ from importlib.metadata import version
130
+ print(version('metacountregressor'))
131
+ import pandas as pd
132
+ import numpy as np
133
+ from metacountregressor.solution import ObjectiveFunction
134
+ from metacountregressor.metaheuristics import (harmony_search,
135
+ differential_evolution,
136
+ simulated_annealing)
137
+
138
+ # Model Decisions,
139
+ manual_fit_spec = {
140
+
141
+ 'fixed_terms': ['const', 'YEAR'],
142
+ 'rdm_terms': [],
143
+ 'rdm_cor_terms': [],
144
+ 'grouped_terms': [],
145
+ 'hetro_in_means': [],
146
+ 'transformations': ['no', 'no'],
147
+ 'dispersion': 1 # Negative Binomial
148
+ }
149
+
150
+ # Arguments
151
+ arguments = {
152
+ 'algorithm': 'hs',
153
+ 'test_percentage': 0,
154
+ 'test_complexity': 6,
155
+ 'instance_number': 'name',
156
+ 'Manual_Fit': manual_fit_spec
157
+ }
158
+ obj_fun = ObjectiveFunction(data_exog, data_endog, **arguments)
159
+ '''
160
+
161
+
32
162
  print('the args is:', args)
33
163
  print('the kwargs is', kwargs)
34
164
 
@@ -44,13 +174,25 @@ def main(args, **kwargs):
44
174
  X = df
45
175
  y = df['FREQ'] # Frequency of crashes
46
176
  X['Offset'] = np.log(df['AADT']) # Explicitley define how to offset the data, no offset otherwise
177
+ df['Offset'] = np.log(df['AADT'])
47
178
  # Drop Y, selected offset term and ID as there are no panels
48
179
  X = df.drop(columns=['FREQ', 'ID', 'AADT'])
49
-
180
+ # Step 0: Process Data
181
+ model_terms = {
182
+ 'Y': 'FREQ', # Replace 'FREQ' with the name of your dependent variable
183
+ 'group': None, # Replace 'group_column' with the name of your grouping column (or None if not used)
184
+ 'panels': None, # Replace 'panel_column' with the name of your panel column (or None if not used)
185
+ 'Offset': 'Offset' # Replace None with the name of your offset column if using one
186
+ }
187
+ a_des, df = helperprocess.set_up_analyst_constraints(df, model_terms)
50
188
  # some example argument, these are defualt so the following line is just for claritity
51
189
  args = {'algorithm': 'hs', 'test_percentage': 0.15, 'test_complexity': 6, 'instance_number': 1,
52
- 'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6}
190
+ 'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6, 'desicions':a_des}
53
191
  # Fit the model with metacountregressor
192
+ # Step 5: Transform the dataset based on the configuration
193
+ #data_new = helperprocess.transform_dataframe(dataset, config)
194
+ y = df[['Y']]
195
+ X = df.drop(columns=['Y'])
54
196
  obj_fun = ObjectiveFunction(X, y, **args)
55
197
  # replace with other metaheuristics if desired
56
198
  results = harmony_search(obj_fun)
@@ -92,8 +234,8 @@ def main(args, **kwargs):
92
234
  'rdm_cor_terms': [],
93
235
  'grouped_terms': [],
94
236
  'hetro_in_means': [],
95
- 'transformations': ['no', 'log', 'log', 'no', 'no', 'no', 'no'],
96
- 'dispersion': 1
237
+ 'transformations': ['no', 'log', 'no', 'no', 'no', 'no', 'no'],
238
+ 'dispersion': 0
97
239
  }
98
240
 
99
241
  keep = ['Constant', 'US', 'RSMS', 'MCV', 'RSHS', 'AADT', 'Curve50', 'Offset']
@@ -102,13 +244,27 @@ def main(args, **kwargs):
102
244
  elif dataset == 4:
103
245
  manual_fit_spec = {
104
246
  'fixed_terms': ['const', 'LOWPRE', 'GBRPM', 'FRICTION'],
105
- 'rdm_terms': ['Expose:normal', 'INTPM:normal', 'CPM:normal', 'HISNOW:normal'],
247
+ 'rdm_terms': ['EXPOSE:normal', 'INTPM:normal', 'CPM:normal', 'HISNOW:normal'],
248
+ 'rdm_cor_terms': [],
249
+ 'grouped_terms': [],
250
+ 'hetro_in_means': [],
251
+ 'transformations': ['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
252
+ 'dispersion': 1
253
+ }
254
+ '''
255
+ manual_fit_spec = {
256
+ 'fixed_terms': ['const', 'LOWPRE', 'GBRPM', 'FRICTION', 'EXPOSE', 'INTPM', 'CPM', 'HISNOW'],
257
+ 'rdm_terms': [],
106
258
  'rdm_cor_terms': [],
107
259
  'grouped_terms': [],
108
260
  'hetro_in_means': [],
109
261
  'transformations': ['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
110
262
  'dispersion': 1
111
263
  }
264
+ '''
265
+
266
+
267
+ '''
112
268
  print('overriding this delete, just want to test the NB')
113
269
  manual_fit_spec = {
114
270
  'fixed_terms': ['const'],
@@ -119,7 +275,7 @@ def main(args, **kwargs):
119
275
  'transformations': ['no'],
120
276
  'dispersion': 1
121
277
  }
122
-
278
+ '''
123
279
  df = pd.read_csv('./data/Ex-16-3.csv') # read in the data
124
280
  y_df = df[['FREQ']].copy() # only consider crashes
125
281
  y_df.rename(columns={"FREQ": "Y"}, inplace=True)
@@ -192,6 +348,17 @@ def main(args, **kwargs):
192
348
  x_df = helperprocess.interactions(x_df, drop_this_perc=0.8)
193
349
  x_df['county'] = group_grab
194
350
 
351
+ print('benchmark specification')
352
+ manual_fit_spec = {
353
+ 'fixed_terms': ['const', 'monthly_AADT', 'segment_length', 'speed', 'paved_shoulder', 'curve'],
354
+ 'rdm_terms': [],
355
+ 'rdm_cor_terms': [],
356
+ 'grouped_terms': ['DP01:normal', 'DX32:normal'],
357
+ 'hetro_in_means': [],
358
+ 'transformations': ['no', 'no', 'no', 'no', 'no', 'no'],
359
+ 'dispersion': 0
360
+ }
361
+
195
362
  elif dataset == 9:
196
363
  df = pd.read_csv('panel_synth.csv') # read in the data
197
364
  y_df = df[['Y']].copy() # only consider crashes
@@ -216,8 +383,32 @@ def main(args, **kwargs):
216
383
  keep = ['group', 'constant', 'element_ID']
217
384
 
218
385
  x_df = helperprocess.interactions(x_df, keep)
219
- else: # the dataset has been selected in the program as something else
220
- print('TODO add in dataset')
386
+
387
+
388
+ elif dataset ==10: # the dataset has been selected in the program as something else
389
+ data_info = process_arguments(**args)
390
+ data_info['hyper']
391
+ data_info['analyst']
392
+ data_info['data']['Y']
393
+ #data_info['data']['Group'][0]
394
+ #data_info['data']['Panel'][0]
395
+ args['decisions'] = data_info['analyst']
396
+ print('check the args of the decions')
397
+ if type(data_info['data']['Grouped'][0]) == str and len(data_info['data']['Grouped'][0]) >1:
398
+ args['group'] = data_info['data']['Grouped'][0]
399
+ args['ID'] = data_info['data']['Grouped'][0]
400
+ if type(data_info['data']['Panel'][0]) == str and len(data_info['data']['Panel'][0])>1:
401
+ args['panels'] = data_info['data']['Panel'][0]
402
+
403
+ df = pd.read_csv(str(data_info['data']['Problem'][0]))
404
+ x_df = df.drop(columns=[data_info['data']['Y'][0]])
405
+ y_df = df[[data_info['data']['Y'][0]]]
406
+ y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
407
+ print('test') #FIXME
408
+ else:
409
+ print('PROCESS THE PACKAGE ARGUMENTS SIMULIAR TO HOW ONE WOULD DEFINE THE ENVIRONMENT')
410
+ data_info =process_package_arguments()
411
+
221
412
 
222
413
  if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
223
414
  if manual_fit_spec is None:
@@ -236,6 +427,8 @@ def main(args, **kwargs):
236
427
  args['panels'] = 'ind_id'
237
428
  args['ID'] = 'ind_id'
238
429
 
430
+
431
+
239
432
  args['complexity_level'] = args.get('complexity_level', 6)
240
433
 
241
434
 
@@ -321,55 +514,63 @@ if __name__ == '__main__':
321
514
  parser = argparse.ArgumentParser(prog='main',
322
515
  epilog=main.__doc__,
323
516
  formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve')
324
-
325
- parser.add_argument('-line', type=int, default=44,
326
- help='line to read in csv to pass in argument')
327
-
328
- if vars(parser.parse_args())['line'] is not None:
329
- reader = csv.DictReader(open('set_data.csv', 'r'))
330
- args = list()
331
- line_number_obs = 0
332
- for dictionary in reader: # TODO find a way to handle multiple args
333
- args = dictionary
334
- if line_number_obs == int(vars(parser.parse_args())['line']):
335
- break
336
- line_number_obs += 1
337
- args = dict(args)
338
-
339
- for key, value in args.items():
340
- try:
341
- # Attempt to parse the string value to a Python literal if value is a string.
342
- if isinstance(value, str):
343
- value = ast.literal_eval(value)
344
- except (ValueError, SyntaxError):
345
- # If there's a parsing error, value remains as the original string.
346
- pass
347
-
348
- # Add the argument to the parser with the potentially updated value.
349
- parser.add_argument(f'-{key}', default=value)
350
-
351
- for i, action in enumerate(parser._optionals._actions):
352
- if "-algorithm" in action.option_strings:
353
- parser._optionals._actions[i].help = "optimization algorithm"
354
-
355
- override = True
356
- if override:
357
- print('todo turn off, in testing phase')
358
- parser.add_argument('-problem_number', default='4')
359
- print('did it make it')
360
- if 'algorithm' not in args:
361
- parser.add_argument('-algorithm', type=str, default='hs',
362
- help='optimization algorithm')
363
- elif 'Manual_Fit' not in args:
364
- parser.add_argument('-Manual_Fit', action='store_false', default=None,
365
- help='To fit a model manually if desired.')
366
-
367
- parser.add_argument('-seperate_out_factors', action='store_false', default=False,
368
- help='Trie of wanting to split data that is potentially categorical as binary'
369
- ' we want to split the data for processing')
370
- parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
517
+
518
+
519
+ BATCH_JOB = False
520
+
521
+ if BATCH_JOB:
522
+ parser.add_argument('-dataset_file', default='data/Ex-16-3.csv', help='supply the path to the dataset')
523
+
524
+ parser.add_argument('-line', type=int, default=1,
525
+ help='line to read in csv to pass in argument')
526
+
527
+ if vars(parser.parse_args())['line'] is not None:
528
+ reader = csv.DictReader(open('set_data.csv', 'r'))
529
+ args = list()
530
+ line_number_obs = 0
531
+ for dictionary in reader: # TODO find a way to handle multiple args
532
+ args = dictionary
533
+ if line_number_obs == int(vars(parser.parse_args())['line']):
534
+ break
535
+ line_number_obs += 1
536
+ args = dict(args)
537
+
538
+
539
+ for key, value in args.items():
540
+ try:
541
+ # Attempt to parse the string value to a Python literal if value is a string.
542
+ if isinstance(value, str):
543
+ value = ast.literal_eval(value)
544
+ except (ValueError, SyntaxError):
545
+ # If there's a parsing error, value remains as the original string.
546
+ pass
547
+
548
+ # Add the argument to the parser with the potentially updated value.
549
+ parser.add_argument(f'-{key}', default=value)
550
+
551
+ for i, action in enumerate(parser._optionals._actions):
552
+ if "-algorithm" in action.option_strings:
553
+ parser._optionals._actions[i].help = "optimization algorithm"
554
+
555
+ override = True
556
+ if override:
557
+ print('WARNING: TESTING ENVIRONMENT, TURN OFF FOR RELEASE')
558
+ parser.add_argument('-problem_number', default='10')
559
+
560
+ if 'algorithm' not in args:
561
+ parser.add_argument('-algorithm', type=str, default='hs',
562
+ help='optimization algorithm')
563
+ elif 'Manual_Fit' not in args:
564
+ parser.add_argument('-Manual_Fit', action='store_false', default=None,
565
+ help='To fit a model manually if desired.')
566
+
567
+ parser.add_argument('-seperate_out_factors', action='store_false', default=False,
568
+ help='Trie of wanting to split data that is potentially categorical as binary'
569
+ ' we want to split the data for processing')
570
+ parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directories')
371
571
 
372
572
  else: # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
573
+ print("RUNNING WITH ARGS")
373
574
  parser.add_argument('-com', type=str, default='MetaCode',
374
575
  help='line to read csv')
375
576
 
@@ -20,8 +20,8 @@ try:
20
20
  from .solution import ObjectiveFunction
21
21
  except:
22
22
  print('Exception relative import')
23
- from metacountregressor.pareto_file import Pareto, Solution
24
- from metacountregressor.solution import ObjectiveFunction
23
+ from pareto_file import Pareto, Solution
24
+ from solution import ObjectiveFunction
25
25
 
26
26
 
27
27
  HarmonySearchResults = namedtuple('HarmonySearchResults',
@@ -72,7 +72,7 @@ def dict_mean(dict_list,
72
72
  mean_dict[key] = sum(d[key] for d in dict_list) / len(dict_list)
73
73
  return mean_dict
74
74
  else:
75
-
75
+ mean_dict = {}
76
76
  for key in dict_list[0].keys():
77
77
  if key in ignore:
78
78
  continue
@@ -402,6 +402,7 @@ class DifferentialEvolution(object):
402
402
  self._obj_fun._obj_1 = 'bic'
403
403
 
404
404
  self._pop_size = kwargs.get('_pop_size', 20)
405
+ print('Population size is', self._pop_size)
405
406
  if not isinstance(self._pop_size, int):
406
407
  raise ValueError("_pop_size must be an integer")
407
408
  elif self._pop_size <= 3:
@@ -618,7 +619,7 @@ class DifferentialEvolution(object):
618
619
  1)
619
620
 
620
621
  if len(self._pareto_population) == 1:
621
- print('the size of the population is only 1')
622
+ print('Pareto Population Size is only 1')
622
623
  if self.pf.check_dominance([obj_trial[self.pf.obj_key_1], obj_trial[self.pf.obj_key_2]],
623
624
  [self._population[j][self.pf.obj_key_1], self._population[j][
624
625
  self.pf.obj_key_2]]): # if solution dominates existing #FIXME some error here true but not entering
@@ -787,7 +788,7 @@ class SimulatedAnnealing(object):
787
788
  self.accept = 0
788
789
  self.profiler = []
789
790
  self.update_t = self.cooling_linear_m
790
- self.get_direcotory()
791
+ self.get_directory()
791
792
  self._crossover_perc = float(kwargs.get('_crossover_perc', 0.2)) or float(kwargs.get('_cr', 0.2))
792
793
  self._obj_fun = objective_function
793
794
  if objective_function.is_multi: # TODO Define more specific objectives in the intialiser
@@ -801,7 +802,7 @@ class SimulatedAnnealing(object):
801
802
  self.pf = Pareto(self.obj_1, self.obj_2, False)
802
803
  self._sa_memory = list()
803
804
 
804
- def get_direcotory(self):
805
+ def get_directory(self):
805
806
  # checking if the directory demo_folder2
806
807
  # exist or not.
807
808
  if not os.path.isdir(self.instance_number):
@@ -1004,7 +1005,7 @@ class SimulatedAnnealing(object):
1004
1005
  elif num_of_changeablePARMs == 0:
1005
1006
  rdm_i = random.choice(range(len(prmVect)))
1006
1007
  if self._obj_fun.get_num_discrete_values(rdm_i) <= 1:
1007
- print('hold gimct')
1008
+ print('retry')
1008
1009
 
1009
1010
  while self._obj_fun.get_num_discrete_values(rdm_i) <= 1:
1010
1011
  rdm_i = random.randint(0, self._obj_fun.get_num_parameters() - 1)
@@ -1046,7 +1047,7 @@ class SimulatedAnnealing(object):
1046
1047
  get_rdm_j = random.randint(0, self._obj_fun.get_num_discrete_values(rdm_i) - 1)
1047
1048
  if (self._obj_fun.get_num_discrete_values(
1048
1049
  rdm_i) - 1) < 1: # TODO: remove this is just a test
1049
- print('fucking fix this sln algorithm')
1050
+
1050
1051
  break
1051
1052
  new_nbr_i = self._obj_fun.get_value(rdm_i, get_rdm_j)
1052
1053
  neighbour[rdm_i] = new_nbr_i
@@ -1237,15 +1238,25 @@ class HarmonySearch(object):
1237
1238
  Initialize HS with the specified objective function. Note that this objective function must implement ObjectiveFunctionInterface.
1238
1239
  """
1239
1240
  self._obj_fun = objective_function
1241
+ ## NEW CODE, TRYING TO EXCTACT OUT THE PARAMATERS
1242
+ self._hms = kwargs.get('_hms', 20)
1243
+ self._par = kwargs.get('_par', .30)
1244
+ self.F = kwargs.get('_AI', 2) # mutation scale
1245
+ self.iter = kwargs.get('_max_iter', 10000)
1246
+ self.cr = kwargs.get('_crossover_perc') or kwargs.get('_cr', 0.2)
1247
+ self.instance_number = str(kwargs.get('instance_number', 1))
1248
+
1249
+
1250
+
1240
1251
  # for printing basics metrics
1241
- self.print_verbose = True
1252
+ self.print_verbose = kwargs.get('verbose', False)
1242
1253
  # harmony_memory stores the best hms harmonies
1243
1254
  self._harmony_memory = list()
1244
1255
  # harmony_history stores all hms harmonies every nth improvisations (i.e., one 'generation')
1245
1256
  self._harmony_history = list()
1246
1257
  # saves the best fitness
1247
1258
  self.instance_number = str(objective_function.instance_number)
1248
- self.get_direcotory()
1259
+ self.get_directory()
1249
1260
  self._harmony_trace_best = list()
1250
1261
  self._harmony_trace_incumbent = list()
1251
1262
  if self._obj_fun.is_multi: # TODO Define more specific objectives in the intialiser
@@ -1261,7 +1272,7 @@ class HarmonySearch(object):
1261
1272
 
1262
1273
  self.pf = Pareto(self.obj_1, self.obj_2, False)
1263
1274
 
1264
- def get_direcotory(self):
1275
+ def get_directory(self):
1265
1276
  # checking if the directory demo_folder2
1266
1277
  # exist or not.
1267
1278
  if not os.path.isdir(self.instance_number):
@@ -8,7 +8,7 @@ with codecs.open("README.rst", encoding='utf8') as fh:
8
8
  setuptools.setup(name='metacountregressor',
9
9
  version='0.1.63',
10
10
  description='Extensions for a Python package for \
11
- GPU-accelerated estimation of mixed logit models.',
11
+ estimation of data count models.',
12
12
  long_description=long_description,
13
13
  long_description_content_type="text/x-rst",
14
14
  url='https://github.com/zahern/CountDataEstimation',
@@ -20,5 +20,6 @@ setuptools.setup(name='metacountregressor',
20
20
  python_requires='>=3.10',
21
21
  install_requires=[
22
22
  'numpy>=1.13.1',
23
- 'scipy>=1.0.0'
23
+ 'scipy>=1.0.0',
24
+ 'latextable'
24
25
  ])