metacountregressor 0.1.73__py3-none-any.whl → 0.1.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,258 @@
1
+ import warnings
2
+ import argparse
3
+ import csv
4
+ import faulthandler
5
+ import ast
6
+ from typing import Any
7
+ import cProfile
8
+ import numpy as np
9
+ import pandas as pd
10
+ from pandas import DataFrame
11
+ from pandas.io.parsers import TextFileReader
12
+ import helperprocess
13
+ from metaheuristics import (differential_evolution,
14
+ harmony_search,
15
+ simulated_annealing)
16
+ from solution import ObjectiveFunction
17
+
18
+
19
+ warnings.simplefilter("ignore")
20
+
21
+ faulthandler.enable()
22
+
23
+
24
+ def convert_df_columns_to_binary_and_wide(df):
25
+ columns = list(df.columns)
26
+
27
+ df = pd.get_dummies(df, columns=columns, drop_first=True)
28
+ return df
29
+
30
+
31
+ def process_arguments():
32
+ '''
33
+ TRYING TO TURN THE CSV FILES INTO RELEVANT ARGS
34
+ '''
35
+ try:
36
+ data_characteristic = pd.read_csv('problem_data.csv')
37
+ analyst_d = pd.read_csv('decisions.csv')
38
+ hyper = pd.read_csv('setup_hyper.csv')
39
+ except Exception as e:
40
+ print(e)
41
+ print('Files Have Not Been Set Up Yet..')
42
+ print('Run the App')
43
+ exit()
44
+
45
+ new_data = {'data': data_characteristic,
46
+ 'analyst':analyst_d,
47
+ 'hyper': hyper}
48
+ return new_data
49
+
50
+ def main(args, **kwargs):
51
+ '''METACOUNT REGRESSOR TESTING ENVIRONMENT'''
52
+
53
+
54
+
55
+
56
+ print('the args is:', args)
57
+ print('the kwargs is', kwargs)
58
+
59
+ # removing junk files if specicified
60
+ helperprocess.remove_files(args.get('removeFiles', True))
61
+
62
+ # do we want to run a test
63
+
64
+
65
+ data_info = process_arguments()
66
+ data_info['hyper']
67
+ data_info['analyst']
68
+ data_info['data']['Y']
69
+ #data_info['data']['Group'][0]
70
+ #data_info['data']['Panel'][0]
71
+ args['decisions'] = data_info['analyst']
72
+ grouped_c = data_info['data']['Grouped'][0]
73
+ if isinstance(data_info['data']['Grouped'][0],str):
74
+ args['group'] = data_info['data']['Grouped'][0]
75
+ args['ID'] = data_info['data']['Panel'][0]
76
+ if isinstance(data_info['data']['Panel'][0],str):
77
+ args['panels'] = data_info['data']['Panel'][0]
78
+
79
+ df = pd.read_csv(str(data_info['data']['Problem'][0]))
80
+ x_df = df.drop(columns=[data_info['data']['Y'][0]])
81
+ # drop the columns of x_df where column is string exclude the column stype args['group']
82
+ exclude_column = args['group']
83
+ columns_to_keep = x_df.dtypes != 'object'
84
+ columns_to_keep |= (x_df.columns == exclude_column)
85
+ x_df = x_df.loc[:, columns_to_keep]
86
+ y_df = df[[data_info['data']['Y'][0]]]
87
+ y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
88
+
89
+ manual_fit_spec = None #TODO add in manual fit
90
+ if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
91
+ if manual_fit_spec is None:
92
+ args['Manual_Fit'] = None
93
+ else:
94
+ print('fitting manually')
95
+ args['Manual_Fit'] = manual_fit_spec
96
+ if args['problem_number'] == str(8) or args['problem_number'] == 8:
97
+ print('Maine County Dataset.')
98
+ args['group'] = 'county'
99
+ args['panels'] = 'element_ID'
100
+ args['ID'] = 'element_ID'
101
+ args['_max_characteristics'] = 55
102
+ elif args['problem_number'] == str(9) or args['problem_number'] == 9:
103
+ args['group'] = 'group'
104
+ args['panels'] = 'ind_id'
105
+ args['ID'] = 'ind_id'
106
+
107
+
108
+
109
+ args['complexity_level'] = args.get('complexity_level', 6)
110
+
111
+
112
+ # Initialize AnalystSpecs to None if not manually provided
113
+ args['AnalystSpecs'] = args.get('AnalystSpecs', None)
114
+
115
+ if args['algorithm'] == 'sa':
116
+ args_hyperparameters = {'alpha': float(args['temp_scale']),
117
+ 'STEPS_PER_TEMP': int(args['steps']),
118
+ 'INTL_ACPT': 0.5,
119
+ '_crossover_perc': args['crossover'],
120
+ 'MAX_ITERATIONS': int(args['_max_imp']),
121
+ '_num_intl_slns': 25,
122
+ 'Manual_Fit': args['Manual_Fit'],
123
+ 'MP': int(args['MP'])}
124
+ helperprocess.entries_to_remove(('crossover', '_max_imp', '_hms', '_hmcr', '_par'), args)
125
+ print(args)
126
+
127
+ obj_fun = ObjectiveFunction(x_df, y_df, **args)
128
+
129
+ results = simulated_annealing(obj_fun, None, **args_hyperparameters)
130
+
131
+ helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
132
+
133
+ if args['dual_complexities']:
134
+ args['complexity_level'] = args['secondary_complexity']
135
+ obj_fun = ObjectiveFunction(x_df, y_df, **args)
136
+ results = simulated_annealing(obj_fun, None, **args_hyperparameters)
137
+ helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
138
+
139
+ elif args['algorithm'] == 'hs':
140
+ args['_mpai'] = 1
141
+
142
+ obj_fun = ObjectiveFunction(x_df, y_df, **args)
143
+ args_hyperparameters = {
144
+ 'Manual_Fit': args['Manual_Fit'],
145
+ 'MP': int(args['MP'])
146
+ }
147
+
148
+ results = harmony_search(obj_fun, None, **args_hyperparameters)
149
+ helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
150
+
151
+ if args.get('dual_complexities', 0):
152
+ args['complexity_level'] = args['secondary_complexity']
153
+ obj_fun = ObjectiveFunction(x_df, y_df, **args)
154
+ results = harmony_search(obj_fun, None, **args_hyperparameters)
155
+ helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
156
+
157
+
158
+ elif args['algorithm'] == 'de':
159
+ # force variables
160
+ args['must_include'] = args.get('force', [])
161
+
162
+ args_hyperparameters = {'_AI': args.get('_AI', 2),
163
+ '_crossover_perc': float(args['crossover']),
164
+ '_max_iter': int(args['_max_imp'])
165
+ , '_pop_size': int(args['_hms']), 'instance_number': int(args['line'])
166
+ , 'Manual_Fit': args['Manual_Fit'],
167
+ 'MP': int(args['MP'])
168
+ }
169
+
170
+ args_hyperparameters = dict(args_hyperparameters)
171
+
172
+ helperprocess.entries_to_remove(('crossover', '_max_imp', '_hms', '_hmcr', '_par'), args)
173
+ obj_fun = ObjectiveFunction(x_df, y_df, **args)
174
+
175
+ results = differential_evolution(obj_fun, None, **args_hyperparameters)
176
+
177
+ helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
178
+
179
+ if args['dual_complexities']:
180
+ args['complexity_level'] = args['secondary_complexity']
181
+ obj_fun = ObjectiveFunction(x_df, y_df, **args)
182
+ results = differential_evolution(obj_fun, None, **args_hyperparameters)
183
+ helperprocess.results_printer(results, args['algorithm'], int(args['is_multi'])) #TODO FIX This
184
+
185
+
186
+ if __name__ == '__main__':
187
+ """Loading in command line args. """
188
+ alg_parser = argparse.ArgumentParser(prog='algorithm', epilog='algorithm specific arguments')
189
+ alg_parser.add_argument('-AI', default=2, help='adjustment index. For the allowable movement of the algorithm')
190
+ alg_parser.print_help()
191
+ parser = argparse.ArgumentParser(prog='main',
192
+ epilog=main.__doc__,
193
+ formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve')
194
+
195
+ parser.add_argument('-line', type=int, default=1,
196
+ help='line to read in csv to pass in argument')
197
+
198
+ if vars(parser.parse_args())['line'] is not None:
199
+ reader = csv.DictReader(open('set_data.csv', 'r'))
200
+ args = list()
201
+ line_number_obs = 0
202
+ for dictionary in reader: # TODO find a way to handle multiple args
203
+ args = dictionary
204
+ if line_number_obs == int(vars(parser.parse_args())['line']):
205
+ break
206
+ line_number_obs += 1
207
+ args = dict(args)
208
+
209
+ for key, value in args.items():
210
+ try:
211
+ # Attempt to parse the string value to a Python literal if value is a string.
212
+ if isinstance(value, str):
213
+ value = ast.literal_eval(value)
214
+ except (ValueError, SyntaxError):
215
+ # If there's a parsing error, value remains as the original string.
216
+ pass
217
+
218
+ # Add the argument to the parser with the potentially updated value.
219
+ parser.add_argument(f'-{key}', default=value)
220
+
221
+ for i, action in enumerate(parser._optionals._actions):
222
+ if "-algorithm" in action.option_strings:
223
+ parser._optionals._actions[i].help = "optimization algorithm"
224
+
225
+ override = True
226
+ if override:
227
+ print('todo turn off, in testing phase')
228
+ parser.add_argument('-problem_number', default='10')
229
+ print('did it make it')
230
+ if 'algorithm' not in args:
231
+ parser.add_argument('-algorithm', type=str, default='hs',
232
+ help='optimization algorithm')
233
+ elif 'Manual_Fit' not in args:
234
+ parser.add_argument('-Manual_Fit', action='store_false', default=None,
235
+ help='To fit a model manually if desired.')
236
+
237
+ parser.add_argument('-seperate_out_factors', action='store_false', default=False,
238
+ help='Trie of wanting to split data that is potentially categorical as binary'
239
+ ' we want to split the data for processing')
240
+ parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
241
+
242
+ else: # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
243
+ parser.add_argument('-com', type=str, default='MetaCode',
244
+ help='line to read csv')
245
+
246
+ # Check the args
247
+ parser.print_help()
248
+ args = vars(parser.parse_args())
249
+ print(type(args))
250
+ # TODO add in chi 2 and df in estimation and compare degrees of freedom this needs to be done in solution
251
+
252
+ # Print the args.
253
+ profiler = cProfile.Profile()
254
+ profiler.runcall(main,args)
255
+ profiler.print_stats(sort='time')
256
+ #TOO MAX_TIME
257
+
258
+
@@ -0,0 +1,90 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+
5
+
6
+
7
+ class DataProcessor:
8
+ def __init__(self, x_data, y_data, kwargs):
9
+ self._obj_1 = kwargs.get('_obj_1')
10
+ self._obj_2 = kwargs.get('_obj_2')
11
+ self.test_percentage = float(kwargs.get('test_percentage', 0))
12
+ self.val_percentage = float(kwargs.get('val_percentage', 0))
13
+ self.is_multi = self.test_percentage != 0
14
+ self._x_data = x_data
15
+ self._y_data = y_data
16
+ self._process_data(kwargs)
17
+
18
+ def _process_data(self, kwargs):
19
+ if self._obj_1 == 'MAE' or self._obj_2 in ["MAE", 'RMSE', 'MSE', 'RMSE_IN', 'RMSE_TEST']:
20
+ self._handle_special_conditions(kwargs)
21
+ else:
22
+ self._standard_data_partition()
23
+
24
+ self._characteristics_names = list(self._x_data.columns)
25
+ self._max_group_all_means = 1
26
+ self._exclude_this_test = [4]
27
+
28
+ def _handle_special_conditions(self, kwargs):
29
+ if 'panels' in kwargs:
30
+ self._process_panels_data(kwargs)
31
+ else:
32
+ self._standard_data_partition()
33
+
34
+ def _process_panels_data(self, kwargs):
35
+ group_key = kwargs['group']
36
+ panels_key = kwargs['panels']
37
+
38
+ # Process groups and panels
39
+ self._x_data[group_key] = self._x_data[group_key].astype('category').cat.codes
40
+ try:
41
+ self._x_data[panels_key] = self._x_data[panels_key].rank(method='dense').astype(int)
42
+ self._x_data[panels_key] -= self._x_data[panels_key].min() - 1
43
+ except KeyError:
44
+ pass
45
+
46
+ # Create training and test datasets
47
+ unique_ids = np.unique(self._x_data[panels_key])
48
+ training_size = int((1 - self.test_percentage - self.val_percentage) * len(unique_ids))
49
+ training_ids = np.random.choice(unique_ids, training_size, replace=False)
50
+
51
+ train_idx = self._x_data.index[self._x_data[panels_key].isin(training_ids)]
52
+ test_idx = self._x_data.index[~self._x_data[panels_key].isin(training_ids)]
53
+
54
+ self._create_datasets(train_idx, test_idx)
55
+
56
+ def _standard_data_partition(self):
57
+ total_samples = len(self._x_data)
58
+ training_size = int((1 - self.test_percentage - self.val_percentage) * total_samples)
59
+ training_indices = np.random.choice(total_samples, training_size, replace=False)
60
+
61
+ train_idx = np.array([i for i in range(total_samples) if i in training_indices])
62
+ test_idx = np.array([i for i in range(total_samples) if i not in training_indices])
63
+
64
+ self._create_datasets(train_idx, test_idx)
65
+
66
+ def _create_datasets(self, train_idx, test_idx):
67
+ self.df_train = self._x_data.loc[train_idx, :]
68
+ self.df_test = self._x_data.loc[test_idx, :]
69
+ self.y_train = self._y_data.loc[train_idx, :]
70
+ self.y_test = self._y_data.loc[test_idx, :]
71
+
72
+ self._x_data_test = self.df_test.copy()
73
+ self._y_data_test = self.y_test.astype('float').copy()
74
+ self._x_data = self.df_train.copy()
75
+ self._y_data = self.y_train.astype('float').copy()
76
+
77
+ # Handle different shapes
78
+ if self._x_data.ndim == 2: # Typical DataFrame
79
+ self._samples, self._characteristics = self._x_data.shape
80
+ self._panels = None
81
+ elif self._x_data.ndim == 3: # 3D structure, e.g., Panel or similar
82
+ self._samples, self._panels, self._characteristics = self._x_data.shape
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+