metacountregressor 0.1.73__py3-none-any.whl → 0.1.83__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metacountregressor/app_main.py +258 -0
- metacountregressor/data_split_helper.py +90 -0
- metacountregressor/helperprocess.py +372 -5
- metacountregressor/main.py +297 -117
- metacountregressor/metaheuristics.py +43 -31
- metacountregressor/setup.py +3 -2
- metacountregressor/solution.py +734 -832
- {metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/METADATA +256 -35
- metacountregressor-0.1.83.dist-info/RECORD +20 -0
- {metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/WHEEL +1 -1
- metacountregressor-0.1.73.dist-info/RECORD +0 -18
- {metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/LICENSE.txt +0 -0
- {metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
import argparse
|
|
3
|
+
import csv
|
|
4
|
+
import faulthandler
|
|
5
|
+
import ast
|
|
6
|
+
from typing import Any
|
|
7
|
+
import cProfile
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from pandas import DataFrame
|
|
11
|
+
from pandas.io.parsers import TextFileReader
|
|
12
|
+
import helperprocess
|
|
13
|
+
from metaheuristics import (differential_evolution,
|
|
14
|
+
harmony_search,
|
|
15
|
+
simulated_annealing)
|
|
16
|
+
from solution import ObjectiveFunction
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
warnings.simplefilter("ignore")
|
|
20
|
+
|
|
21
|
+
faulthandler.enable()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def convert_df_columns_to_binary_and_wide(df):
|
|
25
|
+
columns = list(df.columns)
|
|
26
|
+
|
|
27
|
+
df = pd.get_dummies(df, columns=columns, drop_first=True)
|
|
28
|
+
return df
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def process_arguments():
|
|
32
|
+
'''
|
|
33
|
+
TRYING TO TURN THE CSV FILES INTO RELEVANT ARGS
|
|
34
|
+
'''
|
|
35
|
+
try:
|
|
36
|
+
data_characteristic = pd.read_csv('problem_data.csv')
|
|
37
|
+
analyst_d = pd.read_csv('decisions.csv')
|
|
38
|
+
hyper = pd.read_csv('setup_hyper.csv')
|
|
39
|
+
except Exception as e:
|
|
40
|
+
print(e)
|
|
41
|
+
print('Files Have Not Been Set Up Yet..')
|
|
42
|
+
print('Run the App')
|
|
43
|
+
exit()
|
|
44
|
+
|
|
45
|
+
new_data = {'data': data_characteristic,
|
|
46
|
+
'analyst':analyst_d,
|
|
47
|
+
'hyper': hyper}
|
|
48
|
+
return new_data
|
|
49
|
+
|
|
50
|
+
def main(args, **kwargs):
|
|
51
|
+
'''METACOUNT REGRESSOR TESTING ENVIRONMENT'''
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
print('the args is:', args)
|
|
57
|
+
print('the kwargs is', kwargs)
|
|
58
|
+
|
|
59
|
+
# removing junk files if specicified
|
|
60
|
+
helperprocess.remove_files(args.get('removeFiles', True))
|
|
61
|
+
|
|
62
|
+
# do we want to run a test
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
data_info = process_arguments()
|
|
66
|
+
data_info['hyper']
|
|
67
|
+
data_info['analyst']
|
|
68
|
+
data_info['data']['Y']
|
|
69
|
+
#data_info['data']['Group'][0]
|
|
70
|
+
#data_info['data']['Panel'][0]
|
|
71
|
+
args['decisions'] = data_info['analyst']
|
|
72
|
+
grouped_c = data_info['data']['Grouped'][0]
|
|
73
|
+
if isinstance(data_info['data']['Grouped'][0],str):
|
|
74
|
+
args['group'] = data_info['data']['Grouped'][0]
|
|
75
|
+
args['ID'] = data_info['data']['Panel'][0]
|
|
76
|
+
if isinstance(data_info['data']['Panel'][0],str):
|
|
77
|
+
args['panels'] = data_info['data']['Panel'][0]
|
|
78
|
+
|
|
79
|
+
df = pd.read_csv(str(data_info['data']['Problem'][0]))
|
|
80
|
+
x_df = df.drop(columns=[data_info['data']['Y'][0]])
|
|
81
|
+
# drop the columns of x_df where column is string exclude the column stype args['group']
|
|
82
|
+
exclude_column = args['group']
|
|
83
|
+
columns_to_keep = x_df.dtypes != 'object'
|
|
84
|
+
columns_to_keep |= (x_df.columns == exclude_column)
|
|
85
|
+
x_df = x_df.loc[:, columns_to_keep]
|
|
86
|
+
y_df = df[[data_info['data']['Y'][0]]]
|
|
87
|
+
y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
|
|
88
|
+
|
|
89
|
+
manual_fit_spec = None #TODO add in manual fit
|
|
90
|
+
if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
|
|
91
|
+
if manual_fit_spec is None:
|
|
92
|
+
args['Manual_Fit'] = None
|
|
93
|
+
else:
|
|
94
|
+
print('fitting manually')
|
|
95
|
+
args['Manual_Fit'] = manual_fit_spec
|
|
96
|
+
if args['problem_number'] == str(8) or args['problem_number'] == 8:
|
|
97
|
+
print('Maine County Dataset.')
|
|
98
|
+
args['group'] = 'county'
|
|
99
|
+
args['panels'] = 'element_ID'
|
|
100
|
+
args['ID'] = 'element_ID'
|
|
101
|
+
args['_max_characteristics'] = 55
|
|
102
|
+
elif args['problem_number'] == str(9) or args['problem_number'] == 9:
|
|
103
|
+
args['group'] = 'group'
|
|
104
|
+
args['panels'] = 'ind_id'
|
|
105
|
+
args['ID'] = 'ind_id'
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
args['complexity_level'] = args.get('complexity_level', 6)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# Initialize AnalystSpecs to None if not manually provided
|
|
113
|
+
args['AnalystSpecs'] = args.get('AnalystSpecs', None)
|
|
114
|
+
|
|
115
|
+
if args['algorithm'] == 'sa':
|
|
116
|
+
args_hyperparameters = {'alpha': float(args['temp_scale']),
|
|
117
|
+
'STEPS_PER_TEMP': int(args['steps']),
|
|
118
|
+
'INTL_ACPT': 0.5,
|
|
119
|
+
'_crossover_perc': args['crossover'],
|
|
120
|
+
'MAX_ITERATIONS': int(args['_max_imp']),
|
|
121
|
+
'_num_intl_slns': 25,
|
|
122
|
+
'Manual_Fit': args['Manual_Fit'],
|
|
123
|
+
'MP': int(args['MP'])}
|
|
124
|
+
helperprocess.entries_to_remove(('crossover', '_max_imp', '_hms', '_hmcr', '_par'), args)
|
|
125
|
+
print(args)
|
|
126
|
+
|
|
127
|
+
obj_fun = ObjectiveFunction(x_df, y_df, **args)
|
|
128
|
+
|
|
129
|
+
results = simulated_annealing(obj_fun, None, **args_hyperparameters)
|
|
130
|
+
|
|
131
|
+
helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
|
|
132
|
+
|
|
133
|
+
if args['dual_complexities']:
|
|
134
|
+
args['complexity_level'] = args['secondary_complexity']
|
|
135
|
+
obj_fun = ObjectiveFunction(x_df, y_df, **args)
|
|
136
|
+
results = simulated_annealing(obj_fun, None, **args_hyperparameters)
|
|
137
|
+
helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
|
|
138
|
+
|
|
139
|
+
elif args['algorithm'] == 'hs':
|
|
140
|
+
args['_mpai'] = 1
|
|
141
|
+
|
|
142
|
+
obj_fun = ObjectiveFunction(x_df, y_df, **args)
|
|
143
|
+
args_hyperparameters = {
|
|
144
|
+
'Manual_Fit': args['Manual_Fit'],
|
|
145
|
+
'MP': int(args['MP'])
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
results = harmony_search(obj_fun, None, **args_hyperparameters)
|
|
149
|
+
helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
|
|
150
|
+
|
|
151
|
+
if args.get('dual_complexities', 0):
|
|
152
|
+
args['complexity_level'] = args['secondary_complexity']
|
|
153
|
+
obj_fun = ObjectiveFunction(x_df, y_df, **args)
|
|
154
|
+
results = harmony_search(obj_fun, None, **args_hyperparameters)
|
|
155
|
+
helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
elif args['algorithm'] == 'de':
|
|
159
|
+
# force variables
|
|
160
|
+
args['must_include'] = args.get('force', [])
|
|
161
|
+
|
|
162
|
+
args_hyperparameters = {'_AI': args.get('_AI', 2),
|
|
163
|
+
'_crossover_perc': float(args['crossover']),
|
|
164
|
+
'_max_iter': int(args['_max_imp'])
|
|
165
|
+
, '_pop_size': int(args['_hms']), 'instance_number': int(args['line'])
|
|
166
|
+
, 'Manual_Fit': args['Manual_Fit'],
|
|
167
|
+
'MP': int(args['MP'])
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
args_hyperparameters = dict(args_hyperparameters)
|
|
171
|
+
|
|
172
|
+
helperprocess.entries_to_remove(('crossover', '_max_imp', '_hms', '_hmcr', '_par'), args)
|
|
173
|
+
obj_fun = ObjectiveFunction(x_df, y_df, **args)
|
|
174
|
+
|
|
175
|
+
results = differential_evolution(obj_fun, None, **args_hyperparameters)
|
|
176
|
+
|
|
177
|
+
helperprocess.results_printer(results, args['algorithm'], int(args['is_multi']))
|
|
178
|
+
|
|
179
|
+
if args['dual_complexities']:
|
|
180
|
+
args['complexity_level'] = args['secondary_complexity']
|
|
181
|
+
obj_fun = ObjectiveFunction(x_df, y_df, **args)
|
|
182
|
+
results = differential_evolution(obj_fun, None, **args_hyperparameters)
|
|
183
|
+
helperprocess.results_printer(results, args['algorithm'], int(args['is_multi'])) #TODO FIX This
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
if __name__ == '__main__':
|
|
187
|
+
"""Loading in command line args. """
|
|
188
|
+
alg_parser = argparse.ArgumentParser(prog='algorithm', epilog='algorithm specific arguments')
|
|
189
|
+
alg_parser.add_argument('-AI', default=2, help='adjustment index. For the allowable movement of the algorithm')
|
|
190
|
+
alg_parser.print_help()
|
|
191
|
+
parser = argparse.ArgumentParser(prog='main',
|
|
192
|
+
epilog=main.__doc__,
|
|
193
|
+
formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve')
|
|
194
|
+
|
|
195
|
+
parser.add_argument('-line', type=int, default=1,
|
|
196
|
+
help='line to read in csv to pass in argument')
|
|
197
|
+
|
|
198
|
+
if vars(parser.parse_args())['line'] is not None:
|
|
199
|
+
reader = csv.DictReader(open('set_data.csv', 'r'))
|
|
200
|
+
args = list()
|
|
201
|
+
line_number_obs = 0
|
|
202
|
+
for dictionary in reader: # TODO find a way to handle multiple args
|
|
203
|
+
args = dictionary
|
|
204
|
+
if line_number_obs == int(vars(parser.parse_args())['line']):
|
|
205
|
+
break
|
|
206
|
+
line_number_obs += 1
|
|
207
|
+
args = dict(args)
|
|
208
|
+
|
|
209
|
+
for key, value in args.items():
|
|
210
|
+
try:
|
|
211
|
+
# Attempt to parse the string value to a Python literal if value is a string.
|
|
212
|
+
if isinstance(value, str):
|
|
213
|
+
value = ast.literal_eval(value)
|
|
214
|
+
except (ValueError, SyntaxError):
|
|
215
|
+
# If there's a parsing error, value remains as the original string.
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
# Add the argument to the parser with the potentially updated value.
|
|
219
|
+
parser.add_argument(f'-{key}', default=value)
|
|
220
|
+
|
|
221
|
+
for i, action in enumerate(parser._optionals._actions):
|
|
222
|
+
if "-algorithm" in action.option_strings:
|
|
223
|
+
parser._optionals._actions[i].help = "optimization algorithm"
|
|
224
|
+
|
|
225
|
+
override = True
|
|
226
|
+
if override:
|
|
227
|
+
print('todo turn off, in testing phase')
|
|
228
|
+
parser.add_argument('-problem_number', default='10')
|
|
229
|
+
print('did it make it')
|
|
230
|
+
if 'algorithm' not in args:
|
|
231
|
+
parser.add_argument('-algorithm', type=str, default='hs',
|
|
232
|
+
help='optimization algorithm')
|
|
233
|
+
elif 'Manual_Fit' not in args:
|
|
234
|
+
parser.add_argument('-Manual_Fit', action='store_false', default=None,
|
|
235
|
+
help='To fit a model manually if desired.')
|
|
236
|
+
|
|
237
|
+
parser.add_argument('-seperate_out_factors', action='store_false', default=False,
|
|
238
|
+
help='Trie of wanting to split data that is potentially categorical as binary'
|
|
239
|
+
' we want to split the data for processing')
|
|
240
|
+
parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
|
|
241
|
+
|
|
242
|
+
else: # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
|
|
243
|
+
parser.add_argument('-com', type=str, default='MetaCode',
|
|
244
|
+
help='line to read csv')
|
|
245
|
+
|
|
246
|
+
# Check the args
|
|
247
|
+
parser.print_help()
|
|
248
|
+
args = vars(parser.parse_args())
|
|
249
|
+
print(type(args))
|
|
250
|
+
# TODO add in chi 2 and df in estimation and compare degrees of freedom this needs to be done in solution
|
|
251
|
+
|
|
252
|
+
# Print the args.
|
|
253
|
+
profiler = cProfile.Profile()
|
|
254
|
+
profiler.runcall(main,args)
|
|
255
|
+
profiler.print_stats(sort='time')
|
|
256
|
+
#TOO MAX_TIME
|
|
257
|
+
|
|
258
|
+
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DataProcessor:
|
|
8
|
+
def __init__(self, x_data, y_data, kwargs):
|
|
9
|
+
self._obj_1 = kwargs.get('_obj_1')
|
|
10
|
+
self._obj_2 = kwargs.get('_obj_2')
|
|
11
|
+
self.test_percentage = float(kwargs.get('test_percentage', 0))
|
|
12
|
+
self.val_percentage = float(kwargs.get('val_percentage', 0))
|
|
13
|
+
self.is_multi = self.test_percentage != 0
|
|
14
|
+
self._x_data = x_data
|
|
15
|
+
self._y_data = y_data
|
|
16
|
+
self._process_data(kwargs)
|
|
17
|
+
|
|
18
|
+
def _process_data(self, kwargs):
|
|
19
|
+
if self._obj_1 == 'MAE' or self._obj_2 in ["MAE", 'RMSE', 'MSE', 'RMSE_IN', 'RMSE_TEST']:
|
|
20
|
+
self._handle_special_conditions(kwargs)
|
|
21
|
+
else:
|
|
22
|
+
self._standard_data_partition()
|
|
23
|
+
|
|
24
|
+
self._characteristics_names = list(self._x_data.columns)
|
|
25
|
+
self._max_group_all_means = 1
|
|
26
|
+
self._exclude_this_test = [4]
|
|
27
|
+
|
|
28
|
+
def _handle_special_conditions(self, kwargs):
|
|
29
|
+
if 'panels' in kwargs:
|
|
30
|
+
self._process_panels_data(kwargs)
|
|
31
|
+
else:
|
|
32
|
+
self._standard_data_partition()
|
|
33
|
+
|
|
34
|
+
def _process_panels_data(self, kwargs):
|
|
35
|
+
group_key = kwargs['group']
|
|
36
|
+
panels_key = kwargs['panels']
|
|
37
|
+
|
|
38
|
+
# Process groups and panels
|
|
39
|
+
self._x_data[group_key] = self._x_data[group_key].astype('category').cat.codes
|
|
40
|
+
try:
|
|
41
|
+
self._x_data[panels_key] = self._x_data[panels_key].rank(method='dense').astype(int)
|
|
42
|
+
self._x_data[panels_key] -= self._x_data[panels_key].min() - 1
|
|
43
|
+
except KeyError:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
# Create training and test datasets
|
|
47
|
+
unique_ids = np.unique(self._x_data[panels_key])
|
|
48
|
+
training_size = int((1 - self.test_percentage - self.val_percentage) * len(unique_ids))
|
|
49
|
+
training_ids = np.random.choice(unique_ids, training_size, replace=False)
|
|
50
|
+
|
|
51
|
+
train_idx = self._x_data.index[self._x_data[panels_key].isin(training_ids)]
|
|
52
|
+
test_idx = self._x_data.index[~self._x_data[panels_key].isin(training_ids)]
|
|
53
|
+
|
|
54
|
+
self._create_datasets(train_idx, test_idx)
|
|
55
|
+
|
|
56
|
+
def _standard_data_partition(self):
|
|
57
|
+
total_samples = len(self._x_data)
|
|
58
|
+
training_size = int((1 - self.test_percentage - self.val_percentage) * total_samples)
|
|
59
|
+
training_indices = np.random.choice(total_samples, training_size, replace=False)
|
|
60
|
+
|
|
61
|
+
train_idx = np.array([i for i in range(total_samples) if i in training_indices])
|
|
62
|
+
test_idx = np.array([i for i in range(total_samples) if i not in training_indices])
|
|
63
|
+
|
|
64
|
+
self._create_datasets(train_idx, test_idx)
|
|
65
|
+
|
|
66
|
+
def _create_datasets(self, train_idx, test_idx):
|
|
67
|
+
self.df_train = self._x_data.loc[train_idx, :]
|
|
68
|
+
self.df_test = self._x_data.loc[test_idx, :]
|
|
69
|
+
self.y_train = self._y_data.loc[train_idx, :]
|
|
70
|
+
self.y_test = self._y_data.loc[test_idx, :]
|
|
71
|
+
|
|
72
|
+
self._x_data_test = self.df_test.copy()
|
|
73
|
+
self._y_data_test = self.y_test.astype('float').copy()
|
|
74
|
+
self._x_data = self.df_train.copy()
|
|
75
|
+
self._y_data = self.y_train.astype('float').copy()
|
|
76
|
+
|
|
77
|
+
# Handle different shapes
|
|
78
|
+
if self._x_data.ndim == 2: # Typical DataFrame
|
|
79
|
+
self._samples, self._characteristics = self._x_data.shape
|
|
80
|
+
self._panels = None
|
|
81
|
+
elif self._x_data.ndim == 3: # 3D structure, e.g., Panel or similar
|
|
82
|
+
self._samples, self._panels, self._characteristics = self._x_data.shape
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|