metacountregressor 0.1.113__py3-none-any.whl → 0.1.117__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metacountregressor/app_main.py +9 -4
- metacountregressor/helperprocess.py +267 -5
- metacountregressor/main.py +172 -61
- metacountregressor/metaheuristics.py +20 -9
- metacountregressor/setup.py +3 -2
- metacountregressor/solution.py +483 -131
- {metacountregressor-0.1.113.dist-info → metacountregressor-0.1.117.dist-info}/METADATA +21 -7
- {metacountregressor-0.1.113.dist-info → metacountregressor-0.1.117.dist-info}/RECORD +11 -11
- {metacountregressor-0.1.113.dist-info → metacountregressor-0.1.117.dist-info}/WHEEL +1 -1
- {metacountregressor-0.1.113.dist-info → metacountregressor-0.1.117.dist-info}/LICENSE.txt +0 -0
- {metacountregressor-0.1.113.dist-info → metacountregressor-0.1.117.dist-info}/top_level.txt +0 -0
metacountregressor/app_main.py
CHANGED
|
@@ -69,15 +69,20 @@ def main(args, **kwargs):
|
|
|
69
69
|
#data_info['data']['Group'][0]
|
|
70
70
|
#data_info['data']['Panel'][0]
|
|
71
71
|
args['decisions'] = data_info['analyst']
|
|
72
|
-
|
|
73
|
-
if
|
|
72
|
+
grouped_c = data_info['data']['Grouped'][0]
|
|
73
|
+
if isinstance(data_info['data']['Grouped'][0],str):
|
|
74
74
|
args['group'] = data_info['data']['Grouped'][0]
|
|
75
|
-
args['ID'] = data_info['data']['
|
|
76
|
-
if
|
|
75
|
+
args['ID'] = data_info['data']['Panel'][0]
|
|
76
|
+
if isinstance(data_info['data']['Panel'][0],str):
|
|
77
77
|
args['panels'] = data_info['data']['Panel'][0]
|
|
78
78
|
|
|
79
79
|
df = pd.read_csv(str(data_info['data']['Problem'][0]))
|
|
80
80
|
x_df = df.drop(columns=[data_info['data']['Y'][0]])
|
|
81
|
+
# drop the columns of x_df where column is string exclude the column stype args['group']
|
|
82
|
+
exclude_column = args['group']
|
|
83
|
+
columns_to_keep = x_df.dtypes != 'object'
|
|
84
|
+
columns_to_keep |= (x_df.columns == exclude_column)
|
|
85
|
+
x_df = x_df.loc[:, columns_to_keep]
|
|
81
86
|
y_df = df[[data_info['data']['Y'][0]]]
|
|
82
87
|
y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
|
|
83
88
|
|
|
@@ -1,10 +1,28 @@
|
|
|
1
|
+
from os.path import exists
|
|
1
2
|
import numpy as np
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import csv
|
|
4
5
|
import matplotlib.pyplot as plt
|
|
6
|
+
from scipy import stats as st
|
|
7
|
+
from sklearn.preprocessing import StandardScaler
|
|
8
|
+
|
|
5
9
|
|
|
6
10
|
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
|
|
7
11
|
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from itertools import product
|
|
17
|
+
|
|
18
|
+
# Function to create a list of dictionaries from a parameter grid
|
|
19
|
+
def generate_param_combinations(param_grid):
|
|
20
|
+
keys = param_grid.keys()
|
|
21
|
+
values = param_grid.values()
|
|
22
|
+
combinations = [dict(zip(keys, v)) for v in product(*values)]
|
|
23
|
+
return combinations
|
|
24
|
+
|
|
25
|
+
|
|
8
26
|
##Select the best Features Based on RF
|
|
9
27
|
def select_features(X_train, y_train, n_f=16):
|
|
10
28
|
try:
|
|
@@ -77,6 +95,7 @@ def findCorrelation(corr, cutoff=0.9, exact=None): """
|
|
|
77
95
|
findCorrelation(R1, cutoff=0.6, exact=True) # ['x1', 'x5', 'x4']
|
|
78
96
|
"""
|
|
79
97
|
|
|
98
|
+
|
|
80
99
|
def _findCorrelation_fast(corr, avg, cutoff):
|
|
81
100
|
|
|
82
101
|
combsAboveCutoff = corr.where(lambda x: (np.tril(x) == 0) & (x > cutoff)).stack().index
|
|
@@ -151,6 +170,230 @@ def remove_files(yes=1):
|
|
|
151
170
|
os.remove('pop_log.csv')
|
|
152
171
|
|
|
153
172
|
|
|
173
|
+
# Function to process the DataFrame
|
|
174
|
+
'''
|
|
175
|
+
Example usuage
|
|
176
|
+
# Configuration dictionary
|
|
177
|
+
config = {
|
|
178
|
+
'Age': {
|
|
179
|
+
'type': 'bin',
|
|
180
|
+
'bins': [0, 18, 35, 50, 100],
|
|
181
|
+
'labels': ['Child', 'YoungAdult', 'MiddleAged', 'Senior'],
|
|
182
|
+
'prefix': 'Age_Binned'
|
|
183
|
+
},
|
|
184
|
+
'Income': {
|
|
185
|
+
'type': 'bin',
|
|
186
|
+
'bins': [0, 2000, 5000, 10000],
|
|
187
|
+
'labels': ['Low', 'Medium', 'High'],
|
|
188
|
+
'prefix': 'Income_Binned'
|
|
189
|
+
},
|
|
190
|
+
'Gender': {
|
|
191
|
+
'type': 'one-hot',
|
|
192
|
+
'prefix': 'Gender'
|
|
193
|
+
},
|
|
194
|
+
'Score': {
|
|
195
|
+
'type': 'none'
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
'''
|
|
199
|
+
def null_handler(vari):
|
|
200
|
+
if vari in locals():
|
|
201
|
+
return vari
|
|
202
|
+
else:
|
|
203
|
+
print(f'{vari} does not exist, setting None..')
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def set_up_analyst_constraints(data_characteristic, model_terms, variable_decisions_alt = None):
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
name_data_characteristics = data_characteristic.columns.tolist()
|
|
211
|
+
# Get non-None values as a list
|
|
212
|
+
non_none_terms = [value for value in model_terms.values() if value is not None]
|
|
213
|
+
# how to make name_data_characteristics - non_none_terms
|
|
214
|
+
|
|
215
|
+
result = [item for item in name_data_characteristics if item not in non_none_terms]
|
|
216
|
+
distu = ['normal', 'uniform', 'triangular']
|
|
217
|
+
tra = ['no', 'sqrt', 'arcsinh']
|
|
218
|
+
if model_terms.get('group') is None:
|
|
219
|
+
print('cant have grouped rpm, removing level 4 from every item')
|
|
220
|
+
MAKE_ALL_4_FALSE = True
|
|
221
|
+
else:
|
|
222
|
+
MAKE_ALL_4_FALSE = False
|
|
223
|
+
|
|
224
|
+
variable_decisions = {
|
|
225
|
+
name: {
|
|
226
|
+
'levels': list(range(6)),
|
|
227
|
+
'Distributions': distu,
|
|
228
|
+
'Transformations': tra
|
|
229
|
+
}
|
|
230
|
+
for name in result
|
|
231
|
+
}
|
|
232
|
+
# Override elements in the original dictionary with the alt dictionary
|
|
233
|
+
if variable_decisions_alt is not None:
|
|
234
|
+
for key, alt_value in variable_decisions_alt.items():
|
|
235
|
+
if key in variable_decisions:
|
|
236
|
+
# Update the existing entry
|
|
237
|
+
variable_decisions[key].update(alt_value)
|
|
238
|
+
else:
|
|
239
|
+
# Add new entry if it doesn't exist
|
|
240
|
+
variable_decisions[key] = alt_value
|
|
241
|
+
# Prepare the data for the DataFrame
|
|
242
|
+
rows = []
|
|
243
|
+
for column_name, details in variable_decisions.items():
|
|
244
|
+
# Create a row dictionary
|
|
245
|
+
row = {'Column': column_name}
|
|
246
|
+
|
|
247
|
+
# Add levels as True/False for Level 0 through Level 5
|
|
248
|
+
for level in range(6): # Assuming Level 0 to Level 5
|
|
249
|
+
|
|
250
|
+
if level == 4 and MAKE_ALL_4_FALSE:
|
|
251
|
+
row[f'Level {level}'] = False
|
|
252
|
+
else:
|
|
253
|
+
row[f'Level {level}'] = level in details['levels']
|
|
254
|
+
|
|
255
|
+
# Add distributions and transformations directly
|
|
256
|
+
|
|
257
|
+
# Add distributions and transformations as comma-separated strings
|
|
258
|
+
row['Distributions'] = str(details['Distributions'])
|
|
259
|
+
row['Transformations'] = str(details['Transformations'])
|
|
260
|
+
|
|
261
|
+
rows.append(row)
|
|
262
|
+
|
|
263
|
+
# Create the DataFrame
|
|
264
|
+
df = pd.DataFrame(rows)
|
|
265
|
+
|
|
266
|
+
data_new = data_characteristic.rename(columns={v: k for k, v in model_terms.items() if v in data_characteristic.columns})
|
|
267
|
+
return df, data_new
|
|
268
|
+
|
|
269
|
+
# Function to guess Low, Medium, High ranges
|
|
270
|
+
def guess_low_medium_high(column_name, series):
|
|
271
|
+
# Compute the tertiles (33rd and 66th percentiles)
|
|
272
|
+
#print('did it make it...')
|
|
273
|
+
#mode_value = st.mode(series) # Get the most frequent value
|
|
274
|
+
#i dont think this works cayse its not a seriers any other way
|
|
275
|
+
is_binary = series.isin([0, 1]).all()
|
|
276
|
+
if is_binary:
|
|
277
|
+
return {
|
|
278
|
+
'type': 'binary',
|
|
279
|
+
'bins': [0,1],
|
|
280
|
+
'labels': ['Off', 'On'],
|
|
281
|
+
'prefix': f'{column_name}'
|
|
282
|
+
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
# series = pd.to_numeric(series, errors='coerce').fillna(mode_value)
|
|
286
|
+
low_threshold = np.quantile(series, 0.33)
|
|
287
|
+
high_threshold = np.quantile(series,0.66)
|
|
288
|
+
|
|
289
|
+
# Define the bins and labels
|
|
290
|
+
bins = [np.min(series) - 1, low_threshold, high_threshold, np.max(series)]
|
|
291
|
+
# Handle duplicate bins by adjusting labels
|
|
292
|
+
if len(set(bins)) < len(bins): # Check for duplicate bin edges
|
|
293
|
+
if low_threshold == high_threshold:
|
|
294
|
+
# Collapse to two bins (Low and High)
|
|
295
|
+
bins = [np.min(series) - 1, low_threshold, np.max(series)]
|
|
296
|
+
labels = ['Low', 'High']
|
|
297
|
+
else:
|
|
298
|
+
# Collapse to three unique bins
|
|
299
|
+
bins = sorted(set(bins)) # Remove duplicate edges
|
|
300
|
+
labels = [f'Bin {i + 1}' for i in range(len(bins) - 1)]
|
|
301
|
+
else:
|
|
302
|
+
# Standard case: Low, Medium, High
|
|
303
|
+
labels = ['Low', 'Medium', 'High']
|
|
304
|
+
|
|
305
|
+
return {
|
|
306
|
+
'type': 'bin',
|
|
307
|
+
'bins': bins,
|
|
308
|
+
'labels': labels,
|
|
309
|
+
'prefix': f'{column_name}'
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
def transform_dataframe(df, config):
|
|
313
|
+
output_df = pd.DataFrame()
|
|
314
|
+
|
|
315
|
+
for column, settings in config.items():
|
|
316
|
+
if settings['type'] == 'bin':
|
|
317
|
+
# Apply binning
|
|
318
|
+
# Get unique bins (remove duplicates)
|
|
319
|
+
unique_bins = sorted(set(settings['bins']))
|
|
320
|
+
|
|
321
|
+
# Adjust labels if necessary
|
|
322
|
+
if len(unique_bins) - 1 != len(settings['labels']):
|
|
323
|
+
print(f"Adjusting labels to match bins: {len(unique_bins) - 1} bins detected.")
|
|
324
|
+
labels = [f'Bin {i+1}' for i in range(len(unique_bins) - 1)]
|
|
325
|
+
else:
|
|
326
|
+
labels = settings['labels']
|
|
327
|
+
|
|
328
|
+
# Perform the binning
|
|
329
|
+
binned_d = pd.cut(
|
|
330
|
+
df[column],
|
|
331
|
+
bins=unique_bins, # Deduplicated bins
|
|
332
|
+
labels=labels, # Adjusted or original labels
|
|
333
|
+
right=False # Adjust based on whether to include the right edge
|
|
334
|
+
)
|
|
335
|
+
# One-hot encode the binned column
|
|
336
|
+
binned_dummies = pd.get_dummies(binned_d, prefix=settings['prefix'])
|
|
337
|
+
output_df = pd.concat([output_df, binned_dummies], axis=1)
|
|
338
|
+
|
|
339
|
+
elif settings['type'] == 'one-hot':
|
|
340
|
+
# One-hot encode the column
|
|
341
|
+
one_hot_dummies = pd.get_dummies(df[column], prefix=settings.get('prefix', column))
|
|
342
|
+
output_df = pd.concat([output_df, one_hot_dummies], axis=1)
|
|
343
|
+
|
|
344
|
+
elif settings['type'] == 'continuous':
|
|
345
|
+
# Apply function to continuous data
|
|
346
|
+
data = df[column]
|
|
347
|
+
if 'bounds' in settings:
|
|
348
|
+
# Apply bounds filtering
|
|
349
|
+
lower, upper = settings['bounds']
|
|
350
|
+
data = data[(data >= lower) & (data <= upper)]
|
|
351
|
+
if 'apply_func' in settings:
|
|
352
|
+
# Apply custom function
|
|
353
|
+
data = data.apply(settings['apply_func'])
|
|
354
|
+
output_df[column] = data
|
|
355
|
+
|
|
356
|
+
elif settings['type'] == 'none':
|
|
357
|
+
# Leave the column unchanged
|
|
358
|
+
if column in df.columns:
|
|
359
|
+
|
|
360
|
+
output_df = pd.concat([output_df, df[[column]]], axis=1)
|
|
361
|
+
else:
|
|
362
|
+
print(f'config variable {column} is not in the data. Ignoring ...')
|
|
363
|
+
return output_df
|
|
364
|
+
|
|
365
|
+
# Helper function to guess column type and update `config`
|
|
366
|
+
def guess_column_type(column_name, series):
|
|
367
|
+
|
|
368
|
+
if series.empty:
|
|
369
|
+
raise ValueError(f"The column {column_name} contains no numeric data.")
|
|
370
|
+
|
|
371
|
+
if series.dtype == 'object' or series.dtype.name == 'category':
|
|
372
|
+
# If the column is categorical (e.g., strings), assume one-hot encoding
|
|
373
|
+
return {'type': 'one-hot', 'prefix': column_name}
|
|
374
|
+
elif pd.api.types.is_numeric_dtype(series):
|
|
375
|
+
unique_values = series.nunique()
|
|
376
|
+
|
|
377
|
+
if unique_values < 5:
|
|
378
|
+
return {'type': 'one-hot', 'prefix': column_name}
|
|
379
|
+
|
|
380
|
+
elif np.max(series) - np.min(series) > 20:
|
|
381
|
+
print('made it through here')
|
|
382
|
+
# If there are few unique values, assume binning with default bins
|
|
383
|
+
return guess_low_medium_high(column_name,series)
|
|
384
|
+
else:
|
|
385
|
+
# # Otherwise, assume continuous data with normalization
|
|
386
|
+
# Otherwise, fallback to continuous standardization
|
|
387
|
+
return {
|
|
388
|
+
'type': 'continuous',
|
|
389
|
+
'apply_func': (lambda x: (x - series.mean()) / series.std()) # Z-Score Standardization
|
|
390
|
+
}
|
|
391
|
+
else:
|
|
392
|
+
# Default fallback (leave the column unchanged)
|
|
393
|
+
return {'type': 'none'}
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
|
|
154
397
|
def as_wide_factor(x_df, yes=1, min_factor=2, max_factor=8, keep_original=0, exclude=[]):
|
|
155
398
|
if not yes:
|
|
156
399
|
return x_df
|
|
@@ -173,7 +416,7 @@ def PCA_code(X, n_components=5):
|
|
|
173
416
|
|
|
174
417
|
|
|
175
418
|
def interactions(df, keep=None, drop_this_perc=0.6, interact = False):
|
|
176
|
-
|
|
419
|
+
full_columns = df.columns
|
|
177
420
|
if interact:
|
|
178
421
|
interactions_list = []
|
|
179
422
|
for i, var_i in enumerate(df.columns):
|
|
@@ -199,14 +442,31 @@ def interactions(df, keep=None, drop_this_perc=0.6, interact = False):
|
|
|
199
442
|
df = pd.concat([df, df_interactions], axis=1, sort=False)
|
|
200
443
|
|
|
201
444
|
# second
|
|
202
|
-
|
|
445
|
+
# Remove `keep` columns from the correlation matrix
|
|
446
|
+
if keep is not None:
|
|
447
|
+
missing_columns = [col for col in keep if col not in df.columns]
|
|
448
|
+
|
|
449
|
+
if missing_columns:
|
|
450
|
+
print(f"The following columns are not in the DataFrame and will be ignored: {missing_columns}")
|
|
451
|
+
keep = [col for col in keep if col not in missing_columns]
|
|
452
|
+
df_corr = df.drop(columns=keep, errors='ignore', inplace=False) # Exclude `keep` columns
|
|
453
|
+
else:
|
|
454
|
+
df_corr = df
|
|
455
|
+
|
|
456
|
+
# Compute the absolute correlation matrix
|
|
457
|
+
corr_matrix = df_corr.corr().abs()
|
|
458
|
+
|
|
459
|
+
# Keep only the upper triangle of the correlation matrix
|
|
203
460
|
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
204
461
|
|
|
205
|
-
# Find features with correlation greater than
|
|
462
|
+
# Find features with correlation greater than the threshold
|
|
206
463
|
to_drop = [column for column in upper.columns if any(upper[column] > drop_this_perc)]
|
|
464
|
+
|
|
465
|
+
# Ensure `keep` columns are not dropped
|
|
207
466
|
if keep is not None:
|
|
208
|
-
to_drop = [column for column in to_drop if column not in
|
|
209
|
-
|
|
467
|
+
to_drop = [column for column in to_drop if column not in full_columns]
|
|
468
|
+
|
|
469
|
+
# Drop the identified features
|
|
210
470
|
df.drop(to_drop, axis=1, inplace=True)
|
|
211
471
|
|
|
212
472
|
return df
|
|
@@ -330,3 +590,5 @@ def entries_to_remove(entries, the_dict):
|
|
|
330
590
|
for key in entries:
|
|
331
591
|
if key in the_dict:
|
|
332
592
|
del the_dict[key]
|
|
593
|
+
|
|
594
|
+
|
metacountregressor/main.py
CHANGED
|
@@ -28,12 +28,65 @@ def convert_df_columns_to_binary_and_wide(df):
|
|
|
28
28
|
return df
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def process_arguments(**kwargs):
|
|
32
36
|
'''
|
|
33
37
|
TRYING TO TURN THE CSV FILES INTO RELEVANT ARGS
|
|
34
38
|
'''
|
|
35
|
-
|
|
36
|
-
|
|
39
|
+
#dataset
|
|
40
|
+
'''
|
|
41
|
+
if kwargs.get('dataset_file', False
|
|
42
|
+
):
|
|
43
|
+
dataset = pd.read_csv(kwargs.get('dataset_file'))
|
|
44
|
+
named_data_headers = dataset.columns.tolist()
|
|
45
|
+
decision_constants = {name: list(range(7)) for name in named_data_headers}
|
|
46
|
+
data_info = {
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
'AADT': {
|
|
50
|
+
'type': 'continuous',
|
|
51
|
+
'bounds': [0.0, np.infty],
|
|
52
|
+
'discrete': False,
|
|
53
|
+
'apply_func': (lambda x: np.log(x + 1)),
|
|
54
|
+
},
|
|
55
|
+
'SPEED': {
|
|
56
|
+
'type': 'continuous',
|
|
57
|
+
'bounds': [0, 100],
|
|
58
|
+
'enforce_bounds': True,
|
|
59
|
+
'discrete': True
|
|
60
|
+
},
|
|
61
|
+
'TIME': {
|
|
62
|
+
'type': 'continuous',
|
|
63
|
+
'bounds': [0, 23.999],
|
|
64
|
+
'discrete': False
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
#remove ID CoLUMNS from dataset
|
|
68
|
+
dataset = dataset.drop(columns = [
|
|
69
|
+
'ID'
|
|
70
|
+
])
|
|
71
|
+
for c in dataset.columns:
|
|
72
|
+
if c not in data_info.keys():
|
|
73
|
+
data_info[c] = {'type': 'categorical'}
|
|
74
|
+
|
|
75
|
+
data_new =helperprocess.transform_dataframe(dataset,data_info)
|
|
76
|
+
|
|
77
|
+
update_constant = kwargs.get('analyst_constraints')
|
|
78
|
+
#update the decision_constraints
|
|
79
|
+
'''
|
|
80
|
+
data_characteristic = pd.read_csv(kwargs.get('problem_data', 'problem_data.csv'))
|
|
81
|
+
# Extract the column as a list of characteristic names
|
|
82
|
+
#name_data_characteristics = data_characteristic.columns.tolist()
|
|
83
|
+
|
|
84
|
+
# Create the dictionary
|
|
85
|
+
#decision_constraints = {name: list(range(7)) for name in name_data_characteristics}
|
|
86
|
+
|
|
87
|
+
#print('this gets all the features, I need to remove...')
|
|
88
|
+
|
|
89
|
+
analyst_d = pd.read_csv(kwargs.get('decison_constraints', 'decisions.csv'))
|
|
37
90
|
hyper = pd.read_csv('setup_hyper.csv')
|
|
38
91
|
|
|
39
92
|
new_data = {'data': data_characteristic,
|
|
@@ -41,7 +94,14 @@ def process_arguments():
|
|
|
41
94
|
'hyper': hyper}
|
|
42
95
|
return new_data
|
|
43
96
|
|
|
97
|
+
def process_package_arguments():
|
|
98
|
+
|
|
99
|
+
new_data = {}
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
|
|
44
103
|
def main(args, **kwargs):
|
|
104
|
+
|
|
45
105
|
'''METACOUNT REGRESSOR TESTING ENVIRONMENT'''
|
|
46
106
|
|
|
47
107
|
'''
|
|
@@ -114,13 +174,25 @@ def main(args, **kwargs):
|
|
|
114
174
|
X = df
|
|
115
175
|
y = df['FREQ'] # Frequency of crashes
|
|
116
176
|
X['Offset'] = np.log(df['AADT']) # Explicitley define how to offset the data, no offset otherwise
|
|
177
|
+
df['Offset'] = np.log(df['AADT'])
|
|
117
178
|
# Drop Y, selected offset term and ID as there are no panels
|
|
118
179
|
X = df.drop(columns=['FREQ', 'ID', 'AADT'])
|
|
119
|
-
|
|
180
|
+
# Step 0: Process Data
|
|
181
|
+
model_terms = {
|
|
182
|
+
'Y': 'FREQ', # Replace 'FREQ' with the name of your dependent variable
|
|
183
|
+
'group': None, # Replace 'group_column' with the name of your grouping column (or None if not used)
|
|
184
|
+
'panels': None, # Replace 'panel_column' with the name of your panel column (or None if not used)
|
|
185
|
+
'Offset': 'Offset' # Replace None with the name of your offset column if using one
|
|
186
|
+
}
|
|
187
|
+
a_des, df = helperprocess.set_up_analyst_constraints(df, model_terms)
|
|
120
188
|
# some example argument, these are defualt so the following line is just for claritity
|
|
121
189
|
args = {'algorithm': 'hs', 'test_percentage': 0.15, 'test_complexity': 6, 'instance_number': 1,
|
|
122
|
-
'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6}
|
|
190
|
+
'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6, 'desicions':a_des}
|
|
123
191
|
# Fit the model with metacountregressor
|
|
192
|
+
# Step 5: Transform the dataset based on the configuration
|
|
193
|
+
#data_new = helperprocess.transform_dataframe(dataset, config)
|
|
194
|
+
y = df[['Y']]
|
|
195
|
+
X = df.drop(columns=['Y'])
|
|
124
196
|
obj_fun = ObjectiveFunction(X, y, **args)
|
|
125
197
|
# replace with other metaheuristics if desired
|
|
126
198
|
results = harmony_search(obj_fun)
|
|
@@ -162,8 +234,8 @@ def main(args, **kwargs):
|
|
|
162
234
|
'rdm_cor_terms': [],
|
|
163
235
|
'grouped_terms': [],
|
|
164
236
|
'hetro_in_means': [],
|
|
165
|
-
'transformations': ['no', 'log', '
|
|
166
|
-
'dispersion':
|
|
237
|
+
'transformations': ['no', 'log', 'no', 'no', 'no', 'no', 'no'],
|
|
238
|
+
'dispersion': 0
|
|
167
239
|
}
|
|
168
240
|
|
|
169
241
|
keep = ['Constant', 'US', 'RSMS', 'MCV', 'RSHS', 'AADT', 'Curve50', 'Offset']
|
|
@@ -172,13 +244,27 @@ def main(args, **kwargs):
|
|
|
172
244
|
elif dataset == 4:
|
|
173
245
|
manual_fit_spec = {
|
|
174
246
|
'fixed_terms': ['const', 'LOWPRE', 'GBRPM', 'FRICTION'],
|
|
175
|
-
'rdm_terms': ['
|
|
247
|
+
'rdm_terms': ['EXPOSE:normal', 'INTPM:normal', 'CPM:normal', 'HISNOW:normal'],
|
|
176
248
|
'rdm_cor_terms': [],
|
|
177
249
|
'grouped_terms': [],
|
|
178
250
|
'hetro_in_means': [],
|
|
179
251
|
'transformations': ['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
|
|
180
252
|
'dispersion': 1
|
|
181
253
|
}
|
|
254
|
+
'''
|
|
255
|
+
manual_fit_spec = {
|
|
256
|
+
'fixed_terms': ['const', 'LOWPRE', 'GBRPM', 'FRICTION', 'EXPOSE', 'INTPM', 'CPM', 'HISNOW'],
|
|
257
|
+
'rdm_terms': [],
|
|
258
|
+
'rdm_cor_terms': [],
|
|
259
|
+
'grouped_terms': [],
|
|
260
|
+
'hetro_in_means': [],
|
|
261
|
+
'transformations': ['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
|
|
262
|
+
'dispersion': 1
|
|
263
|
+
}
|
|
264
|
+
'''
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
'''
|
|
182
268
|
print('overriding this delete, just want to test the NB')
|
|
183
269
|
manual_fit_spec = {
|
|
184
270
|
'fixed_terms': ['const'],
|
|
@@ -189,7 +275,7 @@ def main(args, **kwargs):
|
|
|
189
275
|
'transformations': ['no'],
|
|
190
276
|
'dispersion': 1
|
|
191
277
|
}
|
|
192
|
-
|
|
278
|
+
'''
|
|
193
279
|
df = pd.read_csv('./data/Ex-16-3.csv') # read in the data
|
|
194
280
|
y_df = df[['FREQ']].copy() # only consider crashes
|
|
195
281
|
y_df.rename(columns={"FREQ": "Y"}, inplace=True)
|
|
@@ -262,6 +348,17 @@ def main(args, **kwargs):
|
|
|
262
348
|
x_df = helperprocess.interactions(x_df, drop_this_perc=0.8)
|
|
263
349
|
x_df['county'] = group_grab
|
|
264
350
|
|
|
351
|
+
print('benchmark specification')
|
|
352
|
+
manual_fit_spec = {
|
|
353
|
+
'fixed_terms': ['const', 'monthly_AADT', 'segment_length', 'speed', 'paved_shoulder', 'curve'],
|
|
354
|
+
'rdm_terms': [],
|
|
355
|
+
'rdm_cor_terms': [],
|
|
356
|
+
'grouped_terms': ['DP01:normal', 'DX32:normal'],
|
|
357
|
+
'hetro_in_means': [],
|
|
358
|
+
'transformations': ['no', 'no', 'no', 'no', 'no', 'no'],
|
|
359
|
+
'dispersion': 0
|
|
360
|
+
}
|
|
361
|
+
|
|
265
362
|
elif dataset == 9:
|
|
266
363
|
df = pd.read_csv('panel_synth.csv') # read in the data
|
|
267
364
|
y_df = df[['Y']].copy() # only consider crashes
|
|
@@ -286,19 +383,21 @@ def main(args, **kwargs):
|
|
|
286
383
|
keep = ['group', 'constant', 'element_ID']
|
|
287
384
|
|
|
288
385
|
x_df = helperprocess.interactions(x_df, keep)
|
|
289
|
-
|
|
290
|
-
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
elif dataset ==10: # the dataset has been selected in the program as something else
|
|
389
|
+
data_info = process_arguments(**args)
|
|
291
390
|
data_info['hyper']
|
|
292
391
|
data_info['analyst']
|
|
293
392
|
data_info['data']['Y']
|
|
294
393
|
#data_info['data']['Group'][0]
|
|
295
394
|
#data_info['data']['Panel'][0]
|
|
296
395
|
args['decisions'] = data_info['analyst']
|
|
297
|
-
|
|
298
|
-
if
|
|
396
|
+
print('check the args of the decions')
|
|
397
|
+
if type(data_info['data']['Grouped'][0]) == str and len(data_info['data']['Grouped'][0]) >1:
|
|
299
398
|
args['group'] = data_info['data']['Grouped'][0]
|
|
300
399
|
args['ID'] = data_info['data']['Grouped'][0]
|
|
301
|
-
if
|
|
400
|
+
if type(data_info['data']['Panel'][0]) == str and len(data_info['data']['Panel'][0])>1:
|
|
302
401
|
args['panels'] = data_info['data']['Panel'][0]
|
|
303
402
|
|
|
304
403
|
df = pd.read_csv(str(data_info['data']['Problem'][0]))
|
|
@@ -306,6 +405,10 @@ def main(args, **kwargs):
|
|
|
306
405
|
y_df = df[[data_info['data']['Y'][0]]]
|
|
307
406
|
y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
|
|
308
407
|
print('test') #FIXME
|
|
408
|
+
else:
|
|
409
|
+
print('PROCESS THE PACKAGE ARGUMENTS SIMULIAR TO HOW ONE WOULD DEFINE THE ENVIRONMENT')
|
|
410
|
+
data_info =process_package_arguments()
|
|
411
|
+
|
|
309
412
|
|
|
310
413
|
if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
|
|
311
414
|
if manual_fit_spec is None:
|
|
@@ -411,55 +514,63 @@ if __name__ == '__main__':
|
|
|
411
514
|
parser = argparse.ArgumentParser(prog='main',
|
|
412
515
|
epilog=main.__doc__,
|
|
413
516
|
formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve')
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
if
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
parser.
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
BATCH_JOB = False
|
|
520
|
+
|
|
521
|
+
if BATCH_JOB:
|
|
522
|
+
parser.add_argument('-dataset_file', default='data/Ex-16-3.csv', help='supply the path to the dataset')
|
|
523
|
+
|
|
524
|
+
parser.add_argument('-line', type=int, default=1,
|
|
525
|
+
help='line to read in csv to pass in argument')
|
|
526
|
+
|
|
527
|
+
if vars(parser.parse_args())['line'] is not None:
|
|
528
|
+
reader = csv.DictReader(open('set_data.csv', 'r'))
|
|
529
|
+
args = list()
|
|
530
|
+
line_number_obs = 0
|
|
531
|
+
for dictionary in reader: # TODO find a way to handle multiple args
|
|
532
|
+
args = dictionary
|
|
533
|
+
if line_number_obs == int(vars(parser.parse_args())['line']):
|
|
534
|
+
break
|
|
535
|
+
line_number_obs += 1
|
|
536
|
+
args = dict(args)
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
for key, value in args.items():
|
|
540
|
+
try:
|
|
541
|
+
# Attempt to parse the string value to a Python literal if value is a string.
|
|
542
|
+
if isinstance(value, str):
|
|
543
|
+
value = ast.literal_eval(value)
|
|
544
|
+
except (ValueError, SyntaxError):
|
|
545
|
+
# If there's a parsing error, value remains as the original string.
|
|
546
|
+
pass
|
|
547
|
+
|
|
548
|
+
# Add the argument to the parser with the potentially updated value.
|
|
549
|
+
parser.add_argument(f'-{key}', default=value)
|
|
550
|
+
|
|
551
|
+
for i, action in enumerate(parser._optionals._actions):
|
|
552
|
+
if "-algorithm" in action.option_strings:
|
|
553
|
+
parser._optionals._actions[i].help = "optimization algorithm"
|
|
554
|
+
|
|
555
|
+
override = True
|
|
556
|
+
if override:
|
|
557
|
+
print('WARNING: TESTING ENVIRONMENT, TURN OFF FOR RELEASE')
|
|
558
|
+
parser.add_argument('-problem_number', default='10')
|
|
559
|
+
|
|
560
|
+
if 'algorithm' not in args:
|
|
561
|
+
parser.add_argument('-algorithm', type=str, default='hs',
|
|
562
|
+
help='optimization algorithm')
|
|
563
|
+
elif 'Manual_Fit' not in args:
|
|
564
|
+
parser.add_argument('-Manual_Fit', action='store_false', default=None,
|
|
565
|
+
help='To fit a model manually if desired.')
|
|
566
|
+
|
|
567
|
+
parser.add_argument('-seperate_out_factors', action='store_false', default=False,
|
|
568
|
+
help='Trie of wanting to split data that is potentially categorical as binary'
|
|
569
|
+
' we want to split the data for processing')
|
|
570
|
+
parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directories')
|
|
461
571
|
|
|
462
572
|
else: # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
|
|
573
|
+
print("RUNNING WITH ARGS")
|
|
463
574
|
parser.add_argument('-com', type=str, default='MetaCode',
|
|
464
575
|
help='line to read csv')
|
|
465
576
|
|