metacountregressor 0.1.73__py3-none-any.whl → 0.1.83__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metacountregressor/app_main.py +258 -0
- metacountregressor/data_split_helper.py +90 -0
- metacountregressor/helperprocess.py +372 -5
- metacountregressor/main.py +297 -117
- metacountregressor/metaheuristics.py +43 -31
- metacountregressor/setup.py +3 -2
- metacountregressor/solution.py +734 -832
- {metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/METADATA +256 -35
- metacountregressor-0.1.83.dist-info/RECORD +20 -0
- {metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/WHEEL +1 -1
- metacountregressor-0.1.73.dist-info/RECORD +0 -18
- {metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/LICENSE.txt +0 -0
- {metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,145 @@
|
|
|
1
|
+
from os.path import exists
|
|
1
2
|
import numpy as np
|
|
2
3
|
import pandas as pd
|
|
3
4
|
import csv
|
|
4
5
|
import matplotlib.pyplot as plt
|
|
6
|
+
from scipy import stats as st
|
|
7
|
+
from sklearn.preprocessing import StandardScaler
|
|
8
|
+
|
|
5
9
|
|
|
6
10
|
plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
|
|
7
11
|
|
|
8
12
|
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
from itertools import product
|
|
17
|
+
|
|
18
|
+
# Function to create a list of dictionaries from a parameter grid
|
|
19
|
+
def generate_param_combinations(param_grid):
|
|
20
|
+
keys = param_grid.keys()
|
|
21
|
+
values = param_grid.values()
|
|
22
|
+
combinations = [dict(zip(keys, v)) for v in product(*values)]
|
|
23
|
+
return combinations
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
##Select the best Features Based on RF
|
|
27
|
+
def select_features(X_train, y_train, n_f=16):
|
|
28
|
+
try:
|
|
29
|
+
from sklearn.feature_selection import SelectKBest
|
|
30
|
+
from sklearn.feature_selection import f_regression
|
|
31
|
+
feature_names = X_train.columns
|
|
32
|
+
# configure to select all features
|
|
33
|
+
fs = SelectKBest(score_func=f_regression, k=16)
|
|
34
|
+
|
|
35
|
+
# learn relationship from training data
|
|
36
|
+
fs.fit(X_train, y_train)
|
|
37
|
+
|
|
38
|
+
mask = fs.get_support() # Boolean array of selected features
|
|
39
|
+
selected_features = [feature for bool, feature in zip(mask, feature_names) if bool]
|
|
40
|
+
X_train = X_train[selected_features]
|
|
41
|
+
except:
|
|
42
|
+
print('import error, not performing feature selection')
|
|
43
|
+
fs = X_train.columns #TODO check if this is actually getting the names
|
|
44
|
+
|
|
45
|
+
return X_train, fs
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
#Cutts off correlated data
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def findCorrelation(corr, cutoff=0.9, exact=None): """
|
|
55
|
+
This function is the Python implementation of the R function
|
|
56
|
+
`findCorrelation()`.
|
|
57
|
+
|
|
58
|
+
Relies on numpy and pandas, so must have them pre-installed.
|
|
59
|
+
|
|
60
|
+
It searches through a correlation matrix and returns a list of column names
|
|
61
|
+
to remove to reduce pairwise correlations.
|
|
62
|
+
|
|
63
|
+
For the documentation of the R function, see
|
|
64
|
+
https://www.rdocumentation.org/packages/caret/topics/findCorrelation
|
|
65
|
+
and for the source code of `findCorrelation()`, see
|
|
66
|
+
https://github.com/topepo/caret/blob/master/pkg/caret/R/findCorrelation.R
|
|
67
|
+
|
|
68
|
+
-----------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
Parameters:
|
|
71
|
+
-----------
|
|
72
|
+
corr: pandas dataframe.
|
|
73
|
+
A correlation matrix as a pandas dataframe.
|
|
74
|
+
cutoff: float, default: 0.9.
|
|
75
|
+
A numeric value for the pairwise absolute correlation cutoff
|
|
76
|
+
exact: bool, default: None
|
|
77
|
+
A boolean value that determines whether the average correlations be
|
|
78
|
+
recomputed at each step
|
|
79
|
+
-----------------------------------------------------------------------------
|
|
80
|
+
Returns:
|
|
81
|
+
--------
|
|
82
|
+
list of column names
|
|
83
|
+
-----------------------------------------------------------------------------
|
|
84
|
+
Example:
|
|
85
|
+
--------
|
|
86
|
+
R1 = pd.DataFrame({
|
|
87
|
+
'x1': [1.0, 0.86, 0.56, 0.32, 0.85],
|
|
88
|
+
'x2': [0.86, 1.0, 0.01, 0.74, 0.32],
|
|
89
|
+
'x3': [0.56, 0.01, 1.0, 0.65, 0.91],
|
|
90
|
+
'x4': [0.32, 0.74, 0.65, 1.0, 0.36],
|
|
91
|
+
'x5': [0.85, 0.32, 0.91, 0.36, 1.0]
|
|
92
|
+
}, index=['x1', 'x2', 'x3', 'x4', 'x5'])
|
|
93
|
+
|
|
94
|
+
findCorrelation(R1, cutoff=0.6, exact=False) # ['x4', 'x5', 'x1', 'x3']
|
|
95
|
+
findCorrelation(R1, cutoff=0.6, exact=True) # ['x1', 'x5', 'x4']
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _findCorrelation_fast(corr, avg, cutoff):
|
|
100
|
+
|
|
101
|
+
combsAboveCutoff = corr.where(lambda x: (np.tril(x) == 0) & (x > cutoff)).stack().index
|
|
102
|
+
|
|
103
|
+
rowsToCheck = combsAboveCutoff.get_level_values(0)
|
|
104
|
+
colsToCheck = combsAboveCutoff.get_level_values(1)
|
|
105
|
+
|
|
106
|
+
msk = avg[colsToCheck] > avg[rowsToCheck].values
|
|
107
|
+
deletecol = pd.unique(np.r_[colsToCheck[msk], rowsToCheck[~msk]]).tolist()
|
|
108
|
+
|
|
109
|
+
return deletecol
|
|
110
|
+
|
|
111
|
+
def _findCorrelation_exact(corr, avg, cutoff):
|
|
112
|
+
|
|
113
|
+
x = corr.loc[(*[avg.sort_values(ascending=False).index] * 2,)]
|
|
114
|
+
|
|
115
|
+
if (x.dtypes.values[:, None] == ['int64', 'int32', 'int16', 'int8']).any():
|
|
116
|
+
x = x.astype(float)
|
|
117
|
+
|
|
118
|
+
x.values[(*[np.arange(len(x))] * 2,)] = np.nan
|
|
119
|
+
|
|
120
|
+
deletecol = []
|
|
121
|
+
for ix, i in enumerate(x.columns[:-1]):
|
|
122
|
+
for j in x.columns[ix + 1:]:
|
|
123
|
+
if x.loc[i, j] > cutoff:
|
|
124
|
+
if x[i].mean() > x[j].mean():
|
|
125
|
+
deletecol.append(i)
|
|
126
|
+
x.loc[i] = x[i] = np.nan
|
|
127
|
+
else:
|
|
128
|
+
deletecol.append(j)
|
|
129
|
+
x.loc[j] = x[j] = np.nan
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
"""Funtion to Convert Data to Binaries """
|
|
135
|
+
def clean_data_types(df):
|
|
136
|
+
for col in df.columns:
|
|
137
|
+
if df[col].dtype == 'object':
|
|
138
|
+
# Attempt to convert the column to numeric type
|
|
139
|
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
|
140
|
+
return df
|
|
141
|
+
|
|
142
|
+
|
|
9
143
|
def drop_correlations(x_df, percentage=0.85):
|
|
10
144
|
cor_matrix = x_df.corr().abs()
|
|
11
145
|
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool)) # type: ignore
|
|
@@ -36,6 +170,220 @@ def remove_files(yes=1):
|
|
|
36
170
|
os.remove('pop_log.csv')
|
|
37
171
|
|
|
38
172
|
|
|
173
|
+
# Function to process the DataFrame
|
|
174
|
+
'''
|
|
175
|
+
Example usuage
|
|
176
|
+
# Configuration dictionary
|
|
177
|
+
config = {
|
|
178
|
+
'Age': {
|
|
179
|
+
'type': 'bin',
|
|
180
|
+
'bins': [0, 18, 35, 50, 100],
|
|
181
|
+
'labels': ['Child', 'YoungAdult', 'MiddleAged', 'Senior'],
|
|
182
|
+
'prefix': 'Age_Binned'
|
|
183
|
+
},
|
|
184
|
+
'Income': {
|
|
185
|
+
'type': 'bin',
|
|
186
|
+
'bins': [0, 2000, 5000, 10000],
|
|
187
|
+
'labels': ['Low', 'Medium', 'High'],
|
|
188
|
+
'prefix': 'Income_Binned'
|
|
189
|
+
},
|
|
190
|
+
'Gender': {
|
|
191
|
+
'type': 'one-hot',
|
|
192
|
+
'prefix': 'Gender'
|
|
193
|
+
},
|
|
194
|
+
'Score': {
|
|
195
|
+
'type': 'none'
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
'''
|
|
199
|
+
def null_handler(vari):
|
|
200
|
+
if vari in locals():
|
|
201
|
+
return vari
|
|
202
|
+
else:
|
|
203
|
+
print(f'{vari} does not exist, setting None..')
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def set_up_analyst_constraints(data_characteristic, model_terms, variable_decisions_alt = None):
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
name_data_characteristics = data_characteristic.columns.tolist()
|
|
211
|
+
# Get non-None values as a list
|
|
212
|
+
non_none_terms = [value for value in model_terms.values() if value is not None]
|
|
213
|
+
# how to make name_data_characteristics - non_none_terms
|
|
214
|
+
|
|
215
|
+
result = [item for item in name_data_characteristics if item not in non_none_terms]
|
|
216
|
+
distu = ['normal', 'uniform', 'triangular']
|
|
217
|
+
tra = ['no', 'sqrt', 'arcsinh']
|
|
218
|
+
if model_terms.get('group') is None:
|
|
219
|
+
print('cant have grouped rpm, removing level 4 from every item')
|
|
220
|
+
MAKE_ALL_4_FALSE = True
|
|
221
|
+
else:
|
|
222
|
+
MAKE_ALL_4_FALSE = False
|
|
223
|
+
|
|
224
|
+
variable_decisions = {
|
|
225
|
+
name: {
|
|
226
|
+
'levels': list(range(6)),
|
|
227
|
+
'Distributions': distu,
|
|
228
|
+
'Transformations': tra
|
|
229
|
+
}
|
|
230
|
+
for name in result
|
|
231
|
+
}
|
|
232
|
+
# Override elements in the original dictionary with the alt dictionary
|
|
233
|
+
if variable_decisions_alt is not None:
|
|
234
|
+
for key, alt_value in variable_decisions_alt.items():
|
|
235
|
+
if key in variable_decisions:
|
|
236
|
+
# Update the existing entry
|
|
237
|
+
variable_decisions[key].update(alt_value)
|
|
238
|
+
else:
|
|
239
|
+
# Add new entry if it doesn't exist
|
|
240
|
+
variable_decisions[key] = alt_value
|
|
241
|
+
# Prepare the data for the DataFrame
|
|
242
|
+
rows = []
|
|
243
|
+
for column_name, details in variable_decisions.items():
|
|
244
|
+
# Create a row dictionary
|
|
245
|
+
row = {'Column': column_name}
|
|
246
|
+
|
|
247
|
+
# Add levels as True/False for Level 0 through Level 5
|
|
248
|
+
for level in range(6): # Assuming Level 0 to Level 5
|
|
249
|
+
|
|
250
|
+
if level == 4 and MAKE_ALL_4_FALSE:
|
|
251
|
+
row[f'Level {level}'] = False
|
|
252
|
+
else:
|
|
253
|
+
row[f'Level {level}'] = level in details['levels']
|
|
254
|
+
|
|
255
|
+
# Add distributions and transformations directly
|
|
256
|
+
|
|
257
|
+
# Add distributions and transformations as comma-separated strings
|
|
258
|
+
row['Distributions'] = str(details['Distributions'])
|
|
259
|
+
row['Transformations'] = str(details['Transformations'])
|
|
260
|
+
|
|
261
|
+
rows.append(row)
|
|
262
|
+
|
|
263
|
+
# Create the DataFrame
|
|
264
|
+
df = pd.DataFrame(rows)
|
|
265
|
+
|
|
266
|
+
data_new = data_characteristic.rename(columns={v: k for k, v in model_terms.items() if v in data_characteristic.columns})
|
|
267
|
+
return df, data_new
|
|
268
|
+
|
|
269
|
+
# Function to guess Low, Medium, High ranges
|
|
270
|
+
def guess_low_medium_high(column_name, series):
|
|
271
|
+
# Compute the tertiles (33rd and 66th percentiles)
|
|
272
|
+
#print('did it make it...')
|
|
273
|
+
#mode_value = st.mode(series) # Get the most frequent value
|
|
274
|
+
#print('good')
|
|
275
|
+
# series = pd.to_numeric(series, errors='coerce').fillna(mode_value)
|
|
276
|
+
low_threshold = np.quantile(series, 0.33)
|
|
277
|
+
high_threshold = np.quantile(series,0.66)
|
|
278
|
+
|
|
279
|
+
# Define the bins and labels
|
|
280
|
+
bins = [np.min(series) - 1, low_threshold, high_threshold, np.max(series)]
|
|
281
|
+
# Handle duplicate bins by adjusting labels
|
|
282
|
+
if len(set(bins)) < len(bins): # Check for duplicate bin edges
|
|
283
|
+
if low_threshold == high_threshold:
|
|
284
|
+
# Collapse to two bins (Low and High)
|
|
285
|
+
bins = [np.min(series) - 1, low_threshold, np.max(series)]
|
|
286
|
+
labels = ['Low', 'High']
|
|
287
|
+
else:
|
|
288
|
+
# Collapse to three unique bins
|
|
289
|
+
bins = sorted(set(bins)) # Remove duplicate edges
|
|
290
|
+
labels = [f'Bin {i + 1}' for i in range(len(bins) - 1)]
|
|
291
|
+
else:
|
|
292
|
+
# Standard case: Low, Medium, High
|
|
293
|
+
labels = ['Low', 'Medium', 'High']
|
|
294
|
+
|
|
295
|
+
return {
|
|
296
|
+
'type': 'bin',
|
|
297
|
+
'bins': bins,
|
|
298
|
+
'labels': labels,
|
|
299
|
+
'prefix': f'{column_name}'
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
def transform_dataframe(df, config):
|
|
303
|
+
output_df = pd.DataFrame()
|
|
304
|
+
|
|
305
|
+
for column, settings in config.items():
|
|
306
|
+
if settings['type'] == 'bin':
|
|
307
|
+
# Apply binning
|
|
308
|
+
# Get unique bins (remove duplicates)
|
|
309
|
+
unique_bins = sorted(set(settings['bins']))
|
|
310
|
+
|
|
311
|
+
# Adjust labels if necessary
|
|
312
|
+
if len(unique_bins) - 1 != len(settings['labels']):
|
|
313
|
+
print(f"Adjusting labels to match bins: {len(unique_bins) - 1} bins detected.")
|
|
314
|
+
labels = [f'Bin {i+1}' for i in range(len(unique_bins) - 1)]
|
|
315
|
+
else:
|
|
316
|
+
labels = settings['labels']
|
|
317
|
+
|
|
318
|
+
# Perform the binning
|
|
319
|
+
binned_d = pd.cut(
|
|
320
|
+
df[column],
|
|
321
|
+
bins=unique_bins, # Deduplicated bins
|
|
322
|
+
labels=labels, # Adjusted or original labels
|
|
323
|
+
right=False # Adjust based on whether to include the right edge
|
|
324
|
+
)
|
|
325
|
+
# One-hot encode the binned column
|
|
326
|
+
binned_dummies = pd.get_dummies(binned_d, prefix=settings['prefix'])
|
|
327
|
+
output_df = pd.concat([output_df, binned_dummies], axis=1)
|
|
328
|
+
|
|
329
|
+
elif settings['type'] == 'one-hot':
|
|
330
|
+
# One-hot encode the column
|
|
331
|
+
one_hot_dummies = pd.get_dummies(df[column], prefix=settings.get('prefix', column))
|
|
332
|
+
output_df = pd.concat([output_df, one_hot_dummies], axis=1)
|
|
333
|
+
|
|
334
|
+
elif settings['type'] == 'continuous':
|
|
335
|
+
# Apply function to continuous data
|
|
336
|
+
data = df[column]
|
|
337
|
+
if 'bounds' in settings:
|
|
338
|
+
# Apply bounds filtering
|
|
339
|
+
lower, upper = settings['bounds']
|
|
340
|
+
data = data[(data >= lower) & (data <= upper)]
|
|
341
|
+
if 'apply_func' in settings:
|
|
342
|
+
# Apply custom function
|
|
343
|
+
data = data.apply(settings['apply_func'])
|
|
344
|
+
output_df[column] = data
|
|
345
|
+
|
|
346
|
+
elif settings['type'] == 'none':
|
|
347
|
+
# Leave the column unchanged
|
|
348
|
+
if column in df.columns:
|
|
349
|
+
|
|
350
|
+
output_df = pd.concat([output_df, df[[column]]], axis=1)
|
|
351
|
+
else:
|
|
352
|
+
print(f'config variable {column} is not in the data. Ignoring ...')
|
|
353
|
+
return output_df
|
|
354
|
+
|
|
355
|
+
# Helper function to guess column type and update `config`
|
|
356
|
+
def guess_column_type(column_name, series):
|
|
357
|
+
|
|
358
|
+
if series.empty:
|
|
359
|
+
raise ValueError(f"The column {column_name} contains no numeric data.")
|
|
360
|
+
|
|
361
|
+
if series.dtype == 'object' or series.dtype.name == 'category':
|
|
362
|
+
# If the column is categorical (e.g., strings), assume one-hot encoding
|
|
363
|
+
return {'type': 'one-hot', 'prefix': column_name}
|
|
364
|
+
elif pd.api.types.is_numeric_dtype(series):
|
|
365
|
+
unique_values = series.nunique()
|
|
366
|
+
|
|
367
|
+
if unique_values < 5:
|
|
368
|
+
return {'type': 'one-hot', 'prefix': column_name}
|
|
369
|
+
|
|
370
|
+
elif np.max(series) - np.min(series) > 20:
|
|
371
|
+
print('made it through here')
|
|
372
|
+
# If there are few unique values, assume binning with default bins
|
|
373
|
+
return guess_low_medium_high(column_name,series)
|
|
374
|
+
else:
|
|
375
|
+
# # Otherwise, assume continuous data with normalization
|
|
376
|
+
# Otherwise, fallback to continuous standardization
|
|
377
|
+
return {
|
|
378
|
+
'type': 'continuous',
|
|
379
|
+
'apply_func': (lambda x: (x - series.mean()) / series.std()) # Z-Score Standardization
|
|
380
|
+
}
|
|
381
|
+
else:
|
|
382
|
+
# Default fallback (leave the column unchanged)
|
|
383
|
+
return {'type': 'none'}
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
|
|
39
387
|
def as_wide_factor(x_df, yes=1, min_factor=2, max_factor=8, keep_original=0, exclude=[]):
|
|
40
388
|
if not yes:
|
|
41
389
|
return x_df
|
|
@@ -58,7 +406,7 @@ def PCA_code(X, n_components=5):
|
|
|
58
406
|
|
|
59
407
|
|
|
60
408
|
def interactions(df, keep=None, drop_this_perc=0.6, interact = False):
|
|
61
|
-
|
|
409
|
+
full_columns = df.columns
|
|
62
410
|
if interact:
|
|
63
411
|
interactions_list = []
|
|
64
412
|
for i, var_i in enumerate(df.columns):
|
|
@@ -84,14 +432,31 @@ def interactions(df, keep=None, drop_this_perc=0.6, interact = False):
|
|
|
84
432
|
df = pd.concat([df, df_interactions], axis=1, sort=False)
|
|
85
433
|
|
|
86
434
|
# second
|
|
87
|
-
|
|
435
|
+
# Remove `keep` columns from the correlation matrix
|
|
436
|
+
if keep is not None:
|
|
437
|
+
missing_columns = [col for col in keep if col not in df.columns]
|
|
438
|
+
|
|
439
|
+
if missing_columns:
|
|
440
|
+
print(f"The following columns are not in the DataFrame and will be ignored: {missing_columns}")
|
|
441
|
+
keep = [col for col in keep if col not in missing_columns]
|
|
442
|
+
df_corr = df.drop(columns=keep, errors='ignore', inplace=False) # Exclude `keep` columns
|
|
443
|
+
else:
|
|
444
|
+
df_corr = df
|
|
445
|
+
|
|
446
|
+
# Compute the absolute correlation matrix
|
|
447
|
+
corr_matrix = df_corr.corr().abs()
|
|
448
|
+
|
|
449
|
+
# Keep only the upper triangle of the correlation matrix
|
|
88
450
|
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
89
451
|
|
|
90
|
-
# Find features with correlation greater than
|
|
452
|
+
# Find features with correlation greater than the threshold
|
|
91
453
|
to_drop = [column for column in upper.columns if any(upper[column] > drop_this_perc)]
|
|
454
|
+
|
|
455
|
+
# Ensure `keep` columns are not dropped
|
|
92
456
|
if keep is not None:
|
|
93
|
-
to_drop = [column for column in to_drop if column not in
|
|
94
|
-
|
|
457
|
+
to_drop = [column for column in to_drop if column not in full_columns]
|
|
458
|
+
|
|
459
|
+
# Drop the identified features
|
|
95
460
|
df.drop(to_drop, axis=1, inplace=True)
|
|
96
461
|
|
|
97
462
|
return df
|
|
@@ -215,3 +580,5 @@ def entries_to_remove(entries, the_dict):
|
|
|
215
580
|
for key in entries:
|
|
216
581
|
if key in the_dict:
|
|
217
582
|
del the_dict[key]
|
|
583
|
+
|
|
584
|
+
|