metacountregressor 0.1.73__py3-none-any.whl → 0.1.83__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,145 @@
1
+ from os.path import exists
1
2
  import numpy as np
2
3
  import pandas as pd
3
4
  import csv
4
5
  import matplotlib.pyplot as plt
6
+ from scipy import stats as st
7
+ from sklearn.preprocessing import StandardScaler
8
+
5
9
 
6
10
  plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
7
11
 
8
12
 
13
+
14
+
15
+
16
+ from itertools import product
17
+
18
+ # Function to create a list of dictionaries from a parameter grid
19
+ def generate_param_combinations(param_grid):
20
+ keys = param_grid.keys()
21
+ values = param_grid.values()
22
+ combinations = [dict(zip(keys, v)) for v in product(*values)]
23
+ return combinations
24
+
25
+
26
+ ##Select the best Features Based on RF
27
+ def select_features(X_train, y_train, n_f=16):
28
+ try:
29
+ from sklearn.feature_selection import SelectKBest
30
+ from sklearn.feature_selection import f_regression
31
+ feature_names = X_train.columns
32
+ # configure to select all features
33
+ fs = SelectKBest(score_func=f_regression, k=16)
34
+
35
+ # learn relationship from training data
36
+ fs.fit(X_train, y_train)
37
+
38
+ mask = fs.get_support() # Boolean array of selected features
39
+ selected_features = [feature for bool, feature in zip(mask, feature_names) if bool]
40
+ X_train = X_train[selected_features]
41
+ except:
42
+ print('import error, not performing feature selection')
43
+ fs = X_train.columns #TODO check if this is actually getting the names
44
+
45
+ return X_train, fs
46
+
47
+
48
+ #Cutts off correlated data
49
+
50
+
51
+
52
+
53
+
54
+ def findCorrelation(corr, cutoff=0.9, exact=None): """
55
+ This function is the Python implementation of the R function
56
+ `findCorrelation()`.
57
+
58
+ Relies on numpy and pandas, so must have them pre-installed.
59
+
60
+ It searches through a correlation matrix and returns a list of column names
61
+ to remove to reduce pairwise correlations.
62
+
63
+ For the documentation of the R function, see
64
+ https://www.rdocumentation.org/packages/caret/topics/findCorrelation
65
+ and for the source code of `findCorrelation()`, see
66
+ https://github.com/topepo/caret/blob/master/pkg/caret/R/findCorrelation.R
67
+
68
+ -----------------------------------------------------------------------------
69
+
70
+ Parameters:
71
+ -----------
72
+ corr: pandas dataframe.
73
+ A correlation matrix as a pandas dataframe.
74
+ cutoff: float, default: 0.9.
75
+ A numeric value for the pairwise absolute correlation cutoff
76
+ exact: bool, default: None
77
+ A boolean value that determines whether the average correlations be
78
+ recomputed at each step
79
+ -----------------------------------------------------------------------------
80
+ Returns:
81
+ --------
82
+ list of column names
83
+ -----------------------------------------------------------------------------
84
+ Example:
85
+ --------
86
+ R1 = pd.DataFrame({
87
+ 'x1': [1.0, 0.86, 0.56, 0.32, 0.85],
88
+ 'x2': [0.86, 1.0, 0.01, 0.74, 0.32],
89
+ 'x3': [0.56, 0.01, 1.0, 0.65, 0.91],
90
+ 'x4': [0.32, 0.74, 0.65, 1.0, 0.36],
91
+ 'x5': [0.85, 0.32, 0.91, 0.36, 1.0]
92
+ }, index=['x1', 'x2', 'x3', 'x4', 'x5'])
93
+
94
+ findCorrelation(R1, cutoff=0.6, exact=False) # ['x4', 'x5', 'x1', 'x3']
95
+ findCorrelation(R1, cutoff=0.6, exact=True) # ['x1', 'x5', 'x4']
96
+ """
97
+
98
+
99
+ def _findCorrelation_fast(corr, avg, cutoff):
100
+
101
+ combsAboveCutoff = corr.where(lambda x: (np.tril(x) == 0) & (x > cutoff)).stack().index
102
+
103
+ rowsToCheck = combsAboveCutoff.get_level_values(0)
104
+ colsToCheck = combsAboveCutoff.get_level_values(1)
105
+
106
+ msk = avg[colsToCheck] > avg[rowsToCheck].values
107
+ deletecol = pd.unique(np.r_[colsToCheck[msk], rowsToCheck[~msk]]).tolist()
108
+
109
+ return deletecol
110
+
111
+ def _findCorrelation_exact(corr, avg, cutoff):
112
+
113
+ x = corr.loc[(*[avg.sort_values(ascending=False).index] * 2,)]
114
+
115
+ if (x.dtypes.values[:, None] == ['int64', 'int32', 'int16', 'int8']).any():
116
+ x = x.astype(float)
117
+
118
+ x.values[(*[np.arange(len(x))] * 2,)] = np.nan
119
+
120
+ deletecol = []
121
+ for ix, i in enumerate(x.columns[:-1]):
122
+ for j in x.columns[ix + 1:]:
123
+ if x.loc[i, j] > cutoff:
124
+ if x[i].mean() > x[j].mean():
125
+ deletecol.append(i)
126
+ x.loc[i] = x[i] = np.nan
127
+ else:
128
+ deletecol.append(j)
129
+ x.loc[j] = x[j] = np.nan
130
+
131
+
132
+
133
+
134
+ """Funtion to Convert Data to Binaries """
135
+ def clean_data_types(df):
136
+ for col in df.columns:
137
+ if df[col].dtype == 'object':
138
+ # Attempt to convert the column to numeric type
139
+ df[col] = pd.to_numeric(df[col], errors='coerce')
140
+ return df
141
+
142
+
9
143
  def drop_correlations(x_df, percentage=0.85):
10
144
  cor_matrix = x_df.corr().abs()
11
145
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool)) # type: ignore
@@ -36,6 +170,220 @@ def remove_files(yes=1):
36
170
  os.remove('pop_log.csv')
37
171
 
38
172
 
173
+ # Function to process the DataFrame
174
+ '''
175
+ Example usuage
176
+ # Configuration dictionary
177
+ config = {
178
+ 'Age': {
179
+ 'type': 'bin',
180
+ 'bins': [0, 18, 35, 50, 100],
181
+ 'labels': ['Child', 'YoungAdult', 'MiddleAged', 'Senior'],
182
+ 'prefix': 'Age_Binned'
183
+ },
184
+ 'Income': {
185
+ 'type': 'bin',
186
+ 'bins': [0, 2000, 5000, 10000],
187
+ 'labels': ['Low', 'Medium', 'High'],
188
+ 'prefix': 'Income_Binned'
189
+ },
190
+ 'Gender': {
191
+ 'type': 'one-hot',
192
+ 'prefix': 'Gender'
193
+ },
194
+ 'Score': {
195
+ 'type': 'none'
196
+ }
197
+ }
198
+ '''
199
+ def null_handler(vari):
200
+ if vari in locals():
201
+ return vari
202
+ else:
203
+ print(f'{vari} does not exist, setting None..')
204
+ return None
205
+
206
+
207
+ def set_up_analyst_constraints(data_characteristic, model_terms, variable_decisions_alt = None):
208
+
209
+
210
+ name_data_characteristics = data_characteristic.columns.tolist()
211
+ # Get non-None values as a list
212
+ non_none_terms = [value for value in model_terms.values() if value is not None]
213
+ # how to make name_data_characteristics - non_none_terms
214
+
215
+ result = [item for item in name_data_characteristics if item not in non_none_terms]
216
+ distu = ['normal', 'uniform', 'triangular']
217
+ tra = ['no', 'sqrt', 'arcsinh']
218
+ if model_terms.get('group') is None:
219
+ print('cant have grouped rpm, removing level 4 from every item')
220
+ MAKE_ALL_4_FALSE = True
221
+ else:
222
+ MAKE_ALL_4_FALSE = False
223
+
224
+ variable_decisions = {
225
+ name: {
226
+ 'levels': list(range(6)),
227
+ 'Distributions': distu,
228
+ 'Transformations': tra
229
+ }
230
+ for name in result
231
+ }
232
+ # Override elements in the original dictionary with the alt dictionary
233
+ if variable_decisions_alt is not None:
234
+ for key, alt_value in variable_decisions_alt.items():
235
+ if key in variable_decisions:
236
+ # Update the existing entry
237
+ variable_decisions[key].update(alt_value)
238
+ else:
239
+ # Add new entry if it doesn't exist
240
+ variable_decisions[key] = alt_value
241
+ # Prepare the data for the DataFrame
242
+ rows = []
243
+ for column_name, details in variable_decisions.items():
244
+ # Create a row dictionary
245
+ row = {'Column': column_name}
246
+
247
+ # Add levels as True/False for Level 0 through Level 5
248
+ for level in range(6): # Assuming Level 0 to Level 5
249
+
250
+ if level == 4 and MAKE_ALL_4_FALSE:
251
+ row[f'Level {level}'] = False
252
+ else:
253
+ row[f'Level {level}'] = level in details['levels']
254
+
255
+ # Add distributions and transformations directly
256
+
257
+ # Add distributions and transformations as comma-separated strings
258
+ row['Distributions'] = str(details['Distributions'])
259
+ row['Transformations'] = str(details['Transformations'])
260
+
261
+ rows.append(row)
262
+
263
+ # Create the DataFrame
264
+ df = pd.DataFrame(rows)
265
+
266
+ data_new = data_characteristic.rename(columns={v: k for k, v in model_terms.items() if v in data_characteristic.columns})
267
+ return df, data_new
268
+
269
+ # Function to guess Low, Medium, High ranges
270
+ def guess_low_medium_high(column_name, series):
271
+ # Compute the tertiles (33rd and 66th percentiles)
272
+ #print('did it make it...')
273
+ #mode_value = st.mode(series) # Get the most frequent value
274
+ #print('good')
275
+ # series = pd.to_numeric(series, errors='coerce').fillna(mode_value)
276
+ low_threshold = np.quantile(series, 0.33)
277
+ high_threshold = np.quantile(series,0.66)
278
+
279
+ # Define the bins and labels
280
+ bins = [np.min(series) - 1, low_threshold, high_threshold, np.max(series)]
281
+ # Handle duplicate bins by adjusting labels
282
+ if len(set(bins)) < len(bins): # Check for duplicate bin edges
283
+ if low_threshold == high_threshold:
284
+ # Collapse to two bins (Low and High)
285
+ bins = [np.min(series) - 1, low_threshold, np.max(series)]
286
+ labels = ['Low', 'High']
287
+ else:
288
+ # Collapse to three unique bins
289
+ bins = sorted(set(bins)) # Remove duplicate edges
290
+ labels = [f'Bin {i + 1}' for i in range(len(bins) - 1)]
291
+ else:
292
+ # Standard case: Low, Medium, High
293
+ labels = ['Low', 'Medium', 'High']
294
+
295
+ return {
296
+ 'type': 'bin',
297
+ 'bins': bins,
298
+ 'labels': labels,
299
+ 'prefix': f'{column_name}'
300
+ }
301
+
302
+ def transform_dataframe(df, config):
303
+ output_df = pd.DataFrame()
304
+
305
+ for column, settings in config.items():
306
+ if settings['type'] == 'bin':
307
+ # Apply binning
308
+ # Get unique bins (remove duplicates)
309
+ unique_bins = sorted(set(settings['bins']))
310
+
311
+ # Adjust labels if necessary
312
+ if len(unique_bins) - 1 != len(settings['labels']):
313
+ print(f"Adjusting labels to match bins: {len(unique_bins) - 1} bins detected.")
314
+ labels = [f'Bin {i+1}' for i in range(len(unique_bins) - 1)]
315
+ else:
316
+ labels = settings['labels']
317
+
318
+ # Perform the binning
319
+ binned_d = pd.cut(
320
+ df[column],
321
+ bins=unique_bins, # Deduplicated bins
322
+ labels=labels, # Adjusted or original labels
323
+ right=False # Adjust based on whether to include the right edge
324
+ )
325
+ # One-hot encode the binned column
326
+ binned_dummies = pd.get_dummies(binned_d, prefix=settings['prefix'])
327
+ output_df = pd.concat([output_df, binned_dummies], axis=1)
328
+
329
+ elif settings['type'] == 'one-hot':
330
+ # One-hot encode the column
331
+ one_hot_dummies = pd.get_dummies(df[column], prefix=settings.get('prefix', column))
332
+ output_df = pd.concat([output_df, one_hot_dummies], axis=1)
333
+
334
+ elif settings['type'] == 'continuous':
335
+ # Apply function to continuous data
336
+ data = df[column]
337
+ if 'bounds' in settings:
338
+ # Apply bounds filtering
339
+ lower, upper = settings['bounds']
340
+ data = data[(data >= lower) & (data <= upper)]
341
+ if 'apply_func' in settings:
342
+ # Apply custom function
343
+ data = data.apply(settings['apply_func'])
344
+ output_df[column] = data
345
+
346
+ elif settings['type'] == 'none':
347
+ # Leave the column unchanged
348
+ if column in df.columns:
349
+
350
+ output_df = pd.concat([output_df, df[[column]]], axis=1)
351
+ else:
352
+ print(f'config variable {column} is not in the data. Ignoring ...')
353
+ return output_df
354
+
355
+ # Helper function to guess column type and update `config`
356
+ def guess_column_type(column_name, series):
357
+
358
+ if series.empty:
359
+ raise ValueError(f"The column {column_name} contains no numeric data.")
360
+
361
+ if series.dtype == 'object' or series.dtype.name == 'category':
362
+ # If the column is categorical (e.g., strings), assume one-hot encoding
363
+ return {'type': 'one-hot', 'prefix': column_name}
364
+ elif pd.api.types.is_numeric_dtype(series):
365
+ unique_values = series.nunique()
366
+
367
+ if unique_values < 5:
368
+ return {'type': 'one-hot', 'prefix': column_name}
369
+
370
+ elif np.max(series) - np.min(series) > 20:
371
+ print('made it through here')
372
+ # If there are few unique values, assume binning with default bins
373
+ return guess_low_medium_high(column_name,series)
374
+ else:
375
+ # # Otherwise, assume continuous data with normalization
376
+ # Otherwise, fallback to continuous standardization
377
+ return {
378
+ 'type': 'continuous',
379
+ 'apply_func': (lambda x: (x - series.mean()) / series.std()) # Z-Score Standardization
380
+ }
381
+ else:
382
+ # Default fallback (leave the column unchanged)
383
+ return {'type': 'none'}
384
+
385
+
386
+
39
387
  def as_wide_factor(x_df, yes=1, min_factor=2, max_factor=8, keep_original=0, exclude=[]):
40
388
  if not yes:
41
389
  return x_df
@@ -58,7 +406,7 @@ def PCA_code(X, n_components=5):
58
406
 
59
407
 
60
408
  def interactions(df, keep=None, drop_this_perc=0.6, interact = False):
61
-
409
+ full_columns = df.columns
62
410
  if interact:
63
411
  interactions_list = []
64
412
  for i, var_i in enumerate(df.columns):
@@ -84,14 +432,31 @@ def interactions(df, keep=None, drop_this_perc=0.6, interact = False):
84
432
  df = pd.concat([df, df_interactions], axis=1, sort=False)
85
433
 
86
434
  # second
87
- corr_matrix = df.corr().abs()
435
+ # Remove `keep` columns from the correlation matrix
436
+ if keep is not None:
437
+ missing_columns = [col for col in keep if col not in df.columns]
438
+
439
+ if missing_columns:
440
+ print(f"The following columns are not in the DataFrame and will be ignored: {missing_columns}")
441
+ keep = [col for col in keep if col not in missing_columns]
442
+ df_corr = df.drop(columns=keep, errors='ignore', inplace=False) # Exclude `keep` columns
443
+ else:
444
+ df_corr = df
445
+
446
+ # Compute the absolute correlation matrix
447
+ corr_matrix = df_corr.corr().abs()
448
+
449
+ # Keep only the upper triangle of the correlation matrix
88
450
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
89
451
 
90
- # Find features with correlation greater than 0.6
452
+ # Find features with correlation greater than the threshold
91
453
  to_drop = [column for column in upper.columns if any(upper[column] > drop_this_perc)]
454
+
455
+ # Ensure `keep` columns are not dropped
92
456
  if keep is not None:
93
- to_drop = [column for column in to_drop if column not in keep]
94
- # Drop features
457
+ to_drop = [column for column in to_drop if column not in full_columns]
458
+
459
+ # Drop the identified features
95
460
  df.drop(to_drop, axis=1, inplace=True)
96
461
 
97
462
  return df
@@ -215,3 +580,5 @@ def entries_to_remove(entries, the_dict):
215
580
  for key in entries:
216
581
  if key in the_dict:
217
582
  del the_dict[key]
583
+
584
+