metacountregressor 0.1.121__py3-none-any.whl → 0.1.123__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import numpy as np
2
2
  import pandas as pd
3
3
  import csv
4
4
  import matplotlib.pyplot as plt
5
+ from sklearn.preprocessing import StandardScaler
5
6
 
6
7
  plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
7
8
 
@@ -151,6 +152,99 @@ def remove_files(yes=1):
151
152
  os.remove('pop_log.csv')
152
153
 
153
154
 
155
+ # Function to process the DataFrame
156
+ '''
157
+ Example usuage
158
+ # Configuration dictionary
159
+ config = {
160
+ 'Age': {
161
+ 'type': 'bin',
162
+ 'bins': [0, 18, 35, 50, 100],
163
+ 'labels': ['Child', 'YoungAdult', 'MiddleAged', 'Senior'],
164
+ 'prefix': 'Age_Binned'
165
+ },
166
+ 'Income': {
167
+ 'type': 'bin',
168
+ 'bins': [0, 2000, 5000, 10000],
169
+ 'labels': ['Low', 'Medium', 'High'],
170
+ 'prefix': 'Income_Binned'
171
+ },
172
+ 'Gender': {
173
+ 'type': 'one-hot',
174
+ 'prefix': 'Gender'
175
+ },
176
+ 'Score': {
177
+ 'type': 'none'
178
+ }
179
+ }
180
+ '''
181
+
182
+
183
+ def transform_dataframe(df, config):
184
+ output_df = pd.DataFrame()
185
+
186
+ for column, settings in config.items():
187
+ if settings['type'] == 'bin':
188
+ # Apply binning
189
+ binned = pd.cut(
190
+ df[column],
191
+ bins=settings['bins'],
192
+ labels=settings['labels'],
193
+ right=False
194
+ )
195
+ # One-hot encode the binned column
196
+ binned_dummies = pd.get_dummies(binned, prefix=settings['prefix'])
197
+ output_df = pd.concat([output_df, binned_dummies], axis=1)
198
+
199
+ elif settings['type'] == 'one-hot':
200
+ # One-hot encode the column
201
+ one_hot_dummies = pd.get_dummies(df[column], prefix=settings.get('prefix', column))
202
+ output_df = pd.concat([output_df, one_hot_dummies], axis=1)
203
+
204
+ elif settings['type'] == 'continuous':
205
+ # Apply function to continuous data
206
+ data = df[column]
207
+ if 'bounds' in settings:
208
+ # Apply bounds filtering
209
+ lower, upper = settings['bounds']
210
+ data = data[(data >= lower) & (data <= upper)]
211
+ if 'apply_func' in settings:
212
+ # Apply custom function
213
+ data = data.apply(settings['apply_func'])
214
+ output_df[column] = data
215
+
216
+ elif settings['type'] == 'none':
217
+ # Leave the column unchanged
218
+ output_df = pd.concat([output_df, df[[column]]], axis=1)
219
+
220
+ return output_df
221
+
222
+ # Helper function to guess column type and update `config`
223
+ def guess_column_type(column_name, series):
224
+ if series.dtype == 'object' or series.dtype.name == 'category':
225
+ # If the column is categorical (e.g., strings), assume one-hot encoding
226
+ return {'type': 'one-hot', 'prefix': column_name}
227
+ elif pd.api.types.is_numeric_dtype(series):
228
+ unique_values = series.nunique()
229
+ if unique_values < 10:
230
+ # If there are few unique values, assume binning with default bins
231
+ min_val, max_val = series.min(), series.max()
232
+ bins = np.linspace(min_val, max_val, num=unique_values + 1)
233
+ labels = [f'Bin_{i}' for i in range(1, len(bins))]
234
+ return {'type': 'bin', 'bins': bins, 'labels': labels, 'prefix': f'{column_name}_Binned'}
235
+ else:
236
+ # # Otherwise, assume continuous data with normalization
237
+ # Otherwise, fallback to continuous standardization
238
+ return {
239
+ 'type': 'continuous',
240
+ 'apply_func': (lambda x: (x - series.mean()) / series.std()) # Z-Score Standardization
241
+ }
242
+ else:
243
+ # Default fallback (leave the column unchanged)
244
+ return {'type': 'none'}
245
+
246
+
247
+
154
248
  def as_wide_factor(x_df, yes=1, min_factor=2, max_factor=8, keep_original=0, exclude=[]):
155
249
  if not yes:
156
250
  return x_df
@@ -330,3 +424,5 @@ def entries_to_remove(entries, the_dict):
330
424
  for key in entries:
331
425
  if key in the_dict:
332
426
  del the_dict[key]
427
+
428
+
@@ -28,12 +28,60 @@ def convert_df_columns_to_binary_and_wide(df):
28
28
  return df
29
29
 
30
30
 
31
- def process_arguments():
31
+ def process_arguments(**kwargs):
32
32
  '''
33
33
  TRYING TO TURN THE CSV FILES INTO RELEVANT ARGS
34
34
  '''
35
- data_characteristic = pd.read_csv('problem_data.csv')
36
- analyst_d = pd.read_csv('decisions.csv')
35
+ #dataset
36
+ if kwargs.get('dataset_file', False
37
+ ):
38
+ dataset = pd.read_csv(kwargs.get('dataset_file'))
39
+ named_data_headers = dataset.columns.tolist()
40
+ decision_constants = {name: list(range(7)) for name in named_data_headers}
41
+ data_info = {
42
+
43
+
44
+ 'AADT': {
45
+ 'type': 'continuous',
46
+ 'bounds': [0.0, np.infty],
47
+ 'discrete': False,
48
+ 'apply_func': (lambda x: np.log(x + 1)),
49
+ },
50
+ 'SPEED': {
51
+ 'type': 'continuous',
52
+ 'bounds': [0, 100],
53
+ 'enforce_bounds': True,
54
+ 'discrete': True
55
+ },
56
+ 'TIME': {
57
+ 'type': 'continuous',
58
+ 'bounds': [0, 23.999],
59
+ 'discrete': False
60
+ }
61
+ }
62
+ #remove ID CoLUMNS from dataset
63
+ dataset = dataset.drop(columns = [
64
+ 'ID'
65
+ ])
66
+ for c in dataset.columns:
67
+ if c not in data_info.keys():
68
+ data_info[c] = {'type': 'categorical'}
69
+
70
+ data_new =helperprocess.transform_dataframe(dataset,data_info)
71
+
72
+ update_constant = kwargs.get('analyst_constraints')
73
+ #update the decision_constraints
74
+
75
+ data_characteristic = pd.read_csv(kwargs.get('problem_data', 'problem_data.csv'))
76
+ # Extract the column as a list of characteristic names
77
+ name_data_characteristics = data_characteristic.columns.tolist()
78
+
79
+ # Create the dictionary
80
+ decision_constraints = {name: list(range(7)) for name in name_data_characteristics}
81
+
82
+ print('this gets all the features, I need to remove...')
83
+
84
+ analyst_d = pd.read_csv(kwargs.get('decison_constraints', 'decisions.csv'))
37
85
  hyper = pd.read_csv('setup_hyper.csv')
38
86
 
39
87
  new_data = {'data': data_characteristic,
@@ -41,7 +89,7 @@ def process_arguments():
41
89
  'hyper': hyper}
42
90
  return new_data
43
91
 
44
- def process_package_argumemnts():
92
+ def process_package_arguments():
45
93
 
46
94
  new_data = {}
47
95
  pass
@@ -319,8 +367,8 @@ def main(args, **kwargs):
319
367
  x_df = helperprocess.interactions(x_df, keep)
320
368
 
321
369
 
322
- else: # the dataset has been selected in the program as something else
323
- data_info = process_arguments()
370
+ elif dataset ==10: # the dataset has been selected in the program as something else
371
+ data_info = process_arguments(**args)
324
372
  data_info['hyper']
325
373
  data_info['analyst']
326
374
  data_info['data']['Y']
@@ -339,6 +387,10 @@ def main(args, **kwargs):
339
387
  y_df = df[[data_info['data']['Y'][0]]]
340
388
  y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
341
389
  print('test') #FIXME
390
+ else:
391
+ print('PROCESS THE PACKAGE ARGUMENTS SIMULIAR TO HOW ONE WOULD DEFINE THE ENVIRONMENT')
392
+ data_info =process_package_arguments()
393
+
342
394
 
343
395
  if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
344
396
  if manual_fit_spec is None:
@@ -449,6 +501,8 @@ if __name__ == '__main__':
449
501
  BATCH_JOB = True
450
502
 
451
503
  if BATCH_JOB:
504
+ parser.add_argument('-dataset_file', default='data/Ex-16-3.csv', help='supply the path to the dataset')
505
+
452
506
  parser.add_argument('-line', type=int, default=1,
453
507
  help='line to read in csv to pass in argument')
454
508
 
@@ -463,6 +517,7 @@ if __name__ == '__main__':
463
517
  line_number_obs += 1
464
518
  args = dict(args)
465
519
 
520
+
466
521
  for key, value in args.items():
467
522
  try:
468
523
  # Attempt to parse the string value to a Python literal if value is a string.
@@ -479,7 +534,7 @@ if __name__ == '__main__':
479
534
  if "-algorithm" in action.option_strings:
480
535
  parser._optionals._actions[i].help = "optimization algorithm"
481
536
 
482
- override = False
537
+ override = True
483
538
  if override:
484
539
  print('WARNING: TESTING ENVIRONMENT, TURN OFF FOR RELEASE')
485
540
  parser.add_argument('-problem_number', default='10')
@@ -494,9 +549,10 @@ if __name__ == '__main__':
494
549
  parser.add_argument('-seperate_out_factors', action='store_false', default=False,
495
550
  help='Trie of wanting to split data that is potentially categorical as binary'
496
551
  ' we want to split the data for processing')
497
- parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
552
+ parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directories')
498
553
 
499
554
  else: # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
555
+ print("RUNNING WITH ARGS")
500
556
  parser.add_argument('-com', type=str, default='MetaCode',
501
557
  help='line to read csv')
502
558
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: metacountregressor
3
- Version: 0.1.121
3
+ Version: 0.1.123
4
4
  Summary: Extensions for a Python package for estimation of count models.
5
5
  Home-page: https://github.com/zahern/CountDataEstimation
6
6
  Author: Zeke Ahern
@@ -3,8 +3,8 @@ metacountregressor/_device_cust.py,sha256=759fnKmTYccJm4Lpi9_1reurh6OB9d6q9soPR0
3
3
  metacountregressor/app_main.py,sha256=vY3GczTbGbBRalbzMkl_9jVW7RMgEOc6z2Dr1IZJv9c,10014
4
4
  metacountregressor/data_split_helper.py,sha256=M2fIMdIO8znUaYhx5wlacRyNWdQjNYu1z1wkE-kFUYU,3373
5
5
  metacountregressor/halton.py,sha256=jhovA45UBoZYU9g-hl6Lb2sBIx_ZBTNdPrpgkzR9fng,9463
6
- metacountregressor/helperprocess.py,sha256=Sc5gJ7ffFlkya5B5KQwE33xxXuIQyF6OaYtSikLa3pQ,12968
7
- metacountregressor/main.py,sha256=37yw2weAhaDR-wH83QC4Jy8SeUFIHpxqhO9YPwgmRi4,20764
6
+ metacountregressor/helperprocess.py,sha256=4aSoyKP1GfzjwCzZ_dXlTbokOiMt_8sbzB6_tu0GPDg,16290
7
+ metacountregressor/main.py,sha256=A3XGwbwhhKVgMxnEgbAmMpgYaWkS8Rk30-cYs3FxvEk,22713
8
8
  metacountregressor/main_old.py,sha256=eTS4ygq27MnU-dZ_j983Ucb-D5XfbVF8OJQK2hVVLZc,24123
9
9
  metacountregressor/metaheuristics.py,sha256=Kkx1Jfox6NBlm5zVrI26Vc_NI7NFQSS9dinrZU9SpV8,105871
10
10
  metacountregressor/pareto_file.py,sha256=whySaoPAUWYjyI8zo0hwAOa3rFk6SIUlHSpqZiLur0k,23096
@@ -13,8 +13,8 @@ metacountregressor/setup.py,sha256=8w6IqX0tJsbYrOI1BJLIJCIvOnunKli5I9fsF5PhHv4,9
13
13
  metacountregressor/single_objective_finder.py,sha256=jVG7GJBqzSP4_riYr-kMMKy_LE3SlGmKMunNhHYxgRg,8011
14
14
  metacountregressor/solution.py,sha256=OJqB00cvGMLFei6RsjphPamOdLm3EWOOzK7k-uVbvFY,277671
15
15
  metacountregressor/test_generated_paper2.py,sha256=pwOoRzl1jJIIOUAAvbkT6HmmTQ81mwpsshn9SLdKOg8,3927
16
- metacountregressor-0.1.121.dist-info/LICENSE.txt,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
17
- metacountregressor-0.1.121.dist-info/METADATA,sha256=c-c5mHUC6gdf2JEq-DWBuw0F1gAp-Cq0pQeYVLKG_y8,23415
18
- metacountregressor-0.1.121.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
19
- metacountregressor-0.1.121.dist-info/top_level.txt,sha256=zGG7UC5WIpr76gsFUpwJ4En2aCcoNTONBaS3OewwjR0,19
20
- metacountregressor-0.1.121.dist-info/RECORD,,
16
+ metacountregressor-0.1.123.dist-info/LICENSE.txt,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
17
+ metacountregressor-0.1.123.dist-info/METADATA,sha256=e4jQ9vtFxhHtA98q1Vd8PJ9gJiIz91iSUKgGPt78kg8,23415
18
+ metacountregressor-0.1.123.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
19
+ metacountregressor-0.1.123.dist-info/top_level.txt,sha256=zGG7UC5WIpr76gsFUpwJ4En2aCcoNTONBaS3OewwjR0,19
20
+ metacountregressor-0.1.123.dist-info/RECORD,,