PyPI - metacountregressor - Versions diffs - 0.1.119__tar.gz → 0.1.121__tar.gz - Mend

metacountregressor 0.1.119tar.gz → 0.1.121tar.gz

Files changed (27) hide show

{metacountregressor-0.1.119 → metacountregressor-0.1.121}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: metacountregressor
-Version: 0.1.119
+Version: 0.1.121
 Summary: Extensions for a Python package for estimation of count models.
 Home-page: https://github.com/zahern/CountDataEstimation
 Author: Zeke Ahern

{metacountregressor-0.1.119 → metacountregressor-0.1.121}/metacountregressor/helperprocess.py RENAMED Viewed

@@ -2,6 +2,7 @@ import numpy as np
 import pandas as pd
 import csv
 import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler
 plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
@@ -151,6 +152,99 @@ def remove_files(yes=1):
             os.remove('pop_log.csv')
+# Function to process the DataFrame
+'''
+Example usuage
+# Configuration dictionary
+config = {
+    'Age': {
+        'type': 'bin',
+        'bins': [0, 18, 35, 50, 100],
+        'labels': ['Child', 'YoungAdult', 'MiddleAged', 'Senior'],
+        'prefix': 'Age_Binned'
+    },
+    'Income': {
+        'type': 'bin',
+        'bins': [0, 2000, 5000, 10000],
+        'labels': ['Low', 'Medium', 'High'],
+        'prefix': 'Income_Binned'
+    },
+    'Gender': {
+        'type': 'one-hot',
+        'prefix': 'Gender'
+    },
+    'Score': {
+        'type': 'none'
+    }
+}
+'''
+def transform_dataframe(df, config):
+    output_df = pd.DataFrame()
+    for column, settings in config.items():
+        if settings['type'] == 'bin':
+            # Apply binning
+            binned = pd.cut(
+                df[column],
+                bins=settings['bins'],
+                labels=settings['labels'],
+                right=False
+            )
+            # One-hot encode the binned column
+            binned_dummies = pd.get_dummies(binned, prefix=settings['prefix'])
+            output_df = pd.concat([output_df, binned_dummies], axis=1)
+        elif settings['type'] == 'one-hot':
+            # One-hot encode the column
+            one_hot_dummies = pd.get_dummies(df[column], prefix=settings.get('prefix', column))
+            output_df = pd.concat([output_df, one_hot_dummies], axis=1)
+        elif settings['type'] == 'continuous':
+            # Apply function to continuous data
+            data = df[column]
+            if 'bounds' in settings:
+                # Apply bounds filtering
+                lower, upper = settings['bounds']
+                data = data[(data >= lower) & (data <= upper)]
+            if 'apply_func' in settings:
+                # Apply custom function
+                data = data.apply(settings['apply_func'])
+            output_df[column] = data
+        elif settings['type'] == 'none':
+            # Leave the column unchanged
+            output_df = pd.concat([output_df, df[[column]]], axis=1)
+    return output_df
+# Helper function to guess column type and update `config`
+def guess_column_type(column_name, series):
+    if series.dtype == 'object' or series.dtype.name == 'category':
+        # If the column is categorical (e.g., strings), assume one-hot encoding
+        return {'type': 'one-hot', 'prefix': column_name}
+    elif pd.api.types.is_numeric_dtype(series):
+        unique_values = series.nunique()
+        if unique_values < 10:
+            # If there are few unique values, assume binning with default bins
+            min_val, max_val = series.min(), series.max()
+            bins = np.linspace(min_val, max_val, num=unique_values + 1)
+            labels = [f'Bin_{i}' for i in range(1, len(bins))]
+            return {'type': 'bin', 'bins': bins, 'labels': labels, 'prefix': f'{column_name}_Binned'}
+        else:
+           # # Otherwise, assume continuous data with normalization
+            # Otherwise, fallback to continuous standardization
+            return {
+                'type': 'continuous',
+                'apply_func': (lambda x: (x - series.mean()) / series.std())  # Z-Score Standardization
+            }
+    else:
+        # Default fallback (leave the column unchanged)
+        return {'type': 'none'}
 def as_wide_factor(x_df, yes=1, min_factor=2, max_factor=8, keep_original=0, exclude=[]):
     if not yes:
         return x_df
@@ -330,3 +424,5 @@ def entries_to_remove(entries, the_dict):
     for key in entries:
         if key in the_dict:
             del the_dict[key]

{metacountregressor-0.1.119 → metacountregressor-0.1.121}/metacountregressor/main.py RENAMED Viewed

@@ -28,12 +28,60 @@ def convert_df_columns_to_binary_and_wide(df):
     return df
-def process_arguments():
+def process_arguments(**kwargs):
     '''
     TRYING TO TURN THE CSV FILES INTO RELEVANT ARGS
     '''
-    data_characteristic = pd.read_csv('problem_data.csv')
-    analyst_d = pd.read_csv('decisions.csv')
+    #dataset
+    if kwargs.get('dataset_file', False
+    ):
+        dataset = pd.read_csv(kwargs.get('dataset_file'))
+        named_data_headers = dataset.columns.tolist()
+        decision_constants = {name: list(range(7)) for name in named_data_headers}
+        data_info = {
+            'AADT': {
+                'type': 'continuous',
+                'bounds': [0.0, np.infty],
+                'discrete': False,
+                'apply_func': (lambda x: np.log(x + 1)),
+            },
+            'SPEED': {
+                'type': 'continuous',
+                'bounds': [0, 100],
+                'enforce_bounds': True,
+                'discrete': True
+            },
+            'TIME': {
+                'type': 'continuous',
+                'bounds': [0, 23.999],
+                'discrete': False
+            }
+        }
+        #remove ID CoLUMNS from dataset
+        dataset = dataset.drop(columns = [
+            'ID'
+        ])
+        for c in dataset.columns:
+            if c not in data_info.keys():
+                data_info[c] = {'type': 'categorical'}
+        data_new  =helperprocess.transform_dataframe(dataset,data_info)
+        update_constant = kwargs.get('analyst_constraints')
+        #update the decision_constraints
+    data_characteristic = pd.read_csv(kwargs.get('problem_data', 'problem_data.csv'))
+    # Extract the column as a list of characteristic names
+    name_data_characteristics = data_characteristic.columns.tolist()
+    # Create the dictionary
+    decision_constraints = {name: list(range(7)) for name in name_data_characteristics}
+    print('this gets all the features, I need to remove...')
+    analyst_d = pd.read_csv(kwargs.get('decison_constraints', 'decisions.csv'))
     hyper = pd.read_csv('setup_hyper.csv')
     new_data = {'data': data_characteristic,
@@ -41,7 +89,7 @@ def process_arguments():
                 'hyper': hyper}
     return new_data
-def process_package_argumemnts():
+def process_package_arguments():
     new_data = {}
     pass
@@ -319,8 +367,8 @@ def main(args, **kwargs):
         x_df = helperprocess.interactions(x_df, keep)
-    else:  # the dataset has been selected in the program as something else
-        data_info = process_arguments()
+    elif dataset ==10:  # the dataset has been selected in the program as something else
+        data_info = process_arguments(**args)
         data_info['hyper']
         data_info['analyst']
         data_info['data']['Y']
@@ -339,6 +387,10 @@ def main(args, **kwargs):
         y_df = df[[data_info['data']['Y'][0]]]
         y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
         print('test') #FIXME
+    else:
+        print('PROCESS THE PACKAGE ARGUMENTS SIMULIAR TO HOW ONE WOULD DEFINE THE ENVIRONMENT')
+        data_info =process_package_arguments()
     if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
         if manual_fit_spec is None:
@@ -449,6 +501,8 @@ if __name__ == '__main__':
     BATCH_JOB = True
     if BATCH_JOB:
+        parser.add_argument('-dataset_file', default='data/Ex-16-3.csv', help='supply the path to the dataset')
         parser.add_argument('-line', type=int, default=1,
                             help='line to read in csv to pass in argument')
@@ -463,6 +517,7 @@ if __name__ == '__main__':
                 line_number_obs += 1
             args = dict(args)
             for key, value in args.items():
                 try:
                     # Attempt to parse the string value to a Python literal if value is a string.
@@ -479,7 +534,7 @@ if __name__ == '__main__':
                 if "-algorithm" in action.option_strings:
                     parser._optionals._actions[i].help = "optimization algorithm"
-            override = False
+            override = True
             if override:
                 print('WARNING: TESTING ENVIRONMENT, TURN OFF FOR RELEASE')
                 parser.add_argument('-problem_number', default='10')
@@ -494,9 +549,10 @@ if __name__ == '__main__':
             parser.add_argument('-seperate_out_factors', action='store_false', default=False,
                                 help='Trie of wanting to split data that is potentially categorical as binary'
                                     ' we want to split the data for processing')
-            parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
+            parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directories')
     else:  # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
+        print("RUNNING WITH ARGS")
         parser.add_argument('-com', type=str, default='MetaCode',
                             help='line to read csv')

{metacountregressor-0.1.119 → metacountregressor-0.1.121}/metacountregressor.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: metacountregressor
-Version: 0.1.119
+Version: 0.1.121
 Summary: Extensions for a Python package for estimation of count models.
 Home-page: https://github.com/zahern/CountDataEstimation
 Author: Zeke Ahern