PyPI - metacountregressor - Versions diffs - 0.1.73__py3-none-any.whl → 0.1.83__py3-none-any.whl - Mend

metacountregressor 0.1.73py3-none-any.whl → 0.1.83py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

metacountregressor/app_main.py +258 -0
metacountregressor/data_split_helper.py +90 -0
metacountregressor/helperprocess.py +372 -5
metacountregressor/main.py +297 -117
metacountregressor/metaheuristics.py +43 -31
metacountregressor/setup.py +3 -2
metacountregressor/solution.py +734 -832
{metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/METADATA +256 -35
metacountregressor-0.1.83.dist-info/RECORD +20 -0
{metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/WHEEL +1 -1
metacountregressor-0.1.73.dist-info/RECORD +0 -18
{metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/LICENSE.txt +0 -0
{metacountregressor-0.1.73.dist-info → metacountregressor-0.1.83.dist-info}/top_level.txt +0 -0

metacountregressor/main.py CHANGED Viewed

@@ -9,14 +9,12 @@ import numpy as np
 import pandas as pd
 from pandas import DataFrame
 from pandas.io.parsers import TextFileReader
 import helperprocess
 from metaheuristics import (differential_evolution,
                             harmony_search,
                             simulated_annealing)
 from solution import ObjectiveFunction
-from test_motor import *
 warnings.simplefilter("ignore")
@@ -30,14 +28,144 @@ def convert_df_columns_to_binary_and_wide(df):
     return df
+def process_arguments(**kwargs):
+    '''
+    TRYING TO TURN THE CSV FILES INTO RELEVANT ARGS
+    '''
+    #dataset
+    '''
+    if kwargs.get('dataset_file', False
+    ):
+        dataset = pd.read_csv(kwargs.get('dataset_file'))
+        named_data_headers = dataset.columns.tolist()
+        decision_constants = {name: list(range(7)) for name in named_data_headers}
+        data_info = {
+            'AADT': {
+                'type': 'continuous',
+                'bounds': [0.0, np.infty],
+                'discrete': False,
+                'apply_func': (lambda x: np.log(x + 1)),
+            },
+            'SPEED': {
+                'type': 'continuous',
+                'bounds': [0, 100],
+                'enforce_bounds': True,
+                'discrete': True
+            },
+            'TIME': {
+                'type': 'continuous',
+                'bounds': [0, 23.999],
+                'discrete': False
+            }
+        }
+        #remove ID CoLUMNS from dataset
+        dataset = dataset.drop(columns = [
+            'ID'
+        ])
+        for c in dataset.columns:
+            if c not in data_info.keys():
+                data_info[c] = {'type': 'categorical'}
+        data_new  =helperprocess.transform_dataframe(dataset,data_info)
+        update_constant = kwargs.get('analyst_constraints')
+        #update the decision_constraints
+    '''
+    data_characteristic = pd.read_csv(kwargs.get('problem_data', 'problem_data.csv'))
+    # Extract the column as a list of characteristic names
+    #name_data_characteristics = data_characteristic.columns.tolist()
+    # Create the dictionary
+    #decision_constraints = {name: list(range(7)) for name in name_data_characteristics}
+    #print('this gets all the features, I need to remove...')
+    analyst_d = pd.read_csv(kwargs.get('decison_constraints', 'decisions.csv'))
+    hyper = pd.read_csv('setup_hyper.csv')
+    new_data = {'data': data_characteristic,
+                'analyst':analyst_d,
+                'hyper': hyper}
+    return new_data
+def process_package_arguments():
+    new_data = {}
+    pass
 def main(args, **kwargs):
+    '''METACOUNT REGRESSOR TESTING ENVIRONMENT'''
+    '''
+    TESTING_ENV = False
+    if TESTING_ENV:
+        import statsmodels.api as sm
+        data = sm.datasets.sunspots.load_pandas().data
+        # print(data.exog)
+        data_exog = data['YEAR']
+        data_exog = sm.add_constant(data_exog)
+        data_endog = data['SUNACTIVITY']
+        # Instantiate a gamma family model with the default link function.
+        import numpy as np
+        gamma_model = sm.NegativeBinomial(data_endog, data_exog)
+        gamma_results = gamma_model.fit()
+        print(gamma_results.summary())
+        # NOW LET's COMPARE THIS TO METACOUNT REGRESSOR
+        import metacountregressor
+        from importlib.metadata import version
+        print(version('metacountregressor'))
+        import pandas as pd
+        import numpy as np
+        from metacountregressor.solution import ObjectiveFunction
+        from metacountregressor.metaheuristics import (harmony_search,
+                                                       differential_evolution,
+                                                       simulated_annealing)
+        # Model Decisions,
+        manual_fit_spec = {
+            'fixed_terms': ['const', 'YEAR'],
+            'rdm_terms': [],
+            'rdm_cor_terms': [],
+            'grouped_terms': [],
+            'hetro_in_means': [],
+            'transformations': ['no', 'no'],
+            'dispersion': 1  # Negative Binomial
+        }
+        # Arguments
+        arguments = {
+            'algorithm': 'hs',
+            'test_percentage': 0,
+            'test_complexity': 6,
+            'instance_number': 'name',
+            'Manual_Fit': manual_fit_spec
+        }
+        obj_fun = ObjectiveFunction(data_exog, data_endog, **arguments)
+    '''
     print('the args is:', args)
     print('the kwargs is', kwargs)
     # removing junk files if specicified
     helperprocess.remove_files(args.get('removeFiles', True))
-    # do we want tto run a test
+    # do we want to run a test
     if args.get('com', False) == 'MetaCode':
         print('Testing the Python Package')  # TODO add in python package import
         # Read data from CSV file
@@ -46,13 +174,25 @@ def main(args, **kwargs):
         X = df
         y = df['FREQ']  # Frequency of crashes
         X['Offset'] = np.log(df['AADT'])  # Explicitley define how to offset the data, no offset otherwise
+        df['Offset'] = np.log(df['AADT'])
         # Drop Y, selected offset term and  ID as there are no panels
         X = df.drop(columns=['FREQ', 'ID', 'AADT'])
+        # Step 0: Process Data
+        model_terms = {
+            'Y': 'FREQ',  # Replace 'FREQ' with the name of your dependent variable
+            'group': None,  # Replace 'group_column' with the name of your grouping column (or None if not used)
+            'panels': None,  # Replace 'panel_column' with the name of your panel column (or None if not used)
+            'Offset': 'Offset'  # Replace None with the name of your offset column if using one
+        }
+        a_des, df = helperprocess.set_up_analyst_constraints(df, model_terms)
         # some example argument, these are defualt so the following line is just for claritity
         args = {'algorithm': 'hs', 'test_percentage': 0.15, 'test_complexity': 6, 'instance_number': 1,
-                'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6}
+                'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6, 'desicions':a_des}
         # Fit the model with metacountregressor
+        # Step 5: Transform the dataset based on the configuration
+        #data_new = helperprocess.transform_dataframe(dataset, config)
+        y = df[['Y']]
+        X = df.drop(columns=['Y'])
         obj_fun = ObjectiveFunction(X, y, **args)
         # replace with other metaheuristics if desired
         results = harmony_search(obj_fun)
@@ -64,6 +204,7 @@ def main(args, **kwargs):
     print('the dataset is', dataset)
     manual_fit_spec = args.get('Manual_Fit', None)
     if dataset == 1:
+        print('Stage 5 A Short.')
         df = pd.read_csv('./data/1848.csv')  # read in the data
         y_df = df[['FSI']]  # only consider crashes
         y_df.rename(columns={"FSI": "Y"}, inplace=True)
@@ -71,6 +212,7 @@ def main(args, **kwargs):
         x_df = helperprocess.as_wide_factor(x_df)
     elif dataset == 3:
+        print('Stage 5 A Data Complete.')
         x_df = pd.read_csv('./data/Stage5A_1848_All_Initial_Columns.csv')  # drop the ID columns
         drop_these = ['Id', 'ID', 'old', 'G_N']
         for i in drop_these:
@@ -92,8 +234,8 @@ def main(args, **kwargs):
             'rdm_cor_terms': [],
             'grouped_terms': [],
             'hetro_in_means': [],
-            'transformations': ['no', 'log', 'log', 'no', 'no', 'no', 'no'],
-            'dispersion': 1
+            'transformations': ['no', 'log', 'no', 'no', 'no', 'no', 'no'],
+            'dispersion': 0
         }
         keep = ['Constant', 'US', 'RSMS', 'MCV', 'RSHS', 'AADT', 'Curve50', 'Offset']
@@ -102,14 +244,38 @@ def main(args, **kwargs):
     elif dataset == 4:
         manual_fit_spec = {
             'fixed_terms': ['const', 'LOWPRE', 'GBRPM', 'FRICTION'],
-            'rdm_terms': ['Expose:normal', 'INTPM:normal', 'CPM:normal', 'HISNOW:normal'],
+            'rdm_terms': ['EXPOSE:normal', 'INTPM:normal', 'CPM:normal', 'HISNOW:normal'],
+            'rdm_cor_terms': [],
+            'grouped_terms': [],
+            'hetro_in_means': [],
+            'transformations': ['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
+            'dispersion': 1
+        }
+        '''
+        manual_fit_spec = {
+            'fixed_terms': ['const', 'LOWPRE', 'GBRPM', 'FRICTION', 'EXPOSE', 'INTPM', 'CPM', 'HISNOW'],
+            'rdm_terms': [],
             'rdm_cor_terms': [],
             'grouped_terms': [],
             'hetro_in_means': [],
             'transformations': ['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
             'dispersion': 1
         }
+        '''
+        '''
+        print('overriding this delete, just want to test the NB')
+        manual_fit_spec = {
+            'fixed_terms': ['const'],
+            'rdm_terms': [],
+            'rdm_cor_terms': [],
+            'grouped_terms': [],
+            'hetro_in_means': [],
+            'transformations': ['no'],
+            'dispersion': 1
+        }
+        '''
         df = pd.read_csv('./data/Ex-16-3.csv')  # read in the data
         y_df = df[['FREQ']].copy()  # only consider crashes
         y_df.rename(columns={"FREQ": "Y"}, inplace=True)
@@ -118,7 +284,7 @@ def main(args, **kwargs):
         x_df['Offset'] = np.log(1 + x_df['AADT'] * x_df['LENGTH'] * 365 / 100000000)
         x_df = x_df.drop(columns=['AADT', 'LENGTH'])
-        if args['separate_out_factors']:
+        if args.get('seperate_out_factors', 0):
             x_df = helperprocess.as_wide_factor(x_df, keep_original=0,
                                                 exclude=['INTECHAG', 'CURVES', 'MIMEDSH', 'MXMEDSH', 'SPEED'])
@@ -159,7 +325,39 @@ def main(args, **kwargs):
             'transformations': ['no', 'no', 'no', 'no'],
             'dispersion': 0
         }
+    elif dataset == 8:
+        print('Main County')
+        df = pd.read_csv('./data/rural_int.csv')  # read in the data
+        y_df = df[['crashes']].copy()  # only consider crashes
+        y_df.rename(columns={"crashes": "Y"}, inplace=True)
+        panels = df['orig_ID']
+        try:
+            x_df = df.drop(columns=['crashes', 'year', 'orig_ID',
+                                    'jurisdiction', 'town', 'maint_region', 'weather_station', 'dummy_winter_2'])  # was dropped postcode
+            print('dropping for test')
+            x_df = x_df.drop(columns=['month', 'inj.fat', 'PDO'])
+            x_df = x_df.drop(columns = [ 'zonal_ID', 'ln_AADT', 'ln_seg'])
+            x_df['rumble_install_year'] = x_df['rumble_install_year'].astype('category').cat.codes
+            x_df.rename(columns={"rumble_install_year": "has_rumble"}, inplace=True)
+        except Exception as e:
+            print(e)
+            x_df = df.drop(columns=['Y'])  # was dropped postcode
+        group_grab = x_df['county']
+        x_df = x_df.drop(columns =['county'])
+        x_df = helperprocess.interactions(x_df, drop_this_perc=0.8)
+        x_df['county'] = group_grab
+        print('benchmark specification')
+        manual_fit_spec = {
+            'fixed_terms': ['const', 'monthly_AADT', 'segment_length', 'speed', 'paved_shoulder', 'curve'],
+            'rdm_terms': [],
+            'rdm_cor_terms': [],
+            'grouped_terms': ['DP01:normal', 'DX32:normal'],
+            'hetro_in_means': [],
+            'transformations': ['no', 'no', 'no', 'no', 'no', 'no'],
+            'dispersion': 0
+        }
     elif dataset == 9:
         df = pd.read_csv('panel_synth.csv')  # read in the data
@@ -185,65 +383,32 @@ def main(args, **kwargs):
         keep = ['group', 'constant', 'element_ID']
         x_df = helperprocess.interactions(x_df, keep)
-    else:  # the dataset has been selected in the program as something else
-        from tkinter import Tk
-        from tkinter.filedialog import askopenfilename
-        ASK_ANALALYST = 0
-        if ASK_ANALALYST:
-            root = Tk()
-            root.withdraw()
-            # Prompt the user to select a directory
-            directory = askopenfilename(title="Select File For Analysis")
-            skip_lines = int(input("Select the number of lines to skip, (numeric): "))
-            df = pd.read_csv(directory, skip_rows=skip_lines)
-        else:
-            df = pd.read_csv('data/rqc40516_MotorcycleQUT_engineer_crash.csv', skiprows=5)
-        df['CRASH_SPEED_LIMIT'] = df['CRASH_SPEED_LIMIT'].str.replace(' km/h', '').astype(int)
-        # Clean data types
-        df = clean_data_types(df)
-        # Encode categorical variables
-        categories = ['CRASH_SEVERITY', 'CRASH_TYPE', 'CRASH_NATURE', 'CRASH_ATMOSPHERIC_CONDITION']
-        df = pd.get_dummies(df, columns=categories)
-        # Select only numeric columns
-        numeric_types = ['int32', 'uint8', 'bool', 'int64', 'float64']
-        df = df.select_dtypes(include=numeric_types)
-        # Check for missing values and fill with column mean
-        missing_values_count = df['CASUALTY_TOTAL'].isnull().sum()
-        df.fillna(df.mean())
-        # Remove unnecessary columns
-        df.drop(columns=['CRASH_REF_NUMBER'], inplace=True)
-        y = df['CASUALTY_TOTAL']
-        # Define columns to exclude from the analysis
-        EXCLUDE = [
-            'LONGITUDE', 'YEAR', 'DCA', 'ID', 'LATIT', 'NAME', 'SEVERITY',
-            "CASUALTY", "CRASH_FIN_YEAR", "CRASH_HOUR", "MOPED"
-        ]
+    elif dataset ==10:  # the dataset has been selected in the program as something else
+        data_info = process_arguments(**args)
+        data_info['hyper']
+        data_info['analyst']
+        data_info['data']['Y']
+        #data_info['data']['Group'][0]
+        #data_info['data']['Panel'][0]
+        args['decisions'] = data_info['analyst']
+        print('check the args of the decions')
+        if type(data_info['data']['Grouped'][0]) == str and len(data_info['data']['Grouped'][0]) >1:
+            args['group'] = data_info['data']['Grouped'][0]
+            args['ID'] = data_info['data']['Grouped'][0]
+        if type(data_info['data']['Panel'][0]) == str and len(data_info['data']['Panel'][0])>1:
+            args['panels'] = data_info['data']['Panel'][0]
+        df = pd.read_csv(str(data_info['data']['Problem'][0]))
+        x_df = df.drop(columns=[data_info['data']['Y'][0]])
+        y_df = df[[data_info['data']['Y'][0]]]
+        y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
+        print('test') #FIXME
+    else:
+        print('PROCESS THE PACKAGE ARGUMENTS SIMULIAR TO HOW ONE WOULD DEFINE THE ENVIRONMENT')
+        data_info =process_package_arguments()
-        # Filter out excluded columns
-        df = df[[col for col in df.columns if not any(ex in col for ex in EXCLUDE)]]
-        # Prepare target variable
-        # Check for finite values and compute correlations
-        finite_check = df.apply(np.isfinite).all()
-        df_clean = df.loc[:, finite_check]
-        corr = df_clean.corr()
-        # Identify and remove highly correlated features
-        hc = findCorrelation(corr, cutoff=0.5)
-        trimmed_df = df_clean.drop(columns=hc)
-        # Feature selection
-        df_cleaner, fs = select_features(trimmed_df, y)
-        x_df = df_cleaner
-        y_df = y.to_frame(name="Y")
-        # y_df.rename(columns={"CASUALTY_TOTAL": "Y"}, inplace=True)
     if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
         if manual_fit_spec is None:
@@ -251,8 +416,8 @@ def main(args, **kwargs):
         else:
             print('fitting manually')
             args['Manual_Fit'] = manual_fit_spec
     if args['problem_number'] == str(8) or args['problem_number'] == 8:
+        print('Maine County Dataset.')
         args['group'] = 'county'
         args['panels'] = 'element_ID'
         args['ID'] = 'element_ID'
@@ -262,11 +427,13 @@ def main(args, **kwargs):
         args['panels'] = 'ind_id'
         args['ID'] = 'ind_id'
     args['complexity_level'] = args.get('complexity_level', 6)
-    # if no manual input ALGORITHMS DEPEND ON The SET_DATA_CSV TO DEFINE HYPERPARAMATERS
-    AnalystSpecs = None
-    args['AnalystSpecs'] = AnalystSpecs
+    # Initialize AnalystSpecs to None if not manually provided
+    args['AnalystSpecs'] = args.get('AnalystSpecs', None)
     if args['algorithm'] == 'sa':
         args_hyperparameters = {'alpha': float(args['temp_scale']),
@@ -312,7 +479,7 @@ def main(args, **kwargs):
     elif args['algorithm'] == 'de':
-        # force tvariablese
+        # force variables
         args['must_include'] = args.get('force', [])
         args_hyperparameters = {'_AI': args.get('_AI', 2),
@@ -321,7 +488,6 @@ def main(args, **kwargs):
             , '_pop_size': int(args['_hms']), 'instance_number': int(args['line'])
             , 'Manual_Fit': args['Manual_Fit'],
                                 'MP': int(args['MP'])
                                 }
         args_hyperparameters = dict(args_hyperparameters)
@@ -347,50 +513,64 @@ if __name__ == '__main__':
     alg_parser.print_help()
     parser = argparse.ArgumentParser(prog='main',
                                      epilog=main.__doc__,
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument('-line', type=int, default=44,
-                        help='line to read in csv to pass in argument')
-    if vars(parser.parse_args())['line'] is not None:
-        reader = csv.DictReader(open('set_data.csv', 'r'))
-        args = list()
-        line_number_obs = 0
-        for dictionary in reader:  # TODO find a way to handle multiple args
-            args = dictionary
-            if line_number_obs == int(vars(parser.parse_args())['line']):
-                break
-            line_number_obs += 1
-        args = dict(args)
-        for key, value in args.items():
-            try:
-                # Attempt to parse the string value to a Python literal if value is a string.
-                if isinstance(value, str):
-                    value = ast.literal_eval(value)
-            except (ValueError, SyntaxError):
-                # If there's a parsing error, value remains as the original string.
-                pass
-            # Add the argument to the parser with the potentially updated value.
-            parser.add_argument(f'-{key}', default=value)
-        for i, action in enumerate(parser._optionals._actions):
-            if "-algorithm" in action.option_strings:
-                parser._optionals._actions[i].help = "optimization algorithm"
-        if 'algorithm' not in args:
-            parser.add_argument('-algorithm', type=str, default='hs',
-                                help='optimization algorithm')
-        elif 'Manual_Fit' not in args:
-            parser.add_argument('-Manual_Fit', action='store_false', default=None,
-                                help='To fit a model manually if desired.')
-        parser.add_argument('-seperate_out_factors', action='store_false', default=False,
-                            help='Trie of wanting to split data that is potentially categorical as binary'
-                                 ' we want to split the data for processing')
-        parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
-    else:  # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALLY
+                                     formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve')
+    BATCH_JOB = False
+    if BATCH_JOB:
+        parser.add_argument('-dataset_file', default='data/Ex-16-3.csv', help='supply the path to the dataset')
+        parser.add_argument('-line', type=int, default=1,
+                            help='line to read in csv to pass in argument')
+        if vars(parser.parse_args())['line'] is not None:
+            reader = csv.DictReader(open('set_data.csv', 'r'))
+            args = list()
+            line_number_obs = 0
+            for dictionary in reader:  # TODO find a way to handle multiple args
+                args = dictionary
+                if line_number_obs == int(vars(parser.parse_args())['line']):
+                    break
+                line_number_obs += 1
+            args = dict(args)
+            for key, value in args.items():
+                try:
+                    # Attempt to parse the string value to a Python literal if value is a string.
+                    if isinstance(value, str):
+                        value = ast.literal_eval(value)
+                except (ValueError, SyntaxError):
+                    # If there's a parsing error, value remains as the original string.
+                    pass
+                # Add the argument to the parser with the potentially updated value.
+                parser.add_argument(f'-{key}', default=value)
+            for i, action in enumerate(parser._optionals._actions):
+                if "-algorithm" in action.option_strings:
+                    parser._optionals._actions[i].help = "optimization algorithm"
+            override = True
+            if override:
+                print('WARNING: TESTING ENVIRONMENT, TURN OFF FOR RELEASE')
+                parser.add_argument('-problem_number', default='10')
+            if 'algorithm' not in args:
+                parser.add_argument('-algorithm', type=str, default='hs',
+                                    help='optimization algorithm')
+            elif 'Manual_Fit' not in args:
+                parser.add_argument('-Manual_Fit', action='store_false', default=None,
+                                    help='To fit a model manually if desired.')
+            parser.add_argument('-seperate_out_factors', action='store_false', default=False,
+                                help='Trie of wanting to split data that is potentially categorical as binary'
+                                    ' we want to split the data for processing')
+            parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directories')
+    else:  # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
+        print("RUNNING WITH ARGS")
         parser.add_argument('-com', type=str, default='MetaCode',
                             help='line to read csv')
@@ -398,7 +578,7 @@ if __name__ == '__main__':
     parser.print_help()
     args = vars(parser.parse_args())
     print(type(args))
-    # TODO add in chi 2 and df in estimation and compare degrees of freedom
+    # TODO add in chi 2 and df in estimation and compare degrees of freedom this needs to be done in solution
     # Print the args.
     profiler = cProfile.Profile()

metacountregressor 0.1.73__py3-none-any.whl → 0.1.83__py3-none-any.whl

metacountregressor 0.1.73py3-none-any.whl → 0.1.83py3-none-any.whl