PyPI - metacountregressor - Versions diffs - 0.1.78__py3-none-any.whl → 0.1.83__py3-none-any.whl - Mend

metacountregressor 0.1.78py3-none-any.whl → 0.1.83py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

metacountregressor/main.py CHANGED Viewed

@@ -28,7 +28,137 @@ def convert_df_columns_to_binary_and_wide(df):
     return df
+def process_arguments(**kwargs):
+    '''
+    TRYING TO TURN THE CSV FILES INTO RELEVANT ARGS
+    '''
+    #dataset
+    '''
+    if kwargs.get('dataset_file', False
+    ):
+        dataset = pd.read_csv(kwargs.get('dataset_file'))
+        named_data_headers = dataset.columns.tolist()
+        decision_constants = {name: list(range(7)) for name in named_data_headers}
+        data_info = {
+            'AADT': {
+                'type': 'continuous',
+                'bounds': [0.0, np.infty],
+                'discrete': False,
+                'apply_func': (lambda x: np.log(x + 1)),
+            },
+            'SPEED': {
+                'type': 'continuous',
+                'bounds': [0, 100],
+                'enforce_bounds': True,
+                'discrete': True
+            },
+            'TIME': {
+                'type': 'continuous',
+                'bounds': [0, 23.999],
+                'discrete': False
+            }
+        }
+        #remove ID CoLUMNS from dataset
+        dataset = dataset.drop(columns = [
+            'ID'
+        ])
+        for c in dataset.columns:
+            if c not in data_info.keys():
+                data_info[c] = {'type': 'categorical'}
+        data_new  =helperprocess.transform_dataframe(dataset,data_info)
+        update_constant = kwargs.get('analyst_constraints')
+        #update the decision_constraints
+    '''
+    data_characteristic = pd.read_csv(kwargs.get('problem_data', 'problem_data.csv'))
+    # Extract the column as a list of characteristic names
+    #name_data_characteristics = data_characteristic.columns.tolist()
+    # Create the dictionary
+    #decision_constraints = {name: list(range(7)) for name in name_data_characteristics}
+    #print('this gets all the features, I need to remove...')
+    analyst_d = pd.read_csv(kwargs.get('decison_constraints', 'decisions.csv'))
+    hyper = pd.read_csv('setup_hyper.csv')
+    new_data = {'data': data_characteristic,
+                'analyst':analyst_d,
+                'hyper': hyper}
+    return new_data
+def process_package_arguments():
+    new_data = {}
+    pass
 def main(args, **kwargs):
+    '''METACOUNT REGRESSOR TESTING ENVIRONMENT'''
+    '''
+    TESTING_ENV = False
+    if TESTING_ENV:
+        import statsmodels.api as sm
+        data = sm.datasets.sunspots.load_pandas().data
+        # print(data.exog)
+        data_exog = data['YEAR']
+        data_exog = sm.add_constant(data_exog)
+        data_endog = data['SUNACTIVITY']
+        # Instantiate a gamma family model with the default link function.
+        import numpy as np
+        gamma_model = sm.NegativeBinomial(data_endog, data_exog)
+        gamma_results = gamma_model.fit()
+        print(gamma_results.summary())
+        # NOW LET's COMPARE THIS TO METACOUNT REGRESSOR
+        import metacountregressor
+        from importlib.metadata import version
+        print(version('metacountregressor'))
+        import pandas as pd
+        import numpy as np
+        from metacountregressor.solution import ObjectiveFunction
+        from metacountregressor.metaheuristics import (harmony_search,
+                                                       differential_evolution,
+                                                       simulated_annealing)
+        # Model Decisions,
+        manual_fit_spec = {
+            'fixed_terms': ['const', 'YEAR'],
+            'rdm_terms': [],
+            'rdm_cor_terms': [],
+            'grouped_terms': [],
+            'hetro_in_means': [],
+            'transformations': ['no', 'no'],
+            'dispersion': 1  # Negative Binomial
+        }
+        # Arguments
+        arguments = {
+            'algorithm': 'hs',
+            'test_percentage': 0,
+            'test_complexity': 6,
+            'instance_number': 'name',
+            'Manual_Fit': manual_fit_spec
+        }
+        obj_fun = ObjectiveFunction(data_exog, data_endog, **arguments)
+    '''
     print('the args is:', args)
     print('the kwargs is', kwargs)
@@ -44,13 +174,25 @@ def main(args, **kwargs):
         X = df
         y = df['FREQ']  # Frequency of crashes
         X['Offset'] = np.log(df['AADT'])  # Explicitley define how to offset the data, no offset otherwise
+        df['Offset'] = np.log(df['AADT'])
         # Drop Y, selected offset term and  ID as there are no panels
         X = df.drop(columns=['FREQ', 'ID', 'AADT'])
+        # Step 0: Process Data
+        model_terms = {
+            'Y': 'FREQ',  # Replace 'FREQ' with the name of your dependent variable
+            'group': None,  # Replace 'group_column' with the name of your grouping column (or None if not used)
+            'panels': None,  # Replace 'panel_column' with the name of your panel column (or None if not used)
+            'Offset': 'Offset'  # Replace None with the name of your offset column if using one
+        }
+        a_des, df = helperprocess.set_up_analyst_constraints(df, model_terms)
         # some example argument, these are defualt so the following line is just for claritity
         args = {'algorithm': 'hs', 'test_percentage': 0.15, 'test_complexity': 6, 'instance_number': 1,
-                'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6}
+                'val_percentage': 0.15, 'obj_1': 'bic', '_obj_2': 'RMSE_TEST', "MAX_TIME": 6, 'desicions':a_des}
         # Fit the model with metacountregressor
+        # Step 5: Transform the dataset based on the configuration
+        #data_new = helperprocess.transform_dataframe(dataset, config)
+        y = df[['Y']]
+        X = df.drop(columns=['Y'])
         obj_fun = ObjectiveFunction(X, y, **args)
         # replace with other metaheuristics if desired
         results = harmony_search(obj_fun)
@@ -92,8 +234,8 @@ def main(args, **kwargs):
             'rdm_cor_terms': [],
             'grouped_terms': [],
             'hetro_in_means': [],
-            'transformations': ['no', 'log', 'log', 'no', 'no', 'no', 'no'],
-            'dispersion': 1
+            'transformations': ['no', 'log', 'no', 'no', 'no', 'no', 'no'],
+            'dispersion': 0
         }
         keep = ['Constant', 'US', 'RSMS', 'MCV', 'RSHS', 'AADT', 'Curve50', 'Offset']
@@ -102,14 +244,38 @@ def main(args, **kwargs):
     elif dataset == 4:
         manual_fit_spec = {
             'fixed_terms': ['const', 'LOWPRE', 'GBRPM', 'FRICTION'],
-            'rdm_terms': ['Expose:normal', 'INTPM:normal', 'CPM:normal', 'HISNOW:normal'],
+            'rdm_terms': ['EXPOSE:normal', 'INTPM:normal', 'CPM:normal', 'HISNOW:normal'],
+            'rdm_cor_terms': [],
+            'grouped_terms': [],
+            'hetro_in_means': [],
+            'transformations': ['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
+            'dispersion': 1
+        }
+        '''
+        manual_fit_spec = {
+            'fixed_terms': ['const', 'LOWPRE', 'GBRPM', 'FRICTION', 'EXPOSE', 'INTPM', 'CPM', 'HISNOW'],
+            'rdm_terms': [],
             'rdm_cor_terms': [],
             'grouped_terms': [],
             'hetro_in_means': [],
             'transformations': ['no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
             'dispersion': 1
         }
+        '''
+        '''
+        print('overriding this delete, just want to test the NB')
+        manual_fit_spec = {
+            'fixed_terms': ['const'],
+            'rdm_terms': [],
+            'rdm_cor_terms': [],
+            'grouped_terms': [],
+            'hetro_in_means': [],
+            'transformations': ['no'],
+            'dispersion': 1
+        }
+        '''
         df = pd.read_csv('./data/Ex-16-3.csv')  # read in the data
         y_df = df[['FREQ']].copy()  # only consider crashes
         y_df.rename(columns={"FREQ": "Y"}, inplace=True)
@@ -118,7 +284,7 @@ def main(args, **kwargs):
         x_df['Offset'] = np.log(1 + x_df['AADT'] * x_df['LENGTH'] * 365 / 100000000)
         x_df = x_df.drop(columns=['AADT', 'LENGTH'])
-        if args['separate_out_factors']:
+        if args.get('seperate_out_factors', 0):
             x_df = helperprocess.as_wide_factor(x_df, keep_original=0,
                                                 exclude=['INTECHAG', 'CURVES', 'MIMEDSH', 'MXMEDSH', 'SPEED'])
@@ -173,8 +339,8 @@ def main(args, **kwargs):
             x_df = x_df.drop(columns = [ 'zonal_ID', 'ln_AADT', 'ln_seg'])
             x_df['rumble_install_year'] = x_df['rumble_install_year'].astype('category').cat.codes
             x_df.rename(columns={"rumble_install_year": "has_rumble"}, inplace=True)
-        except:
+        except Exception as e:
+            print(e)
             x_df = df.drop(columns=['Y'])  # was dropped postcode
         group_grab = x_df['county']
@@ -182,6 +348,17 @@ def main(args, **kwargs):
         x_df = helperprocess.interactions(x_df, drop_this_perc=0.8)
         x_df['county'] = group_grab
+        print('benchmark specification')
+        manual_fit_spec = {
+            'fixed_terms': ['const', 'monthly_AADT', 'segment_length', 'speed', 'paved_shoulder', 'curve'],
+            'rdm_terms': [],
+            'rdm_cor_terms': [],
+            'grouped_terms': ['DP01:normal', 'DX32:normal'],
+            'hetro_in_means': [],
+            'transformations': ['no', 'no', 'no', 'no', 'no', 'no'],
+            'dispersion': 0
+        }
     elif dataset == 9:
         df = pd.read_csv('panel_synth.csv')  # read in the data
         y_df = df[['Y']].copy()  # only consider crashes
@@ -206,8 +383,32 @@ def main(args, **kwargs):
         keep = ['group', 'constant', 'element_ID']
         x_df = helperprocess.interactions(x_df, keep)
-    else:  # the dataset has been selected in the program as something else
-        print('TODO add in dataset')
+    elif dataset ==10:  # the dataset has been selected in the program as something else
+        data_info = process_arguments(**args)
+        data_info['hyper']
+        data_info['analyst']
+        data_info['data']['Y']
+        #data_info['data']['Group'][0]
+        #data_info['data']['Panel'][0]
+        args['decisions'] = data_info['analyst']
+        print('check the args of the decions')
+        if type(data_info['data']['Grouped'][0]) == str and len(data_info['data']['Grouped'][0]) >1:
+            args['group'] = data_info['data']['Grouped'][0]
+            args['ID'] = data_info['data']['Grouped'][0]
+        if type(data_info['data']['Panel'][0]) == str and len(data_info['data']['Panel'][0])>1:
+            args['panels'] = data_info['data']['Panel'][0]
+        df = pd.read_csv(str(data_info['data']['Problem'][0]))
+        x_df = df.drop(columns=[data_info['data']['Y'][0]])
+        y_df = df[[data_info['data']['Y'][0]]]
+        y_df.rename(columns={data_info['data']['Y'][0]: "Y"}, inplace=True)
+        print('test') #FIXME
+    else:
+        print('PROCESS THE PACKAGE ARGUMENTS SIMULIAR TO HOW ONE WOULD DEFINE THE ENVIRONMENT')
+        data_info =process_package_arguments()
     if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
         if manual_fit_spec is None:
@@ -215,7 +416,6 @@ def main(args, **kwargs):
         else:
             print('fitting manually')
             args['Manual_Fit'] = manual_fit_spec
     if args['problem_number'] == str(8) or args['problem_number'] == 8:
         print('Maine County Dataset.')
         args['group'] = 'county'
@@ -227,6 +427,8 @@ def main(args, **kwargs):
         args['panels'] = 'ind_id'
         args['ID'] = 'ind_id'
     args['complexity_level'] = args.get('complexity_level', 6)
@@ -312,57 +514,63 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser(prog='main',
                                      epilog=main.__doc__,
                                      formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve')
-    parser.add_argument('-line', type=int, default=44,
-                        help='line to read in csv to pass in argument')
-    if vars(parser.parse_args())['line'] is not None:
-        reader = csv.DictReader(open('set_data.csv', 'r'))
-        args = list()
-        line_number_obs = 0
-        for dictionary in reader:  # TODO find a way to handle multiple args
-            args = dictionary
-            if line_number_obs == int(vars(parser.parse_args())['line']):
-                break
-            line_number_obs += 1
-        args = dict(args)
-        for key, value in args.items():
-            try:
-                # Attempt to parse the string value to a Python literal if value is a string.
-                if isinstance(value, str):
-                    value = ast.literal_eval(value)
-            except (ValueError, SyntaxError):
-                # If there's a parsing error, value remains as the original string.
-                pass
-            # Add the argument to the parser with the potentially updated value.
-            parser.add_argument(f'-{key}', default=value)
-        for i, action in enumerate(parser._optionals._actions):
-            if "-algorithm" in action.option_strings:
-                parser._optionals._actions[i].help = "optimization algorithm"
-        override = True
-        if override:
-            print('todo turn off, in testing phase')
-            parser.add_argument('-problem_number', default='8')
-            print('did it make it')
-        if 'algorithm' not in args:
-            parser.add_argument('-algorithm', type=str, default='hs',
-                                help='optimization algorithm')
-        elif 'Manual_Fit' not in args:
-            parser.add_argument('-Manual_Fit', action='store_false', default=None,
-                                help='To fit a model manually if desired.')
-        parser.add_argument('-seperate_out_factors', action='store_false', default=False,
-                            help='Trie of wanting to split data that is potentially categorical as binary'
-                                 ' we want to split the data for processing')
-        parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
+    BATCH_JOB = False
+    if BATCH_JOB:
+        parser.add_argument('-dataset_file', default='data/Ex-16-3.csv', help='supply the path to the dataset')
+        parser.add_argument('-line', type=int, default=1,
+                            help='line to read in csv to pass in argument')
+        if vars(parser.parse_args())['line'] is not None:
+            reader = csv.DictReader(open('set_data.csv', 'r'))
+            args = list()
+            line_number_obs = 0
+            for dictionary in reader:  # TODO find a way to handle multiple args
+                args = dictionary
+                if line_number_obs == int(vars(parser.parse_args())['line']):
+                    break
+                line_number_obs += 1
+            args = dict(args)
+            for key, value in args.items():
+                try:
+                    # Attempt to parse the string value to a Python literal if value is a string.
+                    if isinstance(value, str):
+                        value = ast.literal_eval(value)
+                except (ValueError, SyntaxError):
+                    # If there's a parsing error, value remains as the original string.
+                    pass
+                # Add the argument to the parser with the potentially updated value.
+                parser.add_argument(f'-{key}', default=value)
+            for i, action in enumerate(parser._optionals._actions):
+                if "-algorithm" in action.option_strings:
+                    parser._optionals._actions[i].help = "optimization algorithm"
+            override = True
+            if override:
+                print('WARNING: TESTING ENVIRONMENT, TURN OFF FOR RELEASE')
+                parser.add_argument('-problem_number', default='10')
+            if 'algorithm' not in args:
+                parser.add_argument('-algorithm', type=str, default='hs',
+                                    help='optimization algorithm')
+            elif 'Manual_Fit' not in args:
+                parser.add_argument('-Manual_Fit', action='store_false', default=None,
+                                    help='To fit a model manually if desired.')
+            parser.add_argument('-seperate_out_factors', action='store_false', default=False,
+                                help='Trie of wanting to split data that is potentially categorical as binary'
+                                    ' we want to split the data for processing')
+            parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directories')
     else:  # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
+        print("RUNNING WITH ARGS")
         parser.add_argument('-com', type=str, default='MetaCode',
                             help='line to read csv')
@@ -370,7 +578,7 @@ if __name__ == '__main__':
     parser.print_help()
     args = vars(parser.parse_args())
     print(type(args))
-    # TODO add in chi 2 and df in estimation and compare degrees of freedom
+    # TODO add in chi 2 and df in estimation and compare degrees of freedom this needs to be done in solution
     # Print the args.
     profiler = cProfile.Profile()

metacountregressor/metaheuristics.py CHANGED Viewed

@@ -20,8 +20,8 @@ try:
     from .solution import ObjectiveFunction
 except:
     print('Exception relative import')
-    from metacountregressor.pareto_file import Pareto, Solution
-    from metacountregressor.solution import ObjectiveFunction
+    from pareto_file import Pareto, Solution
+    from solution import ObjectiveFunction
 HarmonySearchResults = namedtuple('HarmonySearchResults',
@@ -72,7 +72,7 @@ def dict_mean(dict_list,
             mean_dict[key] = sum(d[key] for d in dict_list) / len(dict_list)
         return mean_dict
     else:
+        mean_dict = {}
         for key in dict_list[0].keys():
             if key in ignore:
                 continue
@@ -402,6 +402,7 @@ class DifferentialEvolution(object):
             self._obj_fun._obj_1 = 'bic'
         self._pop_size = kwargs.get('_pop_size', 20)
+        print('Population size is', self._pop_size)
         if not isinstance(self._pop_size, int):
             raise ValueError("_pop_size must be an integer")
         elif self._pop_size <= 3:
@@ -618,7 +619,7 @@ class DifferentialEvolution(object):
                                                                                      1)
                     if len(self._pareto_population) == 1:
-                        print('the size of the population is only 1')
+                        print('Pareto Population Size is only 1')
                     if self.pf.check_dominance([obj_trial[self.pf.obj_key_1], obj_trial[self.pf.obj_key_2]],
                                                [self._population[j][self.pf.obj_key_1], self._population[j][
                                                    self.pf.obj_key_2]]):  # if solution dominates existing #FIXME some error here true but not entering
@@ -787,7 +788,7 @@ class SimulatedAnnealing(object):
         self.accept = 0
         self.profiler = []
         self.update_t = self.cooling_linear_m
-        self.get_direcotory()
+        self.get_directory()
         self._crossover_perc = float(kwargs.get('_crossover_perc', 0.2)) or float(kwargs.get('_cr', 0.2))
         self._obj_fun = objective_function
         if objective_function.is_multi:  # TODO Define more specific objectives in the intialiser
@@ -801,7 +802,7 @@ class SimulatedAnnealing(object):
             self.pf = Pareto(self.obj_1, self.obj_2, False)
             self._sa_memory = list()
-    def get_direcotory(self):
+    def get_directory(self):
         # checking if the directory demo_folder2
         # exist or not.
         if not os.path.isdir(self.instance_number):
@@ -1004,7 +1005,7 @@ class SimulatedAnnealing(object):
                     elif num_of_changeablePARMs == 0:
                         rdm_i = random.choice(range(len(prmVect)))
                         if self._obj_fun.get_num_discrete_values(rdm_i) <= 1:
-                            print('hold gimct')
+                            print('retry')
                     while self._obj_fun.get_num_discrete_values(rdm_i) <= 1:
                         rdm_i = random.randint(0, self._obj_fun.get_num_parameters() - 1)
@@ -1046,7 +1047,7 @@ class SimulatedAnnealing(object):
                             get_rdm_j = random.randint(0, self._obj_fun.get_num_discrete_values(rdm_i) - 1)
                             if (self._obj_fun.get_num_discrete_values(
                                     rdm_i) - 1) < 1:  # TODO: remove this is just a test
-                                print('fucking fix this sln algorithm')
                                 break
                             new_nbr_i = self._obj_fun.get_value(rdm_i, get_rdm_j)
                             neighbour[rdm_i] = new_nbr_i
@@ -1237,15 +1238,25 @@ class HarmonySearch(object):
             Initialize HS with the specified objective function. Note that this objective function must implement ObjectiveFunctionInterface.
         """
         self._obj_fun = objective_function
+        ## NEW CODE, TRYING TO EXCTACT OUT THE PARAMATERS
+        self._hms = kwargs.get('_hms', 20)
+        self._par = kwargs.get('_par', .30)
+        self.F = kwargs.get('_AI', 2)  # mutation scale
+        self.iter = kwargs.get('_max_iter', 10000)
+        self.cr = kwargs.get('_crossover_perc') or kwargs.get('_cr', 0.2)
+        self.instance_number = str(kwargs.get('instance_number', 1))
         # for printing basics metrics
-        self.print_verbose = True
+        self.print_verbose = kwargs.get('verbose', False)
         # harmony_memory stores the best hms harmonies
         self._harmony_memory = list()
         # harmony_history stores all hms harmonies every nth improvisations (i.e., one 'generation')
         self._harmony_history = list()
         # saves the best fitness
         self.instance_number = str(objective_function.instance_number)
-        self.get_direcotory()
+        self.get_directory()
         self._harmony_trace_best = list()
         self._harmony_trace_incumbent = list()
         if self._obj_fun.is_multi:  # TODO Define more specific objectives in the intialiser
@@ -1261,7 +1272,7 @@ class HarmonySearch(object):
             self.pf = Pareto(self.obj_1, self.obj_2, False)
-    def get_direcotory(self):
+    def get_directory(self):
         # checking if the directory demo_folder2
         # exist or not.
         if not os.path.isdir(self.instance_number):

metacountregressor/setup.py CHANGED Viewed

@@ -8,7 +8,7 @@ with codecs.open("README.rst", encoding='utf8') as fh:
 setuptools.setup(name='metacountregressor',
                  version='0.1.63',
                  description='Extensions for a Python package for \
-                              GPU-accelerated estimation of mixed logit models.',
+                             estimation of data count models.',
                  long_description=long_description,
                  long_description_content_type="text/x-rst",
                  url='https://github.com/zahern/CountDataEstimation',
@@ -20,5 +20,6 @@ setuptools.setup(name='metacountregressor',
                  python_requires='>=3.10',
                  install_requires=[
                      'numpy>=1.13.1',
-                     'scipy>=1.0.0'
+                     'scipy>=1.0.0',
+                     'latextable'
                  ])

metacountregressor 0.1.78__py3-none-any.whl → 0.1.83__py3-none-any.whl

metacountregressor 0.1.78py3-none-any.whl → 0.1.83py3-none-any.whl