PyPI - metacountregressor - Versions diffs - 0.1.73__py3-none-any.whl → 0.1.78__py3-none-any.whl - Mend

metacountregressor 0.1.73py3-none-any.whl → 0.1.78py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

metacountregressor/data_split_helper.py ADDED Viewed

@@ -0,0 +1,90 @@
+import numpy as np
+import pandas as pd
+class DataProcessor:
+    def __init__(self, x_data, y_data, kwargs):
+        self._obj_1 = kwargs.get('_obj_1')
+        self._obj_2 = kwargs.get('_obj_2')
+        self.test_percentage = float(kwargs.get('test_percentage', 0))
+        self.val_percentage = float(kwargs.get('val_percentage', 0))
+        self.is_multi = self.test_percentage != 0
+        self._x_data = x_data
+        self._y_data = y_data
+        self._process_data(kwargs)
+    def _process_data(self, kwargs):
+        if self._obj_1 == 'MAE' or self._obj_2 in ["MAE", 'RMSE', 'MSE', 'RMSE_IN', 'RMSE_TEST']:
+            self._handle_special_conditions(kwargs)
+        else:
+            self._standard_data_partition()
+        self._characteristics_names = list(self._x_data.columns)
+        self._max_group_all_means = 1
+        self._exclude_this_test = [4]
+    def _handle_special_conditions(self, kwargs):
+        if 'panels' in kwargs:
+            self._process_panels_data(kwargs)
+        else:
+            self._standard_data_partition()
+    def _process_panels_data(self, kwargs):
+        group_key = kwargs['group']
+        panels_key = kwargs['panels']
+        # Process groups and panels
+        self._x_data[group_key] = self._x_data[group_key].astype('category').cat.codes
+        try:
+            self._x_data[panels_key] = self._x_data[panels_key].rank(method='dense').astype(int)
+            self._x_data[panels_key] -= self._x_data[panels_key].min() - 1
+        except KeyError:
+            pass
+        # Create training and test datasets
+        unique_ids = np.unique(self._x_data[panels_key])
+        training_size = int((1 - self.test_percentage - self.val_percentage) * len(unique_ids))
+        training_ids = np.random.choice(unique_ids, training_size, replace=False)
+        train_idx = self._x_data.index[self._x_data[panels_key].isin(training_ids)]
+        test_idx = self._x_data.index[~self._x_data[panels_key].isin(training_ids)]
+        self._create_datasets(train_idx, test_idx)
+    def _standard_data_partition(self):
+        total_samples = len(self._x_data)
+        training_size = int((1 - self.test_percentage - self.val_percentage) * total_samples)
+        training_indices = np.random.choice(total_samples, training_size, replace=False)
+        train_idx = np.array([i for i in range(total_samples) if i in training_indices])
+        test_idx = np.array([i for i in range(total_samples) if i not in training_indices])
+        self._create_datasets(train_idx, test_idx)
+    def _create_datasets(self, train_idx, test_idx):
+        self.df_train = self._x_data.loc[train_idx, :]
+        self.df_test = self._x_data.loc[test_idx, :]
+        self.y_train = self._y_data.loc[train_idx, :]
+        self.y_test = self._y_data.loc[test_idx, :]
+        self._x_data_test = self.df_test.copy()
+        self._y_data_test = self.y_test.astype('float').copy()
+        self._x_data = self.df_train.copy()
+        self._y_data = self.y_train.astype('float').copy()
+        # Handle different shapes
+        if self._x_data.ndim == 2:  # Typical DataFrame
+            self._samples, self._characteristics = self._x_data.shape
+            self._panels = None
+        elif self._x_data.ndim == 3:  # 3D structure, e.g., Panel or similar
+            self._samples, self._panels, self._characteristics = self._x_data.shape

metacountregressor/helperprocess.py CHANGED Viewed

@@ -5,6 +5,121 @@ import matplotlib.pyplot as plt
 plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
+##Select the best Features Based on RF
+def select_features(X_train, y_train, n_f=16):
+    try:
+        from sklearn.feature_selection import SelectKBest
+        from sklearn.feature_selection import f_regression
+        feature_names = X_train.columns
+        # configure to select all features
+        fs = SelectKBest(score_func=f_regression, k=16)
+        # learn relationship from training data
+        fs.fit(X_train, y_train)
+        mask = fs.get_support()  # Boolean array of selected features
+        selected_features = [feature for bool, feature in zip(mask, feature_names) if bool]
+        X_train = X_train[selected_features]
+    except:
+        print('import error, not performing feature selection')
+        fs = X_train.columns #TODO check if this is actually getting the names
+    return X_train, fs
+#Cutts off correlated data
+def findCorrelation(corr, cutoff=0.9, exact=None):    """
+    This function is the Python implementation of the R function
+    `findCorrelation()`.
+    Relies on numpy and pandas, so must have them pre-installed.
+    It searches through a correlation matrix and returns a list of column names
+    to remove to reduce pairwise correlations.
+    For the documentation of the R function, see
+    https://www.rdocumentation.org/packages/caret/topics/findCorrelation
+    and for the source code of `findCorrelation()`, see
+    https://github.com/topepo/caret/blob/master/pkg/caret/R/findCorrelation.R
+    -----------------------------------------------------------------------------
+    Parameters:
+    -----------
+    corr: pandas dataframe.
+        A correlation matrix as a pandas dataframe.
+    cutoff: float, default: 0.9.
+        A numeric value for the pairwise absolute correlation cutoff
+    exact: bool, default: None
+        A boolean value that determines whether the average correlations be
+        recomputed at each step
+    -----------------------------------------------------------------------------
+    Returns:
+    --------
+    list of column names
+    -----------------------------------------------------------------------------
+    Example:
+    --------
+    R1 = pd.DataFrame({
+        'x1': [1.0, 0.86, 0.56, 0.32, 0.85],
+        'x2': [0.86, 1.0, 0.01, 0.74, 0.32],
+        'x3': [0.56, 0.01, 1.0, 0.65, 0.91],
+        'x4': [0.32, 0.74, 0.65, 1.0, 0.36],
+        'x5': [0.85, 0.32, 0.91, 0.36, 1.0]
+    }, index=['x1', 'x2', 'x3', 'x4', 'x5'])
+    findCorrelation(R1, cutoff=0.6, exact=False)  # ['x4', 'x5', 'x1', 'x3']
+    findCorrelation(R1, cutoff=0.6, exact=True)   # ['x1', 'x5', 'x4']
+    """
+def _findCorrelation_fast(corr, avg, cutoff):
+    combsAboveCutoff = corr.where(lambda x: (np.tril(x) == 0) & (x > cutoff)).stack().index
+    rowsToCheck = combsAboveCutoff.get_level_values(0)
+    colsToCheck = combsAboveCutoff.get_level_values(1)
+    msk = avg[colsToCheck] > avg[rowsToCheck].values
+    deletecol = pd.unique(np.r_[colsToCheck[msk], rowsToCheck[~msk]]).tolist()
+    return deletecol
+def _findCorrelation_exact(corr, avg, cutoff):
+    x = corr.loc[(*[avg.sort_values(ascending=False).index] * 2,)]
+    if (x.dtypes.values[:, None] == ['int64', 'int32', 'int16', 'int8']).any():
+        x = x.astype(float)
+    x.values[(*[np.arange(len(x))] * 2,)] = np.nan
+    deletecol = []
+    for ix, i in enumerate(x.columns[:-1]):
+        for j in x.columns[ix + 1:]:
+            if x.loc[i, j] > cutoff:
+                if x[i].mean() > x[j].mean():
+                    deletecol.append(i)
+                    x.loc[i] = x[i] = np.nan
+                else:
+                    deletecol.append(j)
+                    x.loc[j] = x[j] = np.nan
+"""Funtion to Convert Data to Binaries """
+def clean_data_types(df):
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            # Attempt to convert the column to numeric type
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+    return df
 def drop_correlations(x_df, percentage=0.85):
     cor_matrix = x_df.corr().abs()

metacountregressor/main.py CHANGED Viewed

@@ -9,14 +9,12 @@ import numpy as np
 import pandas as pd
 from pandas import DataFrame
 from pandas.io.parsers import TextFileReader
 import helperprocess
 from metaheuristics import (differential_evolution,
                             harmony_search,
                             simulated_annealing)
 from solution import ObjectiveFunction
-from test_motor import *
 warnings.simplefilter("ignore")
@@ -37,7 +35,7 @@ def main(args, **kwargs):
     # removing junk files if specicified
     helperprocess.remove_files(args.get('removeFiles', True))
-    # do we want tto run a test
+    # do we want to run a test
     if args.get('com', False) == 'MetaCode':
         print('Testing the Python Package')  # TODO add in python package import
         # Read data from CSV file
@@ -64,6 +62,7 @@ def main(args, **kwargs):
     print('the dataset is', dataset)
     manual_fit_spec = args.get('Manual_Fit', None)
     if dataset == 1:
+        print('Stage 5 A Short.')
         df = pd.read_csv('./data/1848.csv')  # read in the data
         y_df = df[['FSI']]  # only consider crashes
         y_df.rename(columns={"FSI": "Y"}, inplace=True)
@@ -71,6 +70,7 @@ def main(args, **kwargs):
         x_df = helperprocess.as_wide_factor(x_df)
     elif dataset == 3:
+        print('Stage 5 A Data Complete.')
         x_df = pd.read_csv('./data/Stage5A_1848_All_Initial_Columns.csv')  # drop the ID columns
         drop_these = ['Id', 'ID', 'old', 'G_N']
         for i in drop_these:
@@ -159,7 +159,28 @@ def main(args, **kwargs):
             'transformations': ['no', 'no', 'no', 'no'],
             'dispersion': 0
         }
+    elif dataset == 8:
+        print('Main County')
+        df = pd.read_csv('./data/rural_int.csv')  # read in the data
+        y_df = df[['crashes']].copy()  # only consider crashes
+        y_df.rename(columns={"crashes": "Y"}, inplace=True)
+        panels = df['orig_ID']
+        try:
+            x_df = df.drop(columns=['crashes', 'year', 'orig_ID',
+                                    'jurisdiction', 'town', 'maint_region', 'weather_station', 'dummy_winter_2'])  # was dropped postcode
+            print('dropping for test')
+            x_df = x_df.drop(columns=['month', 'inj.fat', 'PDO'])
+            x_df = x_df.drop(columns = [ 'zonal_ID', 'ln_AADT', 'ln_seg'])
+            x_df['rumble_install_year'] = x_df['rumble_install_year'].astype('category').cat.codes
+            x_df.rename(columns={"rumble_install_year": "has_rumble"}, inplace=True)
+        except:
+            x_df = df.drop(columns=['Y'])  # was dropped postcode
+        group_grab = x_df['county']
+        x_df = x_df.drop(columns =['county'])
+        x_df = helperprocess.interactions(x_df, drop_this_perc=0.8)
+        x_df['county'] = group_grab
     elif dataset == 9:
         df = pd.read_csv('panel_synth.csv')  # read in the data
@@ -186,64 +207,7 @@ def main(args, **kwargs):
         x_df = helperprocess.interactions(x_df, keep)
     else:  # the dataset has been selected in the program as something else
-        from tkinter import Tk
-        from tkinter.filedialog import askopenfilename
-        ASK_ANALALYST = 0
-        if ASK_ANALALYST:
-            root = Tk()
-            root.withdraw()
-            # Prompt the user to select a directory
-            directory = askopenfilename(title="Select File For Analysis")
-            skip_lines = int(input("Select the number of lines to skip, (numeric): "))
-            df = pd.read_csv(directory, skip_rows=skip_lines)
-        else:
-            df = pd.read_csv('data/rqc40516_MotorcycleQUT_engineer_crash.csv', skiprows=5)
-        df['CRASH_SPEED_LIMIT'] = df['CRASH_SPEED_LIMIT'].str.replace(' km/h', '').astype(int)
-        # Clean data types
-        df = clean_data_types(df)
-        # Encode categorical variables
-        categories = ['CRASH_SEVERITY', 'CRASH_TYPE', 'CRASH_NATURE', 'CRASH_ATMOSPHERIC_CONDITION']
-        df = pd.get_dummies(df, columns=categories)
-        # Select only numeric columns
-        numeric_types = ['int32', 'uint8', 'bool', 'int64', 'float64']
-        df = df.select_dtypes(include=numeric_types)
-        # Check for missing values and fill with column mean
-        missing_values_count = df['CASUALTY_TOTAL'].isnull().sum()
-        df.fillna(df.mean())
-        # Remove unnecessary columns
-        df.drop(columns=['CRASH_REF_NUMBER'], inplace=True)
-        y = df['CASUALTY_TOTAL']
-        # Define columns to exclude from the analysis
-        EXCLUDE = [
-            'LONGITUDE', 'YEAR', 'DCA', 'ID', 'LATIT', 'NAME', 'SEVERITY',
-            "CASUALTY", "CRASH_FIN_YEAR", "CRASH_HOUR", "MOPED"
-        ]
-        # Filter out excluded columns
-        df = df[[col for col in df.columns if not any(ex in col for ex in EXCLUDE)]]
-        # Prepare target variable
-        # Check for finite values and compute correlations
-        finite_check = df.apply(np.isfinite).all()
-        df_clean = df.loc[:, finite_check]
-        corr = df_clean.corr()
-        # Identify and remove highly correlated features
-        hc = findCorrelation(corr, cutoff=0.5)
-        trimmed_df = df_clean.drop(columns=hc)
-        # Feature selection
-        df_cleaner, fs = select_features(trimmed_df, y)
-        x_df = df_cleaner
-        y_df = y.to_frame(name="Y")
-        # y_df.rename(columns={"CASUALTY_TOTAL": "Y"}, inplace=True)
+        print('TODO add in dataset')
     if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
         if manual_fit_spec is None:
@@ -253,6 +217,7 @@ def main(args, **kwargs):
             args['Manual_Fit'] = manual_fit_spec
     if args['problem_number'] == str(8) or args['problem_number'] == 8:
+        print('Maine County Dataset.')
         args['group'] = 'county'
         args['panels'] = 'element_ID'
         args['ID'] = 'element_ID'
@@ -264,9 +229,9 @@ def main(args, **kwargs):
     args['complexity_level'] = args.get('complexity_level', 6)
-    # if no manual input ALGORITHMS DEPEND ON The SET_DATA_CSV TO DEFINE HYPERPARAMATERS
-    AnalystSpecs = None
-    args['AnalystSpecs'] = AnalystSpecs
+    # Initialize AnalystSpecs to None if not manually provided
+    args['AnalystSpecs'] = args.get('AnalystSpecs', None)
     if args['algorithm'] == 'sa':
         args_hyperparameters = {'alpha': float(args['temp_scale']),
@@ -312,7 +277,7 @@ def main(args, **kwargs):
     elif args['algorithm'] == 'de':
-        # force tvariablese
+        # force variables
         args['must_include'] = args.get('force', [])
         args_hyperparameters = {'_AI': args.get('_AI', 2),
@@ -321,7 +286,6 @@ def main(args, **kwargs):
             , '_pop_size': int(args['_hms']), 'instance_number': int(args['line'])
             , 'Manual_Fit': args['Manual_Fit'],
                                 'MP': int(args['MP'])
                                 }
         args_hyperparameters = dict(args_hyperparameters)
@@ -347,7 +311,7 @@ if __name__ == '__main__':
     alg_parser.print_help()
     parser = argparse.ArgumentParser(prog='main',
                                      epilog=main.__doc__,
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+                                     formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve')
     parser.add_argument('-line', type=int, default=44,
                         help='line to read in csv to pass in argument')
@@ -362,6 +326,7 @@ if __name__ == '__main__':
                 break
             line_number_obs += 1
         args = dict(args)
         for key, value in args.items():
             try:
                 # Attempt to parse the string value to a Python literal if value is a string.
@@ -378,6 +343,13 @@ if __name__ == '__main__':
             if "-algorithm" in action.option_strings:
                 parser._optionals._actions[i].help = "optimization algorithm"
+        override = True
+        if override:
+            print('todo turn off, in testing phase')
+            parser.add_argument('-problem_number', default='8')
+            print('did it make it')
         if 'algorithm' not in args:
             parser.add_argument('-algorithm', type=str, default='hs',
                                 help='optimization algorithm')
@@ -390,7 +362,7 @@ if __name__ == '__main__':
                                  ' we want to split the data for processing')
         parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
-    else:  # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALLY
+    else:  # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
         parser.add_argument('-com', type=str, default='MetaCode',
                             help='line to read csv')

metacountregressor/metaheuristics.py CHANGED Viewed

@@ -15,8 +15,14 @@ from datetime import datetime
 import numpy as np
 import pandas as pd
-from .pareto_file import Pareto, Solution
-from .solution import ObjectiveFunction
+try:
+    from .pareto_file import Pareto, Solution
+    from .solution import ObjectiveFunction
+except:
+    print('Exception relative import')
+    from metacountregressor.pareto_file import Pareto, Solution
+    from metacountregressor.solution import ObjectiveFunction
 HarmonySearchResults = namedtuple('HarmonySearchResults',
                                   ['elapsed_time', 'best_harmony', 'best_fitness', 'harmony_memories',
@@ -32,7 +38,7 @@ DifferentialEvolutionMulti = namedtuple('DifferentialEvolutionMulti',
                                         ['elapsed_time', 'best_solutions', 'population_solutions'])
-#helper function to plot the bic
+# helper function to plot the bic
 def _plot(x, y, z, xlabel=None, ylabel=None, zlabel=None, filename=None):
     from matplotlib import pyplot as plt
@@ -54,7 +60,8 @@ def _plot(x, y, z, xlabel=None, ylabel=None, zlabel=None, filename=None):
         plt.savefig('bic.png')
         plt.show()
-#helper function to grab dictionary means
+# helper function to grab dictionary means
 def dict_mean(dict_list,
               ignore=None):
     if ignore is None:
@@ -204,8 +211,7 @@ def different_evolution(objective_function, initial_slns=None, **kwargs):
 def differential_evolution(objective_function, initial_slns=None, **kwargs):
-    if not isinstance(objective_function, ObjectiveFunction):
-        raise Exception
     start = datetime.now()
     man = None
@@ -220,11 +226,8 @@ def differential_evolution(objective_function, initial_slns=None, **kwargs):
             de = Mutlithreaded_Meta(objective_function, **kwargs)
             best, pare = de.run_mp(initial_slns=initial_slns, mod_init=man)
         else:
             print('Not Multi Threaded')
             de = DifferentialEvolution(objective_function, **kwargs)
             best, pare = de.differential_evolution_run(initial_slns=initial_slns, mod_init=man)
         end = datetime.now()
@@ -393,12 +396,10 @@ class DifferentialEvolution(object):
     """
     def __init__(self, objective_function, **kwargs):
-        if not isinstance(objective_function, ObjectiveFunction):
-            raise TypeError
         self._obj_fun = objective_function
         if self._obj_fun._obj_1 is None:
-            raise Exception
+            print('no objective found, automatically selecting BIC')
+            self._obj_fun._obj_1 = 'bic'
         self._pop_size = kwargs.get('_pop_size', 20)
         if not isinstance(self._pop_size, int):
@@ -406,7 +407,7 @@ class DifferentialEvolution(object):
         elif self._pop_size <= 3:
             raise ValueError("_pop_size must be greater than 4")
-        self.F = kwargs.get('_AI', 2)  # mustation scale
+        self.F = kwargs.get('_AI', 2)  # mutation scale
         self.iter = kwargs.get('_max_iter', 10000)
         self.cr = kwargs.get('_crossover_perc') or kwargs.get('_cr', 0.2)
         self.instance_number = str(kwargs.get('instance_number', 1))
@@ -415,12 +416,9 @@ class DifferentialEvolution(object):
         self._population = list()
         self.it_process = 1
         if objective_function.is_multi:
             self.obj_1 = objective_function._obj_1
             self.obj_2 = objective_function._obj_2
             self.pf = Pareto(self.obj_1, self.obj_2, True)
             self._pareto_population = list()
         else:
             self.obj_1 = objective_function._obj_1
@@ -555,7 +553,6 @@ class DifferentialEvolution(object):
         average_iteration = 0
         iterations_without_improvement = 0
         start_time = datetime.now()
         if self._obj_fun.use_random_seed():
             self._obj_fun.set_random_seed()
@@ -949,10 +946,9 @@ class SimulatedAnnealing(object):
             output_step.append(a)
             output_energy.append(b)
             output_best_energy.append(c)
-        return {'elapsed_time': elapsed_time, 'Iteration': iteration}  #TODO make this reachavble
-        #return output_step, output_energy, output_best_energy, self.best_energy, self.best_struct
+        return {'elapsed_time': elapsed_time, 'Iteration': iteration}  # TODO make this reachavble
+        # return output_step, output_energy, output_best_energy, self.best_energy, self.best_struct
     def _get_neighbour(self, current, mutations=None):
         neighbour = copy.deepcopy(current)
@@ -963,7 +959,6 @@ class SimulatedAnnealing(object):
         # number of paramaters in the model #TODO get the last value if 2
         num_of_changeablePARMs = 0
         self._obj_fun.nbr_routine(current)
@@ -1242,7 +1237,8 @@ class HarmonySearch(object):
             Initialize HS with the specified objective function. Note that this objective function must implement ObjectiveFunctionInterface.
         """
         self._obj_fun = objective_function
+        # for printing basics metrics
+        self.print_verbose = True
         # harmony_memory stores the best hms harmonies
         self._harmony_memory = list()
         # harmony_history stores all hms harmonies every nth improvisations (i.e., one 'generation')
@@ -1294,7 +1290,7 @@ class HarmonySearch(object):
     def does_it_appear(self, new):
         for d in self._harmony_memory:
             if self.mixed_list_chescker(d['layout'], new):
-                #print('same sln appears in population')
+                # print('same sln appears in population')
                 return True
         return False
@@ -1314,6 +1310,7 @@ class HarmonySearch(object):
             self._obj_fun.set_random_seed()
         # fill harmony_memory using random parameter values by default, but with initial_harmonies if provided
         self._initialize(initial_harmonies, mod_init)
+        if self.print_verbose: print('Initialization complete')
         if self.pf.get_objective_is_multi():
             self._pareto_harmony_memory = self.pf.non_dominant_sorting(self._harmony_memory)
             generation_best = self._pareto_harmony_memory[0]
@@ -1333,6 +1330,9 @@ class HarmonySearch(object):
                 iterations_without_improvement < self._obj_fun.get_termination_iter()):
             # generate new harmony
             elapsed_time = (datetime.now() - start_time).total_seconds()
+            if self.print_verbose:
+                print('Time: ', elapsed_time)
+                print('Improvisation: ', num_imp)
             harmony = list()
             for i in range(0, self._obj_fun.get_num_parameters()):
@@ -1374,6 +1374,7 @@ class HarmonySearch(object):
                                                                          self.pf.get_objective_is_multi())
             num_imp += 1
             if iterations_without_improvement == 0:  # if there is any kind of improvement updae the logs
+                if self.print_verbose: print('improvement found at improvisation', num_imp)
                 if self.pf.get_objective_is_multi():
                     try:
                         logger(num_imp, fitness, self._harmony_memory, True, self.get_instance_name(),

metacountregressor 0.1.73__py3-none-any.whl → 0.1.78__py3-none-any.whl

metacountregressor 0.1.73py3-none-any.whl → 0.1.78py3-none-any.whl