PyPI - metacountregressor - Versions diffs - 0.1.71__tar.gz → 0.1.76__tar.gz - Mend

metacountregressor 0.1.71tar.gz → 0.1.76tar.gz

Files changed (26) hide show

{metacountregressor-0.1.71 → metacountregressor-0.1.76}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: metacountregressor
-Version: 0.1.71
+Version: 0.1.76
 Summary: Extensions for a Python package for estimation of count models.
 Home-page: https://github.com/zahern/CountDataEstimation
 Author: Zeke Ahern

metacountregressor-0.1.76/metacountregressor/data_split_helper.py ADDED Viewed

@@ -0,0 +1,90 @@
+import numpy as np
+import pandas as pd
+class DataProcessor:
+    def __init__(self, x_data, y_data, kwargs):
+        self._obj_1 = kwargs.get('_obj_1')
+        self._obj_2 = kwargs.get('_obj_2')
+        self.test_percentage = float(kwargs.get('test_percentage', 0))
+        self.val_percentage = float(kwargs.get('val_percentage', 0))
+        self.is_multi = self.test_percentage != 0
+        self._x_data = x_data
+        self._y_data = y_data
+        self._process_data(kwargs)
+    def _process_data(self, kwargs):
+        if self._obj_1 == 'MAE' or self._obj_2 in ["MAE", 'RMSE', 'MSE', 'RMSE_IN', 'RMSE_TEST']:
+            self._handle_special_conditions(kwargs)
+        else:
+            self._standard_data_partition()
+        self._characteristics_names = list(self._x_data.columns)
+        self._max_group_all_means = 1
+        self._exclude_this_test = [4]
+    def _handle_special_conditions(self, kwargs):
+        if 'panels' in kwargs:
+            self._process_panels_data(kwargs)
+        else:
+            self._standard_data_partition()
+    def _process_panels_data(self, kwargs):
+        group_key = kwargs['group']
+        panels_key = kwargs['panels']
+        # Process groups and panels
+        self._x_data[group_key] = self._x_data[group_key].astype('category').cat.codes
+        try:
+            self._x_data[panels_key] = self._x_data[panels_key].rank(method='dense').astype(int)
+            self._x_data[panels_key] -= self._x_data[panels_key].min() - 1
+        except KeyError:
+            pass
+        # Create training and test datasets
+        unique_ids = np.unique(self._x_data[panels_key])
+        training_size = int((1 - self.test_percentage - self.val_percentage) * len(unique_ids))
+        training_ids = np.random.choice(unique_ids, training_size, replace=False)
+        train_idx = self._x_data.index[self._x_data[panels_key].isin(training_ids)]
+        test_idx = self._x_data.index[~self._x_data[panels_key].isin(training_ids)]
+        self._create_datasets(train_idx, test_idx)
+    def _standard_data_partition(self):
+        total_samples = len(self._x_data)
+        training_size = int((1 - self.test_percentage - self.val_percentage) * total_samples)
+        training_indices = np.random.choice(total_samples, training_size, replace=False)
+        train_idx = np.array([i for i in range(total_samples) if i in training_indices])
+        test_idx = np.array([i for i in range(total_samples) if i not in training_indices])
+        self._create_datasets(train_idx, test_idx)
+    def _create_datasets(self, train_idx, test_idx):
+        self.df_train = self._x_data.loc[train_idx, :]
+        self.df_test = self._x_data.loc[test_idx, :]
+        self.y_train = self._y_data.loc[train_idx, :]
+        self.y_test = self._y_data.loc[test_idx, :]
+        self._x_data_test = self.df_test.copy()
+        self._y_data_test = self.y_test.astype('float').copy()
+        self._x_data = self.df_train.copy()
+        self._y_data = self.y_train.astype('float').copy()
+        # Handle different shapes
+        if self._x_data.ndim == 2:  # Typical DataFrame
+            self._samples, self._characteristics = self._x_data.shape
+            self._panels = None
+        elif self._x_data.ndim == 3:  # 3D structure, e.g., Panel or similar
+            self._samples, self._panels, self._characteristics = self._x_data.shape

{metacountregressor-0.1.71 → metacountregressor-0.1.76}/metacountregressor/helperprocess.py RENAMED Viewed

@@ -5,6 +5,121 @@ import matplotlib.pyplot as plt
 plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-dark.mplstyle')
+##Select the best Features Based on RF
+def select_features(X_train, y_train, n_f=16):
+    try:
+        from sklearn.feature_selection import SelectKBest
+        from sklearn.feature_selection import f_regression
+        feature_names = X_train.columns
+        # configure to select all features
+        fs = SelectKBest(score_func=f_regression, k=16)
+        # learn relationship from training data
+        fs.fit(X_train, y_train)
+        mask = fs.get_support()  # Boolean array of selected features
+        selected_features = [feature for bool, feature in zip(mask, feature_names) if bool]
+        X_train = X_train[selected_features]
+    except:
+        print('import error, not performing feature selection')
+        fs = X_train.columns #TODO check if this is actually getting the names
+    return X_train, fs
+#Cutts off correlated data
+def findCorrelation(corr, cutoff=0.9, exact=None):    """
+    This function is the Python implementation of the R function
+    `findCorrelation()`.
+    Relies on numpy and pandas, so must have them pre-installed.
+    It searches through a correlation matrix and returns a list of column names
+    to remove to reduce pairwise correlations.
+    For the documentation of the R function, see
+    https://www.rdocumentation.org/packages/caret/topics/findCorrelation
+    and for the source code of `findCorrelation()`, see
+    https://github.com/topepo/caret/blob/master/pkg/caret/R/findCorrelation.R
+    -----------------------------------------------------------------------------
+    Parameters:
+    -----------
+    corr: pandas dataframe.
+        A correlation matrix as a pandas dataframe.
+    cutoff: float, default: 0.9.
+        A numeric value for the pairwise absolute correlation cutoff
+    exact: bool, default: None
+        A boolean value that determines whether the average correlations be
+        recomputed at each step
+    -----------------------------------------------------------------------------
+    Returns:
+    --------
+    list of column names
+    -----------------------------------------------------------------------------
+    Example:
+    --------
+    R1 = pd.DataFrame({
+        'x1': [1.0, 0.86, 0.56, 0.32, 0.85],
+        'x2': [0.86, 1.0, 0.01, 0.74, 0.32],
+        'x3': [0.56, 0.01, 1.0, 0.65, 0.91],
+        'x4': [0.32, 0.74, 0.65, 1.0, 0.36],
+        'x5': [0.85, 0.32, 0.91, 0.36, 1.0]
+    }, index=['x1', 'x2', 'x3', 'x4', 'x5'])
+    findCorrelation(R1, cutoff=0.6, exact=False)  # ['x4', 'x5', 'x1', 'x3']
+    findCorrelation(R1, cutoff=0.6, exact=True)   # ['x1', 'x5', 'x4']
+    """
+def _findCorrelation_fast(corr, avg, cutoff):
+    combsAboveCutoff = corr.where(lambda x: (np.tril(x) == 0) & (x > cutoff)).stack().index
+    rowsToCheck = combsAboveCutoff.get_level_values(0)
+    colsToCheck = combsAboveCutoff.get_level_values(1)
+    msk = avg[colsToCheck] > avg[rowsToCheck].values
+    deletecol = pd.unique(np.r_[colsToCheck[msk], rowsToCheck[~msk]]).tolist()
+    return deletecol
+def _findCorrelation_exact(corr, avg, cutoff):
+    x = corr.loc[(*[avg.sort_values(ascending=False).index] * 2,)]
+    if (x.dtypes.values[:, None] == ['int64', 'int32', 'int16', 'int8']).any():
+        x = x.astype(float)
+    x.values[(*[np.arange(len(x))] * 2,)] = np.nan
+    deletecol = []
+    for ix, i in enumerate(x.columns[:-1]):
+        for j in x.columns[ix + 1:]:
+            if x.loc[i, j] > cutoff:
+                if x[i].mean() > x[j].mean():
+                    deletecol.append(i)
+                    x.loc[i] = x[i] = np.nan
+                else:
+                    deletecol.append(j)
+                    x.loc[j] = x[j] = np.nan
+"""Funtion to Convert Data to Binaries """
+def clean_data_types(df):
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            # Attempt to convert the column to numeric type
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+    return df
 def drop_correlations(x_df, percentage=0.85):
     cor_matrix = x_df.corr().abs()

{metacountregressor-0.1.71 → metacountregressor-0.1.76}/metacountregressor/main.py RENAMED Viewed

@@ -9,14 +9,12 @@ import numpy as np
 import pandas as pd
 from pandas import DataFrame
 from pandas.io.parsers import TextFileReader
 import helperprocess
 from metaheuristics import (differential_evolution,
                             harmony_search,
                             simulated_annealing)
 from solution import ObjectiveFunction
-from test_motor import *
 warnings.simplefilter("ignore")
@@ -37,7 +35,7 @@ def main(args, **kwargs):
     # removing junk files if specicified
     helperprocess.remove_files(args.get('removeFiles', True))
-    # do we want tto run a test
+    # do we want to run a test
     if args.get('com', False) == 'MetaCode':
         print('Testing the Python Package')  # TODO add in python package import
         # Read data from CSV file
@@ -64,6 +62,7 @@ def main(args, **kwargs):
     print('the dataset is', dataset)
     manual_fit_spec = args.get('Manual_Fit', None)
     if dataset == 1:
+        print('Stage 5 A Short.')
         df = pd.read_csv('./data/1848.csv')  # read in the data
         y_df = df[['FSI']]  # only consider crashes
         y_df.rename(columns={"FSI": "Y"}, inplace=True)
@@ -71,6 +70,7 @@ def main(args, **kwargs):
         x_df = helperprocess.as_wide_factor(x_df)
     elif dataset == 3:
+        print('Stage 5 A Data Complete.')
         x_df = pd.read_csv('./data/Stage5A_1848_All_Initial_Columns.csv')  # drop the ID columns
         drop_these = ['Id', 'ID', 'old', 'G_N']
         for i in drop_these:
@@ -159,7 +159,28 @@ def main(args, **kwargs):
             'transformations': ['no', 'no', 'no', 'no'],
             'dispersion': 0
         }
+    elif dataset == 8:
+        print('Main County')
+        df = pd.read_csv('./data/rural_int.csv')  # read in the data
+        y_df = df[['crashes']].copy()  # only consider crashes
+        y_df.rename(columns={"crashes": "Y"}, inplace=True)
+        panels = df['orig_ID']
+        try:
+            x_df = df.drop(columns=['crashes', 'year', 'orig_ID',
+                                    'jurisdiction', 'town', 'maint_region', 'weather_station', 'dummy_winter_2'])  # was dropped postcode
+            print('dropping for test')
+            x_df = x_df.drop(columns=['month', 'inj.fat', 'PDO'])
+            x_df = x_df.drop(columns = [ 'zonal_ID', 'ln_AADT', 'ln_seg'])
+            x_df['rumble_install_year'] = x_df['rumble_install_year'].astype('category').cat.codes
+            x_df.rename(columns={"rumble_install_year": "has_rumble"}, inplace=True)
+        except:
+            x_df = df.drop(columns=['Y'])  # was dropped postcode
+        group_grab = x_df['county']
+        x_df = x_df.drop(columns =['county'])
+        x_df = helperprocess.interactions(x_df, drop_this_perc=0.8)
+        x_df['county'] = group_grab
     elif dataset == 9:
         df = pd.read_csv('panel_synth.csv')  # read in the data
@@ -186,64 +207,7 @@ def main(args, **kwargs):
         x_df = helperprocess.interactions(x_df, keep)
     else:  # the dataset has been selected in the program as something else
-        from tkinter import Tk
-        from tkinter.filedialog import askopenfilename
-        ASK_ANALALYST = 0
-        if ASK_ANALALYST:
-            root = Tk()
-            root.withdraw()
-            # Prompt the user to select a directory
-            directory = askopenfilename(title="Select File For Analysis")
-            skip_lines = int(input("Select the number of lines to skip, (numeric): "))
-            df = pd.read_csv(directory, skip_rows=skip_lines)
-        else:
-            df = pd.read_csv('data/rqc40516_MotorcycleQUT_engineer_crash.csv', skiprows=5)
-        df['CRASH_SPEED_LIMIT'] = df['CRASH_SPEED_LIMIT'].str.replace(' km/h', '').astype(int)
-        # Clean data types
-        df = clean_data_types(df)
-        # Encode categorical variables
-        categories = ['CRASH_SEVERITY', 'CRASH_TYPE', 'CRASH_NATURE', 'CRASH_ATMOSPHERIC_CONDITION']
-        df = pd.get_dummies(df, columns=categories)
-        # Select only numeric columns
-        numeric_types = ['int32', 'uint8', 'bool', 'int64', 'float64']
-        df = df.select_dtypes(include=numeric_types)
-        # Check for missing values and fill with column mean
-        missing_values_count = df['CASUALTY_TOTAL'].isnull().sum()
-        df.fillna(df.mean())
-        # Remove unnecessary columns
-        df.drop(columns=['CRASH_REF_NUMBER'], inplace=True)
-        y = df['CASUALTY_TOTAL']
-        # Define columns to exclude from the analysis
-        EXCLUDE = [
-            'LONGITUDE', 'YEAR', 'DCA', 'ID', 'LATIT', 'NAME', 'SEVERITY',
-            "CASUALTY", "CRASH_FIN_YEAR", "CRASH_HOUR", "MOPED"
-        ]
-        # Filter out excluded columns
-        df = df[[col for col in df.columns if not any(ex in col for ex in EXCLUDE)]]
-        # Prepare target variable
-        # Check for finite values and compute correlations
-        finite_check = df.apply(np.isfinite).all()
-        df_clean = df.loc[:, finite_check]
-        corr = df_clean.corr()
-        # Identify and remove highly correlated features
-        hc = findCorrelation(corr, cutoff=0.5)
-        trimmed_df = df_clean.drop(columns=hc)
-        # Feature selection
-        df_cleaner, fs = select_features(trimmed_df, y)
-        x_df = df_cleaner
-        y_df = y.to_frame(name="Y")
-        # y_df.rename(columns={"CASUALTY_TOTAL": "Y"}, inplace=True)
+        print('TODO add in dataset')
     if args['Keep_Fit'] == str(2) or args['Keep_Fit'] == 2:
         if manual_fit_spec is None:
@@ -253,6 +217,7 @@ def main(args, **kwargs):
             args['Manual_Fit'] = manual_fit_spec
     if args['problem_number'] == str(8) or args['problem_number'] == 8:
+        print('Maine County Dataset.')
         args['group'] = 'county'
         args['panels'] = 'element_ID'
         args['ID'] = 'element_ID'
@@ -264,9 +229,9 @@ def main(args, **kwargs):
     args['complexity_level'] = args.get('complexity_level', 6)
-    # if no manual input ALGORITHMS DEPEND ON The SET_DATA_CSV TO DEFINE HYPERPARAMATERS
-    AnalystSpecs = None
-    args['AnalystSpecs'] = AnalystSpecs
+    # Initialize AnalystSpecs to None if not manually provided
+    args['AnalystSpecs'] = args.get('AnalystSpecs', None)
     if args['algorithm'] == 'sa':
         args_hyperparameters = {'alpha': float(args['temp_scale']),
@@ -312,7 +277,7 @@ def main(args, **kwargs):
     elif args['algorithm'] == 'de':
-        # force tvariablese
+        # force variables
         args['must_include'] = args.get('force', [])
         args_hyperparameters = {'_AI': args.get('_AI', 2),
@@ -321,7 +286,6 @@ def main(args, **kwargs):
             , '_pop_size': int(args['_hms']), 'instance_number': int(args['line'])
             , 'Manual_Fit': args['Manual_Fit'],
                                 'MP': int(args['MP'])
                                 }
         args_hyperparameters = dict(args_hyperparameters)
@@ -347,7 +311,7 @@ if __name__ == '__main__':
     alg_parser.print_help()
     parser = argparse.ArgumentParser(prog='main',
                                      epilog=main.__doc__,
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+                                     formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve')
     parser.add_argument('-line', type=int, default=44,
                         help='line to read in csv to pass in argument')
@@ -362,6 +326,7 @@ if __name__ == '__main__':
                 break
             line_number_obs += 1
         args = dict(args)
         for key, value in args.items():
             try:
                 # Attempt to parse the string value to a Python literal if value is a string.
@@ -378,6 +343,13 @@ if __name__ == '__main__':
             if "-algorithm" in action.option_strings:
                 parser._optionals._actions[i].help = "optimization algorithm"
+        override = True
+        if override:
+            print('todo turn off, in testing phase')
+            parser.add_argument('-problem_number', default='8')
+            print('did it make it')
         if 'algorithm' not in args:
             parser.add_argument('-algorithm', type=str, default='hs',
                                 help='optimization algorithm')
@@ -390,7 +362,7 @@ if __name__ == '__main__':
                                  ' we want to split the data for processing')
         parser.add_argument('-supply_csv', type = str, help = 'enter the name of the csv, please include it as a full directorys')
-    else:  # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALLY
+    else:  # DIDN"T SPECIFY LINES TRY EACH ONE MANNUALY
         parser.add_argument('-com', type=str, default='MetaCode',
                             help='line to read csv')

{metacountregressor-0.1.71 → metacountregressor-0.1.76}/metacountregressor/metaheuristics.py RENAMED Viewed

@@ -15,8 +15,14 @@ from datetime import datetime
 import numpy as np
 import pandas as pd
-from .pareto_file import Pareto, Solution
-from .solution import ObjectiveFunction
+try:
+    from .pareto_file import Pareto, Solution
+    from .solution import ObjectiveFunction
+except:
+    print('Exception relative import')
+    from metacountregressor.pareto_file import Pareto, Solution
+    from metacountregressor.solution import ObjectiveFunction
 HarmonySearchResults = namedtuple('HarmonySearchResults',
                                   ['elapsed_time', 'best_harmony', 'best_fitness', 'harmony_memories',
@@ -32,7 +38,7 @@ DifferentialEvolutionMulti = namedtuple('DifferentialEvolutionMulti',
                                         ['elapsed_time', 'best_solutions', 'population_solutions'])
-#helper function to plot the bic
+# helper function to plot the bic
 def _plot(x, y, z, xlabel=None, ylabel=None, zlabel=None, filename=None):
     from matplotlib import pyplot as plt
@@ -54,7 +60,8 @@ def _plot(x, y, z, xlabel=None, ylabel=None, zlabel=None, filename=None):
         plt.savefig('bic.png')
         plt.show()
-#helper function to grab dictionary means
+# helper function to grab dictionary means
 def dict_mean(dict_list,
               ignore=None):
     if ignore is None:
@@ -204,8 +211,7 @@ def different_evolution(objective_function, initial_slns=None, **kwargs):
 def differential_evolution(objective_function, initial_slns=None, **kwargs):
-    if not isinstance(objective_function, ObjectiveFunction):
-        raise Exception
     start = datetime.now()
     man = None
@@ -220,11 +226,8 @@ def differential_evolution(objective_function, initial_slns=None, **kwargs):
             de = Mutlithreaded_Meta(objective_function, **kwargs)
             best, pare = de.run_mp(initial_slns=initial_slns, mod_init=man)
         else:
             print('Not Multi Threaded')
             de = DifferentialEvolution(objective_function, **kwargs)
             best, pare = de.differential_evolution_run(initial_slns=initial_slns, mod_init=man)
         end = datetime.now()
@@ -393,12 +396,10 @@ class DifferentialEvolution(object):
     """
     def __init__(self, objective_function, **kwargs):
-        if not isinstance(objective_function, ObjectiveFunction):
-            raise TypeError
         self._obj_fun = objective_function
         if self._obj_fun._obj_1 is None:
-            raise Exception
+            print('no objective found, automatically selecting BIC')
+            self._obj_fun._obj_1 = 'bic'
         self._pop_size = kwargs.get('_pop_size', 20)
         if not isinstance(self._pop_size, int):
@@ -406,7 +407,7 @@ class DifferentialEvolution(object):
         elif self._pop_size <= 3:
             raise ValueError("_pop_size must be greater than 4")
-        self.F = kwargs.get('_AI', 2)  # mustation scale
+        self.F = kwargs.get('_AI', 2)  # mutation scale
         self.iter = kwargs.get('_max_iter', 10000)
         self.cr = kwargs.get('_crossover_perc') or kwargs.get('_cr', 0.2)
         self.instance_number = str(kwargs.get('instance_number', 1))
@@ -415,12 +416,9 @@ class DifferentialEvolution(object):
         self._population = list()
         self.it_process = 1
         if objective_function.is_multi:
             self.obj_1 = objective_function._obj_1
             self.obj_2 = objective_function._obj_2
             self.pf = Pareto(self.obj_1, self.obj_2, True)
             self._pareto_population = list()
         else:
             self.obj_1 = objective_function._obj_1
@@ -555,7 +553,6 @@ class DifferentialEvolution(object):
         average_iteration = 0
         iterations_without_improvement = 0
         start_time = datetime.now()
         if self._obj_fun.use_random_seed():
             self._obj_fun.set_random_seed()
@@ -949,10 +946,9 @@ class SimulatedAnnealing(object):
             output_step.append(a)
             output_energy.append(b)
             output_best_energy.append(c)
-        return {'elapsed_time': elapsed_time, 'Iteration': iteration}  #TODO make this reachavble
-        #return output_step, output_energy, output_best_energy, self.best_energy, self.best_struct
+        return {'elapsed_time': elapsed_time, 'Iteration': iteration}  # TODO make this reachavble
+        # return output_step, output_energy, output_best_energy, self.best_energy, self.best_struct
     def _get_neighbour(self, current, mutations=None):
         neighbour = copy.deepcopy(current)
@@ -963,7 +959,6 @@ class SimulatedAnnealing(object):
         # number of paramaters in the model #TODO get the last value if 2
         num_of_changeablePARMs = 0
         self._obj_fun.nbr_routine(current)
@@ -1242,7 +1237,8 @@ class HarmonySearch(object):
             Initialize HS with the specified objective function. Note that this objective function must implement ObjectiveFunctionInterface.
         """
         self._obj_fun = objective_function
+        # for printing basics metrics
+        self.print_verbose = True
         # harmony_memory stores the best hms harmonies
         self._harmony_memory = list()
         # harmony_history stores all hms harmonies every nth improvisations (i.e., one 'generation')
@@ -1294,7 +1290,7 @@ class HarmonySearch(object):
     def does_it_appear(self, new):
         for d in self._harmony_memory:
             if self.mixed_list_chescker(d['layout'], new):
-                #print('same sln appears in population')
+                # print('same sln appears in population')
                 return True
         return False
@@ -1314,6 +1310,7 @@ class HarmonySearch(object):
             self._obj_fun.set_random_seed()
         # fill harmony_memory using random parameter values by default, but with initial_harmonies if provided
         self._initialize(initial_harmonies, mod_init)
+        if self.print_verbose: print('Initialization complete')
         if self.pf.get_objective_is_multi():
             self._pareto_harmony_memory = self.pf.non_dominant_sorting(self._harmony_memory)
             generation_best = self._pareto_harmony_memory[0]
@@ -1333,6 +1330,9 @@ class HarmonySearch(object):
                 iterations_without_improvement < self._obj_fun.get_termination_iter()):
             # generate new harmony
             elapsed_time = (datetime.now() - start_time).total_seconds()
+            if self.print_verbose:
+                print('Time: ', elapsed_time)
+                print('Improvisation: ', num_imp)
             harmony = list()
             for i in range(0, self._obj_fun.get_num_parameters()):
@@ -1374,6 +1374,7 @@ class HarmonySearch(object):
                                                                          self.pf.get_objective_is_multi())
             num_imp += 1
             if iterations_without_improvement == 0:  # if there is any kind of improvement updae the logs
+                if self.print_verbose: print('improvement found at improvisation', num_imp)
                 if self.pf.get_objective_is_multi():
                     try:
                         logger(num_imp, fitness, self._harmony_memory, True, self.get_instance_name(),

metacountregressor 0.1.71__tar.gz → 0.1.76__tar.gz

metacountregressor 0.1.71tar.gz → 0.1.76tar.gz