PyPI - psyke - Versions diffs - 0.8.9.dev48__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl - Mend

psyke 0.8.9.dev48py3-none-any.whl → 1.0.4.dev10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

psyke/__init__.py +112 -24
psyke/clustering/__init__.py +4 -0
psyke/clustering/cream/__init__.py +2 -6
psyke/clustering/exact/__init__.py +10 -7
psyke/clustering/utils.py +0 -1
psyke/extraction/__init__.py +6 -2
psyke/extraction/cart/{predictor.py → CartPredictor.py} +52 -7
psyke/extraction/cart/FairTree.py +205 -0
psyke/extraction/cart/FairTreePredictor.py +56 -0
psyke/extraction/cart/__init__.py +27 -52
psyke/extraction/hypercubic/__init__.py +58 -7
psyke/extraction/hypercubic/creepy/__init__.py +14 -6
psyke/extraction/hypercubic/ginger/__init__.py +100 -0
psyke/extraction/hypercubic/gridex/__init__.py +6 -48
psyke/extraction/hypercubic/gridrex/__init__.py +2 -2
psyke/extraction/hypercubic/hypercube.py +33 -26
psyke/extraction/hypercubic/iter/__init__.py +5 -0
psyke/extraction/hypercubic/strategy.py +13 -9
psyke/extraction/real/__init__.py +21 -22
psyke/extraction/real/utils.py +2 -2
psyke/extraction/trepan/__init__.py +19 -15
psyke/genetic/__init__.py +0 -0
psyke/genetic/fgin/__init__.py +74 -0
psyke/genetic/gin/__init__.py +144 -0
psyke/hypercubepredictor.py +4 -2
psyke/tuning/pedro/__init__.py +4 -2
psyke/utils/logic.py +4 -8
{psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/METADATA +39 -19
psyke-1.0.4.dev10.dist-info/RECORD +46 -0
{psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/WHEEL +1 -1
{psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info/licenses}/LICENSE +2 -1
psyke-0.8.9.dev48.dist-info/RECORD +0 -40
{psyke-0.8.9.dev48.dist-info → psyke-1.0.4.dev10.dist-info}/top_level.txt +0 -0

psyke/__init__.py CHANGED Viewed

@@ -5,16 +5,20 @@ from enum import Enum
 import numpy as np
 import pandas as pd
+from matplotlib import pyplot as plt
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, f1_score, accuracy_score, \
     adjusted_rand_score, adjusted_mutual_info_score, v_measure_score, fowlkes_mallows_score
+from tuprolog.solve.prolog import prolog_solver
 from psyke.schema import DiscreteFeature
 from psyke.utils import get_default_random_seed, Target, get_int_precision
-from tuprolog.theory import Theory
+from tuprolog.theory import Theory, mutable_theory
 from typing import Iterable
 import logging
+from psyke.utils.logic import get_in_rule, data_to_struct, get_not_in_rule
 logging.basicConfig(level=logging.WARN)
 logger = logging.getLogger('psyke')
@@ -52,7 +56,7 @@ class EvaluableModel(object):
         """
         Predicts the output values of every sample in dataset.
-        :param dataframe: is the set of instances to predict.
+        :param dataframe: the set of instances to predict.
         :return: a list of predictions.
         """
         return self.__convert(self._predict(dataframe))
@@ -61,7 +65,7 @@ class EvaluableModel(object):
         raise NotImplementedError('predict')
     def __convert(self, ys: Iterable) -> Iterable:
-        if self.normalization is not None and not isinstance([p for p in ys if p is not None][0], str):
+        if self.normalization is not None and len(ys) > 0 and not isinstance([p for p in ys if p is not None][0], str):
             m, s = self.normalization[list(self.normalization.keys())[-1]]
             ys = [prediction if prediction is None else prediction * s + m for prediction in ys]
         return ys
@@ -85,7 +89,7 @@ class EvaluableModel(object):
     def score(self, dataframe: pd.DataFrame, predictor=None, fidelity: bool = False, completeness: bool = True,
               brute: bool = False, criterion: str = 'corners', n: int = 2,
               task: EvaluableModel.Task = Task.CLASSIFICATION,
-              scoring_function: Iterable[EvaluableModel.Score] = [ClassificationScore.ACCURACY]):
+              scoring_function: Iterable[EvaluableModel.Score] = (ClassificationScore.ACCURACY, )):
         extracted = np.array(
             self.predict(dataframe.iloc[:, :-1]) if not brute else
             self.brute_predict(dataframe.iloc[:, :-1], criterion, n)
@@ -151,42 +155,113 @@ class Extractor(EvaluableModel, ABC):
     def __init__(self, predictor, discretization: Iterable[DiscreteFeature] = None, normalization=None):
         super().__init__(discretization, normalization)
         self.predictor = predictor
+        self.theory = None
     def extract(self, dataframe: pd.DataFrame) -> Theory:
         """
         Extracts rules from the underlying predictor.
-        :param dataframe: is the set of instances to be used for the extraction.
+        :param dataframe: the set of instances to be used for the extraction.
         :return: the theory created from the extracted rules.
         """
         raise NotImplementedError('extract')
-    def predict_why(self, data: dict[str, float], verbose=True):
+    def predict_why(self, data: dict[str, float], verbose: bool = True):
         """
         Provides a prediction and the corresponding explanation.
-        :param data: is the instance to predict.
-        :param verbose: if the explanation has to be printed.
+        :param data: the instance to predict.
+        :param verbose: if True the explanation is printed.
         """
         raise NotImplementedError('predict_why')
-    def predict_counter(self, data: dict[str, float], verbose=True, only_first=True):
+    def predict_counter(self, data: dict[str, float], verbose: bool = True, only_first: bool = True):
         """
         Provides a prediction and counterfactual explanations.
-        :param data: is the instance to predict.
-        :param verbose: if the counterfactual explanation has to be printed.
-        :param only_first: if only the closest counterfactual explanation is provided for each distinct class.
+        :param data: the instance to predict.
+        :param verbose: if True the counterfactual explanation is printed.
+        :param only_first: if True only the closest counterfactual explanation is provided for each distinct class.
         """
         raise NotImplementedError('predict_counter')
+    def plot_fairness(self, dataframe: pd.DataFrame, groups: dict[str, list], colormap='seismic_r', filename=None,
+                      figsize=(5, 4)):
+        """
+        Provides a visual estimation of the fairness exhibited by an extractor with respect to the specified groups.
+        :param dataframe: the set of instances to be used for the estimation.
+        :param groups: the set of relevant groups to consider.
+        :param colormap: the colormap to use for the plot.
+        :param filename: if not None, name used to save the plot.
+        :param figsize: size of the plot.
+        """
+        counts = {group: len(dataframe[idx_g]) for group, idx_g in groups.items()}
+        output = {'labels': []}
+        for group in groups:
+            output[group] = []
+        for i, clause in enumerate(self.theory.clauses):
+            if len(dataframe) == 0:
+                break
+            solver = prolog_solver(static_kb=mutable_theory(clause).assertZ(get_in_rule()).assertZ(get_not_in_rule()))
+            idx = np.array([query.is_yes for query in
+                            [solver.solveOnce(data_to_struct(data)) for _, data in dataframe.iterrows()]])
+            # print(f'Rule {i + 1}. Outcome {clause.head.args[-1]}. Affecting', end='')
+            output['labels'].append(str(clause.head.args[-1]))
+            for group, idx_g in groups.items():
+                # print(f' {len(dataframe[idx & idx_g]) / counts[group]:.2f}%{group}', end='')
+                output[group].append(len(dataframe[idx & idx_g]) / counts[group])
+            dataframe = dataframe[~idx]
+            groups = {group: indices[~idx] for group, indices in groups.items()}
+            # print(f'. Left {len(dataframe)} instances')
+        binary = len(set(output['labels'])) == 2
+        labels = sorted(set(output['labels']))
+        data = np.vstack([output[group] for group in groups]).T * 100
+        if binary:
+            data[np.array(output['labels']) == labels[0]] *= -1
+        plt.figure(figsize=figsize)
+        plt.imshow(data, cmap=colormap, vmin=-100 if binary else 0, vmax=100)
+        plt.gca().set_xticks(range(len(groups)), labels=groups.keys())
+        plt.gca().set_yticks(range(len(output['labels'])),
+                             labels=[f'Rule {i + 1}\n{l}' for i, l in enumerate(output['labels'])])
+        plt.xlabel('Groups')
+        plt.ylabel('Rules')
+        plt.title("Rule set impact on groups")
+        for i in range(len(output['labels'])):
+            for j in range(len(groups)):
+                plt.gca().text(j, i, f'{abs(data[i, j]):.2f}%', ha="center", va="center", color="k")
+        plt.gca().set_xticks([i + .5 for i in range(len(groups))], minor=True)
+        plt.gca().set_yticks([i + .5 for i in range(len(output['labels']))], minor=True)
+        plt.gca().grid(which='minor', color='k', linestyle='-', linewidth=.8)
+        plt.gca().tick_params(which='minor', bottom=False, left=False)
+        cbarticks = np.linspace(-100 if binary else 0, 100, 9 if binary else 11, dtype=int)
+        cbar = plt.colorbar(fraction=0.046, label='Affected samples (%)', ticks=cbarticks)
+        if binary:
+            ticklabels = [str(-i) if i < 0 else str(i) for i in cbarticks]
+            ticklabels[0] += f' {labels[0]}'
+            ticklabels[-1] += f' {labels[-1]}'
+            cbar.ax.set_yticklabels(ticklabels)
+        plt.tight_layout()
+        if filename is not None:
+            plt.savefig(filename, dpi=500)
+        plt.show()
+    def make_fair(self, features: Iterable[str]):
+        raise NotImplementedError(f'Fairness for {type(self).__name__} is not supported at the moment')
     def mae(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
             n: int = 3) -> float:
         """
         Calculates the predictions' MAE w.r.t. the instances given as input.
-        :param dataframe: is the set of instances to be used to calculate the mean absolute error.
+        :param dataframe: the set of instances to be used to calculate the mean absolute error.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
         :param brute: if True, a brute prediction is executed.
-        :param criterion: creterion for brute prediction.
+        :param criterion: criterion for brute prediction.
         :param n: number of points for brute prediction with 'perimeter' criterion.
         :return: the mean absolute error (MAE) of the predictions.
         """
@@ -198,10 +273,10 @@ class Extractor(EvaluableModel, ABC):
         """
         Calculates the predictions' MSE w.r.t. the instances given as input.
-        :param dataframe: is the set of instances to be used to calculate the mean squared error.
+        :param dataframe: the set of instances to be used to calculate the mean squared error.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
         :param brute: if True, a brute prediction is executed.
-        :param criterion: creterion for brute prediction.
+        :param criterion: criterion for brute prediction.
         :param n: number of points for brute prediction with 'perimeter' criterion.
         :return: the mean squared error (MSE) of the predictions.
         """
@@ -213,10 +288,10 @@ class Extractor(EvaluableModel, ABC):
         """
         Calculates the predictions' R2 score w.r.t. the instances given as input.
-        :param dataframe: is the set of instances to be used to calculate the R2 score.
+        :param dataframe: the set of instances to be used to calculate the R2 score.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
         :param brute: if True, a brute prediction is executed.
-        :param criterion: creterion for brute prediction.
+        :param criterion: criterion for brute prediction.
         :param n: number of points for brute prediction with 'perimeter' criterion.
         :return: the R2 score of the predictions.
         """
@@ -224,14 +299,14 @@ class Extractor(EvaluableModel, ABC):
                           Extractor.Task.REGRESSION, [Extractor.RegressionScore.R2])[Extractor.RegressionScore.R2][-1]
     def accuracy(self, dataframe: pd.DataFrame, predictor=None, brute: bool = False, criterion: str = 'center',
-            n: int = 3) -> float:
+                 n: int = 3) -> float:
         """
         Calculates the predictions' accuracy classification score w.r.t. the instances given as input.
-        :param dataframe: is the set of instances to be used to calculate the accuracy classification score.
+        :param dataframe: the set of instances to be used to calculate the accuracy classification score.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
         :param brute: if True, a brute prediction is executed.
-        :param criterion: creterion for brute prediction.
+        :param criterion: criterion for brute prediction.
         :param n: number of points for brute prediction with 'perimeter' criterion.
         :return: the accuracy classification score of the predictions.
         """
@@ -244,10 +319,10 @@ class Extractor(EvaluableModel, ABC):
         """
         Calculates the predictions' F1 score w.r.t. the instances given as input.
-        :param dataframe: is the set of instances to be used to calculate the F1 score.
+        :param dataframe: the set of instances to be used to calculate the F1 score.
         :param predictor: if provided, its predictions on the dataframe are taken instead of the dataframe instances.
         :param brute: if True, a brute prediction is executed.
-        :param criterion: creterion for brute prediction.
+        :param criterion: criterion for brute prediction.
         :param n: number of points for brute prediction with 'perimeter' criterion.
         :return: the F1 score of the predictions.
         """
@@ -319,6 +394,19 @@ class Extractor(EvaluableModel, ABC):
         from psyke.extraction.hypercubic.hex import HEx
         return HEx(predictor, grid, min_examples, threshold, output, discretization, normalization, seed)
+    @staticmethod
+    def ginger(predictor, features: Iterable[str], sigmas: Iterable[float], max_slices: int, min_rules: int = 1,
+               max_poly: int = 1, alpha: float = 0.5, indpb: float = 0.5, tournsize: int = 3, metric: str = 'R2',
+               n_gen: int = 50, n_pop: int = 50, threshold=None, valid=None, output=Target.REGRESSION,
+               normalization: dict[str, tuple[float, float]] = None,
+               seed: int = get_default_random_seed()) -> Extractor:
+        """
+        Creates a new GInGER extractor.
+        """
+        from psyke.extraction.hypercubic.ginger import GInGER
+        return GInGER(predictor, features, sigmas, max_slices, min_rules, max_poly, alpha, indpb, tournsize, metric,
+                      n_gen, n_pop, threshold, valid, output, normalization, seed)
     @staticmethod
     def gridrex(predictor, grid, min_examples: int = 250, threshold: float = 0.1,
                 normalization: dict[str, tuple[float, float]] = None,
@@ -331,7 +419,7 @@ class Extractor(EvaluableModel, ABC):
     @staticmethod
     def creepy(predictor, clustering, depth: int, error_threshold: float, output: Target = Target.CONSTANT,
-               gauss_components: int = 2, ranks: [(str, float)] = [], ignore_threshold: float = 0.0,
+               gauss_components: int = 2, ranks: Iterable[(str, float)] = tuple(), ignore_threshold: float = 0.0,
                discretization=None, normalization: dict[str, tuple[float, float]] = None,
                seed: int = get_default_random_seed()) -> Extractor:
         """

psyke/clustering/__init__.py CHANGED Viewed

@@ -10,6 +10,10 @@ class HyperCubeClustering(HyperCubePredictor, Clustering, ABC):
     def __init__(self, output: Target = Target.CONSTANT, discretization=None, normalization=None):
         HyperCubePredictor.__init__(self, output=output, discretization=discretization, normalization=normalization)
+        self._protected_features = []
     def get_hypercubes(self) -> Iterable[HyperCube]:
         raise NotImplementedError('get_hypercubes')
+    def make_fair(self, features: Iterable[str]):
+        self._protected_features = features

psyke/clustering/cream/__init__.py CHANGED Viewed

@@ -46,11 +46,7 @@ class CREAM(ExACT):
     def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
         to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
         while len(to_split) > 0:
-            to_split.sort(reverse=True)
-            (_, depth, _, node) = to_split.pop()
-            data = ExACT._remove_string_label(node.dataframe)
-            gauss_params = select_gaussian_mixture(data, self.gauss_components)
-            gauss_pred = gauss_params[2].predict(data)
+            node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split)
             cubes = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
             if len(cubes) < 1:
                 continue
@@ -65,4 +61,4 @@ class CREAM(ExACT):
                     (error, depth + 1, np.random.uniform(), n) for (n, error) in
                     zip(node.children, [right[0].diversity, left[0].diversity]) if error > self.error_threshold
                 ]
-        return self._node_to_cubes(surrounding)
+        return self._node_to_cubes(surrounding)

psyke/clustering/exact/__init__.py CHANGED Viewed

@@ -54,13 +54,13 @@ class ExACT(HyperCubeClustering, ABC):
         dbscan_pred = DBSCAN(eps=select_dbscan_epsilon(data, clusters)).fit_predict(data.iloc[:, :-1])
         return HyperCube.create_surrounding_cube(
             dataframe.iloc[np.where(dbscan_pred == Counter(dbscan_pred).most_common(1)[0][0])],
-            True, self._output
+            True, self._output, self._protected_features
         )
     def fit(self, dataframe: pd.DataFrame):
         np.random.seed(self.seed)
         self._predictor.fit(dataframe.iloc[:, :-1], dataframe.iloc[:, -1])
-        self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output)
+        self._surrounding = HyperCube.create_surrounding_cube(dataframe, True, self._output, self._protected_features)
         self._hypercubes = self._iterate(Node(dataframe, self._surrounding))
     def get_hypercubes(self) -> Iterable[HyperCube]:
@@ -79,14 +79,17 @@ class ExACT(HyperCubeClustering, ABC):
             enumerate(dataframe.iloc[:, -1].unique())
         ).items()}}) if isinstance(dataframe.iloc[0, -1], str) else dataframe
+    def _get_gauss_predictions(self, to_split):
+        to_split.sort(reverse=True)
+        (_, depth, _, node) = to_split.pop()
+        data = ExACT._remove_string_label(node.dataframe)
+        gauss_params = select_gaussian_mixture(data.drop(self._protected_features, axis=1), self.gauss_components)
+        return node, depth, gauss_params[2].predict(data.drop(self._protected_features, axis=1)), gauss_params
     def _iterate(self, surrounding: Node) -> Iterable[HyperCube]:
         to_split = [(self.error_threshold * 10, 1, 1, surrounding)]
         while len(to_split) > 0:
-            to_split.sort(reverse=True)
-            (_, depth, _, node) = to_split.pop()
-            data = ExACT._remove_string_label(node.dataframe)
-            gauss_params = select_gaussian_mixture(data, self.gauss_components)
-            gauss_pred = gauss_params[2].predict(data)
+            node, depth, gauss_pred, gauss_params = self._get_gauss_predictions(to_split)
             cubes, indices = self.__eligible_cubes(gauss_pred, node, gauss_params[1])
             cubes = [(c.volume(), len(idx), i, idx, c) for i, (c, idx) in enumerate(zip(cubes, indices))
                      if (idx is not None) and (not node.cube.equal(c))]

psyke/clustering/utils.py CHANGED Viewed

@@ -11,7 +11,6 @@ def select_gaussian_mixture(data: pd.DataFrame, max_components) -> tuple[float,
     try:
         models = [GaussianMixture(n_components=n).fit(data) for n in components if n <= len(data)]
     except ValueError:
-        print(data)
         print(len(data))
     return min([(m.bic(data) / (i + 2), (i + 2), m) for i, m in enumerate(models)])

psyke/extraction/__init__.py CHANGED Viewed

@@ -11,11 +11,15 @@ class PedagogicalExtractor(Extractor, ABC):
     def __init__(self, predictor, discretization=None, normalization=None):
         Extractor.__init__(self, predictor=predictor, discretization=discretization, normalization=normalization)
-    def extract(self, dataframe: pd.DataFrame) -> Theory:
+    def _substitute_output(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         new_y = pd.DataFrame(self.predictor.predict(dataframe.iloc[:, :-1])).set_index(dataframe.index)
         data = dataframe.iloc[:, :-1].copy().join(new_y)
         data.columns = dataframe.columns
-        return self._extract(data)
+        return data
+    def extract(self, dataframe: pd.DataFrame) -> Theory:
+        self.theory = self._extract(self._substitute_output(dataframe))
+        return self.theory
     def _extract(self, dataframe: pd.DataFrame) -> Theory:
         raise NotImplementedError('extract')

psyke/extraction/cart/{predictor.py → CartPredictor.py} RENAMED Viewed

@@ -1,11 +1,14 @@
-from collections import Iterable
+from collections.abc import Iterable
 from typing import Union, Any
 import numpy as np
+import pandas as pd
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
-from psyke.schema import Value, LessThan, GreaterThan, SchemaException
+from tuprolog.core import clause, Var, Struct
+from tuprolog.theory import Theory, mutable_theory
-LeafConstraints = dict[str, list[Value]]
-LeafSequence = Iterable[tuple[LeafConstraints, Any]]
+from psyke.extraction.cart import LeafConstraints, LeafSequence
+from psyke.schema import LessThan, GreaterThan, SchemaException, DiscreteFeature
+from psyke.utils.logic import create_variable_list, create_head, create_term
 class CartPredictor:
@@ -14,11 +17,12 @@ class CartPredictor:
     """
     def __init__(self, predictor: Union[DecisionTreeClassifier, DecisionTreeRegressor] = DecisionTreeClassifier(),
-                 normalization=None):
+                 discretization=None, normalization=None):
         self._predictor = predictor
+        self.discretization = discretization
         self.normalization = normalization
-    def __get_constraints(self, nodes: Iterable[(int, bool)]) -> LeafConstraints:
+    def __get_constraints(self, nodes: Iterable[tuple[int, bool]]) -> LeafConstraints:
         thresholds = [self._predictor.tree_.threshold[i[0]] for i in nodes]
         features = [self._predictor.feature_names_in_[self._predictor.tree_.feature[node[0]]] for node in nodes]
         conditions = [node[1] for node in nodes]
@@ -48,7 +52,7 @@ class CartPredictor:
         else:
             return self._predictor.tree_.value[node]
-    def __path(self, node: int, path=None) -> Iterable[(int, bool)]:
+    def __path(self, node: int, path=None) -> Iterable[tuple[int, bool]]:
         path = [] if path is None else path
         if node == 0:
             return path
@@ -62,6 +66,47 @@ class CartPredictor:
     def predict(self, data) -> Iterable:
         return self._predictor.predict(data)
+    @staticmethod
+    def _simplify_nodes(nodes: list) -> Iterable:
+        simplified = [nodes.pop(0)]
+        while len(nodes) > 0:
+            first_node = nodes[0][0]
+            for k, conditions in first_node.items():
+                for condition in conditions:
+                    if all(k in node[0] and condition in node[0][k] for node in nodes):
+                        [node[0][k].remove(condition) for node in nodes]
+            simplified.append(nodes.pop(0))
+        return [({k: v for k, v in rule.items() if v != []}, prediction) for rule, prediction in simplified]
+    def _create_body(self, variables: dict[str, Var], conditions: LeafConstraints) -> Iterable[Struct]:
+        results = []
+        for feature_name, cond_list in conditions.items():
+            for condition in cond_list:
+                feature: DiscreteFeature = [d for d in self.discretization if feature_name in d.admissible_values][0] \
+                    if self.discretization else None
+                results.append(create_term(variables[feature_name], condition) if feature is None else
+                               create_term(variables[feature.name],
+                                           feature.admissible_values[feature_name],
+                                           isinstance(condition, GreaterThan)))
+        return results
+    def create_theory(self, data: pd.DataFrame, simplify: bool = True) -> Theory:
+        new_theory = mutable_theory()
+        nodes = [node for node in self]
+        nodes = self._simplify_nodes(nodes) if simplify else nodes
+        for (constraints, prediction) in nodes:
+            if self.normalization is not None and data.columns[-1] in self.normalization:
+                m, s = self.normalization[data.columns[-1]]
+                prediction = prediction * s + m
+            variables = create_variable_list(self.discretization, data)
+            new_theory.assertZ(
+                clause(
+                    create_head(data.columns[-1], list(variables.values()), prediction),
+                    self._create_body(variables, constraints)
+                )
+            )
+        return new_theory
     @property
     def predictor(self) -> Union[DecisionTreeClassifier, DecisionTreeRegressor]:
         return self._predictor

psyke/extraction/cart/FairTree.py ADDED Viewed

@@ -0,0 +1,205 @@
+import numpy as np
+from collections import Counter
+from sklearn.metrics import accuracy_score, r2_score
+class Node:
+    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
+        self.feature = feature
+        self.threshold = threshold
+        self.left = left
+        self.right = right
+        self.value = value
+    def is_leaf_node(self):
+        return self.value is not None
+class FairTree:
+    def __init__(self, max_depth=3, max_leaves=None, criterion=None, min_samples_split=2, lambda_penalty=0.0,
+                 protected_attr=None):
+        self.max_depth = max_depth
+        self.max_leaves = max_leaves
+        self.min_samples_split = min_samples_split
+        self.lambda_penalty = lambda_penalty
+        self.protected_attr = protected_attr
+        self.criterion = criterion
+        self.root = None
+        self.n_leaves = 0
+        self.quality_function = None
+    def fit(self, X, y):
+        self.n_leaves = 0
+        self.root = self._grow_tree(X, y, depth=0)
+        while self.n_leaves > self.max_leaves:
+            self.prune_least_important_leaf(X, y)
+            self.n_leaves -= 1
+        return self
+    @staticmethod
+    def _estimate_output(y):
+        raise NotImplementedError
+    def score(self, X, y):
+        raise NotImplementedError
+    def predict(self, X):
+        return np.array([self._traverse_tree(x, self.root) for _, x in X.iterrows()])
+    def _traverse_tree(self, x, node):
+        if node.is_leaf_node():
+            return node.value
+        if x[node.feature] <= node.threshold:
+            return self._traverse_tree(x, node.left)
+        return self._traverse_tree(x, node.right)
+    def _grow_tree(self, X, y, depth):
+        if depth >= self.max_depth or X.shape[0] < self.min_samples_split or len(set(y.values.flatten())) == 1 or \
+                (self.max_leaves is not None and self.n_leaves >= self.max_leaves):
+            self.n_leaves += 1
+            return Node(value=self._estimate_output(y))
+        best_feature, best_threshold = self._best_split(X, y)
+        if best_feature is None:
+            self.n_leaves += 1
+            return Node(value=self._estimate_output(y))
+        left_idxs = X[best_feature] <= best_threshold
+        right_idxs = X[best_feature] > best_threshold
+        left = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
+        right = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)
+        return Node(best_feature, best_threshold, left, right)
+    @staticmethod
+    def generate_thresholds(X, y):
+        sorted_indices = np.argsort(X)
+        X = np.array(X)[sorted_indices]
+        y = np.array(y)[sorted_indices]
+        # X = np.array(np.unique(np.unique(list(zip(X, y)), axis=0)[:, 0]), dtype=float)
+        return np.array([(X[:-1][i] + X[1:][i]) / 2.0 for i in range(len(X) - 1) if y[i] != y[i + 1]])
+    def _best_split(self, X, y):
+        best_gain = -float('inf')
+        split_idx, split_threshold = None, None
+        for feature in [feature for feature in X.columns if feature not in self.protected_attr]:
+            # for threshold in self.generate_thresholds(X[feature], y):
+            for threshold in np.unique(np.quantile(X[feature], np.linspace(0, 1, num=25))):
+                left_idxs = X[feature] <= threshold
+                right_idxs = X[feature] > threshold
+                if left_idxs.sum() == 0 or right_idxs.sum() == 0:
+                    continue
+                gain = self._fair_gain(y, left_idxs, right_idxs, X[self.protected_attr])
+                if gain > best_gain:
+                    best_gain = gain
+                    split_idx = feature
+                    split_threshold = threshold
+        return split_idx, split_threshold
+    @staticmethod
+    def _disparity(group):
+        counts = Counter(group)
+        if len(counts) <= 1:
+            return 0.0
+        values = np.array(list(counts.values())) / len(group)
+        return np.abs(values[0] - values[1])
+    def _fair_gain(self, y, left_idx, right_idx, protected):
+        child = len(y[left_idx]) / len(y) * self.quality_function(y[left_idx]) + \
+                len(y[right_idx]) / len(y) * self.quality_function(y[right_idx])
+        info_gain = self.quality_function(y) - child
+        penalty = self._disparity(protected[left_idx]) + self._disparity(protected[right_idx])
+        return info_gain - self.lambda_penalty * penalty
+    @staticmethod
+    def _match_path(x, path):
+        for node, left in path:
+            if left and x[node.feature] > node.threshold:
+                return False
+            if not left and x[node.feature] <= node.threshold:
+                return False
+        return True
+    @staticmethod
+    def candidates(node, parent=None, is_left=None, path=[]):
+        if node is None or node.is_leaf_node():
+            return []
+        leaves = []
+        if node.left.is_leaf_node() and node.right.is_leaf_node():
+            leaves.append((node, parent, is_left, path))
+        leaves += FairTreeClassifier.candidates(node.left, node, True, path + [(node, True)])
+        leaves += FairTreeClassifier.candidates(node.right, node, False, path + [(node, False)])
+        return leaves
+    def prune_least_important_leaf(self, X, y):
+        best_score = -np.inf
+        best_prune = None
+        for node, parent, is_left, path in self.candidates(self.root):
+            original_left = node.left
+            original_right = node.right
+            merged_y = y[(X.apply(lambda x: self._match_path(x, path), axis=1))]
+            if len(merged_y) == 0:
+                continue
+            new_value = self._estimate_output(merged_y)
+            node.left = node.right = None
+            node.value = new_value
+            score = self.score(X, y)
+            if score >= best_score:
+                best_score = score
+                best_prune = (node, new_value)
+            node.left, node.right, node.value = original_left, original_right, None
+        if best_prune:
+            best_prune[0].left = best_prune[0].right = None
+            best_prune[0].value = best_prune[1]
+class FairTreeClassifier(FairTree):
+    def __init__(self, max_depth=3, max_leaves=None, criterion='entropy', min_samples_split=2, lambda_penalty=0.0,
+                 protected_attr=None):
+        super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr)
+        self.quality_function = self._gini if self.criterion == 'gini' else self._entropy
+    @staticmethod
+    def _estimate_output(y):
+        return Counter(y.values.flatten()).most_common(1)[0][0]
+    def score(self, X, y):
+        return accuracy_score(y.values.flatten(), self.predict(X))
+    @staticmethod
+    def _entropy(y):
+        ps = np.unique(y, return_counts=True)[1] / len(y)
+        return -np.sum([p * np.log2(p) for p in ps if p > 0])
+    @staticmethod
+    def _gini(y):
+        return 1.0 - np.sum(np.unique(y, return_counts=True)[1] / len(y)**2)
+class FairTreeRegressor(FairTree):
+    def __init__(self, max_depth=3, max_leaves=None, criterion='mse', min_samples_split=2, lambda_penalty=0.0,
+                 protected_attr=None):
+        super().__init__(max_depth, max_leaves, criterion, min_samples_split, lambda_penalty, protected_attr)
+        self.quality_function = self._mse
+    @staticmethod
+    def _estimate_output(y):
+        return np.mean(y.values.flatten())
+    def score(self, X, y):
+        return r2_score(y.values.flatten(), self.predict(X))
+    @staticmethod
+    def _mse(y):
+        y = y.values.flatten().astype(float)
+        return np.mean((y - np.mean(y))**2)

psyke 0.8.9.dev48__py3-none-any.whl → 1.0.4.dev10__py3-none-any.whl

psyke 0.8.9.dev48py3-none-any.whl → 1.0.4.dev10py3-none-any.whl