PyPI - PyNomaly - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

PyNomaly 0.3.2py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

PyNomaly/__init__.py +18 -0
PyNomaly/loop.py +398 -335
pynomaly-0.3.5.dist-info/METADATA +503 -0
pynomaly-0.3.5.dist-info/RECORD +7 -0
{PyNomaly-0.3.2.dist-info → pynomaly-0.3.5.dist-info}/WHEEL +1 -1
PyNomaly-0.3.2.dist-info/METADATA +0 -17
PyNomaly-0.3.2.dist-info/RECORD +0 -7
/PyNomaly-0.3.2.dist-info/LICENSE.txt → /pynomaly-0.3.5.dist-info/licenses/LICENSE +0 -0
{PyNomaly-0.3.2.dist-info → pynomaly-0.3.5.dist-info}/top_level.txt +0 -0

PyNomaly/loop.py CHANGED Viewed

@@ -10,13 +10,33 @@ try:
 except ImportError:
     pass
-__author__ = 'Valentino Constantinou'
-__version__ = '0.3.2'
-__license__ = 'Apache License, Version 2.0'
+__author__ = "Valentino Constantinou"
+__version__ = "0.3.5"
+__license__ = "Apache License, Version 2.0"
-class Utils:
+# Custom Exceptions
+class PyNomalyError(Exception):
+    """Base exception for PyNomaly."""
+    pass
+class ValidationError(PyNomalyError):
+    """Raised when input validation fails."""
+    pass
+class ClusterSizeError(ValidationError):
+    """Raised when cluster size is smaller than n_neighbors."""
+    pass
+class MissingValuesError(ValidationError):
+    """Raised when data contains missing values."""
+    pass
+class Utils:
     @staticmethod
     def emit_progress_bar(progress: str, index: int, total: int) -> str:
         """
@@ -32,7 +52,10 @@ class Utils:
         w, h = get_terminal_size()
         sys.stdout.write("\r")
-        block_size = int(total / w)
+        if total < w:
+            block_size = int(w / total)
+        else:
+            block_size = int(total / w)
         if index % block_size == 0:
             progress += "="
         percent = index / total
@@ -52,7 +75,7 @@ class LocalOutlierProbability(object):
     :param cluster_labels: a numpy array of cluster assignments w.r.t. each
     sample (optional, default None)
     :return:
-    """"""
+    """ """
     Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP:
     Local Outlier Probabilities.
@@ -75,203 +98,190 @@ class LocalOutlierProbability(object):
            (2016).
     """
-    class Validate:
+    """
+    Validation methods.
+    These methods validate inputs and raise exceptions or warnings as appropriate.
+    """
+    @staticmethod
+    def _convert_to_array(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
+        """
+        Converts the input data to a numpy array if it is a Pandas DataFrame
+        or validates it is already a numpy array.
+        :param obj: user-provided input data.
+        :return: a vector of values to be used in calculating the local
+        outlier probability.
+        """
+        if obj.__class__.__name__ == "DataFrame":
+            points_vector = obj.values
+            return points_vector
+        elif obj.__class__.__name__ == "ndarray":
+            points_vector = obj
+            return points_vector
+        else:
+            warnings.warn(
+                "Provided data or distance matrix must be in ndarray "
+                "or DataFrame.",
+                UserWarning,
+            )
+            if isinstance(obj, list):
+                points_vector = np.array(obj)
+                return points_vector
+            points_vector = np.array([obj])
+            return points_vector
+    def _validate_inputs(self):
         """
-        The Validate class aids in ensuring PyNomaly receives the right set
-        of user inputs for proper execution of the Local Outlier Probability
-        (LoOP) approach. Depending on the desired behavior, either an
-        exception is raised to the user or PyNomaly continues executing
-        albeit with some form of user warning.
+        Validates the inputs provided during initialization to ensure
+        that the needed objects are provided.
+        :return: a tuple of (data, distance_matrix, neighbor_matrix) or
+        raises a warning for invalid inputs.
         """
+        if all(v is None for v in [self.data, self.distance_matrix]):
+            warnings.warn(
+                "Data or a distance matrix must be provided.", UserWarning
+            )
+            return False
+        elif all(v is not None for v in [self.data, self.distance_matrix]):
+            warnings.warn(
+                "Only one of the following may be provided: data or a "
+                "distance matrix (not both).",
+                UserWarning,
+            )
+            return False
+        if self.data is not None:
+            points_vector = self._convert_to_array(self.data)
+            return points_vector, self.distance_matrix, self.neighbor_matrix
+        if all(
+            matrix is not None
+            for matrix in [self.neighbor_matrix, self.distance_matrix]
+        ):
+            dist_vector = self._convert_to_array(self.distance_matrix)
+            neigh_vector = self._convert_to_array(self.neighbor_matrix)
+        else:
+            warnings.warn(
+                "A neighbor index matrix and distance matrix must both be "
+                "provided when not using raw input data.",
+                UserWarning,
+            )
+            return False
+        if self.distance_matrix.shape != self.neighbor_matrix.shape:
+            warnings.warn(
+                "The shape of the distance and neighbor "
+                "index matrices must match.",
+                UserWarning,
+            )
+            return False
+        elif (self.distance_matrix.shape[1] != self.n_neighbors) or (
+            self.neighbor_matrix.shape[1] != self.n_neighbors
+        ):
+            warnings.warn(
+                "The shape of the distance or "
+                "neighbor index matrix does not "
+                "match the number of neighbors "
+                "specified.",
+                UserWarning,
+            )
+            return False
+        return self.data, dist_vector, neigh_vector
+    def _check_cluster_size(self) -> None:
+        """
+        Validates the cluster labels to ensure that the smallest cluster
+        size (number of observations in the cluster) is larger than the
+        specified number of neighbors.
+        :raises ClusterSizeError: if any cluster is too small.
+        """
+        c_labels = self._cluster_labels()
+        for cluster_id in set(c_labels):
+            c_size = np.where(c_labels == cluster_id)[0].shape[0]
+            if c_size <= self.n_neighbors:
+                raise ClusterSizeError(
+                    "Number of neighbors specified larger than smallest "
+                    "cluster. Specify a number of neighbors smaller than "
+                    "the smallest cluster size (observations in smallest "
+                    "cluster minus one)."
+                )
+    def _check_n_neighbors(self) -> bool:
         """
-        Private methods.
+        Validates the specified number of neighbors to ensure that it is
+        greater than 0 and that the specified value is less than the total
+        number of observations.
+        :return: a boolean indicating whether validation has passed without
+        adjustment.
         """
+        if not self.n_neighbors > 0:
+            self.n_neighbors = 10
+            warnings.warn(
+                "n_neighbors must be greater than 0."
+                " Fit with " + str(self.n_neighbors) + " instead.",
+                UserWarning,
+            )
+            return False
+        elif self.n_neighbors >= self._n_observations():
+            self.n_neighbors = self._n_observations() - 1
+            warnings.warn(
+                "n_neighbors must be less than the number of observations."
+                " Fit with " + str(self.n_neighbors) + " instead.",
+                UserWarning,
+            )
+        return True
-        @staticmethod
-        def _data(obj: Union['pd.DataFrame', np.ndarray]) -> np.ndarray:
-            """
-            Validates the input data to ensure it is either a Pandas DataFrame
-            or Numpy array.
-            :param obj: user-provided input data.
-            :return: a vector of values to be used in calculating the local
-            outlier probability.
-            """
-            if obj.__class__.__name__ == 'DataFrame':
-                points_vector = obj.values
-                return points_vector
-            elif obj.__class__.__name__ == 'ndarray':
-                points_vector = obj
-                return points_vector
-            else:
-                warnings.warn(
-                    "Provided data or distance matrix must be in ndarray "
-                    "or DataFrame.",
-                    UserWarning)
-                if isinstance(obj, list):
-                    points_vector = np.array(obj)
-                    return points_vector
-                points_vector = np.array([obj])
-                return points_vector
+    def _check_extent(self) -> bool:
+        """
+        Validates the specified extent parameter to ensure it is either 1,
+        2, or 3.
+        :return: a boolean indicating whether validation has passed.
+        """
+        if self.extent not in [1, 2, 3]:
+            warnings.warn(
+                "extent parameter (lambda) must be 1, 2, or 3.", UserWarning
+            )
+            return False
+        return True
-        def _inputs(self, obj: 'LocalOutlierProbability'):
-            """
-            Validates the inputs provided during initialization to ensure
-            that the needed objects are provided.
-            :param obj: a PyNomaly object.
-            :return: a boolean indicating whether validation has failed or
-            the data, distance matrix, and neighbor matrix.
-            """
-            if all(v is None for v in [obj.data, obj.distance_matrix]):
-                warnings.warn(
-                    "Data or a distance matrix must be provided.", UserWarning
-                )
-                return False
-            elif all(v is not None for v in [obj.data, obj.distance_matrix]):
-                warnings.warn(
-                    "Only one of the following may be provided: data or a "
-                    "distance matrix (not both).", UserWarning
-                )
-                return False
-            if obj.data is not None:
-                points_vector = self._data(obj.data)
-                return points_vector, obj.distance_matrix, obj.neighbor_matrix
-            if all(matrix is not None for matrix in [obj.neighbor_matrix,
-                                                     obj.distance_matrix]):
-                dist_vector = self._data(obj.distance_matrix)
-                neigh_vector = self._data(obj.neighbor_matrix)
-            else:
-                warnings.warn(
-                    "A neighbor index matrix and distance matrix must both be "
-                    "provided when not using raw input data.", UserWarning
-                )
-                return False
-            if obj.distance_matrix.shape != obj.neighbor_matrix.shape:
-                warnings.warn(
-                    "The shape of the distance and neighbor "
-                    "index matrices must match.", UserWarning
-                )
-                return False
-            elif (obj.distance_matrix.shape[1] != obj.n_neighbors) \
-                    or (obj.neighbor_matrix.shape[1] !=
-                        obj.n_neighbors):
-                warnings.warn("The shape of the distance or "
-                              "neighbor index matrix does not "
-                              "match the number of neighbors "
-                              "specified.", UserWarning)
-                return False
-            return obj.data, dist_vector, neigh_vector
-        @staticmethod
-        def _cluster_size(obj) -> bool:
-            """
-            Validates the cluster labels to ensure that the smallest cluster
-            size (number of observations in the cluster) is larger than the
-            specified number of neighbors.
-            :param obj: a PyNomaly object.
-            :return: a boolean indicating whether validation has passed.
-            """
-            c_labels = obj._cluster_labels()
-            for cluster_id in set(c_labels):
-                c_size = np.where(c_labels == cluster_id)[0].shape[0]
-                if c_size <= obj.n_neighbors:
-                    warnings.warn(
-                        "Number of neighbors specified larger than smallest "
-                        "cluster. Specify a number of neighbors smaller than "
-                        "the smallest cluster size (observations in smallest "
-                        "cluster minus one).",
-                        UserWarning)
-                    return False
-            return True
-        @staticmethod
-        def _n_neighbors(obj) -> bool:
-            """
-            Validates the specified number of neighbors to ensure that it is
-            greater than 0 and that the specified value is less than the total
-            number of observations.
-            :param obj: a PyNomaly object.
-            :return: a boolean indicating whether validation has passed.
-            """
-            if not obj.n_neighbors > 0:
-                obj.n_neighbors = 10
-                warnings.warn("n_neighbors must be greater than 0."
-                              " Fit with " + str(obj.n_neighbors) +
-                              " instead.",
-                              UserWarning)
-                return False
-            elif obj.n_neighbors >= obj._n_observations():
-                obj.n_neighbors = obj._n_observations() - 1
-                warnings.warn(
-                    "n_neighbors must be less than the number of observations."
-                    " Fit with " + str(obj.n_neighbors) + " instead.",
-                    UserWarning)
-            return True
-        @staticmethod
-        def _extent(obj) -> bool:
-            """
-            Validates the specified extent parameter to ensure it is either 1,
-            2, or 3.
-            :param obj: a PyNomaly object.
-            :return: a boolean indicating whether validation has passed.
-            """
-            if obj.extent not in [1, 2, 3]:
-                warnings.warn(
-                    "extent parameter (lambda) must be 1, 2, or 3.",
-                    UserWarning)
-                return False
-            return True
-        @staticmethod
-        def _missing_values(obj) -> bool:
-            """
-            Validates the provided data to ensure that it contains no
-            missing values.
-            :param obj: a PyNomaly object.
-            :return: a boolean indicating whether validation has passed.
-            """
-            if np.any(np.isnan(obj.data)):
-                warnings.warn(
-                    "Method does not support missing values in input data.",
-                    UserWarning)
-                return False
-            return True
-        @staticmethod
-        def _fit(obj) -> bool:
-            """
-            Validates that the model was fit prior to calling the stream()
-            method.
-            :param obj: a PyNomaly object.
-            :return: a boolean indicating whether validation has passed.
-            """
-            if obj.is_fit is False:
-                warnings.warn(
-                    "Must fit on historical data by calling fit() prior to "
-                    "calling stream(x).",
-                    UserWarning)
-                return False
-            return True
-        @staticmethod
-        def _no_cluster_labels(obj) -> bool:
-            """
-            Checks to see if cluster labels are attempting to be used in
-            stream() and, if so, calls fit() once again but without cluster
-            labels. As PyNomaly does not accept clustering algorithms as input,
-            the stream approach does not support clustering.
-            :param obj: a PyNomaly object.
-            :return: a boolean indicating whether validation has passed.
-            """
-            if len(set(obj._cluster_labels())) > 1:
-                warnings.warn(
-                    "Stream approach does not support clustered data. "
-                    "Automatically refit using single cluster of points.",
-                    UserWarning)
-                return False
-            return True
+    def _check_missing_values(self) -> None:
+        """
+        Validates the provided data to ensure that it contains no
+        missing values.
+        :raises MissingValuesError: if data contains NaN values.
+        """
+        if np.any(np.isnan(self.data)):
+            raise MissingValuesError(
+                "Method does not support missing values in input data."
+            )
+    def _check_is_fit(self) -> bool:
+        """
+        Checks that the model was fit prior to calling the stream() method.
+        :return: a boolean indicating whether the model has been fit.
+        """
+        if self.is_fit is False:
+            warnings.warn(
+                "Must fit on historical data by calling fit() prior to "
+                "calling stream(x).",
+                UserWarning,
+            )
+            return False
+        return True
+    def _check_no_cluster_labels(self) -> bool:
+        """
+        Checks to see if cluster labels are attempting to be used in
+        stream() and, if so, returns False. As PyNomaly does not accept
+        clustering algorithms as input, the stream approach does not
+        support clustering.
+        :return: a boolean indicating whether single cluster (no labels).
+        """
+        if len(set(self._cluster_labels())) > 1:
+            warnings.warn(
+                "Stream approach does not support clustered data. "
+                "Automatically refit using single cluster of points.",
+                UserWarning,
+            )
+            return False
+        return True
     """
     Decorators.
@@ -291,43 +301,35 @@ class LocalOutlierProbability(object):
             assert len(types) == f.__code__.co_argcount
             def new_f(*args, **kwds):
-                for (a, t) in zip(args, types):
-                    if type(a).__name__ == 'DataFrame':
+                for a, t in zip(args, types):
+                    if type(a).__name__ == "DataFrame":
                         a = np.array(a)
                     if isinstance(a, t) is False:
-                        warnings.warn("Argument %r is not of type %s" % (a, t),
-                                      UserWarning)
+                        warnings.warn(
+                            "Argument %r is not of type %s" % (a, t), UserWarning
+                        )
                 opt_types = {
-                    'distance_matrix': {
-                        'type': types[2]
-                    },
-                    'neighbor_matrix': {
-                        'type': types[3]
-                    },
-                    'extent': {
-                        'type': types[4]
-                    },
-                    'n_neighbors': {
-                        'type': types[5]
-                    },
-                    'cluster_labels': {
-                        'type': types[6]
-                    },
-                    'use_numba': {
-                        'type': types[7]
-                    },
-                    'progress_bar': {
-                        'type': types[8]
-                    }
+                    "distance_matrix": {"type": types[2]},
+                    "neighbor_matrix": {"type": types[3]},
+                    "extent": {"type": types[4]},
+                    "n_neighbors": {"type": types[5]},
+                    "cluster_labels": {"type": types[6]},
+                    "use_numba": {"type": types[7]},
+                    "progress_bar": {"type": types[8]},
                 }
                 for x in kwds:
-                    opt_types[x]['value'] = kwds[x]
+                    opt_types[x]["value"] = kwds[x]
                 for k in opt_types:
                     try:
-                        if isinstance(opt_types[k]['value'],
-                                      opt_types[k]['type']) is False:
-                            warnings.warn("Argument %r is not of type %s." % (
-                                k, opt_types[k]['type']), UserWarning)
+                        if (
+                            isinstance(opt_types[k]["value"], opt_types[k]["type"])
+                            is False
+                        ):
+                            warnings.warn(
+                                "Argument %r is not of type %s."
+                                % (k, opt_types[k]["type"]),
+                                UserWarning,
+                            )
                     except KeyError:
                         pass
                 return f(*args, **kwds)
@@ -337,11 +339,28 @@ class LocalOutlierProbability(object):
         return decorator
-    @accepts(object, np.ndarray, np.ndarray, np.ndarray, (int, np.integer),
-             (int, np.integer), list, bool, bool)
-    def __init__(self, data=None, distance_matrix=None, neighbor_matrix=None,
-                 extent=3, n_neighbors=10, cluster_labels=None,
-                 use_numba=False, progress_bar=False) -> None:
+    @accepts(
+        object,
+        np.ndarray,
+        np.ndarray,
+        np.ndarray,
+        (int, np.integer),
+        (int, np.integer),
+        list,
+        bool,
+        bool,
+    )
+    def __init__(
+        self,
+        data=None,
+        distance_matrix=None,
+        neighbor_matrix=None,
+        extent=3,
+        n_neighbors=10,
+        cluster_labels=None,
+        use_numba=False,
+        progress_bar=False,
+    ) -> None:
         self.data = data
         self.distance_matrix = distance_matrix
         self.neighbor_matrix = neighbor_matrix
@@ -358,29 +377,28 @@ class LocalOutlierProbability(object):
         self.progress_bar = progress_bar
         self.is_fit = False
-        if self.use_numba is True and 'numba' not in sys.modules:
+        if self.use_numba is True and "numba" not in sys.modules:
             self.use_numba = False
             warnings.warn(
-                "Numba is not available, falling back to pure python mode.",
-                UserWarning)
+                "Numba is not available, falling back to pure python mode.", UserWarning
+            )
-        self.Validate()._inputs(self)
-        self.Validate._extent(self)
+        self._validate_inputs()
+        self._check_extent()
     """
     Private methods.
     """
     @staticmethod
-    def _standard_distance(cardinality: float, sum_squared_distance: float) \
-            -> float:
+    def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:
         """
         Calculates the standard distance of an observation.
         :param cardinality: the cardinality of the input observation.
         :param sum_squared_distance: the sum squared distance between all
         neighbors of the input observation.
         :return: the standard distance.
-        # """
+        #"""
         division_result = sum_squared_distance / cardinality
         st_dist = sqrt(division_result)
         return st_dist
@@ -397,8 +415,9 @@ class LocalOutlierProbability(object):
         return extent * standard_distance
     @staticmethod
-    def _prob_outlier_factor(probabilistic_distance: np.ndarray, ev_prob_dist:
-    np.ndarray) -> np.ndarray:
+    def _prob_outlier_factor(
+        probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray
+    ) -> np.ndarray:
         """
         Calculates the probabilistic outlier factor of an observation.
         :param probabilistic_distance: the probabilistic distance of the
@@ -409,14 +428,14 @@ class LocalOutlierProbability(object):
         if np.all(probabilistic_distance == ev_prob_dist):
             return np.zeros(probabilistic_distance.shape)
         else:
-            ev_prob_dist[ev_prob_dist == 0.] = 1.e-8
-            result = np.divide(probabilistic_distance, ev_prob_dist) - 1.
+            ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8
+            result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0
             return result
     @staticmethod
-    def _norm_prob_outlier_factor(extent: float,
-                                  ev_probabilistic_outlier_factor: list) \
-            -> list:
+    def _norm_prob_outlier_factor(
+        extent: float, ev_probabilistic_outlier_factor: list
+    ) -> list:
         """
         Calculates the normalized probabilistic outlier factor of an
         observation.
@@ -431,8 +450,9 @@ class LocalOutlierProbability(object):
         return npofs
     @staticmethod
-    def _local_outlier_probability(plof_val: np.ndarray, nplof_val: np.ndarray) \
-            -> np.ndarray:
+    def _local_outlier_probability(
+        plof_val: np.ndarray, nplof_val: np.ndarray
+    ) -> np.ndarray:
         """
         Calculates the local outlier probability of an observation.
         :param plof_val: the probabilistic outlier factor of the input
@@ -445,7 +465,7 @@ class LocalOutlierProbability(object):
         if np.all(plof_val == nplof_val):
             return np.zeros(plof_val.shape)
         else:
-            return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.))))
+            return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
     def _n_observations(self) -> int:
         """
@@ -499,8 +519,9 @@ class LocalOutlierProbability(object):
         :return: the updated storage matrix that collects information on
         each observation.
         """
-        for vec, cluster_id in zip(range(self.distance_matrix.shape[0]),
-                                   self._cluster_labels()):
+        for vec, cluster_id in zip(
+            range(self.distance_matrix.shape[0]), self._cluster_labels()
+        ):
             data_store[vec][0] = cluster_id
             data_store[vec][1] = self.distance_matrix[vec]
             data_store[vec][2] = self.neighbor_matrix[vec]
@@ -508,10 +529,10 @@ class LocalOutlierProbability(object):
     @staticmethod
     def _compute_distance_and_neighbor_matrix(
-            clust_points_vector: np.ndarray,
-            indices: np.ndarray,
-            distances: np.ndarray,
-            indexes: np.ndarray
+        clust_points_vector: np.ndarray,
+        indices: np.ndarray,
+        distances: np.ndarray,
+        indexes: np.ndarray,
     ) -> Tuple[np.ndarray, np.ndarray, int]:
         """
         This helper method provides the heavy lifting for the _distances
@@ -519,27 +540,27 @@ class LocalOutlierProbability(object):
         written so that it can make full use of Numba's jit capabilities if
         desired.
         """
         for i in range(clust_points_vector.shape[0]):
             for j in range(i + 1, clust_points_vector.shape[0]):
-                p = ((i,), (j,))
+                # Global index of the points
+                global_i = indices[0][i]
+                global_j = indices[0][j]
-                diff = clust_points_vector[p[0]] - clust_points_vector[p[1]]
+                # Compute Euclidean distance
+                diff = clust_points_vector[i] - clust_points_vector[j]
                 d = np.dot(diff, diff) ** 0.5
-                idx = indices[0][p[0]]
-                idx_max = distances[idx].argmax()
+                # Update distance and neighbor index for global_i
+                idx_max = distances[global_i].argmax()
+                if d < distances[global_i][idx_max]:
+                    distances[global_i][idx_max] = d
+                    indexes[global_i][idx_max] = global_j
-                if d < distances[idx][idx_max]:
-                    distances[idx][idx_max] = d
-                    indexes[idx][idx_max] = p[1][0]
-                idx = indices[0][p[1]]
-                idx_max = distances[idx].argmax()
-                if d < distances[idx][idx_max]:
-                    distances[idx][idx_max] = d
-                    indexes[idx][idx_max] = p[0][0]
+                # Update distance and neighbor index for global_j
+                idx_max = distances[global_j].argmax()
+                if d < distances[global_j][idx_max]:
+                    distances[global_j][idx_max] = d
+                    indexes[global_j][idx_max] = global_i
             yield distances, indexes, i
@@ -552,20 +573,21 @@ class LocalOutlierProbability(object):
         :return: the updated storage matrix that collects information on
         each observation.
         """
-        distances = np.full([self._n_observations(), self.n_neighbors], 9e10,
-                            dtype=float)
-        indexes = np.full([self._n_observations(), self.n_neighbors], 9e10,
-                          dtype=float)
-        self.points_vector = self.Validate._data(self.data)
-        compute = numba.jit(self._compute_distance_and_neighbor_matrix,
-                            cache=True) if self.use_numba else \
-            self._compute_distance_and_neighbor_matrix
+        distances = np.full(
+            [self._n_observations(), self.n_neighbors], 9e10, dtype=float
+        )
+        indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
+        self.points_vector = self._convert_to_array(self.data)
+        compute = (
+            numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
+            if self.use_numba
+            else self._compute_distance_and_neighbor_matrix
+        )
         progress = "="
         for cluster_id in set(self._cluster_labels()):
             indices = np.where(self._cluster_labels() == cluster_id)
             clust_points_vector = np.array(
-                self.points_vector.take(indices, axis=0)[0],
-                dtype=np.float64
+                self.points_vector.take(indices, axis=0)[0], dtype=np.float64
             )
             # a generator that yields an updated distance matrix on each loop
             for c in compute(clust_points_vector, indices, distances, indexes):
@@ -573,7 +595,8 @@ class LocalOutlierProbability(object):
                 # update the progress bar
                 if progress_bar is True:
                     progress = Utils.emit_progress_bar(
-                        progress, i+1, clust_points_vector.shape[0])
+                        progress, i + 1, clust_points_vector.shape[0]
+                    )
         self.distance_matrix = distances
         self.neighbor_matrix = indexes
@@ -627,11 +650,10 @@ class LocalOutlierProbability(object):
         """
         prob_distances = []
         for i in range(data_store[:, 4].shape[0]):
-            prob_distances.append(
-                self._prob_distance(self.extent, data_store[:, 4][i]))
+            prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))
         return np.hstack((data_store, np.array([prob_distances]).T))
-    def _prob_distances_ev(self, data_store: np.ndarray) -> np.ndarray:
+    def _prob_distances_ev(self, data_store) -> np.ndarray:
         """
         Calculates the expected value of the probabilistic distance for
         each observation in the input data with respect to the cluster the
@@ -645,19 +667,20 @@ class LocalOutlierProbability(object):
         for cluster_id in self.cluster_labels_u:
             indices = np.where(data_store[:, 0] == cluster_id)[0]
             for index in indices:
-                nbrhood = data_store[index][2].astype(int)
-                nbrhood_prob_distances = np.take(data_store[:, 5],
-                                                 nbrhood).astype(float)
+                # Global neighbor indices for the current point
+                nbrhood = data_store[index][2].astype(int)  # Ensure global indices
+                nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(
+                    float
+                )
                 nbrhood_prob_distances_nonan = nbrhood_prob_distances[
-                    np.logical_not(np.isnan(nbrhood_prob_distances))]
-                prob_set_distance_ev[index] = \
-                    nbrhood_prob_distances_nonan.mean()
+                    np.logical_not(np.isnan(nbrhood_prob_distances))
+                ]
+                prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()
         self.prob_distances_ev = prob_set_distance_ev
-        data_store = np.hstack((data_store, prob_set_distance_ev))
-        return data_store
+        return np.hstack((data_store, prob_set_distance_ev))
-    def _prob_local_outlier_factors(self,
-                                    data_store: np.ndarray) -> np.ndarray:
+    def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
         """
         Calculates the probabilistic local outlier factor for each
         observation in the input data.
@@ -667,13 +690,22 @@ class LocalOutlierProbability(object):
         each observation.
         """
         return np.hstack(
-            (data_store,
-             np.array([np.apply_along_axis(self._prob_outlier_factor, 0,
-                                           data_store[:, 5],
-                                           data_store[:, 6])]).T))
+            (
+                data_store,
+                np.array(
+                    [
+                        np.apply_along_axis(
+                            self._prob_outlier_factor,
+                            0,
+                            data_store[:, 5],
+                            data_store[:, 6],
+                        )
+                    ]
+                ).T,
+            )
+        )
-    def _prob_local_outlier_factors_ev(self,
-                                       data_store: np.ndarray) -> np.ndarray:
+    def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:
         """
         Calculates the expected value of the probabilistic local outlier factor
         for each observation in the input data with respect to the cluster the
@@ -686,21 +718,31 @@ class LocalOutlierProbability(object):
         prob_local_outlier_factor_ev_dict = {}
         for cluster_id in self.cluster_labels_u:
             indices = np.where(data_store[:, 0] == cluster_id)
-            prob_local_outlier_factors = np.take(data_store[:, 7],
-                                                 indices).astype(float)
-            prob_local_outlier_factors_nonan = prob_local_outlier_factors[
-                np.logical_not(np.isnan(prob_local_outlier_factors))]
-            prob_local_outlier_factor_ev_dict[cluster_id] = (
-                    np.power(prob_local_outlier_factors_nonan, 2).sum() /
-                    float(prob_local_outlier_factors_nonan.size)
+            prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(
+                float
             )
+            prob_local_outlier_factors_nonan = prob_local_outlier_factors[
+                np.logical_not(np.isnan(prob_local_outlier_factors))
+            ]
+            prob_local_outlier_factor_ev_dict[cluster_id] = np.power(
+                prob_local_outlier_factors_nonan, 2
+            ).sum() / float(prob_local_outlier_factors_nonan.size)
         data_store = np.hstack(
-            (data_store, np.array([[prob_local_outlier_factor_ev_dict[x] for x
-                                    in data_store[:, 0].tolist()]]).T))
+            (
+                data_store,
+                np.array(
+                    [
+                        [
+                            prob_local_outlier_factor_ev_dict[x]
+                            for x in data_store[:, 0].tolist()
+                        ]
+                    ]
+                ).T,
+            )
+        )
         return data_store
-    def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) \
-            -> np.ndarray:
+    def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
         """
         Calculates the normalized probabilistic local outlier factor for each
         observation in the input data.
@@ -709,11 +751,20 @@ class LocalOutlierProbability(object):
         :return: the updated storage matrix that collects information on
         each observation.
         """
-        return np.hstack((data_store, np.array([self._norm_prob_outlier_factor(
-            self.extent, data_store[:, 8].tolist())]).T))
+        return np.hstack(
+            (
+                data_store,
+                np.array(
+                    [
+                        self._norm_prob_outlier_factor(
+                            self.extent, data_store[:, 8].tolist()
+                        )
+                    ]
+                ).T,
+            )
+        )
-    def _local_outlier_probabilities(self,
-                                     data_store: np.ndarray) -> np.ndarray:
+    def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:
         """
         Calculates the local outlier probability for each observation in the
         input data.
@@ -723,31 +774,40 @@ class LocalOutlierProbability(object):
         each observation.
         """
         return np.hstack(
-            (data_store,
-             np.array([np.apply_along_axis(self._local_outlier_probability, 0,
-                                           data_store[:, 7],
-                                           data_store[:, 9])]).T))
+            (
+                data_store,
+                np.array(
+                    [
+                        np.apply_along_axis(
+                            self._local_outlier_probability,
+                            0,
+                            data_store[:, 7],
+                            data_store[:, 9],
+                        )
+                    ]
+                ).T,
+            )
+        )
     """
     Public methods
     """
-    def fit(self) -> 'LocalOutlierProbability':
+    def fit(self) -> "LocalOutlierProbability":
         """
         Calculates the local outlier probability for each observation in the
         input data according to the input parameters extent, n_neighbors, and
         cluster_labels.
         :return: self, which contains the local outlier probabilities as
         self.local_outlier_probabilities.
+        :raises ClusterSizeError: if any cluster is smaller than n_neighbors.
+        :raises MissingValuesError: if data contains missing values.
         """
-        self.Validate._n_neighbors(self)
-        if self.Validate._cluster_size(self) is False:
-            sys.exit()
-        if self.data is not None and self.Validate._missing_values(
-                self) is False:
-            sys.exit()
+        self._check_n_neighbors()
+        self._check_cluster_size()
+        if self.data is not None:
+            self._check_missing_values()
         store = self._store()
         if self.data is not None:
@@ -770,7 +830,6 @@ class LocalOutlierProbability(object):
         return self
     def stream(self, x: np.ndarray) -> np.ndarray:
         """
         Calculates the local outlier probability for an individual sample
         according to the input parameters extent, n_neighbors, and
@@ -784,19 +843,23 @@ class LocalOutlierProbability(object):
         """
         orig_cluster_labels = None
-        if self.Validate._no_cluster_labels(self) is False:
+        if self._check_no_cluster_labels() is False:
             orig_cluster_labels = self.cluster_labels
             self.cluster_labels = np.array([0] * len(self.data))
-        if self.Validate._fit(self) is False:
+        if self._check_is_fit() is False:
             self.fit()
-        point_vector = self.Validate._data(x)
+        point_vector = self._convert_to_array(x)
         distances = np.full([1, self.n_neighbors], 9e10, dtype=float)
         if self.data is not None:
             matrix = self.points_vector
         else:
             matrix = self.distance_matrix
+            # When using distance matrix mode, x is a scalar distance value.
+            # Extract scalar from array to avoid NumPy assignment errors.
+            if point_vector.size == 1:
+                point_vector = float(point_vector.flat[0])
         for p in range(0, matrix.shape[0]):
             if self.data is not None:
                 d = self._euclidean(matrix[p, :], point_vector)
@@ -809,12 +872,12 @@ class LocalOutlierProbability(object):
         ssd = np.power(distances, 2).sum()
         std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
         prob_dist = self._prob_distance(self.extent, std_dist)
-        plof = self._prob_outlier_factor(np.array(prob_dist),
-                                         np.array(
-                                             self.prob_distances_ev.mean())
-                                         )
+        plof = self._prob_outlier_factor(
+            np.array(prob_dist), np.array(self.prob_distances_ev.mean())
+        )
         loop = self._local_outlier_probability(
-            plof, self.norm_prob_local_outlier_factor)
+            plof, self.norm_prob_local_outlier_factor
+        )
         if orig_cluster_labels is not None:
             self.cluster_labels = orig_cluster_labels

PyNomaly 0.3.2__py3-none-any.whl → 0.3.5__py3-none-any.whl

PyNomaly 0.3.2py3-none-any.whl → 0.3.5py3-none-any.whl