PyNomaly 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyNomaly/__init__.py +18 -0
- PyNomaly/loop.py +214 -215
- pynomaly-0.3.5.dist-info/METADATA +503 -0
- pynomaly-0.3.5.dist-info/RECORD +7 -0
- {PyNomaly-0.3.4.dist-info → pynomaly-0.3.5.dist-info}/WHEEL +1 -1
- PyNomaly-0.3.4.dist-info/METADATA +0 -17
- PyNomaly-0.3.4.dist-info/RECORD +0 -7
- /PyNomaly-0.3.4.dist-info/LICENSE.txt → /pynomaly-0.3.5.dist-info/licenses/LICENSE +0 -0
- {PyNomaly-0.3.4.dist-info → pynomaly-0.3.5.dist-info}/top_level.txt +0 -0
PyNomaly/__init__.py
CHANGED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Authors: Valentino Constantinou <vc@valentino.io>
|
|
2
|
+
# License: Apache 2.0
|
|
3
|
+
|
|
4
|
+
from PyNomaly.loop import (
|
|
5
|
+
LocalOutlierProbability,
|
|
6
|
+
PyNomalyError,
|
|
7
|
+
ValidationError,
|
|
8
|
+
ClusterSizeError,
|
|
9
|
+
MissingValuesError,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"LocalOutlierProbability",
|
|
14
|
+
"PyNomalyError",
|
|
15
|
+
"ValidationError",
|
|
16
|
+
"ClusterSizeError",
|
|
17
|
+
"MissingValuesError",
|
|
18
|
+
]
|
PyNomaly/loop.py
CHANGED
|
@@ -11,10 +11,31 @@ except ImportError:
|
|
|
11
11
|
pass
|
|
12
12
|
|
|
13
13
|
__author__ = "Valentino Constantinou"
|
|
14
|
-
__version__ = "0.3.
|
|
14
|
+
__version__ = "0.3.5"
|
|
15
15
|
__license__ = "Apache License, Version 2.0"
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
# Custom Exceptions
|
|
19
|
+
class PyNomalyError(Exception):
|
|
20
|
+
"""Base exception for PyNomaly."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ValidationError(PyNomalyError):
|
|
25
|
+
"""Raised when input validation fails."""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ClusterSizeError(ValidationError):
|
|
30
|
+
"""Raised when cluster size is smaller than n_neighbors."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MissingValuesError(ValidationError):
|
|
35
|
+
"""Raised when data contains missing values."""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
18
39
|
class Utils:
|
|
19
40
|
@staticmethod
|
|
20
41
|
def emit_progress_bar(progress: str, index: int, total: int) -> str:
|
|
@@ -77,217 +98,190 @@ class LocalOutlierProbability(object):
|
|
|
77
98
|
(2016).
|
|
78
99
|
"""
|
|
79
100
|
|
|
80
|
-
|
|
101
|
+
"""
|
|
102
|
+
Validation methods.
|
|
103
|
+
These methods validate inputs and raise exceptions or warnings as appropriate.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
@staticmethod
|
|
107
|
+
def _convert_to_array(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
|
|
108
|
+
"""
|
|
109
|
+
Converts the input data to a numpy array if it is a Pandas DataFrame
|
|
110
|
+
or validates it is already a numpy array.
|
|
111
|
+
:param obj: user-provided input data.
|
|
112
|
+
:return: a vector of values to be used in calculating the local
|
|
113
|
+
outlier probability.
|
|
114
|
+
"""
|
|
115
|
+
if obj.__class__.__name__ == "DataFrame":
|
|
116
|
+
points_vector = obj.values
|
|
117
|
+
return points_vector
|
|
118
|
+
elif obj.__class__.__name__ == "ndarray":
|
|
119
|
+
points_vector = obj
|
|
120
|
+
return points_vector
|
|
121
|
+
else:
|
|
122
|
+
warnings.warn(
|
|
123
|
+
"Provided data or distance matrix must be in ndarray "
|
|
124
|
+
"or DataFrame.",
|
|
125
|
+
UserWarning,
|
|
126
|
+
)
|
|
127
|
+
if isinstance(obj, list):
|
|
128
|
+
points_vector = np.array(obj)
|
|
129
|
+
return points_vector
|
|
130
|
+
points_vector = np.array([obj])
|
|
131
|
+
return points_vector
|
|
81
132
|
|
|
133
|
+
def _validate_inputs(self):
|
|
82
134
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
albeit with some form of user warning.
|
|
135
|
+
Validates the inputs provided during initialization to ensure
|
|
136
|
+
that the needed objects are provided.
|
|
137
|
+
:return: a tuple of (data, distance_matrix, neighbor_matrix) or
|
|
138
|
+
raises a warning for invalid inputs.
|
|
88
139
|
"""
|
|
140
|
+
if all(v is None for v in [self.data, self.distance_matrix]):
|
|
141
|
+
warnings.warn(
|
|
142
|
+
"Data or a distance matrix must be provided.", UserWarning
|
|
143
|
+
)
|
|
144
|
+
return False
|
|
145
|
+
elif all(v is not None for v in [self.data, self.distance_matrix]):
|
|
146
|
+
warnings.warn(
|
|
147
|
+
"Only one of the following may be provided: data or a "
|
|
148
|
+
"distance matrix (not both).",
|
|
149
|
+
UserWarning,
|
|
150
|
+
)
|
|
151
|
+
return False
|
|
152
|
+
if self.data is not None:
|
|
153
|
+
points_vector = self._convert_to_array(self.data)
|
|
154
|
+
return points_vector, self.distance_matrix, self.neighbor_matrix
|
|
155
|
+
if all(
|
|
156
|
+
matrix is not None
|
|
157
|
+
for matrix in [self.neighbor_matrix, self.distance_matrix]
|
|
158
|
+
):
|
|
159
|
+
dist_vector = self._convert_to_array(self.distance_matrix)
|
|
160
|
+
neigh_vector = self._convert_to_array(self.neighbor_matrix)
|
|
161
|
+
else:
|
|
162
|
+
warnings.warn(
|
|
163
|
+
"A neighbor index matrix and distance matrix must both be "
|
|
164
|
+
"provided when not using raw input data.",
|
|
165
|
+
UserWarning,
|
|
166
|
+
)
|
|
167
|
+
return False
|
|
168
|
+
if self.distance_matrix.shape != self.neighbor_matrix.shape:
|
|
169
|
+
warnings.warn(
|
|
170
|
+
"The shape of the distance and neighbor "
|
|
171
|
+
"index matrices must match.",
|
|
172
|
+
UserWarning,
|
|
173
|
+
)
|
|
174
|
+
return False
|
|
175
|
+
elif (self.distance_matrix.shape[1] != self.n_neighbors) or (
|
|
176
|
+
self.neighbor_matrix.shape[1] != self.n_neighbors
|
|
177
|
+
):
|
|
178
|
+
warnings.warn(
|
|
179
|
+
"The shape of the distance or "
|
|
180
|
+
"neighbor index matrix does not "
|
|
181
|
+
"match the number of neighbors "
|
|
182
|
+
"specified.",
|
|
183
|
+
UserWarning,
|
|
184
|
+
)
|
|
185
|
+
return False
|
|
186
|
+
return self.data, dist_vector, neigh_vector
|
|
187
|
+
|
|
188
|
+
def _check_cluster_size(self) -> None:
|
|
189
|
+
"""
|
|
190
|
+
Validates the cluster labels to ensure that the smallest cluster
|
|
191
|
+
size (number of observations in the cluster) is larger than the
|
|
192
|
+
specified number of neighbors.
|
|
193
|
+
:raises ClusterSizeError: if any cluster is too small.
|
|
194
|
+
"""
|
|
195
|
+
c_labels = self._cluster_labels()
|
|
196
|
+
for cluster_id in set(c_labels):
|
|
197
|
+
c_size = np.where(c_labels == cluster_id)[0].shape[0]
|
|
198
|
+
if c_size <= self.n_neighbors:
|
|
199
|
+
raise ClusterSizeError(
|
|
200
|
+
"Number of neighbors specified larger than smallest "
|
|
201
|
+
"cluster. Specify a number of neighbors smaller than "
|
|
202
|
+
"the smallest cluster size (observations in smallest "
|
|
203
|
+
"cluster minus one)."
|
|
204
|
+
)
|
|
89
205
|
|
|
206
|
+
def _check_n_neighbors(self) -> bool:
|
|
90
207
|
"""
|
|
91
|
-
|
|
208
|
+
Validates the specified number of neighbors to ensure that it is
|
|
209
|
+
greater than 0 and that the specified value is less than the total
|
|
210
|
+
number of observations.
|
|
211
|
+
:return: a boolean indicating whether validation has passed without
|
|
212
|
+
adjustment.
|
|
92
213
|
"""
|
|
214
|
+
if not self.n_neighbors > 0:
|
|
215
|
+
self.n_neighbors = 10
|
|
216
|
+
warnings.warn(
|
|
217
|
+
"n_neighbors must be greater than 0."
|
|
218
|
+
" Fit with " + str(self.n_neighbors) + " instead.",
|
|
219
|
+
UserWarning,
|
|
220
|
+
)
|
|
221
|
+
return False
|
|
222
|
+
elif self.n_neighbors >= self._n_observations():
|
|
223
|
+
self.n_neighbors = self._n_observations() - 1
|
|
224
|
+
warnings.warn(
|
|
225
|
+
"n_neighbors must be less than the number of observations."
|
|
226
|
+
" Fit with " + str(self.n_neighbors) + " instead.",
|
|
227
|
+
UserWarning,
|
|
228
|
+
)
|
|
229
|
+
return True
|
|
93
230
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
elif obj.__class__.__name__ == "ndarray":
|
|
107
|
-
points_vector = obj
|
|
108
|
-
return points_vector
|
|
109
|
-
else:
|
|
110
|
-
warnings.warn(
|
|
111
|
-
"Provided data or distance matrix must be in ndarray "
|
|
112
|
-
"or DataFrame.",
|
|
113
|
-
UserWarning,
|
|
114
|
-
)
|
|
115
|
-
if isinstance(obj, list):
|
|
116
|
-
points_vector = np.array(obj)
|
|
117
|
-
return points_vector
|
|
118
|
-
points_vector = np.array([obj])
|
|
119
|
-
return points_vector
|
|
231
|
+
def _check_extent(self) -> bool:
|
|
232
|
+
"""
|
|
233
|
+
Validates the specified extent parameter to ensure it is either 1,
|
|
234
|
+
2, or 3.
|
|
235
|
+
:return: a boolean indicating whether validation has passed.
|
|
236
|
+
"""
|
|
237
|
+
if self.extent not in [1, 2, 3]:
|
|
238
|
+
warnings.warn(
|
|
239
|
+
"extent parameter (lambda) must be 1, 2, or 3.", UserWarning
|
|
240
|
+
)
|
|
241
|
+
return False
|
|
242
|
+
return True
|
|
120
243
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
)
|
|
163
|
-
return False
|
|
164
|
-
elif (obj.distance_matrix.shape[1] != obj.n_neighbors) or (
|
|
165
|
-
obj.neighbor_matrix.shape[1] != obj.n_neighbors
|
|
166
|
-
):
|
|
167
|
-
warnings.warn(
|
|
168
|
-
"The shape of the distance or "
|
|
169
|
-
"neighbor index matrix does not "
|
|
170
|
-
"match the number of neighbors "
|
|
171
|
-
"specified.",
|
|
172
|
-
UserWarning,
|
|
173
|
-
)
|
|
174
|
-
return False
|
|
175
|
-
return obj.data, dist_vector, neigh_vector
|
|
176
|
-
|
|
177
|
-
@staticmethod
|
|
178
|
-
def _cluster_size(obj) -> bool:
|
|
179
|
-
"""
|
|
180
|
-
Validates the cluster labels to ensure that the smallest cluster
|
|
181
|
-
size (number of observations in the cluster) is larger than the
|
|
182
|
-
specified number of neighbors.
|
|
183
|
-
:param obj: a PyNomaly object.
|
|
184
|
-
:return: a boolean indicating whether validation has passed.
|
|
185
|
-
"""
|
|
186
|
-
c_labels = obj._cluster_labels()
|
|
187
|
-
for cluster_id in set(c_labels):
|
|
188
|
-
c_size = np.where(c_labels == cluster_id)[0].shape[0]
|
|
189
|
-
if c_size <= obj.n_neighbors:
|
|
190
|
-
warnings.warn(
|
|
191
|
-
"Number of neighbors specified larger than smallest "
|
|
192
|
-
"cluster. Specify a number of neighbors smaller than "
|
|
193
|
-
"the smallest cluster size (observations in smallest "
|
|
194
|
-
"cluster minus one).",
|
|
195
|
-
UserWarning,
|
|
196
|
-
)
|
|
197
|
-
return False
|
|
198
|
-
return True
|
|
199
|
-
|
|
200
|
-
@staticmethod
|
|
201
|
-
def _n_neighbors(obj) -> bool:
|
|
202
|
-
"""
|
|
203
|
-
Validates the specified number of neighbors to ensure that it is
|
|
204
|
-
greater than 0 and that the specified value is less than the total
|
|
205
|
-
number of observations.
|
|
206
|
-
:param obj: a PyNomaly object.
|
|
207
|
-
:return: a boolean indicating whether validation has passed.
|
|
208
|
-
"""
|
|
209
|
-
if not obj.n_neighbors > 0:
|
|
210
|
-
obj.n_neighbors = 10
|
|
211
|
-
warnings.warn(
|
|
212
|
-
"n_neighbors must be greater than 0."
|
|
213
|
-
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
214
|
-
UserWarning,
|
|
215
|
-
)
|
|
216
|
-
return False
|
|
217
|
-
elif obj.n_neighbors >= obj._n_observations():
|
|
218
|
-
obj.n_neighbors = obj._n_observations() - 1
|
|
219
|
-
warnings.warn(
|
|
220
|
-
"n_neighbors must be less than the number of observations."
|
|
221
|
-
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
222
|
-
UserWarning,
|
|
223
|
-
)
|
|
224
|
-
return True
|
|
225
|
-
|
|
226
|
-
@staticmethod
|
|
227
|
-
def _extent(obj) -> bool:
|
|
228
|
-
"""
|
|
229
|
-
Validates the specified extent parameter to ensure it is either 1,
|
|
230
|
-
2, or 3.
|
|
231
|
-
:param obj: a PyNomaly object.
|
|
232
|
-
:return: a boolean indicating whether validation has passed.
|
|
233
|
-
"""
|
|
234
|
-
if obj.extent not in [1, 2, 3]:
|
|
235
|
-
warnings.warn(
|
|
236
|
-
"extent parameter (lambda) must be 1, 2, or 3.", UserWarning
|
|
237
|
-
)
|
|
238
|
-
return False
|
|
239
|
-
return True
|
|
240
|
-
|
|
241
|
-
@staticmethod
|
|
242
|
-
def _missing_values(obj) -> bool:
|
|
243
|
-
"""
|
|
244
|
-
Validates the provided data to ensure that it contains no
|
|
245
|
-
missing values.
|
|
246
|
-
:param obj: a PyNomaly object.
|
|
247
|
-
:return: a boolean indicating whether validation has passed.
|
|
248
|
-
"""
|
|
249
|
-
if np.any(np.isnan(obj.data)):
|
|
250
|
-
warnings.warn(
|
|
251
|
-
"Method does not support missing values in input data.", UserWarning
|
|
252
|
-
)
|
|
253
|
-
return False
|
|
254
|
-
return True
|
|
255
|
-
|
|
256
|
-
@staticmethod
|
|
257
|
-
def _fit(obj) -> bool:
|
|
258
|
-
"""
|
|
259
|
-
Validates that the model was fit prior to calling the stream()
|
|
260
|
-
method.
|
|
261
|
-
:param obj: a PyNomaly object.
|
|
262
|
-
:return: a boolean indicating whether validation has passed.
|
|
263
|
-
"""
|
|
264
|
-
if obj.is_fit is False:
|
|
265
|
-
warnings.warn(
|
|
266
|
-
"Must fit on historical data by calling fit() prior to "
|
|
267
|
-
"calling stream(x).",
|
|
268
|
-
UserWarning,
|
|
269
|
-
)
|
|
270
|
-
return False
|
|
271
|
-
return True
|
|
272
|
-
|
|
273
|
-
@staticmethod
|
|
274
|
-
def _no_cluster_labels(obj) -> bool:
|
|
275
|
-
"""
|
|
276
|
-
Checks to see if cluster labels are attempting to be used in
|
|
277
|
-
stream() and, if so, calls fit() once again but without cluster
|
|
278
|
-
labels. As PyNomaly does not accept clustering algorithms as input,
|
|
279
|
-
the stream approach does not support clustering.
|
|
280
|
-
:param obj: a PyNomaly object.
|
|
281
|
-
:return: a boolean indicating whether validation has passed.
|
|
282
|
-
"""
|
|
283
|
-
if len(set(obj._cluster_labels())) > 1:
|
|
284
|
-
warnings.warn(
|
|
285
|
-
"Stream approach does not support clustered data. "
|
|
286
|
-
"Automatically refit using single cluster of points.",
|
|
287
|
-
UserWarning,
|
|
288
|
-
)
|
|
289
|
-
return False
|
|
290
|
-
return True
|
|
244
|
+
def _check_missing_values(self) -> None:
|
|
245
|
+
"""
|
|
246
|
+
Validates the provided data to ensure that it contains no
|
|
247
|
+
missing values.
|
|
248
|
+
:raises MissingValuesError: if data contains NaN values.
|
|
249
|
+
"""
|
|
250
|
+
if np.any(np.isnan(self.data)):
|
|
251
|
+
raise MissingValuesError(
|
|
252
|
+
"Method does not support missing values in input data."
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def _check_is_fit(self) -> bool:
|
|
256
|
+
"""
|
|
257
|
+
Checks that the model was fit prior to calling the stream() method.
|
|
258
|
+
:return: a boolean indicating whether the model has been fit.
|
|
259
|
+
"""
|
|
260
|
+
if self.is_fit is False:
|
|
261
|
+
warnings.warn(
|
|
262
|
+
"Must fit on historical data by calling fit() prior to "
|
|
263
|
+
"calling stream(x).",
|
|
264
|
+
UserWarning,
|
|
265
|
+
)
|
|
266
|
+
return False
|
|
267
|
+
return True
|
|
268
|
+
|
|
269
|
+
def _check_no_cluster_labels(self) -> bool:
|
|
270
|
+
"""
|
|
271
|
+
Checks to see if cluster labels are attempting to be used in
|
|
272
|
+
stream() and, if so, returns False. As PyNomaly does not accept
|
|
273
|
+
clustering algorithms as input, the stream approach does not
|
|
274
|
+
support clustering.
|
|
275
|
+
:return: a boolean indicating whether single cluster (no labels).
|
|
276
|
+
"""
|
|
277
|
+
if len(set(self._cluster_labels())) > 1:
|
|
278
|
+
warnings.warn(
|
|
279
|
+
"Stream approach does not support clustered data. "
|
|
280
|
+
"Automatically refit using single cluster of points.",
|
|
281
|
+
UserWarning,
|
|
282
|
+
)
|
|
283
|
+
return False
|
|
284
|
+
return True
|
|
291
285
|
|
|
292
286
|
"""
|
|
293
287
|
Decorators.
|
|
@@ -389,8 +383,8 @@ class LocalOutlierProbability(object):
|
|
|
389
383
|
"Numba is not available, falling back to pure python mode.", UserWarning
|
|
390
384
|
)
|
|
391
385
|
|
|
392
|
-
self.
|
|
393
|
-
self.
|
|
386
|
+
self._validate_inputs()
|
|
387
|
+
self._check_extent()
|
|
394
388
|
|
|
395
389
|
"""
|
|
396
390
|
Private methods.
|
|
@@ -583,7 +577,7 @@ class LocalOutlierProbability(object):
|
|
|
583
577
|
[self._n_observations(), self.n_neighbors], 9e10, dtype=float
|
|
584
578
|
)
|
|
585
579
|
indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
|
|
586
|
-
self.points_vector = self.
|
|
580
|
+
self.points_vector = self._convert_to_array(self.data)
|
|
587
581
|
compute = (
|
|
588
582
|
numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
|
|
589
583
|
if self.use_numba
|
|
@@ -806,13 +800,14 @@ class LocalOutlierProbability(object):
|
|
|
806
800
|
cluster_labels.
|
|
807
801
|
:return: self, which contains the local outlier probabilities as
|
|
808
802
|
self.local_outlier_probabilities.
|
|
803
|
+
:raises ClusterSizeError: if any cluster is smaller than n_neighbors.
|
|
804
|
+
:raises MissingValuesError: if data contains missing values.
|
|
809
805
|
"""
|
|
810
806
|
|
|
811
|
-
self.
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
sys.exit()
|
|
807
|
+
self._check_n_neighbors()
|
|
808
|
+
self._check_cluster_size()
|
|
809
|
+
if self.data is not None:
|
|
810
|
+
self._check_missing_values()
|
|
816
811
|
|
|
817
812
|
store = self._store()
|
|
818
813
|
if self.data is not None:
|
|
@@ -848,19 +843,23 @@ class LocalOutlierProbability(object):
|
|
|
848
843
|
"""
|
|
849
844
|
|
|
850
845
|
orig_cluster_labels = None
|
|
851
|
-
if self.
|
|
846
|
+
if self._check_no_cluster_labels() is False:
|
|
852
847
|
orig_cluster_labels = self.cluster_labels
|
|
853
848
|
self.cluster_labels = np.array([0] * len(self.data))
|
|
854
849
|
|
|
855
|
-
if self.
|
|
850
|
+
if self._check_is_fit() is False:
|
|
856
851
|
self.fit()
|
|
857
852
|
|
|
858
|
-
point_vector = self.
|
|
853
|
+
point_vector = self._convert_to_array(x)
|
|
859
854
|
distances = np.full([1, self.n_neighbors], 9e10, dtype=float)
|
|
860
855
|
if self.data is not None:
|
|
861
856
|
matrix = self.points_vector
|
|
862
857
|
else:
|
|
863
858
|
matrix = self.distance_matrix
|
|
859
|
+
# When using distance matrix mode, x is a scalar distance value.
|
|
860
|
+
# Extract scalar from array to avoid NumPy assignment errors.
|
|
861
|
+
if point_vector.size == 1:
|
|
862
|
+
point_vector = float(point_vector.flat[0])
|
|
864
863
|
for p in range(0, matrix.shape[0]):
|
|
865
864
|
if self.data is not None:
|
|
866
865
|
d = self._euclidean(matrix[p, :], point_vector)
|
|
@@ -0,0 +1,503 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: PyNomaly
|
|
3
|
+
Version: 0.3.5
|
|
4
|
+
Summary: A Python 3 implementation of LoOP: Local Outlier Probabilities, a local density based outlier detection method providing an outlier score in the range of [0,1].
|
|
5
|
+
Home-page: https://github.com/vc1492a/PyNomaly
|
|
6
|
+
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.5.tar.gz
|
|
7
|
+
Author: Valentino Constantinou
|
|
8
|
+
Author-email: vc@valentino.io
|
|
9
|
+
License: Apache License, Version 2.0
|
|
10
|
+
Keywords: outlier,anomaly,detection,machine,learning,probability
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: python-utils
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: author-email
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: download-url
|
|
20
|
+
Dynamic: home-page
|
|
21
|
+
Dynamic: keywords
|
|
22
|
+
Dynamic: license
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: summary
|
|
26
|
+
|
|
27
|
+
# PyNomaly
|
|
28
|
+
|
|
29
|
+
PyNomaly is a Python 3 implementation of LoOP (Local Outlier Probabilities).
|
|
30
|
+
LoOP is a local density based outlier detection method by Kriegel, Kröger, Schubert, and Zimek which provides outlier
|
|
31
|
+
scores in the range of [0,1] that are directly interpretable as the probability of a sample being an outlier.
|
|
32
|
+
|
|
33
|
+
PyNomaly is a core library of [deepchecks](https://github.com/deepchecks/deepchecks), [OmniDocBench](https://github.com/opendatalab/OmniDocBench) and [pysad](https://github.com/selimfirat/pysad).
|
|
34
|
+
|
|
35
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
36
|
+
[](https://pypi.python.org/pypi/PyNomaly/0.3.5)
|
|
37
|
+
[](https://pepy.tech/projects/pynomaly)
|
|
38
|
+
[](https://pepy.tech/projects/pynomaly)
|
|
39
|
+

|
|
40
|
+
[](https://coveralls.io/github/vc1492a/PyNomaly?branch=main)
|
|
41
|
+
[](http://joss.theoj.org/papers/f4d2cfe680768526da7c1f6a2c103266)
|
|
42
|
+
|
|
43
|
+
The outlier score of each sample is called the Local Outlier Probability.
|
|
44
|
+
It measures the local deviation of density of a given sample with
|
|
45
|
+
respect to its neighbors as Local Outlier Factor (LOF), but provides normalized
|
|
46
|
+
outlier scores in the range [0,1]. These outlier scores are directly interpretable
|
|
47
|
+
as a probability of an object being an outlier. Since Local Outlier Probabilities provides scores in the
|
|
48
|
+
range [0,1], practitioners are free to interpret the results according to the application.
|
|
49
|
+
|
|
50
|
+
Like LOF, it is local in that the anomaly score depends on how isolated the sample is
|
|
51
|
+
with respect to the surrounding neighborhood. Locality is given by k-nearest neighbors,
|
|
52
|
+
whose distance is used to estimate the local density. By comparing the local density of a sample to the
|
|
53
|
+
local densities of its neighbors, one can identify samples that lie in regions of lower
|
|
54
|
+
density compared to their neighbors and thus identify samples that may be outliers according to their Local
|
|
55
|
+
Outlier Probability.
|
|
56
|
+
|
|
57
|
+
The authors' 2009 paper detailing LoOP's theory, formulation, and application is provided by
|
|
58
|
+
Ludwig-Maximilians University Munich - Institute for Informatics;
|
|
59
|
+
[LoOP: Local Outlier Probabilities](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf).
|
|
60
|
+
|
|
61
|
+
## Implementation
|
|
62
|
+
|
|
63
|
+
This Python 3 implementation uses Numpy and the formulas outlined in
|
|
64
|
+
[LoOP: Local Outlier Probabilities](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf)
|
|
65
|
+
to calculate the Local Outlier Probability of each sample.
|
|
66
|
+
|
|
67
|
+
## Dependencies
|
|
68
|
+
- Python 3.8 - 3.13
|
|
69
|
+
- numpy >= 1.16.3
|
|
70
|
+
- python-utils >= 2.3.0
|
|
71
|
+
- (optional) numba >= 0.45.1
|
|
72
|
+
|
|
73
|
+
Numba just-in-time (JIT) compiles the function with calculates the Euclidean
|
|
74
|
+
distance between observations, providing a reduction in computation time
|
|
75
|
+
(significantly when a large number of observations are scored). Numba is not a
|
|
76
|
+
requirement and PyNomaly may still be used solely with numpy if desired
|
|
77
|
+
(details below).
|
|
78
|
+
|
|
79
|
+
## Quick Start
|
|
80
|
+
|
|
81
|
+
First install the package from the Python Package Index:
|
|
82
|
+
|
|
83
|
+
```shell
|
|
84
|
+
pip install PyNomaly # or pip3 install ... if you're using both Python 3 and 2.
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Alternatively, you can use conda to install the package from conda-forge:
|
|
88
|
+
|
|
89
|
+
```shell
|
|
90
|
+
conda install conda-forge::pynomaly
|
|
91
|
+
```
|
|
92
|
+
Then you can do something like this:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from PyNomaly import loop
|
|
96
|
+
m = loop.LocalOutlierProbability(data).fit()
|
|
97
|
+
scores = m.local_outlier_probabilities
|
|
98
|
+
print(scores)
|
|
99
|
+
```
|
|
100
|
+
where *data* is a NxM (N rows, M columns; 2-dimensional) set of data as either a Pandas DataFrame or Numpy array.
|
|
101
|
+
|
|
102
|
+
LocalOutlierProbability sets the *extent* (in integer in value of 1, 2, or 3) and *n_neighbors* (must be greater than 0) parameters with the default
|
|
103
|
+
values of 3 and 10, respectively. You're free to set these parameters on your own as below:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from PyNomaly import loop
|
|
107
|
+
m = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20).fit()
|
|
108
|
+
scores = m.local_outlier_probabilities
|
|
109
|
+
print(scores)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
This implementation of LoOP also includes an optional *cluster_labels* parameter. This is useful in cases where regions
|
|
113
|
+
of varying density occur within the same set of data. When using *cluster_labels*, the Local Outlier Probability of a
|
|
114
|
+
sample is calculated with respect to its cluster assignment.
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from PyNomaly import loop
|
|
118
|
+
from sklearn.cluster import DBSCAN
|
|
119
|
+
db = DBSCAN(eps=0.6, min_samples=50).fit(data)
|
|
120
|
+
m = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20, cluster_labels=list(db.labels_)).fit()
|
|
121
|
+
scores = m.local_outlier_probabilities
|
|
122
|
+
print(scores)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**NOTE**: Unless your data is all the same scale, it may be a good idea to normalize your data with z-scores or another
|
|
126
|
+
normalization scheme prior to using LoOP, especially when working with multiple dimensions of varying scale.
|
|
127
|
+
Users must also appropriately handle missing values prior to using LoOP, as LoOP does not support Pandas
|
|
128
|
+
DataFrames or Numpy arrays with missing values.
|
|
129
|
+
|
|
130
|
+
### Utilizing Numba and Progress Bars
|
|
131
|
+
|
|
132
|
+
It may be helpful to use just-in-time (JIT) compilation in the cases where a lot of
|
|
133
|
+
observations are scored. Numba, a JIT compiler for Python, may be used
|
|
134
|
+
with PyNomaly by setting `use_numba=True`:
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from PyNomaly import loop
|
|
138
|
+
m = loop.LocalOutlierProbability(data, extent=2, n_neighbors=20, use_numba=True, progress_bar=True).fit()
|
|
139
|
+
scores = m.local_outlier_probabilities
|
|
140
|
+
print(scores)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Numba must be installed if the above to use JIT compilation and improve the
|
|
144
|
+
speed of multiple calls to `LocalOutlierProbability()`, and PyNomaly has been
|
|
145
|
+
tested with Numba version 0.45.1. An example of the speed difference that can
|
|
146
|
+
be realized with using Numba is avaialble in `examples/numba_speed_diff.py`.
|
|
147
|
+
|
|
148
|
+
You may also choose to print progress bars _with our without_ the use of numba
|
|
149
|
+
by passing `progress_bar=True` to the `LocalOutlierProbability()` method as above.
|
|
150
|
+
|
|
151
|
+
### Choosing Parameters
|
|
152
|
+
|
|
153
|
+
The *extent* parameter controls the sensitivity of the scoring in practice. The parameter corresponds to
|
|
154
|
+
the statistical notion of an outlier defined as an object deviating more than a given lambda (*extent*)
|
|
155
|
+
times the standard deviation from the mean. A value of 2 implies outliers deviating more than 2 standard deviations
|
|
156
|
+
from the mean, and corresponds to 95.0% in the empirical "three-sigma" rule. The appropriate parameter should be selected
|
|
157
|
+
according to the level of sensitivity needed for the input data and application. The question to ask is whether it is
|
|
158
|
+
more reasonable to assume outliers in your data are 1, 2, or 3 standard deviations from the mean, and select the value
|
|
159
|
+
likely most appropriate to your data and application.
|
|
160
|
+
|
|
161
|
+
The *n_neighbors* parameter defines the number of neighbors to consider about
|
|
162
|
+
each sample (neighborhood size) when determining its Local Outlier Probability with respect to the density
|
|
163
|
+
of the sample's defined neighborhood. The idea number of neighbors to consider is dependent on the
|
|
164
|
+
input data. However, the notion of an outlier implies it would be considered as such regardless of the number
|
|
165
|
+
of neighbors considered. One potential approach is to use a number of different neighborhood sizes and average
|
|
166
|
+
the results for reach observation. Those observations which rank highly with varying neighborhood sizes are
|
|
167
|
+
more than likely outliers. This is one potential approach of selecting the neighborhood size. Another is to
|
|
168
|
+
select a value proportional to the number of observations, such an odd-valued integer close to the square root
|
|
169
|
+
of the number of observations in your data (*sqrt(n_observations*).
|
|
170
|
+
|
|
171
|
+
## Iris Data Example
|
|
172
|
+
|
|
173
|
+
We'll be using the well-known Iris dataset to show LoOP's capabilities. There's a few things you'll need for this
|
|
174
|
+
example beyond the standard prerequisites listed above:
|
|
175
|
+
- matplotlib 2.0.0 or greater
|
|
176
|
+
- PyDataset 0.2.0 or greater
|
|
177
|
+
- scikit-learn 0.18.1 or greater
|
|
178
|
+
|
|
179
|
+
First, let's import the packages and libraries we will need for this example.
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
from PyNomaly import loop
|
|
183
|
+
import pandas as pd
|
|
184
|
+
from pydataset import data
|
|
185
|
+
import numpy as np
|
|
186
|
+
from sklearn.cluster import DBSCAN
|
|
187
|
+
import matplotlib.pyplot as plt
|
|
188
|
+
from mpl_toolkits.mplot3d import Axes3D
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Now let's create two sets of Iris data for scoring; one with clustering and the other without.
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
# import the data and remove any non-numeric columns
|
|
195
|
+
iris = pd.DataFrame(data('iris').drop(columns=['Species']))
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Next, let's cluster the data using DBSCAN and generate two sets of scores. On both cases, we will use the default
|
|
199
|
+
values for both *extent* (0.997) and *n_neighbors* (10).
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
db = DBSCAN(eps=0.9, min_samples=10).fit(iris)
|
|
203
|
+
m = loop.LocalOutlierProbability(iris).fit()
|
|
204
|
+
scores_noclust = m.local_outlier_probabilities
|
|
205
|
+
m_clust = loop.LocalOutlierProbability(iris, cluster_labels=list(db.labels_)).fit()
|
|
206
|
+
scores_clust = m_clust.local_outlier_probabilities
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Organize the data into two separate Pandas DataFrames.
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
iris_clust = pd.DataFrame(iris.copy())
|
|
213
|
+
iris_clust['scores'] = scores_clust
|
|
214
|
+
iris_clust['labels'] = db.labels_
|
|
215
|
+
iris['scores'] = scores_noclust
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
And finally, let's visualize the scores provided by LoOP in both cases (with and without clustering).
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
fig = plt.figure(figsize=(7, 7))
|
|
222
|
+
ax = fig.add_subplot(111, projection='3d')
|
|
223
|
+
ax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],
|
|
224
|
+
c=iris['scores'], cmap='seismic', s=50)
|
|
225
|
+
ax.set_xlabel('Sepal.Width')
|
|
226
|
+
ax.set_ylabel('Petal.Width')
|
|
227
|
+
ax.set_zlabel('Sepal.Length')
|
|
228
|
+
plt.show()
|
|
229
|
+
plt.clf()
|
|
230
|
+
plt.cla()
|
|
231
|
+
plt.close()
|
|
232
|
+
|
|
233
|
+
fig = plt.figure(figsize=(7, 7))
|
|
234
|
+
ax = fig.add_subplot(111, projection='3d')
|
|
235
|
+
ax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],
|
|
236
|
+
c=iris_clust['scores'], cmap='seismic', s=50)
|
|
237
|
+
ax.set_xlabel('Sepal.Width')
|
|
238
|
+
ax.set_ylabel('Petal.Width')
|
|
239
|
+
ax.set_zlabel('Sepal.Length')
|
|
240
|
+
plt.show()
|
|
241
|
+
plt.clf()
|
|
242
|
+
plt.cla()
|
|
243
|
+
plt.close()
|
|
244
|
+
|
|
245
|
+
fig = plt.figure(figsize=(7, 7))
|
|
246
|
+
ax = fig.add_subplot(111, projection='3d')
|
|
247
|
+
ax.scatter(iris_clust['Sepal.Width'], iris_clust['Petal.Width'], iris_clust['Sepal.Length'],
|
|
248
|
+
c=iris_clust['labels'], cmap='Set1', s=50)
|
|
249
|
+
ax.set_xlabel('Sepal.Width')
|
|
250
|
+
ax.set_ylabel('Petal.Width')
|
|
251
|
+
ax.set_zlabel('Sepal.Length')
|
|
252
|
+
plt.show()
|
|
253
|
+
plt.clf()
|
|
254
|
+
plt.cla()
|
|
255
|
+
plt.close()
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
Your results should look like the following:
|
|
259
|
+
|
|
260
|
+
**LoOP Scores without Clustering**
|
|
261
|
+

|
|
262
|
+
|
|
263
|
+
**LoOP Scores with Clustering**
|
|
264
|
+

|
|
265
|
+
|
|
266
|
+
**DBSCAN Cluster Assignments**
|
|
267
|
+

|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
Note the differences between using LocalOutlierProbability with and without clustering. In the example without clustering, samples are
|
|
271
|
+
scored according to the distribution of the entire data set. In the example with clustering, each sample is scored
|
|
272
|
+
according to the distribution of each cluster. Which approach is suitable depends on the use case.
|
|
273
|
+
|
|
274
|
+
**NOTE**: Data was not normalized in this example, but it's probably a good idea to do so in practice.
|
|
275
|
+
|
|
276
|
+
## Using Numpy
|
|
277
|
+
|
|
278
|
+
When using numpy, make sure to use 2-dimensional arrays in tabular format:
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
data = np.array([
|
|
282
|
+
[43.3, 30.2, 90.2],
|
|
283
|
+
[62.9, 58.3, 49.3],
|
|
284
|
+
[55.2, 56.2, 134.2],
|
|
285
|
+
[48.6, 80.3, 50.3],
|
|
286
|
+
[67.1, 60.0, 55.9],
|
|
287
|
+
[421.5, 90.3, 50.0]
|
|
288
|
+
])
|
|
289
|
+
|
|
290
|
+
scores = loop.LocalOutlierProbability(data, n_neighbors=3).fit().local_outlier_probabilities
|
|
291
|
+
print(scores)
|
|
292
|
+
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
The shape of the input array shape corresponds to the rows (observations) and columns (features) in the data:
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
print(data.shape)
|
|
299
|
+
# (6,3), which matches number of observations and features in the above example
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
Similar to the above:
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
data = np.random.rand(100, 5)
|
|
306
|
+
scores = loop.LocalOutlierProbability(data).fit().local_outlier_probabilities
|
|
307
|
+
print(scores)
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
## Specifying a Distance Matrix
|
|
311
|
+
|
|
312
|
+
PyNomaly provides the ability to specify a distance matrix so that any
|
|
313
|
+
distance metric can be used (a neighbor index matrix must also be provided).
|
|
314
|
+
This can be useful when wanting to use a distance other than the euclidean.
|
|
315
|
+
|
|
316
|
+
Note that in order to maintain alignment with the LoOP definition of closest neighbors,
|
|
317
|
+
an additional neighbor is added when using [scikit-learn's NearestNeighbors](https://scikit-learn.org/1.5/modules/neighbors.html) since `NearestNeighbors`
|
|
318
|
+
includes the point itself when calculating the cloest neighbors (whereas the LoOP method does not include distances to point itself).
|
|
319
|
+
|
|
320
|
+
```python
|
|
321
|
+
import numpy as np
|
|
322
|
+
from sklearn.neighbors import NearestNeighbors
|
|
323
|
+
|
|
324
|
+
data = np.array([
|
|
325
|
+
[43.3, 30.2, 90.2],
|
|
326
|
+
[62.9, 58.3, 49.3],
|
|
327
|
+
[55.2, 56.2, 134.2],
|
|
328
|
+
[48.6, 80.3, 50.3],
|
|
329
|
+
[67.1, 60.0, 55.9],
|
|
330
|
+
[421.5, 90.3, 50.0]
|
|
331
|
+
])
|
|
332
|
+
|
|
333
|
+
# Generate distance and neighbor matrices
|
|
334
|
+
n_neighbors = 3 # the number of neighbors according to the LoOP definition
|
|
335
|
+
neigh = NearestNeighbors(n_neighbors=n_neighbors+1, metric='hamming')
|
|
336
|
+
neigh.fit(data)
|
|
337
|
+
d, idx = neigh.kneighbors(data, return_distance=True)
|
|
338
|
+
|
|
339
|
+
# Remove self-distances - you MUST do this to preserve the same results as intended by the definition of LoOP
|
|
340
|
+
indices = np.delete(indices, 0, 1)
|
|
341
|
+
distances = np.delete(distances, 0, 1)
|
|
342
|
+
|
|
343
|
+
# Fit and return scores
|
|
344
|
+
m = loop.LocalOutlierProbability(distance_matrix=d, neighbor_matrix=idx, n_neighbors=n_neighbors+1).fit()
|
|
345
|
+
scores = m.local_outlier_probabilities
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
The below visualization shows the results by a few known distance metrics:
|
|
349
|
+
|
|
350
|
+
**LoOP Scores by Distance Metric**
|
|
351
|
+

|
|
352
|
+
|
|
353
|
+
## Streaming Data
|
|
354
|
+
|
|
355
|
+
PyNomaly also contains an implementation of Hamlet et. al.'s modifications
|
|
356
|
+
to the original LoOP approach [[4](http://www.tandfonline.com/doi/abs/10.1080/23742917.2016.1226651?journalCode=tsec20)],
|
|
357
|
+
which may be used for applications involving streaming data or where rapid calculations may be necessary.
|
|
358
|
+
First, the standard LoOP algorithm is used on "training" data, with certain attributes of the fitted data
|
|
359
|
+
stored from the original LoOP approach. Then, as new points are considered, these fitted attributes are
|
|
360
|
+
called when calculating the score of the incoming streaming data due to the use of averages from the initial
|
|
361
|
+
fit, such as the use of a global value for the expected value of the probabilistic distance. Despite the potential
|
|
362
|
+
for increased error when compared to the standard approach, it may be effective in streaming applications where
|
|
363
|
+
refitting the standard approach over all points could be computationally expensive.
|
|
364
|
+
|
|
365
|
+
While the iris dataset is not streaming data, we'll use it in this example by taking the first 120 observations
|
|
366
|
+
as training data and take the remaining 30 observations as a stream, scoring each observation
|
|
367
|
+
individually.
|
|
368
|
+
|
|
369
|
+
Split the data.
|
|
370
|
+
```python
|
|
371
|
+
iris = iris.sample(frac=1) # shuffle data
|
|
372
|
+
iris_train = iris.iloc[:, 0:4].head(120)
|
|
373
|
+
iris_test = iris.iloc[:, 0:4].tail(30)
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
Fit to each set.
|
|
377
|
+
```python
|
|
378
|
+
m = loop.LocalOutlierProbability(iris).fit()
|
|
379
|
+
scores_noclust = m.local_outlier_probabilities
|
|
380
|
+
iris['scores'] = scores_noclust
|
|
381
|
+
|
|
382
|
+
m_train = loop.LocalOutlierProbability(iris_train, n_neighbors=10)
|
|
383
|
+
m_train.fit()
|
|
384
|
+
iris_train_scores = m_train.local_outlier_probabilities
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
```python
|
|
388
|
+
iris_test_scores = []
|
|
389
|
+
for index, row in iris_test.iterrows():
|
|
390
|
+
array = np.array([row['Sepal.Length'], row['Sepal.Width'], row['Petal.Length'], row['Petal.Width']])
|
|
391
|
+
iris_test_scores.append(m_train.stream(array))
|
|
392
|
+
iris_test_scores = np.array(iris_test_scores)
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
Concatenate the scores and assess.
|
|
396
|
+
|
|
397
|
+
```python
|
|
398
|
+
iris['stream_scores'] = np.hstack((iris_train_scores, iris_test_scores))
|
|
399
|
+
# iris['scores'] from earlier example
|
|
400
|
+
rmse = np.sqrt(((iris['scores'] - iris['stream_scores']) ** 2).mean(axis=None))
|
|
401
|
+
print(rmse)
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
The root mean squared error (RMSE) between the two approaches is approximately 0.199 (your scores will vary depending on the data and specification).
|
|
405
|
+
The plot below shows the scores from the stream approach.
|
|
406
|
+
|
|
407
|
+
```python
|
|
408
|
+
fig = plt.figure(figsize=(7, 7))
|
|
409
|
+
ax = fig.add_subplot(111, projection='3d')
|
|
410
|
+
ax.scatter(iris['Sepal.Width'], iris['Petal.Width'], iris['Sepal.Length'],
|
|
411
|
+
c=iris['stream_scores'], cmap='seismic', s=50)
|
|
412
|
+
ax.set_xlabel('Sepal.Width')
|
|
413
|
+
ax.set_ylabel('Petal.Width')
|
|
414
|
+
ax.set_zlabel('Sepal.Length')
|
|
415
|
+
plt.show()
|
|
416
|
+
plt.clf()
|
|
417
|
+
plt.cla()
|
|
418
|
+
plt.close()
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
**LoOP Scores using Stream Approach with n=10**
|
|
422
|
+

|
|
423
|
+
|
|
424
|
+
### Notes
|
|
425
|
+
When calculating the LoOP score of incoming data, the original fitted scores are not updated.
|
|
426
|
+
In some applications, it may be beneficial to refit the data periodically. The stream functionality
|
|
427
|
+
also assumes that either data or a distance matrix (or value) will be used across in both fitting
|
|
428
|
+
and streaming, with no changes in specification between steps.
|
|
429
|
+
|
|
430
|
+
## Contributing
|
|
431
|
+
|
|
432
|
+
Please use the issue tracker to report any erroneous behavior or desired
|
|
433
|
+
feature requests.
|
|
434
|
+
|
|
435
|
+
If you would like to contribute to development, please fork the repository and make
|
|
436
|
+
any changes to a branch which corresponds to an open issue. Hot fixes
|
|
437
|
+
and bug fixes can be represented by branches with the prefix `fix/` versus
|
|
438
|
+
`feature/` for new capabilities or code improvements. Pull requests will
|
|
439
|
+
then be made from these branches into the repository's `dev` branch
|
|
440
|
+
prior to being pulled into `main`.
|
|
441
|
+
|
|
442
|
+
### Commit Messages and Releases
|
|
443
|
+
|
|
444
|
+
**Your commit messages are important** - here's why.
|
|
445
|
+
|
|
446
|
+
PyNomaly leverages [release-please](https://github.com/googleapis/release-please-action) to help automate the release process using the [Conventional Commits](https://www.conventionalcommits.org/) specification. When pull requests are opened to the `main` branch, release-please will collate the git commit messages and prepare an organized changelog and release notes. This process can be completed because of the Conventional Commits specification.
|
|
447
|
+
|
|
448
|
+
Conventional Commits provides an easy set of rules for creating an explicit commit history; which makes it easier to write automated tools on top of. This convention dovetails with SemVer, by describing the features, fixes, and breaking changes made in commit messages. You can check out examples [here](https://www.conventionalcommits.org/en/v1.0.0/#examples). Make a best effort to use the specification when contributing to Infactory code as it dramatically eases the documentation around releases and their features, breaking changes, bug fixes and documentation updates.
|
|
449
|
+
|
|
450
|
+
### Tests
|
|
451
|
+
When contributing, please ensure to run unit tests and add additional tests as
|
|
452
|
+
necessary if adding new functionality. To run the unit tests, use `pytest`:
|
|
453
|
+
|
|
454
|
+
```
|
|
455
|
+
python3 -m pytest --cov=PyNomaly -s -v
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
To run the tests with Numba enabled, simply set the flag `NUMBA` in `test_loop.py`
|
|
459
|
+
to `True`. Note that a drop in coverage is expected due to portions of the code
|
|
460
|
+
being compiled upon code execution.
|
|
461
|
+
|
|
462
|
+
## Versioning
|
|
463
|
+
[Semantic versioning](http://semver.org/) is used for this project. If contributing, please conform to semantic
|
|
464
|
+
versioning guidelines when submitting a pull request.
|
|
465
|
+
|
|
466
|
+
## License
|
|
467
|
+
This project is licensed under the Apache 2.0 license.
|
|
468
|
+
|
|
469
|
+
## Research
|
|
470
|
+
If citing PyNomaly, use the following:
|
|
471
|
+
|
|
472
|
+
```
|
|
473
|
+
@article{Constantinou2018,
|
|
474
|
+
doi = {10.21105/joss.00845},
|
|
475
|
+
url = {https://doi.org/10.21105/joss.00845},
|
|
476
|
+
year = {2018},
|
|
477
|
+
month = {oct},
|
|
478
|
+
publisher = {The Open Journal},
|
|
479
|
+
volume = {3},
|
|
480
|
+
number = {30},
|
|
481
|
+
pages = {845},
|
|
482
|
+
author = {Valentino Constantinou},
|
|
483
|
+
title = {{PyNomaly}: Anomaly detection using Local Outlier Probabilities ({LoOP}).},
|
|
484
|
+
journal = {Journal of Open Source Software}
|
|
485
|
+
}
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
## References
|
|
490
|
+
1. Breunig M., Kriegel H.-P., Ng R., Sander, J. LOF: Identifying Density-based Local Outliers. ACM SIGMOD International Conference on Management of Data (2000). [PDF](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LOF.pdf).
|
|
491
|
+
2. Kriegel H., Kröger P., Schubert E., Zimek A. LoOP: Local Outlier Probabilities. 18th ACM conference on Information and knowledge management, CIKM (2009). [PDF](http://www.dbs.ifi.lmu.de/Publikationen/Papers/LoOP1649.pdf).
|
|
492
|
+
3. Goldstein M., Uchida S. A Comparative Evaluation of Unsupervised Anomaly Detection Algorithms for Multivariate Data. PLoS ONE 11(4): e0152173 (2016).
|
|
493
|
+
4. Hamlet C., Straub J., Russell M., Kerlin S. An incremental and approximate local outlier probability algorithm for intrusion detection and its evaluation. Journal of Cyber Security Technology (2016). [DOI](http://www.tandfonline.com/doi/abs/10.1080/23742917.2016.1226651?journalCode=tsec20).
|
|
494
|
+
|
|
495
|
+
## Acknowledgements
|
|
496
|
+
- The authors of LoOP (Local Outlier Probabilities)
|
|
497
|
+
- Hans-Peter Kriegel
|
|
498
|
+
- Peer Kröger
|
|
499
|
+
- Erich Schubert
|
|
500
|
+
- Arthur Zimek
|
|
501
|
+
- [NASA Jet Propulsion Laboratory](https://jpl.nasa.gov/)
|
|
502
|
+
- [Kyle Hundman](https://github.com/khundman)
|
|
503
|
+
- [Ian Colwell](https://github.com/iancolwell)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
PyNomaly/__init__.py,sha256=TAc8Gsi_DPvsYdWmUTphvkCrw_PDKoRLJ7QMfuNqMpM,360
|
|
2
|
+
PyNomaly/loop.py,sha256=aMG9bo5GNwczenkY6RKjlh3CXsEtRyi8cBIMMN7-NiE,33808
|
|
3
|
+
pynomaly-0.3.5.dist-info/licenses/LICENSE,sha256=xZYfuJFfM57xOlBLbkJmsCwEvw1P6K2t3jI8faTdOMs,563
|
|
4
|
+
pynomaly-0.3.5.dist-info/METADATA,sha256=YoOW9w5rDUoZNozkWUCf8C0UF2NiDve1LKQGn5LsJxc,21891
|
|
5
|
+
pynomaly-0.3.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
6
|
+
pynomaly-0.3.5.dist-info/top_level.txt,sha256=el-HX4RLyBjkh2CW3TK9yXAA54zQOIYVmcJjRbBYKX4,9
|
|
7
|
+
pynomaly-0.3.5.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: PyNomaly
|
|
3
|
-
Version: 0.3.4
|
|
4
|
-
Summary: A Python 3 implementation of LoOP: Local Outlier Probabilities, a local density based outlier detection method providing an outlier score in the range of [0,1].
|
|
5
|
-
Home-page: https://github.com/vc1492a/PyNomaly
|
|
6
|
-
Author: Valentino Constantinou
|
|
7
|
-
Author-email: vc@valentino.io
|
|
8
|
-
License: Apache License, Version 2.0
|
|
9
|
-
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz
|
|
10
|
-
Keywords: outlier,anomaly,detection,machine,learning,probability
|
|
11
|
-
Platform: UNKNOWN
|
|
12
|
-
Requires-Dist: numpy
|
|
13
|
-
Requires-Dist: python-utils
|
|
14
|
-
|
|
15
|
-
UNKNOWN
|
|
16
|
-
|
|
17
|
-
|
PyNomaly-0.3.4.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
PyNomaly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
PyNomaly/loop.py,sha256=VLllAa5pOIHZjlI0XuLSpjLzY3tJ_ZTzDCbbIh3VM44,34571
|
|
3
|
-
PyNomaly-0.3.4.dist-info/LICENSE.txt,sha256=xZYfuJFfM57xOlBLbkJmsCwEvw1P6K2t3jI8faTdOMs,563
|
|
4
|
-
PyNomaly-0.3.4.dist-info/METADATA,sha256=xkHaSUSpOnZynE_KfVQAwoBXNOzTpE-IymwuiRdIeos,581
|
|
5
|
-
PyNomaly-0.3.4.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
|
|
6
|
-
PyNomaly-0.3.4.dist-info/top_level.txt,sha256=el-HX4RLyBjkh2CW3TK9yXAA54zQOIYVmcJjRbBYKX4,9
|
|
7
|
-
PyNomaly-0.3.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|