PyNomaly 0.3.2__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyNomaly/__init__.py +18 -0
- PyNomaly/loop.py +398 -335
- pynomaly-0.3.5.dist-info/METADATA +503 -0
- pynomaly-0.3.5.dist-info/RECORD +7 -0
- {PyNomaly-0.3.2.dist-info → pynomaly-0.3.5.dist-info}/WHEEL +1 -1
- PyNomaly-0.3.2.dist-info/METADATA +0 -17
- PyNomaly-0.3.2.dist-info/RECORD +0 -7
- /PyNomaly-0.3.2.dist-info/LICENSE.txt → /pynomaly-0.3.5.dist-info/licenses/LICENSE +0 -0
- {PyNomaly-0.3.2.dist-info → pynomaly-0.3.5.dist-info}/top_level.txt +0 -0
PyNomaly/loop.py
CHANGED
|
@@ -10,13 +10,33 @@ try:
|
|
|
10
10
|
except ImportError:
|
|
11
11
|
pass
|
|
12
12
|
|
|
13
|
-
__author__ =
|
|
14
|
-
__version__ =
|
|
15
|
-
__license__ =
|
|
13
|
+
__author__ = "Valentino Constantinou"
|
|
14
|
+
__version__ = "0.3.5"
|
|
15
|
+
__license__ = "Apache License, Version 2.0"
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
# Custom Exceptions
|
|
19
|
+
class PyNomalyError(Exception):
|
|
20
|
+
"""Base exception for PyNomaly."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ValidationError(PyNomalyError):
|
|
25
|
+
"""Raised when input validation fails."""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ClusterSizeError(ValidationError):
|
|
30
|
+
"""Raised when cluster size is smaller than n_neighbors."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MissingValuesError(ValidationError):
|
|
35
|
+
"""Raised when data contains missing values."""
|
|
36
|
+
pass
|
|
19
37
|
|
|
38
|
+
|
|
39
|
+
class Utils:
|
|
20
40
|
@staticmethod
|
|
21
41
|
def emit_progress_bar(progress: str, index: int, total: int) -> str:
|
|
22
42
|
"""
|
|
@@ -32,7 +52,10 @@ class Utils:
|
|
|
32
52
|
|
|
33
53
|
w, h = get_terminal_size()
|
|
34
54
|
sys.stdout.write("\r")
|
|
35
|
-
|
|
55
|
+
if total < w:
|
|
56
|
+
block_size = int(w / total)
|
|
57
|
+
else:
|
|
58
|
+
block_size = int(total / w)
|
|
36
59
|
if index % block_size == 0:
|
|
37
60
|
progress += "="
|
|
38
61
|
percent = index / total
|
|
@@ -52,7 +75,7 @@ class LocalOutlierProbability(object):
|
|
|
52
75
|
:param cluster_labels: a numpy array of cluster assignments w.r.t. each
|
|
53
76
|
sample (optional, default None)
|
|
54
77
|
:return:
|
|
55
|
-
""""""
|
|
78
|
+
""" """
|
|
56
79
|
|
|
57
80
|
Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP:
|
|
58
81
|
Local Outlier Probabilities.
|
|
@@ -75,203 +98,190 @@ class LocalOutlierProbability(object):
|
|
|
75
98
|
(2016).
|
|
76
99
|
"""
|
|
77
100
|
|
|
78
|
-
|
|
101
|
+
"""
|
|
102
|
+
Validation methods.
|
|
103
|
+
These methods validate inputs and raise exceptions or warnings as appropriate.
|
|
104
|
+
"""
|
|
79
105
|
|
|
106
|
+
@staticmethod
|
|
107
|
+
def _convert_to_array(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
|
|
108
|
+
"""
|
|
109
|
+
Converts the input data to a numpy array if it is a Pandas DataFrame
|
|
110
|
+
or validates it is already a numpy array.
|
|
111
|
+
:param obj: user-provided input data.
|
|
112
|
+
:return: a vector of values to be used in calculating the local
|
|
113
|
+
outlier probability.
|
|
114
|
+
"""
|
|
115
|
+
if obj.__class__.__name__ == "DataFrame":
|
|
116
|
+
points_vector = obj.values
|
|
117
|
+
return points_vector
|
|
118
|
+
elif obj.__class__.__name__ == "ndarray":
|
|
119
|
+
points_vector = obj
|
|
120
|
+
return points_vector
|
|
121
|
+
else:
|
|
122
|
+
warnings.warn(
|
|
123
|
+
"Provided data or distance matrix must be in ndarray "
|
|
124
|
+
"or DataFrame.",
|
|
125
|
+
UserWarning,
|
|
126
|
+
)
|
|
127
|
+
if isinstance(obj, list):
|
|
128
|
+
points_vector = np.array(obj)
|
|
129
|
+
return points_vector
|
|
130
|
+
points_vector = np.array([obj])
|
|
131
|
+
return points_vector
|
|
132
|
+
|
|
133
|
+
def _validate_inputs(self):
|
|
80
134
|
"""
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
albeit with some form of user warning.
|
|
135
|
+
Validates the inputs provided during initialization to ensure
|
|
136
|
+
that the needed objects are provided.
|
|
137
|
+
:return: a tuple of (data, distance_matrix, neighbor_matrix) or
|
|
138
|
+
raises a warning for invalid inputs.
|
|
86
139
|
"""
|
|
140
|
+
if all(v is None for v in [self.data, self.distance_matrix]):
|
|
141
|
+
warnings.warn(
|
|
142
|
+
"Data or a distance matrix must be provided.", UserWarning
|
|
143
|
+
)
|
|
144
|
+
return False
|
|
145
|
+
elif all(v is not None for v in [self.data, self.distance_matrix]):
|
|
146
|
+
warnings.warn(
|
|
147
|
+
"Only one of the following may be provided: data or a "
|
|
148
|
+
"distance matrix (not both).",
|
|
149
|
+
UserWarning,
|
|
150
|
+
)
|
|
151
|
+
return False
|
|
152
|
+
if self.data is not None:
|
|
153
|
+
points_vector = self._convert_to_array(self.data)
|
|
154
|
+
return points_vector, self.distance_matrix, self.neighbor_matrix
|
|
155
|
+
if all(
|
|
156
|
+
matrix is not None
|
|
157
|
+
for matrix in [self.neighbor_matrix, self.distance_matrix]
|
|
158
|
+
):
|
|
159
|
+
dist_vector = self._convert_to_array(self.distance_matrix)
|
|
160
|
+
neigh_vector = self._convert_to_array(self.neighbor_matrix)
|
|
161
|
+
else:
|
|
162
|
+
warnings.warn(
|
|
163
|
+
"A neighbor index matrix and distance matrix must both be "
|
|
164
|
+
"provided when not using raw input data.",
|
|
165
|
+
UserWarning,
|
|
166
|
+
)
|
|
167
|
+
return False
|
|
168
|
+
if self.distance_matrix.shape != self.neighbor_matrix.shape:
|
|
169
|
+
warnings.warn(
|
|
170
|
+
"The shape of the distance and neighbor "
|
|
171
|
+
"index matrices must match.",
|
|
172
|
+
UserWarning,
|
|
173
|
+
)
|
|
174
|
+
return False
|
|
175
|
+
elif (self.distance_matrix.shape[1] != self.n_neighbors) or (
|
|
176
|
+
self.neighbor_matrix.shape[1] != self.n_neighbors
|
|
177
|
+
):
|
|
178
|
+
warnings.warn(
|
|
179
|
+
"The shape of the distance or "
|
|
180
|
+
"neighbor index matrix does not "
|
|
181
|
+
"match the number of neighbors "
|
|
182
|
+
"specified.",
|
|
183
|
+
UserWarning,
|
|
184
|
+
)
|
|
185
|
+
return False
|
|
186
|
+
return self.data, dist_vector, neigh_vector
|
|
187
|
+
|
|
188
|
+
def _check_cluster_size(self) -> None:
|
|
189
|
+
"""
|
|
190
|
+
Validates the cluster labels to ensure that the smallest cluster
|
|
191
|
+
size (number of observations in the cluster) is larger than the
|
|
192
|
+
specified number of neighbors.
|
|
193
|
+
:raises ClusterSizeError: if any cluster is too small.
|
|
194
|
+
"""
|
|
195
|
+
c_labels = self._cluster_labels()
|
|
196
|
+
for cluster_id in set(c_labels):
|
|
197
|
+
c_size = np.where(c_labels == cluster_id)[0].shape[0]
|
|
198
|
+
if c_size <= self.n_neighbors:
|
|
199
|
+
raise ClusterSizeError(
|
|
200
|
+
"Number of neighbors specified larger than smallest "
|
|
201
|
+
"cluster. Specify a number of neighbors smaller than "
|
|
202
|
+
"the smallest cluster size (observations in smallest "
|
|
203
|
+
"cluster minus one)."
|
|
204
|
+
)
|
|
87
205
|
|
|
206
|
+
def _check_n_neighbors(self) -> bool:
|
|
88
207
|
"""
|
|
89
|
-
|
|
208
|
+
Validates the specified number of neighbors to ensure that it is
|
|
209
|
+
greater than 0 and that the specified value is less than the total
|
|
210
|
+
number of observations.
|
|
211
|
+
:return: a boolean indicating whether validation has passed without
|
|
212
|
+
adjustment.
|
|
90
213
|
"""
|
|
214
|
+
if not self.n_neighbors > 0:
|
|
215
|
+
self.n_neighbors = 10
|
|
216
|
+
warnings.warn(
|
|
217
|
+
"n_neighbors must be greater than 0."
|
|
218
|
+
" Fit with " + str(self.n_neighbors) + " instead.",
|
|
219
|
+
UserWarning,
|
|
220
|
+
)
|
|
221
|
+
return False
|
|
222
|
+
elif self.n_neighbors >= self._n_observations():
|
|
223
|
+
self.n_neighbors = self._n_observations() - 1
|
|
224
|
+
warnings.warn(
|
|
225
|
+
"n_neighbors must be less than the number of observations."
|
|
226
|
+
" Fit with " + str(self.n_neighbors) + " instead.",
|
|
227
|
+
UserWarning,
|
|
228
|
+
)
|
|
229
|
+
return True
|
|
91
230
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
elif obj.__class__.__name__ == 'ndarray':
|
|
105
|
-
points_vector = obj
|
|
106
|
-
return points_vector
|
|
107
|
-
else:
|
|
108
|
-
warnings.warn(
|
|
109
|
-
"Provided data or distance matrix must be in ndarray "
|
|
110
|
-
"or DataFrame.",
|
|
111
|
-
UserWarning)
|
|
112
|
-
if isinstance(obj, list):
|
|
113
|
-
points_vector = np.array(obj)
|
|
114
|
-
return points_vector
|
|
115
|
-
points_vector = np.array([obj])
|
|
116
|
-
return points_vector
|
|
231
|
+
def _check_extent(self) -> bool:
|
|
232
|
+
"""
|
|
233
|
+
Validates the specified extent parameter to ensure it is either 1,
|
|
234
|
+
2, or 3.
|
|
235
|
+
:return: a boolean indicating whether validation has passed.
|
|
236
|
+
"""
|
|
237
|
+
if self.extent not in [1, 2, 3]:
|
|
238
|
+
warnings.warn(
|
|
239
|
+
"extent parameter (lambda) must be 1, 2, or 3.", UserWarning
|
|
240
|
+
)
|
|
241
|
+
return False
|
|
242
|
+
return True
|
|
117
243
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
warnings.warn("The shape of the distance or "
|
|
160
|
-
"neighbor index matrix does not "
|
|
161
|
-
"match the number of neighbors "
|
|
162
|
-
"specified.", UserWarning)
|
|
163
|
-
return False
|
|
164
|
-
return obj.data, dist_vector, neigh_vector
|
|
165
|
-
|
|
166
|
-
@staticmethod
|
|
167
|
-
def _cluster_size(obj) -> bool:
|
|
168
|
-
"""
|
|
169
|
-
Validates the cluster labels to ensure that the smallest cluster
|
|
170
|
-
size (number of observations in the cluster) is larger than the
|
|
171
|
-
specified number of neighbors.
|
|
172
|
-
:param obj: a PyNomaly object.
|
|
173
|
-
:return: a boolean indicating whether validation has passed.
|
|
174
|
-
"""
|
|
175
|
-
c_labels = obj._cluster_labels()
|
|
176
|
-
for cluster_id in set(c_labels):
|
|
177
|
-
c_size = np.where(c_labels == cluster_id)[0].shape[0]
|
|
178
|
-
if c_size <= obj.n_neighbors:
|
|
179
|
-
warnings.warn(
|
|
180
|
-
"Number of neighbors specified larger than smallest "
|
|
181
|
-
"cluster. Specify a number of neighbors smaller than "
|
|
182
|
-
"the smallest cluster size (observations in smallest "
|
|
183
|
-
"cluster minus one).",
|
|
184
|
-
UserWarning)
|
|
185
|
-
return False
|
|
186
|
-
return True
|
|
187
|
-
|
|
188
|
-
@staticmethod
|
|
189
|
-
def _n_neighbors(obj) -> bool:
|
|
190
|
-
"""
|
|
191
|
-
Validates the specified number of neighbors to ensure that it is
|
|
192
|
-
greater than 0 and that the specified value is less than the total
|
|
193
|
-
number of observations.
|
|
194
|
-
:param obj: a PyNomaly object.
|
|
195
|
-
:return: a boolean indicating whether validation has passed.
|
|
196
|
-
"""
|
|
197
|
-
if not obj.n_neighbors > 0:
|
|
198
|
-
obj.n_neighbors = 10
|
|
199
|
-
warnings.warn("n_neighbors must be greater than 0."
|
|
200
|
-
" Fit with " + str(obj.n_neighbors) +
|
|
201
|
-
" instead.",
|
|
202
|
-
UserWarning)
|
|
203
|
-
return False
|
|
204
|
-
elif obj.n_neighbors >= obj._n_observations():
|
|
205
|
-
obj.n_neighbors = obj._n_observations() - 1
|
|
206
|
-
warnings.warn(
|
|
207
|
-
"n_neighbors must be less than the number of observations."
|
|
208
|
-
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
209
|
-
UserWarning)
|
|
210
|
-
return True
|
|
211
|
-
|
|
212
|
-
@staticmethod
|
|
213
|
-
def _extent(obj) -> bool:
|
|
214
|
-
"""
|
|
215
|
-
Validates the specified extent parameter to ensure it is either 1,
|
|
216
|
-
2, or 3.
|
|
217
|
-
:param obj: a PyNomaly object.
|
|
218
|
-
:return: a boolean indicating whether validation has passed.
|
|
219
|
-
"""
|
|
220
|
-
if obj.extent not in [1, 2, 3]:
|
|
221
|
-
warnings.warn(
|
|
222
|
-
"extent parameter (lambda) must be 1, 2, or 3.",
|
|
223
|
-
UserWarning)
|
|
224
|
-
return False
|
|
225
|
-
return True
|
|
226
|
-
|
|
227
|
-
@staticmethod
|
|
228
|
-
def _missing_values(obj) -> bool:
|
|
229
|
-
"""
|
|
230
|
-
Validates the provided data to ensure that it contains no
|
|
231
|
-
missing values.
|
|
232
|
-
:param obj: a PyNomaly object.
|
|
233
|
-
:return: a boolean indicating whether validation has passed.
|
|
234
|
-
"""
|
|
235
|
-
if np.any(np.isnan(obj.data)):
|
|
236
|
-
warnings.warn(
|
|
237
|
-
"Method does not support missing values in input data.",
|
|
238
|
-
UserWarning)
|
|
239
|
-
return False
|
|
240
|
-
return True
|
|
241
|
-
|
|
242
|
-
@staticmethod
|
|
243
|
-
def _fit(obj) -> bool:
|
|
244
|
-
"""
|
|
245
|
-
Validates that the model was fit prior to calling the stream()
|
|
246
|
-
method.
|
|
247
|
-
:param obj: a PyNomaly object.
|
|
248
|
-
:return: a boolean indicating whether validation has passed.
|
|
249
|
-
"""
|
|
250
|
-
if obj.is_fit is False:
|
|
251
|
-
warnings.warn(
|
|
252
|
-
"Must fit on historical data by calling fit() prior to "
|
|
253
|
-
"calling stream(x).",
|
|
254
|
-
UserWarning)
|
|
255
|
-
return False
|
|
256
|
-
return True
|
|
257
|
-
|
|
258
|
-
@staticmethod
|
|
259
|
-
def _no_cluster_labels(obj) -> bool:
|
|
260
|
-
"""
|
|
261
|
-
Checks to see if cluster labels are attempting to be used in
|
|
262
|
-
stream() and, if so, calls fit() once again but without cluster
|
|
263
|
-
labels. As PyNomaly does not accept clustering algorithms as input,
|
|
264
|
-
the stream approach does not support clustering.
|
|
265
|
-
:param obj: a PyNomaly object.
|
|
266
|
-
:return: a boolean indicating whether validation has passed.
|
|
267
|
-
"""
|
|
268
|
-
if len(set(obj._cluster_labels())) > 1:
|
|
269
|
-
warnings.warn(
|
|
270
|
-
"Stream approach does not support clustered data. "
|
|
271
|
-
"Automatically refit using single cluster of points.",
|
|
272
|
-
UserWarning)
|
|
273
|
-
return False
|
|
274
|
-
return True
|
|
244
|
+
def _check_missing_values(self) -> None:
|
|
245
|
+
"""
|
|
246
|
+
Validates the provided data to ensure that it contains no
|
|
247
|
+
missing values.
|
|
248
|
+
:raises MissingValuesError: if data contains NaN values.
|
|
249
|
+
"""
|
|
250
|
+
if np.any(np.isnan(self.data)):
|
|
251
|
+
raise MissingValuesError(
|
|
252
|
+
"Method does not support missing values in input data."
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
def _check_is_fit(self) -> bool:
|
|
256
|
+
"""
|
|
257
|
+
Checks that the model was fit prior to calling the stream() method.
|
|
258
|
+
:return: a boolean indicating whether the model has been fit.
|
|
259
|
+
"""
|
|
260
|
+
if self.is_fit is False:
|
|
261
|
+
warnings.warn(
|
|
262
|
+
"Must fit on historical data by calling fit() prior to "
|
|
263
|
+
"calling stream(x).",
|
|
264
|
+
UserWarning,
|
|
265
|
+
)
|
|
266
|
+
return False
|
|
267
|
+
return True
|
|
268
|
+
|
|
269
|
+
def _check_no_cluster_labels(self) -> bool:
|
|
270
|
+
"""
|
|
271
|
+
Checks to see if cluster labels are attempting to be used in
|
|
272
|
+
stream() and, if so, returns False. As PyNomaly does not accept
|
|
273
|
+
clustering algorithms as input, the stream approach does not
|
|
274
|
+
support clustering.
|
|
275
|
+
:return: a boolean indicating whether single cluster (no labels).
|
|
276
|
+
"""
|
|
277
|
+
if len(set(self._cluster_labels())) > 1:
|
|
278
|
+
warnings.warn(
|
|
279
|
+
"Stream approach does not support clustered data. "
|
|
280
|
+
"Automatically refit using single cluster of points.",
|
|
281
|
+
UserWarning,
|
|
282
|
+
)
|
|
283
|
+
return False
|
|
284
|
+
return True
|
|
275
285
|
|
|
276
286
|
"""
|
|
277
287
|
Decorators.
|
|
@@ -291,43 +301,35 @@ class LocalOutlierProbability(object):
|
|
|
291
301
|
assert len(types) == f.__code__.co_argcount
|
|
292
302
|
|
|
293
303
|
def new_f(*args, **kwds):
|
|
294
|
-
for
|
|
295
|
-
if type(a).__name__ ==
|
|
304
|
+
for a, t in zip(args, types):
|
|
305
|
+
if type(a).__name__ == "DataFrame":
|
|
296
306
|
a = np.array(a)
|
|
297
307
|
if isinstance(a, t) is False:
|
|
298
|
-
warnings.warn(
|
|
299
|
-
|
|
308
|
+
warnings.warn(
|
|
309
|
+
"Argument %r is not of type %s" % (a, t), UserWarning
|
|
310
|
+
)
|
|
300
311
|
opt_types = {
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
},
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
},
|
|
307
|
-
|
|
308
|
-
'type': types[4]
|
|
309
|
-
},
|
|
310
|
-
'n_neighbors': {
|
|
311
|
-
'type': types[5]
|
|
312
|
-
},
|
|
313
|
-
'cluster_labels': {
|
|
314
|
-
'type': types[6]
|
|
315
|
-
},
|
|
316
|
-
'use_numba': {
|
|
317
|
-
'type': types[7]
|
|
318
|
-
},
|
|
319
|
-
'progress_bar': {
|
|
320
|
-
'type': types[8]
|
|
321
|
-
}
|
|
312
|
+
"distance_matrix": {"type": types[2]},
|
|
313
|
+
"neighbor_matrix": {"type": types[3]},
|
|
314
|
+
"extent": {"type": types[4]},
|
|
315
|
+
"n_neighbors": {"type": types[5]},
|
|
316
|
+
"cluster_labels": {"type": types[6]},
|
|
317
|
+
"use_numba": {"type": types[7]},
|
|
318
|
+
"progress_bar": {"type": types[8]},
|
|
322
319
|
}
|
|
323
320
|
for x in kwds:
|
|
324
|
-
opt_types[x][
|
|
321
|
+
opt_types[x]["value"] = kwds[x]
|
|
325
322
|
for k in opt_types:
|
|
326
323
|
try:
|
|
327
|
-
if
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
324
|
+
if (
|
|
325
|
+
isinstance(opt_types[k]["value"], opt_types[k]["type"])
|
|
326
|
+
is False
|
|
327
|
+
):
|
|
328
|
+
warnings.warn(
|
|
329
|
+
"Argument %r is not of type %s."
|
|
330
|
+
% (k, opt_types[k]["type"]),
|
|
331
|
+
UserWarning,
|
|
332
|
+
)
|
|
331
333
|
except KeyError:
|
|
332
334
|
pass
|
|
333
335
|
return f(*args, **kwds)
|
|
@@ -337,11 +339,28 @@ class LocalOutlierProbability(object):
|
|
|
337
339
|
|
|
338
340
|
return decorator
|
|
339
341
|
|
|
340
|
-
@accepts(
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
342
|
+
@accepts(
|
|
343
|
+
object,
|
|
344
|
+
np.ndarray,
|
|
345
|
+
np.ndarray,
|
|
346
|
+
np.ndarray,
|
|
347
|
+
(int, np.integer),
|
|
348
|
+
(int, np.integer),
|
|
349
|
+
list,
|
|
350
|
+
bool,
|
|
351
|
+
bool,
|
|
352
|
+
)
|
|
353
|
+
def __init__(
|
|
354
|
+
self,
|
|
355
|
+
data=None,
|
|
356
|
+
distance_matrix=None,
|
|
357
|
+
neighbor_matrix=None,
|
|
358
|
+
extent=3,
|
|
359
|
+
n_neighbors=10,
|
|
360
|
+
cluster_labels=None,
|
|
361
|
+
use_numba=False,
|
|
362
|
+
progress_bar=False,
|
|
363
|
+
) -> None:
|
|
345
364
|
self.data = data
|
|
346
365
|
self.distance_matrix = distance_matrix
|
|
347
366
|
self.neighbor_matrix = neighbor_matrix
|
|
@@ -358,29 +377,28 @@ class LocalOutlierProbability(object):
|
|
|
358
377
|
self.progress_bar = progress_bar
|
|
359
378
|
self.is_fit = False
|
|
360
379
|
|
|
361
|
-
if self.use_numba is True and
|
|
380
|
+
if self.use_numba is True and "numba" not in sys.modules:
|
|
362
381
|
self.use_numba = False
|
|
363
382
|
warnings.warn(
|
|
364
|
-
"Numba is not available, falling back to pure python mode.",
|
|
365
|
-
|
|
383
|
+
"Numba is not available, falling back to pure python mode.", UserWarning
|
|
384
|
+
)
|
|
366
385
|
|
|
367
|
-
self.
|
|
368
|
-
self.
|
|
386
|
+
self._validate_inputs()
|
|
387
|
+
self._check_extent()
|
|
369
388
|
|
|
370
389
|
"""
|
|
371
390
|
Private methods.
|
|
372
391
|
"""
|
|
373
392
|
|
|
374
393
|
@staticmethod
|
|
375
|
-
def _standard_distance(cardinality: float, sum_squared_distance: float)
|
|
376
|
-
-> float:
|
|
394
|
+
def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:
|
|
377
395
|
"""
|
|
378
396
|
Calculates the standard distance of an observation.
|
|
379
397
|
:param cardinality: the cardinality of the input observation.
|
|
380
398
|
:param sum_squared_distance: the sum squared distance between all
|
|
381
399
|
neighbors of the input observation.
|
|
382
400
|
:return: the standard distance.
|
|
383
|
-
#
|
|
401
|
+
#"""
|
|
384
402
|
division_result = sum_squared_distance / cardinality
|
|
385
403
|
st_dist = sqrt(division_result)
|
|
386
404
|
return st_dist
|
|
@@ -397,8 +415,9 @@ class LocalOutlierProbability(object):
|
|
|
397
415
|
return extent * standard_distance
|
|
398
416
|
|
|
399
417
|
@staticmethod
|
|
400
|
-
def _prob_outlier_factor(
|
|
401
|
-
|
|
418
|
+
def _prob_outlier_factor(
|
|
419
|
+
probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray
|
|
420
|
+
) -> np.ndarray:
|
|
402
421
|
"""
|
|
403
422
|
Calculates the probabilistic outlier factor of an observation.
|
|
404
423
|
:param probabilistic_distance: the probabilistic distance of the
|
|
@@ -409,14 +428,14 @@ class LocalOutlierProbability(object):
|
|
|
409
428
|
if np.all(probabilistic_distance == ev_prob_dist):
|
|
410
429
|
return np.zeros(probabilistic_distance.shape)
|
|
411
430
|
else:
|
|
412
|
-
ev_prob_dist[ev_prob_dist == 0.] = 1.
|
|
413
|
-
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.
|
|
431
|
+
ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8
|
|
432
|
+
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0
|
|
414
433
|
return result
|
|
415
434
|
|
|
416
435
|
@staticmethod
|
|
417
|
-
def _norm_prob_outlier_factor(
|
|
418
|
-
|
|
419
|
-
|
|
436
|
+
def _norm_prob_outlier_factor(
|
|
437
|
+
extent: float, ev_probabilistic_outlier_factor: list
|
|
438
|
+
) -> list:
|
|
420
439
|
"""
|
|
421
440
|
Calculates the normalized probabilistic outlier factor of an
|
|
422
441
|
observation.
|
|
@@ -431,8 +450,9 @@ class LocalOutlierProbability(object):
|
|
|
431
450
|
return npofs
|
|
432
451
|
|
|
433
452
|
@staticmethod
|
|
434
|
-
def _local_outlier_probability(
|
|
435
|
-
|
|
453
|
+
def _local_outlier_probability(
|
|
454
|
+
plof_val: np.ndarray, nplof_val: np.ndarray
|
|
455
|
+
) -> np.ndarray:
|
|
436
456
|
"""
|
|
437
457
|
Calculates the local outlier probability of an observation.
|
|
438
458
|
:param plof_val: the probabilistic outlier factor of the input
|
|
@@ -445,7 +465,7 @@ class LocalOutlierProbability(object):
|
|
|
445
465
|
if np.all(plof_val == nplof_val):
|
|
446
466
|
return np.zeros(plof_val.shape)
|
|
447
467
|
else:
|
|
448
|
-
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.))))
|
|
468
|
+
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
|
|
449
469
|
|
|
450
470
|
def _n_observations(self) -> int:
|
|
451
471
|
"""
|
|
@@ -499,8 +519,9 @@ class LocalOutlierProbability(object):
|
|
|
499
519
|
:return: the updated storage matrix that collects information on
|
|
500
520
|
each observation.
|
|
501
521
|
"""
|
|
502
|
-
for vec, cluster_id in zip(
|
|
503
|
-
|
|
522
|
+
for vec, cluster_id in zip(
|
|
523
|
+
range(self.distance_matrix.shape[0]), self._cluster_labels()
|
|
524
|
+
):
|
|
504
525
|
data_store[vec][0] = cluster_id
|
|
505
526
|
data_store[vec][1] = self.distance_matrix[vec]
|
|
506
527
|
data_store[vec][2] = self.neighbor_matrix[vec]
|
|
@@ -508,10 +529,10 @@ class LocalOutlierProbability(object):
|
|
|
508
529
|
|
|
509
530
|
@staticmethod
|
|
510
531
|
def _compute_distance_and_neighbor_matrix(
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
532
|
+
clust_points_vector: np.ndarray,
|
|
533
|
+
indices: np.ndarray,
|
|
534
|
+
distances: np.ndarray,
|
|
535
|
+
indexes: np.ndarray,
|
|
515
536
|
) -> Tuple[np.ndarray, np.ndarray, int]:
|
|
516
537
|
"""
|
|
517
538
|
This helper method provides the heavy lifting for the _distances
|
|
@@ -519,27 +540,27 @@ class LocalOutlierProbability(object):
|
|
|
519
540
|
written so that it can make full use of Numba's jit capabilities if
|
|
520
541
|
desired.
|
|
521
542
|
"""
|
|
522
|
-
|
|
523
543
|
for i in range(clust_points_vector.shape[0]):
|
|
524
544
|
for j in range(i + 1, clust_points_vector.shape[0]):
|
|
525
|
-
|
|
545
|
+
# Global index of the points
|
|
546
|
+
global_i = indices[0][i]
|
|
547
|
+
global_j = indices[0][j]
|
|
526
548
|
|
|
527
|
-
|
|
549
|
+
# Compute Euclidean distance
|
|
550
|
+
diff = clust_points_vector[i] - clust_points_vector[j]
|
|
528
551
|
d = np.dot(diff, diff) ** 0.5
|
|
529
552
|
|
|
530
|
-
|
|
531
|
-
idx_max = distances[
|
|
553
|
+
# Update distance and neighbor index for global_i
|
|
554
|
+
idx_max = distances[global_i].argmax()
|
|
555
|
+
if d < distances[global_i][idx_max]:
|
|
556
|
+
distances[global_i][idx_max] = d
|
|
557
|
+
indexes[global_i][idx_max] = global_j
|
|
532
558
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
idx_max = distances[idx].argmax()
|
|
539
|
-
|
|
540
|
-
if d < distances[idx][idx_max]:
|
|
541
|
-
distances[idx][idx_max] = d
|
|
542
|
-
indexes[idx][idx_max] = p[0][0]
|
|
559
|
+
# Update distance and neighbor index for global_j
|
|
560
|
+
idx_max = distances[global_j].argmax()
|
|
561
|
+
if d < distances[global_j][idx_max]:
|
|
562
|
+
distances[global_j][idx_max] = d
|
|
563
|
+
indexes[global_j][idx_max] = global_i
|
|
543
564
|
|
|
544
565
|
yield distances, indexes, i
|
|
545
566
|
|
|
@@ -552,20 +573,21 @@ class LocalOutlierProbability(object):
|
|
|
552
573
|
:return: the updated storage matrix that collects information on
|
|
553
574
|
each observation.
|
|
554
575
|
"""
|
|
555
|
-
distances = np.full(
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
self.points_vector = self.
|
|
560
|
-
compute =
|
|
561
|
-
|
|
562
|
-
self.
|
|
576
|
+
distances = np.full(
|
|
577
|
+
[self._n_observations(), self.n_neighbors], 9e10, dtype=float
|
|
578
|
+
)
|
|
579
|
+
indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
|
|
580
|
+
self.points_vector = self._convert_to_array(self.data)
|
|
581
|
+
compute = (
|
|
582
|
+
numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
|
|
583
|
+
if self.use_numba
|
|
584
|
+
else self._compute_distance_and_neighbor_matrix
|
|
585
|
+
)
|
|
563
586
|
progress = "="
|
|
564
587
|
for cluster_id in set(self._cluster_labels()):
|
|
565
588
|
indices = np.where(self._cluster_labels() == cluster_id)
|
|
566
589
|
clust_points_vector = np.array(
|
|
567
|
-
self.points_vector.take(indices, axis=0)[0],
|
|
568
|
-
dtype=np.float64
|
|
590
|
+
self.points_vector.take(indices, axis=0)[0], dtype=np.float64
|
|
569
591
|
)
|
|
570
592
|
# a generator that yields an updated distance matrix on each loop
|
|
571
593
|
for c in compute(clust_points_vector, indices, distances, indexes):
|
|
@@ -573,7 +595,8 @@ class LocalOutlierProbability(object):
|
|
|
573
595
|
# update the progress bar
|
|
574
596
|
if progress_bar is True:
|
|
575
597
|
progress = Utils.emit_progress_bar(
|
|
576
|
-
progress, i+1, clust_points_vector.shape[0]
|
|
598
|
+
progress, i + 1, clust_points_vector.shape[0]
|
|
599
|
+
)
|
|
577
600
|
|
|
578
601
|
self.distance_matrix = distances
|
|
579
602
|
self.neighbor_matrix = indexes
|
|
@@ -627,11 +650,10 @@ class LocalOutlierProbability(object):
|
|
|
627
650
|
"""
|
|
628
651
|
prob_distances = []
|
|
629
652
|
for i in range(data_store[:, 4].shape[0]):
|
|
630
|
-
prob_distances.append(
|
|
631
|
-
self._prob_distance(self.extent, data_store[:, 4][i]))
|
|
653
|
+
prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))
|
|
632
654
|
return np.hstack((data_store, np.array([prob_distances]).T))
|
|
633
655
|
|
|
634
|
-
def _prob_distances_ev(self, data_store
|
|
656
|
+
def _prob_distances_ev(self, data_store) -> np.ndarray:
|
|
635
657
|
"""
|
|
636
658
|
Calculates the expected value of the probabilistic distance for
|
|
637
659
|
each observation in the input data with respect to the cluster the
|
|
@@ -645,19 +667,20 @@ class LocalOutlierProbability(object):
|
|
|
645
667
|
for cluster_id in self.cluster_labels_u:
|
|
646
668
|
indices = np.where(data_store[:, 0] == cluster_id)[0]
|
|
647
669
|
for index in indices:
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
670
|
+
# Global neighbor indices for the current point
|
|
671
|
+
nbrhood = data_store[index][2].astype(int) # Ensure global indices
|
|
672
|
+
nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(
|
|
673
|
+
float
|
|
674
|
+
)
|
|
651
675
|
nbrhood_prob_distances_nonan = nbrhood_prob_distances[
|
|
652
|
-
np.logical_not(np.isnan(nbrhood_prob_distances))
|
|
653
|
-
|
|
654
|
-
|
|
676
|
+
np.logical_not(np.isnan(nbrhood_prob_distances))
|
|
677
|
+
]
|
|
678
|
+
prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()
|
|
679
|
+
|
|
655
680
|
self.prob_distances_ev = prob_set_distance_ev
|
|
656
|
-
|
|
657
|
-
return data_store
|
|
681
|
+
return np.hstack((data_store, prob_set_distance_ev))
|
|
658
682
|
|
|
659
|
-
def _prob_local_outlier_factors(self,
|
|
660
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
683
|
+
def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
|
|
661
684
|
"""
|
|
662
685
|
Calculates the probabilistic local outlier factor for each
|
|
663
686
|
observation in the input data.
|
|
@@ -667,13 +690,22 @@ class LocalOutlierProbability(object):
|
|
|
667
690
|
each observation.
|
|
668
691
|
"""
|
|
669
692
|
return np.hstack(
|
|
670
|
-
(
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
693
|
+
(
|
|
694
|
+
data_store,
|
|
695
|
+
np.array(
|
|
696
|
+
[
|
|
697
|
+
np.apply_along_axis(
|
|
698
|
+
self._prob_outlier_factor,
|
|
699
|
+
0,
|
|
700
|
+
data_store[:, 5],
|
|
701
|
+
data_store[:, 6],
|
|
702
|
+
)
|
|
703
|
+
]
|
|
704
|
+
).T,
|
|
705
|
+
)
|
|
706
|
+
)
|
|
674
707
|
|
|
675
|
-
def _prob_local_outlier_factors_ev(self,
|
|
676
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
708
|
+
def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:
|
|
677
709
|
"""
|
|
678
710
|
Calculates the expected value of the probabilistic local outlier factor
|
|
679
711
|
for each observation in the input data with respect to the cluster the
|
|
@@ -686,21 +718,31 @@ class LocalOutlierProbability(object):
|
|
|
686
718
|
prob_local_outlier_factor_ev_dict = {}
|
|
687
719
|
for cluster_id in self.cluster_labels_u:
|
|
688
720
|
indices = np.where(data_store[:, 0] == cluster_id)
|
|
689
|
-
prob_local_outlier_factors = np.take(data_store[:, 7],
|
|
690
|
-
|
|
691
|
-
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
|
|
692
|
-
np.logical_not(np.isnan(prob_local_outlier_factors))]
|
|
693
|
-
prob_local_outlier_factor_ev_dict[cluster_id] = (
|
|
694
|
-
np.power(prob_local_outlier_factors_nonan, 2).sum() /
|
|
695
|
-
float(prob_local_outlier_factors_nonan.size)
|
|
721
|
+
prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(
|
|
722
|
+
float
|
|
696
723
|
)
|
|
724
|
+
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
|
|
725
|
+
np.logical_not(np.isnan(prob_local_outlier_factors))
|
|
726
|
+
]
|
|
727
|
+
prob_local_outlier_factor_ev_dict[cluster_id] = np.power(
|
|
728
|
+
prob_local_outlier_factors_nonan, 2
|
|
729
|
+
).sum() / float(prob_local_outlier_factors_nonan.size)
|
|
697
730
|
data_store = np.hstack(
|
|
698
|
-
(
|
|
699
|
-
|
|
731
|
+
(
|
|
732
|
+
data_store,
|
|
733
|
+
np.array(
|
|
734
|
+
[
|
|
735
|
+
[
|
|
736
|
+
prob_local_outlier_factor_ev_dict[x]
|
|
737
|
+
for x in data_store[:, 0].tolist()
|
|
738
|
+
]
|
|
739
|
+
]
|
|
740
|
+
).T,
|
|
741
|
+
)
|
|
742
|
+
)
|
|
700
743
|
return data_store
|
|
701
744
|
|
|
702
|
-
def _norm_prob_local_outlier_factors(self, data_store: np.ndarray)
|
|
703
|
-
-> np.ndarray:
|
|
745
|
+
def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
|
|
704
746
|
"""
|
|
705
747
|
Calculates the normalized probabilistic local outlier factor for each
|
|
706
748
|
observation in the input data.
|
|
@@ -709,11 +751,20 @@ class LocalOutlierProbability(object):
|
|
|
709
751
|
:return: the updated storage matrix that collects information on
|
|
710
752
|
each observation.
|
|
711
753
|
"""
|
|
712
|
-
return np.hstack(
|
|
713
|
-
|
|
754
|
+
return np.hstack(
|
|
755
|
+
(
|
|
756
|
+
data_store,
|
|
757
|
+
np.array(
|
|
758
|
+
[
|
|
759
|
+
self._norm_prob_outlier_factor(
|
|
760
|
+
self.extent, data_store[:, 8].tolist()
|
|
761
|
+
)
|
|
762
|
+
]
|
|
763
|
+
).T,
|
|
764
|
+
)
|
|
765
|
+
)
|
|
714
766
|
|
|
715
|
-
def _local_outlier_probabilities(self,
|
|
716
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
767
|
+
def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:
|
|
717
768
|
"""
|
|
718
769
|
Calculates the local outlier probability for each observation in the
|
|
719
770
|
input data.
|
|
@@ -723,31 +774,40 @@ class LocalOutlierProbability(object):
|
|
|
723
774
|
each observation.
|
|
724
775
|
"""
|
|
725
776
|
return np.hstack(
|
|
726
|
-
(
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
777
|
+
(
|
|
778
|
+
data_store,
|
|
779
|
+
np.array(
|
|
780
|
+
[
|
|
781
|
+
np.apply_along_axis(
|
|
782
|
+
self._local_outlier_probability,
|
|
783
|
+
0,
|
|
784
|
+
data_store[:, 7],
|
|
785
|
+
data_store[:, 9],
|
|
786
|
+
)
|
|
787
|
+
]
|
|
788
|
+
).T,
|
|
789
|
+
)
|
|
790
|
+
)
|
|
730
791
|
|
|
731
792
|
"""
|
|
732
793
|
Public methods
|
|
733
794
|
"""
|
|
734
795
|
|
|
735
|
-
def fit(self) ->
|
|
736
|
-
|
|
796
|
+
def fit(self) -> "LocalOutlierProbability":
|
|
737
797
|
"""
|
|
738
798
|
Calculates the local outlier probability for each observation in the
|
|
739
799
|
input data according to the input parameters extent, n_neighbors, and
|
|
740
800
|
cluster_labels.
|
|
741
801
|
:return: self, which contains the local outlier probabilities as
|
|
742
802
|
self.local_outlier_probabilities.
|
|
803
|
+
:raises ClusterSizeError: if any cluster is smaller than n_neighbors.
|
|
804
|
+
:raises MissingValuesError: if data contains missing values.
|
|
743
805
|
"""
|
|
744
806
|
|
|
745
|
-
self.
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
self) is False:
|
|
750
|
-
sys.exit()
|
|
807
|
+
self._check_n_neighbors()
|
|
808
|
+
self._check_cluster_size()
|
|
809
|
+
if self.data is not None:
|
|
810
|
+
self._check_missing_values()
|
|
751
811
|
|
|
752
812
|
store = self._store()
|
|
753
813
|
if self.data is not None:
|
|
@@ -770,7 +830,6 @@ class LocalOutlierProbability(object):
|
|
|
770
830
|
return self
|
|
771
831
|
|
|
772
832
|
def stream(self, x: np.ndarray) -> np.ndarray:
|
|
773
|
-
|
|
774
833
|
"""
|
|
775
834
|
Calculates the local outlier probability for an individual sample
|
|
776
835
|
according to the input parameters extent, n_neighbors, and
|
|
@@ -784,19 +843,23 @@ class LocalOutlierProbability(object):
|
|
|
784
843
|
"""
|
|
785
844
|
|
|
786
845
|
orig_cluster_labels = None
|
|
787
|
-
if self.
|
|
846
|
+
if self._check_no_cluster_labels() is False:
|
|
788
847
|
orig_cluster_labels = self.cluster_labels
|
|
789
848
|
self.cluster_labels = np.array([0] * len(self.data))
|
|
790
849
|
|
|
791
|
-
if self.
|
|
850
|
+
if self._check_is_fit() is False:
|
|
792
851
|
self.fit()
|
|
793
852
|
|
|
794
|
-
point_vector = self.
|
|
853
|
+
point_vector = self._convert_to_array(x)
|
|
795
854
|
distances = np.full([1, self.n_neighbors], 9e10, dtype=float)
|
|
796
855
|
if self.data is not None:
|
|
797
856
|
matrix = self.points_vector
|
|
798
857
|
else:
|
|
799
858
|
matrix = self.distance_matrix
|
|
859
|
+
# When using distance matrix mode, x is a scalar distance value.
|
|
860
|
+
# Extract scalar from array to avoid NumPy assignment errors.
|
|
861
|
+
if point_vector.size == 1:
|
|
862
|
+
point_vector = float(point_vector.flat[0])
|
|
800
863
|
for p in range(0, matrix.shape[0]):
|
|
801
864
|
if self.data is not None:
|
|
802
865
|
d = self._euclidean(matrix[p, :], point_vector)
|
|
@@ -809,12 +872,12 @@ class LocalOutlierProbability(object):
|
|
|
809
872
|
ssd = np.power(distances, 2).sum()
|
|
810
873
|
std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
|
|
811
874
|
prob_dist = self._prob_distance(self.extent, std_dist)
|
|
812
|
-
plof = self._prob_outlier_factor(
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
)
|
|
875
|
+
plof = self._prob_outlier_factor(
|
|
876
|
+
np.array(prob_dist), np.array(self.prob_distances_ev.mean())
|
|
877
|
+
)
|
|
816
878
|
loop = self._local_outlier_probability(
|
|
817
|
-
plof, self.norm_prob_local_outlier_factor
|
|
879
|
+
plof, self.norm_prob_local_outlier_factor
|
|
880
|
+
)
|
|
818
881
|
|
|
819
882
|
if orig_cluster_labels is not None:
|
|
820
883
|
self.cluster_labels = orig_cluster_labels
|