PyNomaly 0.3.1__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyNomaly/loop.py +274 -171
- {PyNomaly-0.3.1.dist-info → PyNomaly-0.3.4.dist-info}/METADATA +3 -3
- PyNomaly-0.3.4.dist-info/RECORD +7 -0
- {PyNomaly-0.3.1.dist-info → PyNomaly-0.3.4.dist-info}/WHEEL +1 -1
- PyNomaly-0.3.1.dist-info/RECORD +0 -7
- {PyNomaly-0.3.1.dist-info → PyNomaly-0.3.4.dist-info}/LICENSE.txt +0 -0
- {PyNomaly-0.3.1.dist-info → PyNomaly-0.3.4.dist-info}/top_level.txt +0 -0
PyNomaly/loop.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from math import erf, sqrt
|
|
2
2
|
import numpy as np
|
|
3
|
+
from python_utils.terminal import get_terminal_size
|
|
3
4
|
import sys
|
|
5
|
+
from typing import Tuple, Union
|
|
4
6
|
import warnings
|
|
5
7
|
|
|
6
8
|
try:
|
|
@@ -8,9 +10,37 @@ try:
|
|
|
8
10
|
except ImportError:
|
|
9
11
|
pass
|
|
10
12
|
|
|
11
|
-
__author__ =
|
|
12
|
-
__version__ =
|
|
13
|
-
__license__ =
|
|
13
|
+
__author__ = "Valentino Constantinou"
|
|
14
|
+
__version__ = "0.3.4"
|
|
15
|
+
__license__ = "Apache License, Version 2.0"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Utils:
|
|
19
|
+
@staticmethod
|
|
20
|
+
def emit_progress_bar(progress: str, index: int, total: int) -> str:
|
|
21
|
+
"""
|
|
22
|
+
A progress bar that is continuously updated in Python's standard
|
|
23
|
+
out.
|
|
24
|
+
:param progress: a string printed to stdout that is updated and later
|
|
25
|
+
returned.
|
|
26
|
+
:param index: the current index of the iteration within the tracked
|
|
27
|
+
process.
|
|
28
|
+
:param total: the total length of the tracked process.
|
|
29
|
+
:return: progress string.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
w, h = get_terminal_size()
|
|
33
|
+
sys.stdout.write("\r")
|
|
34
|
+
if total < w:
|
|
35
|
+
block_size = int(w / total)
|
|
36
|
+
else:
|
|
37
|
+
block_size = int(total / w)
|
|
38
|
+
if index % block_size == 0:
|
|
39
|
+
progress += "="
|
|
40
|
+
percent = index / total
|
|
41
|
+
sys.stdout.write("[ %s ] %.2f%%" % (progress, percent * 100))
|
|
42
|
+
sys.stdout.flush()
|
|
43
|
+
return progress
|
|
14
44
|
|
|
15
45
|
|
|
16
46
|
class LocalOutlierProbability(object):
|
|
@@ -24,7 +54,7 @@ class LocalOutlierProbability(object):
|
|
|
24
54
|
:param cluster_labels: a numpy array of cluster assignments w.r.t. each
|
|
25
55
|
sample (optional, default None)
|
|
26
56
|
:return:
|
|
27
|
-
""""""
|
|
57
|
+
""" """
|
|
28
58
|
|
|
29
59
|
Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP:
|
|
30
60
|
Local Outlier Probabilities.
|
|
@@ -62,7 +92,7 @@ class LocalOutlierProbability(object):
|
|
|
62
92
|
"""
|
|
63
93
|
|
|
64
94
|
@staticmethod
|
|
65
|
-
def _data(obj):
|
|
95
|
+
def _data(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
|
|
66
96
|
"""
|
|
67
97
|
Validates the input data to ensure it is either a Pandas DataFrame
|
|
68
98
|
or Numpy array.
|
|
@@ -70,24 +100,25 @@ class LocalOutlierProbability(object):
|
|
|
70
100
|
:return: a vector of values to be used in calculating the local
|
|
71
101
|
outlier probability.
|
|
72
102
|
"""
|
|
73
|
-
if obj.__class__.__name__ ==
|
|
103
|
+
if obj.__class__.__name__ == "DataFrame":
|
|
74
104
|
points_vector = obj.values
|
|
75
105
|
return points_vector
|
|
76
|
-
elif obj.__class__.__name__ ==
|
|
106
|
+
elif obj.__class__.__name__ == "ndarray":
|
|
77
107
|
points_vector = obj
|
|
78
108
|
return points_vector
|
|
79
109
|
else:
|
|
80
110
|
warnings.warn(
|
|
81
111
|
"Provided data or distance matrix must be in ndarray "
|
|
82
112
|
"or DataFrame.",
|
|
83
|
-
UserWarning
|
|
113
|
+
UserWarning,
|
|
114
|
+
)
|
|
84
115
|
if isinstance(obj, list):
|
|
85
116
|
points_vector = np.array(obj)
|
|
86
117
|
return points_vector
|
|
87
118
|
points_vector = np.array([obj])
|
|
88
119
|
return points_vector
|
|
89
120
|
|
|
90
|
-
def _inputs(self, obj):
|
|
121
|
+
def _inputs(self, obj: "LocalOutlierProbability"):
|
|
91
122
|
"""
|
|
92
123
|
Validates the inputs provided during initialization to ensure
|
|
93
124
|
that the needed objects are provided.
|
|
@@ -103,40 +134,48 @@ class LocalOutlierProbability(object):
|
|
|
103
134
|
elif all(v is not None for v in [obj.data, obj.distance_matrix]):
|
|
104
135
|
warnings.warn(
|
|
105
136
|
"Only one of the following may be provided: data or a "
|
|
106
|
-
"distance matrix (not both).",
|
|
137
|
+
"distance matrix (not both).",
|
|
138
|
+
UserWarning,
|
|
107
139
|
)
|
|
108
140
|
return False
|
|
109
141
|
if obj.data is not None:
|
|
110
142
|
points_vector = self._data(obj.data)
|
|
111
143
|
return points_vector, obj.distance_matrix, obj.neighbor_matrix
|
|
112
|
-
if all(
|
|
113
|
-
|
|
144
|
+
if all(
|
|
145
|
+
matrix is not None
|
|
146
|
+
for matrix in [obj.neighbor_matrix, obj.distance_matrix]
|
|
147
|
+
):
|
|
114
148
|
dist_vector = self._data(obj.distance_matrix)
|
|
115
149
|
neigh_vector = self._data(obj.neighbor_matrix)
|
|
116
150
|
else:
|
|
117
151
|
warnings.warn(
|
|
118
152
|
"A neighbor index matrix and distance matrix must both be "
|
|
119
|
-
"provided when not using raw input data.",
|
|
153
|
+
"provided when not using raw input data.",
|
|
154
|
+
UserWarning,
|
|
120
155
|
)
|
|
121
156
|
return False
|
|
122
157
|
if obj.distance_matrix.shape != obj.neighbor_matrix.shape:
|
|
123
158
|
warnings.warn(
|
|
124
159
|
"The shape of the distance and neighbor "
|
|
125
|
-
"index matrices must match.",
|
|
160
|
+
"index matrices must match.",
|
|
161
|
+
UserWarning,
|
|
126
162
|
)
|
|
127
163
|
return False
|
|
128
|
-
elif (obj.distance_matrix.shape[1] != obj.n_neighbors)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
warnings.warn(
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
164
|
+
elif (obj.distance_matrix.shape[1] != obj.n_neighbors) or (
|
|
165
|
+
obj.neighbor_matrix.shape[1] != obj.n_neighbors
|
|
166
|
+
):
|
|
167
|
+
warnings.warn(
|
|
168
|
+
"The shape of the distance or "
|
|
169
|
+
"neighbor index matrix does not "
|
|
170
|
+
"match the number of neighbors "
|
|
171
|
+
"specified.",
|
|
172
|
+
UserWarning,
|
|
173
|
+
)
|
|
135
174
|
return False
|
|
136
175
|
return obj.data, dist_vector, neigh_vector
|
|
137
176
|
|
|
138
177
|
@staticmethod
|
|
139
|
-
def _cluster_size(obj):
|
|
178
|
+
def _cluster_size(obj) -> bool:
|
|
140
179
|
"""
|
|
141
180
|
Validates the cluster labels to ensure that the smallest cluster
|
|
142
181
|
size (number of observations in the cluster) is larger than the
|
|
@@ -153,12 +192,13 @@ class LocalOutlierProbability(object):
|
|
|
153
192
|
"cluster. Specify a number of neighbors smaller than "
|
|
154
193
|
"the smallest cluster size (observations in smallest "
|
|
155
194
|
"cluster minus one).",
|
|
156
|
-
UserWarning
|
|
195
|
+
UserWarning,
|
|
196
|
+
)
|
|
157
197
|
return False
|
|
158
198
|
return True
|
|
159
199
|
|
|
160
200
|
@staticmethod
|
|
161
|
-
def _n_neighbors(obj):
|
|
201
|
+
def _n_neighbors(obj) -> bool:
|
|
162
202
|
"""
|
|
163
203
|
Validates the specified number of neighbors to ensure that it is
|
|
164
204
|
greater than 0 and that the specified value is less than the total
|
|
@@ -168,21 +208,23 @@ class LocalOutlierProbability(object):
|
|
|
168
208
|
"""
|
|
169
209
|
if not obj.n_neighbors > 0:
|
|
170
210
|
obj.n_neighbors = 10
|
|
171
|
-
warnings.warn(
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
211
|
+
warnings.warn(
|
|
212
|
+
"n_neighbors must be greater than 0."
|
|
213
|
+
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
214
|
+
UserWarning,
|
|
215
|
+
)
|
|
175
216
|
return False
|
|
176
217
|
elif obj.n_neighbors >= obj._n_observations():
|
|
177
218
|
obj.n_neighbors = obj._n_observations() - 1
|
|
178
219
|
warnings.warn(
|
|
179
220
|
"n_neighbors must be less than the number of observations."
|
|
180
221
|
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
181
|
-
UserWarning
|
|
222
|
+
UserWarning,
|
|
223
|
+
)
|
|
182
224
|
return True
|
|
183
225
|
|
|
184
226
|
@staticmethod
|
|
185
|
-
def _extent(obj):
|
|
227
|
+
def _extent(obj) -> bool:
|
|
186
228
|
"""
|
|
187
229
|
Validates the specified extent parameter to ensure it is either 1,
|
|
188
230
|
2, or 3.
|
|
@@ -191,13 +233,13 @@ class LocalOutlierProbability(object):
|
|
|
191
233
|
"""
|
|
192
234
|
if obj.extent not in [1, 2, 3]:
|
|
193
235
|
warnings.warn(
|
|
194
|
-
"extent parameter (lambda) must be 1, 2, or 3.",
|
|
195
|
-
|
|
236
|
+
"extent parameter (lambda) must be 1, 2, or 3.", UserWarning
|
|
237
|
+
)
|
|
196
238
|
return False
|
|
197
239
|
return True
|
|
198
240
|
|
|
199
241
|
@staticmethod
|
|
200
|
-
def _missing_values(obj):
|
|
242
|
+
def _missing_values(obj) -> bool:
|
|
201
243
|
"""
|
|
202
244
|
Validates the provided data to ensure that it contains no
|
|
203
245
|
missing values.
|
|
@@ -206,13 +248,13 @@ class LocalOutlierProbability(object):
|
|
|
206
248
|
"""
|
|
207
249
|
if np.any(np.isnan(obj.data)):
|
|
208
250
|
warnings.warn(
|
|
209
|
-
"Method does not support missing values in input data.",
|
|
210
|
-
|
|
251
|
+
"Method does not support missing values in input data.", UserWarning
|
|
252
|
+
)
|
|
211
253
|
return False
|
|
212
254
|
return True
|
|
213
255
|
|
|
214
256
|
@staticmethod
|
|
215
|
-
def _fit(obj):
|
|
257
|
+
def _fit(obj) -> bool:
|
|
216
258
|
"""
|
|
217
259
|
Validates that the model was fit prior to calling the stream()
|
|
218
260
|
method.
|
|
@@ -223,12 +265,13 @@ class LocalOutlierProbability(object):
|
|
|
223
265
|
warnings.warn(
|
|
224
266
|
"Must fit on historical data by calling fit() prior to "
|
|
225
267
|
"calling stream(x).",
|
|
226
|
-
UserWarning
|
|
268
|
+
UserWarning,
|
|
269
|
+
)
|
|
227
270
|
return False
|
|
228
271
|
return True
|
|
229
272
|
|
|
230
273
|
@staticmethod
|
|
231
|
-
def _no_cluster_labels(obj):
|
|
274
|
+
def _no_cluster_labels(obj) -> bool:
|
|
232
275
|
"""
|
|
233
276
|
Checks to see if cluster labels are attempting to be used in
|
|
234
277
|
stream() and, if so, calls fit() once again but without cluster
|
|
@@ -241,7 +284,8 @@ class LocalOutlierProbability(object):
|
|
|
241
284
|
warnings.warn(
|
|
242
285
|
"Stream approach does not support clustered data. "
|
|
243
286
|
"Automatically refit using single cluster of points.",
|
|
244
|
-
UserWarning
|
|
287
|
+
UserWarning,
|
|
288
|
+
)
|
|
245
289
|
return False
|
|
246
290
|
return True
|
|
247
291
|
|
|
@@ -263,40 +307,35 @@ class LocalOutlierProbability(object):
|
|
|
263
307
|
assert len(types) == f.__code__.co_argcount
|
|
264
308
|
|
|
265
309
|
def new_f(*args, **kwds):
|
|
266
|
-
for
|
|
267
|
-
if type(a).__name__ ==
|
|
310
|
+
for a, t in zip(args, types):
|
|
311
|
+
if type(a).__name__ == "DataFrame":
|
|
268
312
|
a = np.array(a)
|
|
269
313
|
if isinstance(a, t) is False:
|
|
270
|
-
warnings.warn(
|
|
271
|
-
|
|
314
|
+
warnings.warn(
|
|
315
|
+
"Argument %r is not of type %s" % (a, t), UserWarning
|
|
316
|
+
)
|
|
272
317
|
opt_types = {
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
},
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
},
|
|
279
|
-
|
|
280
|
-
'type': types[4]
|
|
281
|
-
},
|
|
282
|
-
'n_neighbors': {
|
|
283
|
-
'type': types[5]
|
|
284
|
-
},
|
|
285
|
-
'cluster_labels': {
|
|
286
|
-
'type': types[6]
|
|
287
|
-
},
|
|
288
|
-
'use_numba': {
|
|
289
|
-
'type': types[7]
|
|
290
|
-
}
|
|
318
|
+
"distance_matrix": {"type": types[2]},
|
|
319
|
+
"neighbor_matrix": {"type": types[3]},
|
|
320
|
+
"extent": {"type": types[4]},
|
|
321
|
+
"n_neighbors": {"type": types[5]},
|
|
322
|
+
"cluster_labels": {"type": types[6]},
|
|
323
|
+
"use_numba": {"type": types[7]},
|
|
324
|
+
"progress_bar": {"type": types[8]},
|
|
291
325
|
}
|
|
292
326
|
for x in kwds:
|
|
293
|
-
opt_types[x][
|
|
327
|
+
opt_types[x]["value"] = kwds[x]
|
|
294
328
|
for k in opt_types:
|
|
295
329
|
try:
|
|
296
|
-
if
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
330
|
+
if (
|
|
331
|
+
isinstance(opt_types[k]["value"], opt_types[k]["type"])
|
|
332
|
+
is False
|
|
333
|
+
):
|
|
334
|
+
warnings.warn(
|
|
335
|
+
"Argument %r is not of type %s."
|
|
336
|
+
% (k, opt_types[k]["type"]),
|
|
337
|
+
UserWarning,
|
|
338
|
+
)
|
|
300
339
|
except KeyError:
|
|
301
340
|
pass
|
|
302
341
|
return f(*args, **kwds)
|
|
@@ -306,11 +345,28 @@ class LocalOutlierProbability(object):
|
|
|
306
345
|
|
|
307
346
|
return decorator
|
|
308
347
|
|
|
309
|
-
@accepts(
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
348
|
+
@accepts(
|
|
349
|
+
object,
|
|
350
|
+
np.ndarray,
|
|
351
|
+
np.ndarray,
|
|
352
|
+
np.ndarray,
|
|
353
|
+
(int, np.integer),
|
|
354
|
+
(int, np.integer),
|
|
355
|
+
list,
|
|
356
|
+
bool,
|
|
357
|
+
bool,
|
|
358
|
+
)
|
|
359
|
+
def __init__(
|
|
360
|
+
self,
|
|
361
|
+
data=None,
|
|
362
|
+
distance_matrix=None,
|
|
363
|
+
neighbor_matrix=None,
|
|
364
|
+
extent=3,
|
|
365
|
+
n_neighbors=10,
|
|
366
|
+
cluster_labels=None,
|
|
367
|
+
use_numba=False,
|
|
368
|
+
progress_bar=False,
|
|
369
|
+
) -> None:
|
|
314
370
|
self.data = data
|
|
315
371
|
self.distance_matrix = distance_matrix
|
|
316
372
|
self.neighbor_matrix = neighbor_matrix
|
|
@@ -324,13 +380,14 @@ class LocalOutlierProbability(object):
|
|
|
324
380
|
self.norm_prob_local_outlier_factor = None
|
|
325
381
|
self.local_outlier_probabilities = None
|
|
326
382
|
self._objects = {}
|
|
383
|
+
self.progress_bar = progress_bar
|
|
327
384
|
self.is_fit = False
|
|
328
385
|
|
|
329
|
-
if self.use_numba and
|
|
386
|
+
if self.use_numba is True and "numba" not in sys.modules:
|
|
330
387
|
self.use_numba = False
|
|
331
388
|
warnings.warn(
|
|
332
|
-
"Numba is not available, falling back to pure python mode.",
|
|
333
|
-
|
|
389
|
+
"Numba is not available, falling back to pure python mode.", UserWarning
|
|
390
|
+
)
|
|
334
391
|
|
|
335
392
|
self.Validate()._inputs(self)
|
|
336
393
|
self.Validate._extent(self)
|
|
@@ -340,15 +397,14 @@ class LocalOutlierProbability(object):
|
|
|
340
397
|
"""
|
|
341
398
|
|
|
342
399
|
@staticmethod
|
|
343
|
-
def _standard_distance(cardinality: float, sum_squared_distance: float)
|
|
344
|
-
-> float:
|
|
400
|
+
def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:
|
|
345
401
|
"""
|
|
346
402
|
Calculates the standard distance of an observation.
|
|
347
403
|
:param cardinality: the cardinality of the input observation.
|
|
348
404
|
:param sum_squared_distance: the sum squared distance between all
|
|
349
405
|
neighbors of the input observation.
|
|
350
406
|
:return: the standard distance.
|
|
351
|
-
#
|
|
407
|
+
#"""
|
|
352
408
|
division_result = sum_squared_distance / cardinality
|
|
353
409
|
st_dist = sqrt(division_result)
|
|
354
410
|
return st_dist
|
|
@@ -365,8 +421,9 @@ class LocalOutlierProbability(object):
|
|
|
365
421
|
return extent * standard_distance
|
|
366
422
|
|
|
367
423
|
@staticmethod
|
|
368
|
-
def _prob_outlier_factor(
|
|
369
|
-
|
|
424
|
+
def _prob_outlier_factor(
|
|
425
|
+
probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray
|
|
426
|
+
) -> np.ndarray:
|
|
370
427
|
"""
|
|
371
428
|
Calculates the probabilistic outlier factor of an observation.
|
|
372
429
|
:param probabilistic_distance: the probabilistic distance of the
|
|
@@ -377,14 +434,14 @@ class LocalOutlierProbability(object):
|
|
|
377
434
|
if np.all(probabilistic_distance == ev_prob_dist):
|
|
378
435
|
return np.zeros(probabilistic_distance.shape)
|
|
379
436
|
else:
|
|
380
|
-
ev_prob_dist[ev_prob_dist == 0.] = 1.
|
|
381
|
-
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.
|
|
437
|
+
ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8
|
|
438
|
+
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0
|
|
382
439
|
return result
|
|
383
440
|
|
|
384
441
|
@staticmethod
|
|
385
|
-
def _norm_prob_outlier_factor(
|
|
386
|
-
|
|
387
|
-
|
|
442
|
+
def _norm_prob_outlier_factor(
|
|
443
|
+
extent: float, ev_probabilistic_outlier_factor: list
|
|
444
|
+
) -> list:
|
|
388
445
|
"""
|
|
389
446
|
Calculates the normalized probabilistic outlier factor of an
|
|
390
447
|
observation.
|
|
@@ -399,8 +456,9 @@ class LocalOutlierProbability(object):
|
|
|
399
456
|
return npofs
|
|
400
457
|
|
|
401
458
|
@staticmethod
|
|
402
|
-
def _local_outlier_probability(
|
|
403
|
-
|
|
459
|
+
def _local_outlier_probability(
|
|
460
|
+
plof_val: np.ndarray, nplof_val: np.ndarray
|
|
461
|
+
) -> np.ndarray:
|
|
404
462
|
"""
|
|
405
463
|
Calculates the local outlier probability of an observation.
|
|
406
464
|
:param plof_val: the probabilistic outlier factor of the input
|
|
@@ -413,7 +471,7 @@ class LocalOutlierProbability(object):
|
|
|
413
471
|
if np.all(plof_val == nplof_val):
|
|
414
472
|
return np.zeros(plof_val.shape)
|
|
415
473
|
else:
|
|
416
|
-
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.))))
|
|
474
|
+
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
|
|
417
475
|
|
|
418
476
|
def _n_observations(self) -> int:
|
|
419
477
|
"""
|
|
@@ -467,8 +525,9 @@ class LocalOutlierProbability(object):
|
|
|
467
525
|
:return: the updated storage matrix that collects information on
|
|
468
526
|
each observation.
|
|
469
527
|
"""
|
|
470
|
-
for vec, cluster_id in zip(
|
|
471
|
-
|
|
528
|
+
for vec, cluster_id in zip(
|
|
529
|
+
range(self.distance_matrix.shape[0]), self._cluster_labels()
|
|
530
|
+
):
|
|
472
531
|
data_store[vec][0] = cluster_id
|
|
473
532
|
data_store[vec][1] = self.distance_matrix[vec]
|
|
474
533
|
data_store[vec][2] = self.neighbor_matrix[vec]
|
|
@@ -476,41 +535,42 @@ class LocalOutlierProbability(object):
|
|
|
476
535
|
|
|
477
536
|
@staticmethod
|
|
478
537
|
def _compute_distance_and_neighbor_matrix(
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
):
|
|
538
|
+
clust_points_vector: np.ndarray,
|
|
539
|
+
indices: np.ndarray,
|
|
540
|
+
distances: np.ndarray,
|
|
541
|
+
indexes: np.ndarray,
|
|
542
|
+
) -> Tuple[np.ndarray, np.ndarray, int]:
|
|
484
543
|
"""
|
|
485
544
|
This helper method provides the heavy lifting for the _distances
|
|
486
545
|
method and is only intended for use therein. The code has been
|
|
487
|
-
written so that it can make full use of
|
|
546
|
+
written so that it can make full use of Numba's jit capabilities if
|
|
488
547
|
desired.
|
|
489
548
|
"""
|
|
490
549
|
for i in range(clust_points_vector.shape[0]):
|
|
491
550
|
for j in range(i + 1, clust_points_vector.shape[0]):
|
|
492
|
-
|
|
551
|
+
# Global index of the points
|
|
552
|
+
global_i = indices[0][i]
|
|
553
|
+
global_j = indices[0][j]
|
|
493
554
|
|
|
494
|
-
|
|
555
|
+
# Compute Euclidean distance
|
|
556
|
+
diff = clust_points_vector[i] - clust_points_vector[j]
|
|
495
557
|
d = np.dot(diff, diff) ** 0.5
|
|
496
558
|
|
|
497
|
-
|
|
498
|
-
idx_max = distances[
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
indexes[idx][idx_max] = p[1][0]
|
|
559
|
+
# Update distance and neighbor index for global_i
|
|
560
|
+
idx_max = distances[global_i].argmax()
|
|
561
|
+
if d < distances[global_i][idx_max]:
|
|
562
|
+
distances[global_i][idx_max] = d
|
|
563
|
+
indexes[global_i][idx_max] = global_j
|
|
503
564
|
|
|
504
|
-
|
|
505
|
-
idx_max = distances[
|
|
565
|
+
# Update distance and neighbor index for global_j
|
|
566
|
+
idx_max = distances[global_j].argmax()
|
|
567
|
+
if d < distances[global_j][idx_max]:
|
|
568
|
+
distances[global_j][idx_max] = d
|
|
569
|
+
indexes[global_j][idx_max] = global_i
|
|
506
570
|
|
|
507
|
-
|
|
508
|
-
distances[idx][idx_max] = d
|
|
509
|
-
indexes[idx][idx_max] = p[0][0]
|
|
571
|
+
yield distances, indexes, i
|
|
510
572
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
def _distances(self) -> None:
|
|
573
|
+
def _distances(self, progress_bar: bool = False) -> None:
|
|
514
574
|
"""
|
|
515
575
|
Provides the distances between each observation and it's closest
|
|
516
576
|
neighbors. When input data is provided, calculates the euclidean
|
|
@@ -519,22 +579,30 @@ class LocalOutlierProbability(object):
|
|
|
519
579
|
:return: the updated storage matrix that collects information on
|
|
520
580
|
each observation.
|
|
521
581
|
"""
|
|
522
|
-
distances = np.full(
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
582
|
+
distances = np.full(
|
|
583
|
+
[self._n_observations(), self.n_neighbors], 9e10, dtype=float
|
|
584
|
+
)
|
|
585
|
+
indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
|
|
526
586
|
self.points_vector = self.Validate._data(self.data)
|
|
527
|
-
compute =
|
|
528
|
-
|
|
529
|
-
self.
|
|
587
|
+
compute = (
|
|
588
|
+
numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
|
|
589
|
+
if self.use_numba
|
|
590
|
+
else self._compute_distance_and_neighbor_matrix
|
|
591
|
+
)
|
|
592
|
+
progress = "="
|
|
530
593
|
for cluster_id in set(self._cluster_labels()):
|
|
531
594
|
indices = np.where(self._cluster_labels() == cluster_id)
|
|
532
595
|
clust_points_vector = np.array(
|
|
533
|
-
self.points_vector.take(indices, axis=0)[0],
|
|
534
|
-
dtype=np.float64
|
|
596
|
+
self.points_vector.take(indices, axis=0)[0], dtype=np.float64
|
|
535
597
|
)
|
|
536
|
-
|
|
537
|
-
|
|
598
|
+
# a generator that yields an updated distance matrix on each loop
|
|
599
|
+
for c in compute(clust_points_vector, indices, distances, indexes):
|
|
600
|
+
distances, indexes, i = c
|
|
601
|
+
# update the progress bar
|
|
602
|
+
if progress_bar is True:
|
|
603
|
+
progress = Utils.emit_progress_bar(
|
|
604
|
+
progress, i + 1, clust_points_vector.shape[0]
|
|
605
|
+
)
|
|
538
606
|
|
|
539
607
|
self.distance_matrix = distances
|
|
540
608
|
self.neighbor_matrix = indexes
|
|
@@ -588,11 +656,10 @@ class LocalOutlierProbability(object):
|
|
|
588
656
|
"""
|
|
589
657
|
prob_distances = []
|
|
590
658
|
for i in range(data_store[:, 4].shape[0]):
|
|
591
|
-
prob_distances.append(
|
|
592
|
-
self._prob_distance(self.extent, data_store[:, 4][i]))
|
|
659
|
+
prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))
|
|
593
660
|
return np.hstack((data_store, np.array([prob_distances]).T))
|
|
594
661
|
|
|
595
|
-
def _prob_distances_ev(self, data_store
|
|
662
|
+
def _prob_distances_ev(self, data_store) -> np.ndarray:
|
|
596
663
|
"""
|
|
597
664
|
Calculates the expected value of the probabilistic distance for
|
|
598
665
|
each observation in the input data with respect to the cluster the
|
|
@@ -606,19 +673,20 @@ class LocalOutlierProbability(object):
|
|
|
606
673
|
for cluster_id in self.cluster_labels_u:
|
|
607
674
|
indices = np.where(data_store[:, 0] == cluster_id)[0]
|
|
608
675
|
for index in indices:
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
676
|
+
# Global neighbor indices for the current point
|
|
677
|
+
nbrhood = data_store[index][2].astype(int) # Ensure global indices
|
|
678
|
+
nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(
|
|
679
|
+
float
|
|
680
|
+
)
|
|
612
681
|
nbrhood_prob_distances_nonan = nbrhood_prob_distances[
|
|
613
|
-
np.logical_not(np.isnan(nbrhood_prob_distances))
|
|
614
|
-
|
|
615
|
-
|
|
682
|
+
np.logical_not(np.isnan(nbrhood_prob_distances))
|
|
683
|
+
]
|
|
684
|
+
prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()
|
|
685
|
+
|
|
616
686
|
self.prob_distances_ev = prob_set_distance_ev
|
|
617
|
-
|
|
618
|
-
return data_store
|
|
687
|
+
return np.hstack((data_store, prob_set_distance_ev))
|
|
619
688
|
|
|
620
|
-
def _prob_local_outlier_factors(self,
|
|
621
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
689
|
+
def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
|
|
622
690
|
"""
|
|
623
691
|
Calculates the probabilistic local outlier factor for each
|
|
624
692
|
observation in the input data.
|
|
@@ -628,13 +696,22 @@ class LocalOutlierProbability(object):
|
|
|
628
696
|
each observation.
|
|
629
697
|
"""
|
|
630
698
|
return np.hstack(
|
|
631
|
-
(
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
699
|
+
(
|
|
700
|
+
data_store,
|
|
701
|
+
np.array(
|
|
702
|
+
[
|
|
703
|
+
np.apply_along_axis(
|
|
704
|
+
self._prob_outlier_factor,
|
|
705
|
+
0,
|
|
706
|
+
data_store[:, 5],
|
|
707
|
+
data_store[:, 6],
|
|
708
|
+
)
|
|
709
|
+
]
|
|
710
|
+
).T,
|
|
711
|
+
)
|
|
712
|
+
)
|
|
635
713
|
|
|
636
|
-
def _prob_local_outlier_factors_ev(self,
|
|
637
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
714
|
+
def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:
|
|
638
715
|
"""
|
|
639
716
|
Calculates the expected value of the probabilistic local outlier factor
|
|
640
717
|
for each observation in the input data with respect to the cluster the
|
|
@@ -647,21 +724,31 @@ class LocalOutlierProbability(object):
|
|
|
647
724
|
prob_local_outlier_factor_ev_dict = {}
|
|
648
725
|
for cluster_id in self.cluster_labels_u:
|
|
649
726
|
indices = np.where(data_store[:, 0] == cluster_id)
|
|
650
|
-
prob_local_outlier_factors = np.take(data_store[:, 7],
|
|
651
|
-
|
|
652
|
-
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
|
|
653
|
-
np.logical_not(np.isnan(prob_local_outlier_factors))]
|
|
654
|
-
prob_local_outlier_factor_ev_dict[cluster_id] = (
|
|
655
|
-
np.power(prob_local_outlier_factors_nonan, 2).sum() /
|
|
656
|
-
float(prob_local_outlier_factors_nonan.size)
|
|
727
|
+
prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(
|
|
728
|
+
float
|
|
657
729
|
)
|
|
730
|
+
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
|
|
731
|
+
np.logical_not(np.isnan(prob_local_outlier_factors))
|
|
732
|
+
]
|
|
733
|
+
prob_local_outlier_factor_ev_dict[cluster_id] = np.power(
|
|
734
|
+
prob_local_outlier_factors_nonan, 2
|
|
735
|
+
).sum() / float(prob_local_outlier_factors_nonan.size)
|
|
658
736
|
data_store = np.hstack(
|
|
659
|
-
(
|
|
660
|
-
|
|
737
|
+
(
|
|
738
|
+
data_store,
|
|
739
|
+
np.array(
|
|
740
|
+
[
|
|
741
|
+
[
|
|
742
|
+
prob_local_outlier_factor_ev_dict[x]
|
|
743
|
+
for x in data_store[:, 0].tolist()
|
|
744
|
+
]
|
|
745
|
+
]
|
|
746
|
+
).T,
|
|
747
|
+
)
|
|
748
|
+
)
|
|
661
749
|
return data_store
|
|
662
750
|
|
|
663
|
-
def _norm_prob_local_outlier_factors(self, data_store: np.ndarray)
|
|
664
|
-
-> np.ndarray:
|
|
751
|
+
def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
|
|
665
752
|
"""
|
|
666
753
|
Calculates the normalized probabilistic local outlier factor for each
|
|
667
754
|
observation in the input data.
|
|
@@ -670,11 +757,20 @@ class LocalOutlierProbability(object):
|
|
|
670
757
|
:return: the updated storage matrix that collects information on
|
|
671
758
|
each observation.
|
|
672
759
|
"""
|
|
673
|
-
return np.hstack(
|
|
674
|
-
|
|
760
|
+
return np.hstack(
|
|
761
|
+
(
|
|
762
|
+
data_store,
|
|
763
|
+
np.array(
|
|
764
|
+
[
|
|
765
|
+
self._norm_prob_outlier_factor(
|
|
766
|
+
self.extent, data_store[:, 8].tolist()
|
|
767
|
+
)
|
|
768
|
+
]
|
|
769
|
+
).T,
|
|
770
|
+
)
|
|
771
|
+
)
|
|
675
772
|
|
|
676
|
-
def _local_outlier_probabilities(self,
|
|
677
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
773
|
+
def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:
|
|
678
774
|
"""
|
|
679
775
|
Calculates the local outlier probability for each observation in the
|
|
680
776
|
input data.
|
|
@@ -684,17 +780,26 @@ class LocalOutlierProbability(object):
|
|
|
684
780
|
each observation.
|
|
685
781
|
"""
|
|
686
782
|
return np.hstack(
|
|
687
|
-
(
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
783
|
+
(
|
|
784
|
+
data_store,
|
|
785
|
+
np.array(
|
|
786
|
+
[
|
|
787
|
+
np.apply_along_axis(
|
|
788
|
+
self._local_outlier_probability,
|
|
789
|
+
0,
|
|
790
|
+
data_store[:, 7],
|
|
791
|
+
data_store[:, 9],
|
|
792
|
+
)
|
|
793
|
+
]
|
|
794
|
+
).T,
|
|
795
|
+
)
|
|
796
|
+
)
|
|
691
797
|
|
|
692
798
|
"""
|
|
693
799
|
Public methods
|
|
694
800
|
"""
|
|
695
801
|
|
|
696
|
-
def fit(self) ->
|
|
697
|
-
|
|
802
|
+
def fit(self) -> "LocalOutlierProbability":
|
|
698
803
|
"""
|
|
699
804
|
Calculates the local outlier probability for each observation in the
|
|
700
805
|
input data according to the input parameters extent, n_neighbors, and
|
|
@@ -706,13 +811,12 @@ class LocalOutlierProbability(object):
|
|
|
706
811
|
self.Validate._n_neighbors(self)
|
|
707
812
|
if self.Validate._cluster_size(self) is False:
|
|
708
813
|
sys.exit()
|
|
709
|
-
if self.data is not None and self.Validate._missing_values(
|
|
710
|
-
self) is False:
|
|
814
|
+
if self.data is not None and self.Validate._missing_values(self) is False:
|
|
711
815
|
sys.exit()
|
|
712
816
|
|
|
713
817
|
store = self._store()
|
|
714
818
|
if self.data is not None:
|
|
715
|
-
self._distances()
|
|
819
|
+
self._distances(progress_bar=self.progress_bar)
|
|
716
820
|
store = self._assign_distances(store)
|
|
717
821
|
store = self._ssd(store)
|
|
718
822
|
store = self._standard_distances(store)
|
|
@@ -731,7 +835,6 @@ class LocalOutlierProbability(object):
|
|
|
731
835
|
return self
|
|
732
836
|
|
|
733
837
|
def stream(self, x: np.ndarray) -> np.ndarray:
|
|
734
|
-
|
|
735
838
|
"""
|
|
736
839
|
Calculates the local outlier probability for an individual sample
|
|
737
840
|
according to the input parameters extent, n_neighbors, and
|
|
@@ -770,12 +873,12 @@ class LocalOutlierProbability(object):
|
|
|
770
873
|
ssd = np.power(distances, 2).sum()
|
|
771
874
|
std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
|
|
772
875
|
prob_dist = self._prob_distance(self.extent, std_dist)
|
|
773
|
-
plof = self._prob_outlier_factor(
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
)
|
|
876
|
+
plof = self._prob_outlier_factor(
|
|
877
|
+
np.array(prob_dist), np.array(self.prob_distances_ev.mean())
|
|
878
|
+
)
|
|
777
879
|
loop = self._local_outlier_probability(
|
|
778
|
-
plof, self.norm_prob_local_outlier_factor
|
|
880
|
+
plof, self.norm_prob_local_outlier_factor
|
|
881
|
+
)
|
|
779
882
|
|
|
780
883
|
if orig_cluster_labels is not None:
|
|
781
884
|
self.cluster_labels = orig_cluster_labels
|
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PyNomaly
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: A Python 3 implementation of LoOP: Local Outlier Probabilities, a local density based outlier detection method providing an outlier score in the range of [0,1].
|
|
5
5
|
Home-page: https://github.com/vc1492a/PyNomaly
|
|
6
6
|
Author: Valentino Constantinou
|
|
7
7
|
Author-email: vc@valentino.io
|
|
8
8
|
License: Apache License, Version 2.0
|
|
9
|
-
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.
|
|
9
|
+
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz
|
|
10
10
|
Keywords: outlier,anomaly,detection,machine,learning,probability
|
|
11
11
|
Platform: UNKNOWN
|
|
12
12
|
Requires-Dist: numpy
|
|
13
|
-
Requires-Dist:
|
|
13
|
+
Requires-Dist: python-utils
|
|
14
14
|
|
|
15
15
|
UNKNOWN
|
|
16
16
|
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
PyNomaly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
PyNomaly/loop.py,sha256=VLllAa5pOIHZjlI0XuLSpjLzY3tJ_ZTzDCbbIh3VM44,34571
|
|
3
|
+
PyNomaly-0.3.4.dist-info/LICENSE.txt,sha256=xZYfuJFfM57xOlBLbkJmsCwEvw1P6K2t3jI8faTdOMs,563
|
|
4
|
+
PyNomaly-0.3.4.dist-info/METADATA,sha256=xkHaSUSpOnZynE_KfVQAwoBXNOzTpE-IymwuiRdIeos,581
|
|
5
|
+
PyNomaly-0.3.4.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
|
|
6
|
+
PyNomaly-0.3.4.dist-info/top_level.txt,sha256=el-HX4RLyBjkh2CW3TK9yXAA54zQOIYVmcJjRbBYKX4,9
|
|
7
|
+
PyNomaly-0.3.4.dist-info/RECORD,,
|
PyNomaly-0.3.1.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
PyNomaly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
PyNomaly/loop.py,sha256=zK7I946YNha8VjxIrAJPgF5wjs6anYgXneJA-kH9RdA,32115
|
|
3
|
-
PyNomaly-0.3.1.dist-info/LICENSE.txt,sha256=xZYfuJFfM57xOlBLbkJmsCwEvw1P6K2t3jI8faTdOMs,563
|
|
4
|
-
PyNomaly-0.3.1.dist-info/METADATA,sha256=FQfdmmyCgb_cE1LJYpj4chFs40ME95Ms1G4wq_3AFAE,574
|
|
5
|
-
PyNomaly-0.3.1.dist-info/WHEEL,sha256=S8S5VL-stOTSZDYxHyf0KP7eds0J72qrK0Evu3TfyAY,92
|
|
6
|
-
PyNomaly-0.3.1.dist-info/top_level.txt,sha256=el-HX4RLyBjkh2CW3TK9yXAA54zQOIYVmcJjRbBYKX4,9
|
|
7
|
-
PyNomaly-0.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|