PyNomaly 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyNomaly/loop.py +229 -165
- {PyNomaly-0.3.2.dist-info → PyNomaly-0.3.4.dist-info}/METADATA +2 -2
- PyNomaly-0.3.4.dist-info/RECORD +7 -0
- {PyNomaly-0.3.2.dist-info → PyNomaly-0.3.4.dist-info}/WHEEL +1 -1
- PyNomaly-0.3.2.dist-info/RECORD +0 -7
- {PyNomaly-0.3.2.dist-info → PyNomaly-0.3.4.dist-info}/LICENSE.txt +0 -0
- {PyNomaly-0.3.2.dist-info → PyNomaly-0.3.4.dist-info}/top_level.txt +0 -0
PyNomaly/loop.py
CHANGED
|
@@ -10,13 +10,12 @@ try:
|
|
|
10
10
|
except ImportError:
|
|
11
11
|
pass
|
|
12
12
|
|
|
13
|
-
__author__ =
|
|
14
|
-
__version__ =
|
|
15
|
-
__license__ =
|
|
13
|
+
__author__ = "Valentino Constantinou"
|
|
14
|
+
__version__ = "0.3.4"
|
|
15
|
+
__license__ = "Apache License, Version 2.0"
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class Utils:
|
|
19
|
-
|
|
20
19
|
@staticmethod
|
|
21
20
|
def emit_progress_bar(progress: str, index: int, total: int) -> str:
|
|
22
21
|
"""
|
|
@@ -32,7 +31,10 @@ class Utils:
|
|
|
32
31
|
|
|
33
32
|
w, h = get_terminal_size()
|
|
34
33
|
sys.stdout.write("\r")
|
|
35
|
-
|
|
34
|
+
if total < w:
|
|
35
|
+
block_size = int(w / total)
|
|
36
|
+
else:
|
|
37
|
+
block_size = int(total / w)
|
|
36
38
|
if index % block_size == 0:
|
|
37
39
|
progress += "="
|
|
38
40
|
percent = index / total
|
|
@@ -52,7 +54,7 @@ class LocalOutlierProbability(object):
|
|
|
52
54
|
:param cluster_labels: a numpy array of cluster assignments w.r.t. each
|
|
53
55
|
sample (optional, default None)
|
|
54
56
|
:return:
|
|
55
|
-
""""""
|
|
57
|
+
""" """
|
|
56
58
|
|
|
57
59
|
Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP:
|
|
58
60
|
Local Outlier Probabilities.
|
|
@@ -90,7 +92,7 @@ class LocalOutlierProbability(object):
|
|
|
90
92
|
"""
|
|
91
93
|
|
|
92
94
|
@staticmethod
|
|
93
|
-
def _data(obj: Union[
|
|
95
|
+
def _data(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
|
|
94
96
|
"""
|
|
95
97
|
Validates the input data to ensure it is either a Pandas DataFrame
|
|
96
98
|
or Numpy array.
|
|
@@ -98,24 +100,25 @@ class LocalOutlierProbability(object):
|
|
|
98
100
|
:return: a vector of values to be used in calculating the local
|
|
99
101
|
outlier probability.
|
|
100
102
|
"""
|
|
101
|
-
if obj.__class__.__name__ ==
|
|
103
|
+
if obj.__class__.__name__ == "DataFrame":
|
|
102
104
|
points_vector = obj.values
|
|
103
105
|
return points_vector
|
|
104
|
-
elif obj.__class__.__name__ ==
|
|
106
|
+
elif obj.__class__.__name__ == "ndarray":
|
|
105
107
|
points_vector = obj
|
|
106
108
|
return points_vector
|
|
107
109
|
else:
|
|
108
110
|
warnings.warn(
|
|
109
111
|
"Provided data or distance matrix must be in ndarray "
|
|
110
112
|
"or DataFrame.",
|
|
111
|
-
UserWarning
|
|
113
|
+
UserWarning,
|
|
114
|
+
)
|
|
112
115
|
if isinstance(obj, list):
|
|
113
116
|
points_vector = np.array(obj)
|
|
114
117
|
return points_vector
|
|
115
118
|
points_vector = np.array([obj])
|
|
116
119
|
return points_vector
|
|
117
120
|
|
|
118
|
-
def _inputs(self, obj:
|
|
121
|
+
def _inputs(self, obj: "LocalOutlierProbability"):
|
|
119
122
|
"""
|
|
120
123
|
Validates the inputs provided during initialization to ensure
|
|
121
124
|
that the needed objects are provided.
|
|
@@ -131,35 +134,43 @@ class LocalOutlierProbability(object):
|
|
|
131
134
|
elif all(v is not None for v in [obj.data, obj.distance_matrix]):
|
|
132
135
|
warnings.warn(
|
|
133
136
|
"Only one of the following may be provided: data or a "
|
|
134
|
-
"distance matrix (not both).",
|
|
137
|
+
"distance matrix (not both).",
|
|
138
|
+
UserWarning,
|
|
135
139
|
)
|
|
136
140
|
return False
|
|
137
141
|
if obj.data is not None:
|
|
138
142
|
points_vector = self._data(obj.data)
|
|
139
143
|
return points_vector, obj.distance_matrix, obj.neighbor_matrix
|
|
140
|
-
if all(
|
|
141
|
-
|
|
144
|
+
if all(
|
|
145
|
+
matrix is not None
|
|
146
|
+
for matrix in [obj.neighbor_matrix, obj.distance_matrix]
|
|
147
|
+
):
|
|
142
148
|
dist_vector = self._data(obj.distance_matrix)
|
|
143
149
|
neigh_vector = self._data(obj.neighbor_matrix)
|
|
144
150
|
else:
|
|
145
151
|
warnings.warn(
|
|
146
152
|
"A neighbor index matrix and distance matrix must both be "
|
|
147
|
-
"provided when not using raw input data.",
|
|
153
|
+
"provided when not using raw input data.",
|
|
154
|
+
UserWarning,
|
|
148
155
|
)
|
|
149
156
|
return False
|
|
150
157
|
if obj.distance_matrix.shape != obj.neighbor_matrix.shape:
|
|
151
158
|
warnings.warn(
|
|
152
159
|
"The shape of the distance and neighbor "
|
|
153
|
-
"index matrices must match.",
|
|
160
|
+
"index matrices must match.",
|
|
161
|
+
UserWarning,
|
|
154
162
|
)
|
|
155
163
|
return False
|
|
156
|
-
elif (obj.distance_matrix.shape[1] != obj.n_neighbors)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
warnings.warn(
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
164
|
+
elif (obj.distance_matrix.shape[1] != obj.n_neighbors) or (
|
|
165
|
+
obj.neighbor_matrix.shape[1] != obj.n_neighbors
|
|
166
|
+
):
|
|
167
|
+
warnings.warn(
|
|
168
|
+
"The shape of the distance or "
|
|
169
|
+
"neighbor index matrix does not "
|
|
170
|
+
"match the number of neighbors "
|
|
171
|
+
"specified.",
|
|
172
|
+
UserWarning,
|
|
173
|
+
)
|
|
163
174
|
return False
|
|
164
175
|
return obj.data, dist_vector, neigh_vector
|
|
165
176
|
|
|
@@ -181,7 +192,8 @@ class LocalOutlierProbability(object):
|
|
|
181
192
|
"cluster. Specify a number of neighbors smaller than "
|
|
182
193
|
"the smallest cluster size (observations in smallest "
|
|
183
194
|
"cluster minus one).",
|
|
184
|
-
UserWarning
|
|
195
|
+
UserWarning,
|
|
196
|
+
)
|
|
185
197
|
return False
|
|
186
198
|
return True
|
|
187
199
|
|
|
@@ -196,17 +208,19 @@ class LocalOutlierProbability(object):
|
|
|
196
208
|
"""
|
|
197
209
|
if not obj.n_neighbors > 0:
|
|
198
210
|
obj.n_neighbors = 10
|
|
199
|
-
warnings.warn(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
211
|
+
warnings.warn(
|
|
212
|
+
"n_neighbors must be greater than 0."
|
|
213
|
+
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
214
|
+
UserWarning,
|
|
215
|
+
)
|
|
203
216
|
return False
|
|
204
217
|
elif obj.n_neighbors >= obj._n_observations():
|
|
205
218
|
obj.n_neighbors = obj._n_observations() - 1
|
|
206
219
|
warnings.warn(
|
|
207
220
|
"n_neighbors must be less than the number of observations."
|
|
208
221
|
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
209
|
-
UserWarning
|
|
222
|
+
UserWarning,
|
|
223
|
+
)
|
|
210
224
|
return True
|
|
211
225
|
|
|
212
226
|
@staticmethod
|
|
@@ -219,8 +233,8 @@ class LocalOutlierProbability(object):
|
|
|
219
233
|
"""
|
|
220
234
|
if obj.extent not in [1, 2, 3]:
|
|
221
235
|
warnings.warn(
|
|
222
|
-
"extent parameter (lambda) must be 1, 2, or 3.",
|
|
223
|
-
|
|
236
|
+
"extent parameter (lambda) must be 1, 2, or 3.", UserWarning
|
|
237
|
+
)
|
|
224
238
|
return False
|
|
225
239
|
return True
|
|
226
240
|
|
|
@@ -234,8 +248,8 @@ class LocalOutlierProbability(object):
|
|
|
234
248
|
"""
|
|
235
249
|
if np.any(np.isnan(obj.data)):
|
|
236
250
|
warnings.warn(
|
|
237
|
-
"Method does not support missing values in input data.",
|
|
238
|
-
|
|
251
|
+
"Method does not support missing values in input data.", UserWarning
|
|
252
|
+
)
|
|
239
253
|
return False
|
|
240
254
|
return True
|
|
241
255
|
|
|
@@ -251,7 +265,8 @@ class LocalOutlierProbability(object):
|
|
|
251
265
|
warnings.warn(
|
|
252
266
|
"Must fit on historical data by calling fit() prior to "
|
|
253
267
|
"calling stream(x).",
|
|
254
|
-
UserWarning
|
|
268
|
+
UserWarning,
|
|
269
|
+
)
|
|
255
270
|
return False
|
|
256
271
|
return True
|
|
257
272
|
|
|
@@ -269,7 +284,8 @@ class LocalOutlierProbability(object):
|
|
|
269
284
|
warnings.warn(
|
|
270
285
|
"Stream approach does not support clustered data. "
|
|
271
286
|
"Automatically refit using single cluster of points.",
|
|
272
|
-
UserWarning
|
|
287
|
+
UserWarning,
|
|
288
|
+
)
|
|
273
289
|
return False
|
|
274
290
|
return True
|
|
275
291
|
|
|
@@ -291,43 +307,35 @@ class LocalOutlierProbability(object):
|
|
|
291
307
|
assert len(types) == f.__code__.co_argcount
|
|
292
308
|
|
|
293
309
|
def new_f(*args, **kwds):
|
|
294
|
-
for
|
|
295
|
-
if type(a).__name__ ==
|
|
310
|
+
for a, t in zip(args, types):
|
|
311
|
+
if type(a).__name__ == "DataFrame":
|
|
296
312
|
a = np.array(a)
|
|
297
313
|
if isinstance(a, t) is False:
|
|
298
|
-
warnings.warn(
|
|
299
|
-
|
|
314
|
+
warnings.warn(
|
|
315
|
+
"Argument %r is not of type %s" % (a, t), UserWarning
|
|
316
|
+
)
|
|
300
317
|
opt_types = {
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
},
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
},
|
|
307
|
-
|
|
308
|
-
'type': types[4]
|
|
309
|
-
},
|
|
310
|
-
'n_neighbors': {
|
|
311
|
-
'type': types[5]
|
|
312
|
-
},
|
|
313
|
-
'cluster_labels': {
|
|
314
|
-
'type': types[6]
|
|
315
|
-
},
|
|
316
|
-
'use_numba': {
|
|
317
|
-
'type': types[7]
|
|
318
|
-
},
|
|
319
|
-
'progress_bar': {
|
|
320
|
-
'type': types[8]
|
|
321
|
-
}
|
|
318
|
+
"distance_matrix": {"type": types[2]},
|
|
319
|
+
"neighbor_matrix": {"type": types[3]},
|
|
320
|
+
"extent": {"type": types[4]},
|
|
321
|
+
"n_neighbors": {"type": types[5]},
|
|
322
|
+
"cluster_labels": {"type": types[6]},
|
|
323
|
+
"use_numba": {"type": types[7]},
|
|
324
|
+
"progress_bar": {"type": types[8]},
|
|
322
325
|
}
|
|
323
326
|
for x in kwds:
|
|
324
|
-
opt_types[x][
|
|
327
|
+
opt_types[x]["value"] = kwds[x]
|
|
325
328
|
for k in opt_types:
|
|
326
329
|
try:
|
|
327
|
-
if
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
330
|
+
if (
|
|
331
|
+
isinstance(opt_types[k]["value"], opt_types[k]["type"])
|
|
332
|
+
is False
|
|
333
|
+
):
|
|
334
|
+
warnings.warn(
|
|
335
|
+
"Argument %r is not of type %s."
|
|
336
|
+
% (k, opt_types[k]["type"]),
|
|
337
|
+
UserWarning,
|
|
338
|
+
)
|
|
331
339
|
except KeyError:
|
|
332
340
|
pass
|
|
333
341
|
return f(*args, **kwds)
|
|
@@ -337,11 +345,28 @@ class LocalOutlierProbability(object):
|
|
|
337
345
|
|
|
338
346
|
return decorator
|
|
339
347
|
|
|
340
|
-
@accepts(
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
348
|
+
@accepts(
|
|
349
|
+
object,
|
|
350
|
+
np.ndarray,
|
|
351
|
+
np.ndarray,
|
|
352
|
+
np.ndarray,
|
|
353
|
+
(int, np.integer),
|
|
354
|
+
(int, np.integer),
|
|
355
|
+
list,
|
|
356
|
+
bool,
|
|
357
|
+
bool,
|
|
358
|
+
)
|
|
359
|
+
def __init__(
|
|
360
|
+
self,
|
|
361
|
+
data=None,
|
|
362
|
+
distance_matrix=None,
|
|
363
|
+
neighbor_matrix=None,
|
|
364
|
+
extent=3,
|
|
365
|
+
n_neighbors=10,
|
|
366
|
+
cluster_labels=None,
|
|
367
|
+
use_numba=False,
|
|
368
|
+
progress_bar=False,
|
|
369
|
+
) -> None:
|
|
345
370
|
self.data = data
|
|
346
371
|
self.distance_matrix = distance_matrix
|
|
347
372
|
self.neighbor_matrix = neighbor_matrix
|
|
@@ -358,11 +383,11 @@ class LocalOutlierProbability(object):
|
|
|
358
383
|
self.progress_bar = progress_bar
|
|
359
384
|
self.is_fit = False
|
|
360
385
|
|
|
361
|
-
if self.use_numba is True and
|
|
386
|
+
if self.use_numba is True and "numba" not in sys.modules:
|
|
362
387
|
self.use_numba = False
|
|
363
388
|
warnings.warn(
|
|
364
|
-
"Numba is not available, falling back to pure python mode.",
|
|
365
|
-
|
|
389
|
+
"Numba is not available, falling back to pure python mode.", UserWarning
|
|
390
|
+
)
|
|
366
391
|
|
|
367
392
|
self.Validate()._inputs(self)
|
|
368
393
|
self.Validate._extent(self)
|
|
@@ -372,15 +397,14 @@ class LocalOutlierProbability(object):
|
|
|
372
397
|
"""
|
|
373
398
|
|
|
374
399
|
@staticmethod
|
|
375
|
-
def _standard_distance(cardinality: float, sum_squared_distance: float)
|
|
376
|
-
-> float:
|
|
400
|
+
def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:
|
|
377
401
|
"""
|
|
378
402
|
Calculates the standard distance of an observation.
|
|
379
403
|
:param cardinality: the cardinality of the input observation.
|
|
380
404
|
:param sum_squared_distance: the sum squared distance between all
|
|
381
405
|
neighbors of the input observation.
|
|
382
406
|
:return: the standard distance.
|
|
383
|
-
#
|
|
407
|
+
#"""
|
|
384
408
|
division_result = sum_squared_distance / cardinality
|
|
385
409
|
st_dist = sqrt(division_result)
|
|
386
410
|
return st_dist
|
|
@@ -397,8 +421,9 @@ class LocalOutlierProbability(object):
|
|
|
397
421
|
return extent * standard_distance
|
|
398
422
|
|
|
399
423
|
@staticmethod
|
|
400
|
-
def _prob_outlier_factor(
|
|
401
|
-
|
|
424
|
+
def _prob_outlier_factor(
|
|
425
|
+
probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray
|
|
426
|
+
) -> np.ndarray:
|
|
402
427
|
"""
|
|
403
428
|
Calculates the probabilistic outlier factor of an observation.
|
|
404
429
|
:param probabilistic_distance: the probabilistic distance of the
|
|
@@ -409,14 +434,14 @@ class LocalOutlierProbability(object):
|
|
|
409
434
|
if np.all(probabilistic_distance == ev_prob_dist):
|
|
410
435
|
return np.zeros(probabilistic_distance.shape)
|
|
411
436
|
else:
|
|
412
|
-
ev_prob_dist[ev_prob_dist == 0.] = 1.
|
|
413
|
-
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.
|
|
437
|
+
ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8
|
|
438
|
+
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0
|
|
414
439
|
return result
|
|
415
440
|
|
|
416
441
|
@staticmethod
|
|
417
|
-
def _norm_prob_outlier_factor(
|
|
418
|
-
|
|
419
|
-
|
|
442
|
+
def _norm_prob_outlier_factor(
|
|
443
|
+
extent: float, ev_probabilistic_outlier_factor: list
|
|
444
|
+
) -> list:
|
|
420
445
|
"""
|
|
421
446
|
Calculates the normalized probabilistic outlier factor of an
|
|
422
447
|
observation.
|
|
@@ -431,8 +456,9 @@ class LocalOutlierProbability(object):
|
|
|
431
456
|
return npofs
|
|
432
457
|
|
|
433
458
|
@staticmethod
|
|
434
|
-
def _local_outlier_probability(
|
|
435
|
-
|
|
459
|
+
def _local_outlier_probability(
|
|
460
|
+
plof_val: np.ndarray, nplof_val: np.ndarray
|
|
461
|
+
) -> np.ndarray:
|
|
436
462
|
"""
|
|
437
463
|
Calculates the local outlier probability of an observation.
|
|
438
464
|
:param plof_val: the probabilistic outlier factor of the input
|
|
@@ -445,7 +471,7 @@ class LocalOutlierProbability(object):
|
|
|
445
471
|
if np.all(plof_val == nplof_val):
|
|
446
472
|
return np.zeros(plof_val.shape)
|
|
447
473
|
else:
|
|
448
|
-
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.))))
|
|
474
|
+
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
|
|
449
475
|
|
|
450
476
|
def _n_observations(self) -> int:
|
|
451
477
|
"""
|
|
@@ -499,8 +525,9 @@ class LocalOutlierProbability(object):
|
|
|
499
525
|
:return: the updated storage matrix that collects information on
|
|
500
526
|
each observation.
|
|
501
527
|
"""
|
|
502
|
-
for vec, cluster_id in zip(
|
|
503
|
-
|
|
528
|
+
for vec, cluster_id in zip(
|
|
529
|
+
range(self.distance_matrix.shape[0]), self._cluster_labels()
|
|
530
|
+
):
|
|
504
531
|
data_store[vec][0] = cluster_id
|
|
505
532
|
data_store[vec][1] = self.distance_matrix[vec]
|
|
506
533
|
data_store[vec][2] = self.neighbor_matrix[vec]
|
|
@@ -508,10 +535,10 @@ class LocalOutlierProbability(object):
|
|
|
508
535
|
|
|
509
536
|
@staticmethod
|
|
510
537
|
def _compute_distance_and_neighbor_matrix(
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
538
|
+
clust_points_vector: np.ndarray,
|
|
539
|
+
indices: np.ndarray,
|
|
540
|
+
distances: np.ndarray,
|
|
541
|
+
indexes: np.ndarray,
|
|
515
542
|
) -> Tuple[np.ndarray, np.ndarray, int]:
|
|
516
543
|
"""
|
|
517
544
|
This helper method provides the heavy lifting for the _distances
|
|
@@ -519,27 +546,27 @@ class LocalOutlierProbability(object):
|
|
|
519
546
|
written so that it can make full use of Numba's jit capabilities if
|
|
520
547
|
desired.
|
|
521
548
|
"""
|
|
522
|
-
|
|
523
549
|
for i in range(clust_points_vector.shape[0]):
|
|
524
550
|
for j in range(i + 1, clust_points_vector.shape[0]):
|
|
525
|
-
|
|
551
|
+
# Global index of the points
|
|
552
|
+
global_i = indices[0][i]
|
|
553
|
+
global_j = indices[0][j]
|
|
526
554
|
|
|
527
|
-
|
|
555
|
+
# Compute Euclidean distance
|
|
556
|
+
diff = clust_points_vector[i] - clust_points_vector[j]
|
|
528
557
|
d = np.dot(diff, diff) ** 0.5
|
|
529
558
|
|
|
530
|
-
|
|
531
|
-
idx_max = distances[
|
|
559
|
+
# Update distance and neighbor index for global_i
|
|
560
|
+
idx_max = distances[global_i].argmax()
|
|
561
|
+
if d < distances[global_i][idx_max]:
|
|
562
|
+
distances[global_i][idx_max] = d
|
|
563
|
+
indexes[global_i][idx_max] = global_j
|
|
532
564
|
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
idx_max = distances[idx].argmax()
|
|
539
|
-
|
|
540
|
-
if d < distances[idx][idx_max]:
|
|
541
|
-
distances[idx][idx_max] = d
|
|
542
|
-
indexes[idx][idx_max] = p[0][0]
|
|
565
|
+
# Update distance and neighbor index for global_j
|
|
566
|
+
idx_max = distances[global_j].argmax()
|
|
567
|
+
if d < distances[global_j][idx_max]:
|
|
568
|
+
distances[global_j][idx_max] = d
|
|
569
|
+
indexes[global_j][idx_max] = global_i
|
|
543
570
|
|
|
544
571
|
yield distances, indexes, i
|
|
545
572
|
|
|
@@ -552,20 +579,21 @@ class LocalOutlierProbability(object):
|
|
|
552
579
|
:return: the updated storage matrix that collects information on
|
|
553
580
|
each observation.
|
|
554
581
|
"""
|
|
555
|
-
distances = np.full(
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
582
|
+
distances = np.full(
|
|
583
|
+
[self._n_observations(), self.n_neighbors], 9e10, dtype=float
|
|
584
|
+
)
|
|
585
|
+
indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
|
|
559
586
|
self.points_vector = self.Validate._data(self.data)
|
|
560
|
-
compute =
|
|
561
|
-
|
|
562
|
-
self.
|
|
587
|
+
compute = (
|
|
588
|
+
numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
|
|
589
|
+
if self.use_numba
|
|
590
|
+
else self._compute_distance_and_neighbor_matrix
|
|
591
|
+
)
|
|
563
592
|
progress = "="
|
|
564
593
|
for cluster_id in set(self._cluster_labels()):
|
|
565
594
|
indices = np.where(self._cluster_labels() == cluster_id)
|
|
566
595
|
clust_points_vector = np.array(
|
|
567
|
-
self.points_vector.take(indices, axis=0)[0],
|
|
568
|
-
dtype=np.float64
|
|
596
|
+
self.points_vector.take(indices, axis=0)[0], dtype=np.float64
|
|
569
597
|
)
|
|
570
598
|
# a generator that yields an updated distance matrix on each loop
|
|
571
599
|
for c in compute(clust_points_vector, indices, distances, indexes):
|
|
@@ -573,7 +601,8 @@ class LocalOutlierProbability(object):
|
|
|
573
601
|
# update the progress bar
|
|
574
602
|
if progress_bar is True:
|
|
575
603
|
progress = Utils.emit_progress_bar(
|
|
576
|
-
progress, i+1, clust_points_vector.shape[0]
|
|
604
|
+
progress, i + 1, clust_points_vector.shape[0]
|
|
605
|
+
)
|
|
577
606
|
|
|
578
607
|
self.distance_matrix = distances
|
|
579
608
|
self.neighbor_matrix = indexes
|
|
@@ -627,11 +656,10 @@ class LocalOutlierProbability(object):
|
|
|
627
656
|
"""
|
|
628
657
|
prob_distances = []
|
|
629
658
|
for i in range(data_store[:, 4].shape[0]):
|
|
630
|
-
prob_distances.append(
|
|
631
|
-
self._prob_distance(self.extent, data_store[:, 4][i]))
|
|
659
|
+
prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))
|
|
632
660
|
return np.hstack((data_store, np.array([prob_distances]).T))
|
|
633
661
|
|
|
634
|
-
def _prob_distances_ev(self, data_store
|
|
662
|
+
def _prob_distances_ev(self, data_store) -> np.ndarray:
|
|
635
663
|
"""
|
|
636
664
|
Calculates the expected value of the probabilistic distance for
|
|
637
665
|
each observation in the input data with respect to the cluster the
|
|
@@ -645,19 +673,20 @@ class LocalOutlierProbability(object):
|
|
|
645
673
|
for cluster_id in self.cluster_labels_u:
|
|
646
674
|
indices = np.where(data_store[:, 0] == cluster_id)[0]
|
|
647
675
|
for index in indices:
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
676
|
+
# Global neighbor indices for the current point
|
|
677
|
+
nbrhood = data_store[index][2].astype(int) # Ensure global indices
|
|
678
|
+
nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(
|
|
679
|
+
float
|
|
680
|
+
)
|
|
651
681
|
nbrhood_prob_distances_nonan = nbrhood_prob_distances[
|
|
652
|
-
np.logical_not(np.isnan(nbrhood_prob_distances))
|
|
653
|
-
|
|
654
|
-
|
|
682
|
+
np.logical_not(np.isnan(nbrhood_prob_distances))
|
|
683
|
+
]
|
|
684
|
+
prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()
|
|
685
|
+
|
|
655
686
|
self.prob_distances_ev = prob_set_distance_ev
|
|
656
|
-
|
|
657
|
-
return data_store
|
|
687
|
+
return np.hstack((data_store, prob_set_distance_ev))
|
|
658
688
|
|
|
659
|
-
def _prob_local_outlier_factors(self,
|
|
660
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
689
|
+
def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
|
|
661
690
|
"""
|
|
662
691
|
Calculates the probabilistic local outlier factor for each
|
|
663
692
|
observation in the input data.
|
|
@@ -667,13 +696,22 @@ class LocalOutlierProbability(object):
|
|
|
667
696
|
each observation.
|
|
668
697
|
"""
|
|
669
698
|
return np.hstack(
|
|
670
|
-
(
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
699
|
+
(
|
|
700
|
+
data_store,
|
|
701
|
+
np.array(
|
|
702
|
+
[
|
|
703
|
+
np.apply_along_axis(
|
|
704
|
+
self._prob_outlier_factor,
|
|
705
|
+
0,
|
|
706
|
+
data_store[:, 5],
|
|
707
|
+
data_store[:, 6],
|
|
708
|
+
)
|
|
709
|
+
]
|
|
710
|
+
).T,
|
|
711
|
+
)
|
|
712
|
+
)
|
|
674
713
|
|
|
675
|
-
def _prob_local_outlier_factors_ev(self,
|
|
676
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
714
|
+
def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:
|
|
677
715
|
"""
|
|
678
716
|
Calculates the expected value of the probabilistic local outlier factor
|
|
679
717
|
for each observation in the input data with respect to the cluster the
|
|
@@ -686,21 +724,31 @@ class LocalOutlierProbability(object):
|
|
|
686
724
|
prob_local_outlier_factor_ev_dict = {}
|
|
687
725
|
for cluster_id in self.cluster_labels_u:
|
|
688
726
|
indices = np.where(data_store[:, 0] == cluster_id)
|
|
689
|
-
prob_local_outlier_factors = np.take(data_store[:, 7],
|
|
690
|
-
|
|
691
|
-
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
|
|
692
|
-
np.logical_not(np.isnan(prob_local_outlier_factors))]
|
|
693
|
-
prob_local_outlier_factor_ev_dict[cluster_id] = (
|
|
694
|
-
np.power(prob_local_outlier_factors_nonan, 2).sum() /
|
|
695
|
-
float(prob_local_outlier_factors_nonan.size)
|
|
727
|
+
prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(
|
|
728
|
+
float
|
|
696
729
|
)
|
|
730
|
+
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
|
|
731
|
+
np.logical_not(np.isnan(prob_local_outlier_factors))
|
|
732
|
+
]
|
|
733
|
+
prob_local_outlier_factor_ev_dict[cluster_id] = np.power(
|
|
734
|
+
prob_local_outlier_factors_nonan, 2
|
|
735
|
+
).sum() / float(prob_local_outlier_factors_nonan.size)
|
|
697
736
|
data_store = np.hstack(
|
|
698
|
-
(
|
|
699
|
-
|
|
737
|
+
(
|
|
738
|
+
data_store,
|
|
739
|
+
np.array(
|
|
740
|
+
[
|
|
741
|
+
[
|
|
742
|
+
prob_local_outlier_factor_ev_dict[x]
|
|
743
|
+
for x in data_store[:, 0].tolist()
|
|
744
|
+
]
|
|
745
|
+
]
|
|
746
|
+
).T,
|
|
747
|
+
)
|
|
748
|
+
)
|
|
700
749
|
return data_store
|
|
701
750
|
|
|
702
|
-
def _norm_prob_local_outlier_factors(self, data_store: np.ndarray)
|
|
703
|
-
-> np.ndarray:
|
|
751
|
+
def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
|
|
704
752
|
"""
|
|
705
753
|
Calculates the normalized probabilistic local outlier factor for each
|
|
706
754
|
observation in the input data.
|
|
@@ -709,11 +757,20 @@ class LocalOutlierProbability(object):
|
|
|
709
757
|
:return: the updated storage matrix that collects information on
|
|
710
758
|
each observation.
|
|
711
759
|
"""
|
|
712
|
-
return np.hstack(
|
|
713
|
-
|
|
760
|
+
return np.hstack(
|
|
761
|
+
(
|
|
762
|
+
data_store,
|
|
763
|
+
np.array(
|
|
764
|
+
[
|
|
765
|
+
self._norm_prob_outlier_factor(
|
|
766
|
+
self.extent, data_store[:, 8].tolist()
|
|
767
|
+
)
|
|
768
|
+
]
|
|
769
|
+
).T,
|
|
770
|
+
)
|
|
771
|
+
)
|
|
714
772
|
|
|
715
|
-
def _local_outlier_probabilities(self,
|
|
716
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
773
|
+
def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:
|
|
717
774
|
"""
|
|
718
775
|
Calculates the local outlier probability for each observation in the
|
|
719
776
|
input data.
|
|
@@ -723,17 +780,26 @@ class LocalOutlierProbability(object):
|
|
|
723
780
|
each observation.
|
|
724
781
|
"""
|
|
725
782
|
return np.hstack(
|
|
726
|
-
(
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
783
|
+
(
|
|
784
|
+
data_store,
|
|
785
|
+
np.array(
|
|
786
|
+
[
|
|
787
|
+
np.apply_along_axis(
|
|
788
|
+
self._local_outlier_probability,
|
|
789
|
+
0,
|
|
790
|
+
data_store[:, 7],
|
|
791
|
+
data_store[:, 9],
|
|
792
|
+
)
|
|
793
|
+
]
|
|
794
|
+
).T,
|
|
795
|
+
)
|
|
796
|
+
)
|
|
730
797
|
|
|
731
798
|
"""
|
|
732
799
|
Public methods
|
|
733
800
|
"""
|
|
734
801
|
|
|
735
|
-
def fit(self) ->
|
|
736
|
-
|
|
802
|
+
def fit(self) -> "LocalOutlierProbability":
|
|
737
803
|
"""
|
|
738
804
|
Calculates the local outlier probability for each observation in the
|
|
739
805
|
input data according to the input parameters extent, n_neighbors, and
|
|
@@ -745,8 +811,7 @@ class LocalOutlierProbability(object):
|
|
|
745
811
|
self.Validate._n_neighbors(self)
|
|
746
812
|
if self.Validate._cluster_size(self) is False:
|
|
747
813
|
sys.exit()
|
|
748
|
-
if self.data is not None and self.Validate._missing_values(
|
|
749
|
-
self) is False:
|
|
814
|
+
if self.data is not None and self.Validate._missing_values(self) is False:
|
|
750
815
|
sys.exit()
|
|
751
816
|
|
|
752
817
|
store = self._store()
|
|
@@ -770,7 +835,6 @@ class LocalOutlierProbability(object):
|
|
|
770
835
|
return self
|
|
771
836
|
|
|
772
837
|
def stream(self, x: np.ndarray) -> np.ndarray:
|
|
773
|
-
|
|
774
838
|
"""
|
|
775
839
|
Calculates the local outlier probability for an individual sample
|
|
776
840
|
according to the input parameters extent, n_neighbors, and
|
|
@@ -809,12 +873,12 @@ class LocalOutlierProbability(object):
|
|
|
809
873
|
ssd = np.power(distances, 2).sum()
|
|
810
874
|
std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
|
|
811
875
|
prob_dist = self._prob_distance(self.extent, std_dist)
|
|
812
|
-
plof = self._prob_outlier_factor(
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
)
|
|
876
|
+
plof = self._prob_outlier_factor(
|
|
877
|
+
np.array(prob_dist), np.array(self.prob_distances_ev.mean())
|
|
878
|
+
)
|
|
816
879
|
loop = self._local_outlier_probability(
|
|
817
|
-
plof, self.norm_prob_local_outlier_factor
|
|
880
|
+
plof, self.norm_prob_local_outlier_factor
|
|
881
|
+
)
|
|
818
882
|
|
|
819
883
|
if orig_cluster_labels is not None:
|
|
820
884
|
self.cluster_labels = orig_cluster_labels
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: PyNomaly
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: A Python 3 implementation of LoOP: Local Outlier Probabilities, a local density based outlier detection method providing an outlier score in the range of [0,1].
|
|
5
5
|
Home-page: https://github.com/vc1492a/PyNomaly
|
|
6
6
|
Author: Valentino Constantinou
|
|
7
7
|
Author-email: vc@valentino.io
|
|
8
8
|
License: Apache License, Version 2.0
|
|
9
|
-
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.
|
|
9
|
+
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz
|
|
10
10
|
Keywords: outlier,anomaly,detection,machine,learning,probability
|
|
11
11
|
Platform: UNKNOWN
|
|
12
12
|
Requires-Dist: numpy
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
PyNomaly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
PyNomaly/loop.py,sha256=VLllAa5pOIHZjlI0XuLSpjLzY3tJ_ZTzDCbbIh3VM44,34571
|
|
3
|
+
PyNomaly-0.3.4.dist-info/LICENSE.txt,sha256=xZYfuJFfM57xOlBLbkJmsCwEvw1P6K2t3jI8faTdOMs,563
|
|
4
|
+
PyNomaly-0.3.4.dist-info/METADATA,sha256=xkHaSUSpOnZynE_KfVQAwoBXNOzTpE-IymwuiRdIeos,581
|
|
5
|
+
PyNomaly-0.3.4.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
|
|
6
|
+
PyNomaly-0.3.4.dist-info/top_level.txt,sha256=el-HX4RLyBjkh2CW3TK9yXAA54zQOIYVmcJjRbBYKX4,9
|
|
7
|
+
PyNomaly-0.3.4.dist-info/RECORD,,
|
PyNomaly-0.3.2.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
PyNomaly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
PyNomaly/loop.py,sha256=TN_uRdunxW-9T_uqwBEMe3hFQ1KBPEBaaILdx60j7dY,33782
|
|
3
|
-
PyNomaly-0.3.2.dist-info/LICENSE.txt,sha256=xZYfuJFfM57xOlBLbkJmsCwEvw1P6K2t3jI8faTdOMs,563
|
|
4
|
-
PyNomaly-0.3.2.dist-info/METADATA,sha256=aZyIcqIYNKlRvPfGer9jppgw5lA3n9kA-q6OWXcuYYg,581
|
|
5
|
-
PyNomaly-0.3.2.dist-info/WHEEL,sha256=S8S5VL-stOTSZDYxHyf0KP7eds0J72qrK0Evu3TfyAY,92
|
|
6
|
-
PyNomaly-0.3.2.dist-info/top_level.txt,sha256=el-HX4RLyBjkh2CW3TK9yXAA54zQOIYVmcJjRbBYKX4,9
|
|
7
|
-
PyNomaly-0.3.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|