PyNomaly 0.3.3__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/PKG-INFO +2 -2
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/PyNomaly/loop.py +225 -164
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/PyNomaly.egg-info/PKG-INFO +2 -2
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/PyNomaly.egg-info/SOURCES.txt +0 -1
- PyNomaly-0.3.4/setup.cfg +4 -0
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/setup.py +2 -2
- PyNomaly-0.3.3/setup.cfg +0 -7
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/PyNomaly/__init__.py +0 -0
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/PyNomaly.egg-info/dependency_links.txt +0 -0
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/PyNomaly.egg-info/requires.txt +0 -0
- {PyNomaly-0.3.3 → PyNomaly-0.3.4}/PyNomaly.egg-info/top_level.txt +0 -0
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 1.1
|
|
2
2
|
Name: PyNomaly
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: A Python 3 implementation of LoOP: Local Outlier Probabilities, a local density based outlier detection method providing an outlier score in the range of [0,1].
|
|
5
5
|
Home-page: https://github.com/vc1492a/PyNomaly
|
|
6
6
|
Author: Valentino Constantinou
|
|
7
7
|
Author-email: vc@valentino.io
|
|
8
8
|
License: Apache License, Version 2.0
|
|
9
|
-
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.
|
|
9
|
+
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz
|
|
10
10
|
Description: UNKNOWN
|
|
11
11
|
Keywords: outlier,anomaly,detection,machine,learning,probability
|
|
12
12
|
Platform: UNKNOWN
|
|
@@ -10,13 +10,12 @@ try:
|
|
|
10
10
|
except ImportError:
|
|
11
11
|
pass
|
|
12
12
|
|
|
13
|
-
__author__ =
|
|
14
|
-
__version__ =
|
|
15
|
-
__license__ =
|
|
13
|
+
__author__ = "Valentino Constantinou"
|
|
14
|
+
__version__ = "0.3.4"
|
|
15
|
+
__license__ = "Apache License, Version 2.0"
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class Utils:
|
|
19
|
-
|
|
20
19
|
@staticmethod
|
|
21
20
|
def emit_progress_bar(progress: str, index: int, total: int) -> str:
|
|
22
21
|
"""
|
|
@@ -55,7 +54,7 @@ class LocalOutlierProbability(object):
|
|
|
55
54
|
:param cluster_labels: a numpy array of cluster assignments w.r.t. each
|
|
56
55
|
sample (optional, default None)
|
|
57
56
|
:return:
|
|
58
|
-
""""""
|
|
57
|
+
""" """
|
|
59
58
|
|
|
60
59
|
Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP:
|
|
61
60
|
Local Outlier Probabilities.
|
|
@@ -93,7 +92,7 @@ class LocalOutlierProbability(object):
|
|
|
93
92
|
"""
|
|
94
93
|
|
|
95
94
|
@staticmethod
|
|
96
|
-
def _data(obj: Union[
|
|
95
|
+
def _data(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
|
|
97
96
|
"""
|
|
98
97
|
Validates the input data to ensure it is either a Pandas DataFrame
|
|
99
98
|
or Numpy array.
|
|
@@ -101,24 +100,25 @@ class LocalOutlierProbability(object):
|
|
|
101
100
|
:return: a vector of values to be used in calculating the local
|
|
102
101
|
outlier probability.
|
|
103
102
|
"""
|
|
104
|
-
if obj.__class__.__name__ ==
|
|
103
|
+
if obj.__class__.__name__ == "DataFrame":
|
|
105
104
|
points_vector = obj.values
|
|
106
105
|
return points_vector
|
|
107
|
-
elif obj.__class__.__name__ ==
|
|
106
|
+
elif obj.__class__.__name__ == "ndarray":
|
|
108
107
|
points_vector = obj
|
|
109
108
|
return points_vector
|
|
110
109
|
else:
|
|
111
110
|
warnings.warn(
|
|
112
111
|
"Provided data or distance matrix must be in ndarray "
|
|
113
112
|
"or DataFrame.",
|
|
114
|
-
UserWarning
|
|
113
|
+
UserWarning,
|
|
114
|
+
)
|
|
115
115
|
if isinstance(obj, list):
|
|
116
116
|
points_vector = np.array(obj)
|
|
117
117
|
return points_vector
|
|
118
118
|
points_vector = np.array([obj])
|
|
119
119
|
return points_vector
|
|
120
120
|
|
|
121
|
-
def _inputs(self, obj:
|
|
121
|
+
def _inputs(self, obj: "LocalOutlierProbability"):
|
|
122
122
|
"""
|
|
123
123
|
Validates the inputs provided during initialization to ensure
|
|
124
124
|
that the needed objects are provided.
|
|
@@ -134,35 +134,43 @@ class LocalOutlierProbability(object):
|
|
|
134
134
|
elif all(v is not None for v in [obj.data, obj.distance_matrix]):
|
|
135
135
|
warnings.warn(
|
|
136
136
|
"Only one of the following may be provided: data or a "
|
|
137
|
-
"distance matrix (not both).",
|
|
137
|
+
"distance matrix (not both).",
|
|
138
|
+
UserWarning,
|
|
138
139
|
)
|
|
139
140
|
return False
|
|
140
141
|
if obj.data is not None:
|
|
141
142
|
points_vector = self._data(obj.data)
|
|
142
143
|
return points_vector, obj.distance_matrix, obj.neighbor_matrix
|
|
143
|
-
if all(
|
|
144
|
-
|
|
144
|
+
if all(
|
|
145
|
+
matrix is not None
|
|
146
|
+
for matrix in [obj.neighbor_matrix, obj.distance_matrix]
|
|
147
|
+
):
|
|
145
148
|
dist_vector = self._data(obj.distance_matrix)
|
|
146
149
|
neigh_vector = self._data(obj.neighbor_matrix)
|
|
147
150
|
else:
|
|
148
151
|
warnings.warn(
|
|
149
152
|
"A neighbor index matrix and distance matrix must both be "
|
|
150
|
-
"provided when not using raw input data.",
|
|
153
|
+
"provided when not using raw input data.",
|
|
154
|
+
UserWarning,
|
|
151
155
|
)
|
|
152
156
|
return False
|
|
153
157
|
if obj.distance_matrix.shape != obj.neighbor_matrix.shape:
|
|
154
158
|
warnings.warn(
|
|
155
159
|
"The shape of the distance and neighbor "
|
|
156
|
-
"index matrices must match.",
|
|
160
|
+
"index matrices must match.",
|
|
161
|
+
UserWarning,
|
|
157
162
|
)
|
|
158
163
|
return False
|
|
159
|
-
elif (obj.distance_matrix.shape[1] != obj.n_neighbors)
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
warnings.warn(
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
164
|
+
elif (obj.distance_matrix.shape[1] != obj.n_neighbors) or (
|
|
165
|
+
obj.neighbor_matrix.shape[1] != obj.n_neighbors
|
|
166
|
+
):
|
|
167
|
+
warnings.warn(
|
|
168
|
+
"The shape of the distance or "
|
|
169
|
+
"neighbor index matrix does not "
|
|
170
|
+
"match the number of neighbors "
|
|
171
|
+
"specified.",
|
|
172
|
+
UserWarning,
|
|
173
|
+
)
|
|
166
174
|
return False
|
|
167
175
|
return obj.data, dist_vector, neigh_vector
|
|
168
176
|
|
|
@@ -184,7 +192,8 @@ class LocalOutlierProbability(object):
|
|
|
184
192
|
"cluster. Specify a number of neighbors smaller than "
|
|
185
193
|
"the smallest cluster size (observations in smallest "
|
|
186
194
|
"cluster minus one).",
|
|
187
|
-
UserWarning
|
|
195
|
+
UserWarning,
|
|
196
|
+
)
|
|
188
197
|
return False
|
|
189
198
|
return True
|
|
190
199
|
|
|
@@ -199,17 +208,19 @@ class LocalOutlierProbability(object):
|
|
|
199
208
|
"""
|
|
200
209
|
if not obj.n_neighbors > 0:
|
|
201
210
|
obj.n_neighbors = 10
|
|
202
|
-
warnings.warn(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
211
|
+
warnings.warn(
|
|
212
|
+
"n_neighbors must be greater than 0."
|
|
213
|
+
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
214
|
+
UserWarning,
|
|
215
|
+
)
|
|
206
216
|
return False
|
|
207
217
|
elif obj.n_neighbors >= obj._n_observations():
|
|
208
218
|
obj.n_neighbors = obj._n_observations() - 1
|
|
209
219
|
warnings.warn(
|
|
210
220
|
"n_neighbors must be less than the number of observations."
|
|
211
221
|
" Fit with " + str(obj.n_neighbors) + " instead.",
|
|
212
|
-
UserWarning
|
|
222
|
+
UserWarning,
|
|
223
|
+
)
|
|
213
224
|
return True
|
|
214
225
|
|
|
215
226
|
@staticmethod
|
|
@@ -222,8 +233,8 @@ class LocalOutlierProbability(object):
|
|
|
222
233
|
"""
|
|
223
234
|
if obj.extent not in [1, 2, 3]:
|
|
224
235
|
warnings.warn(
|
|
225
|
-
"extent parameter (lambda) must be 1, 2, or 3.",
|
|
226
|
-
|
|
236
|
+
"extent parameter (lambda) must be 1, 2, or 3.", UserWarning
|
|
237
|
+
)
|
|
227
238
|
return False
|
|
228
239
|
return True
|
|
229
240
|
|
|
@@ -237,8 +248,8 @@ class LocalOutlierProbability(object):
|
|
|
237
248
|
"""
|
|
238
249
|
if np.any(np.isnan(obj.data)):
|
|
239
250
|
warnings.warn(
|
|
240
|
-
"Method does not support missing values in input data.",
|
|
241
|
-
|
|
251
|
+
"Method does not support missing values in input data.", UserWarning
|
|
252
|
+
)
|
|
242
253
|
return False
|
|
243
254
|
return True
|
|
244
255
|
|
|
@@ -254,7 +265,8 @@ class LocalOutlierProbability(object):
|
|
|
254
265
|
warnings.warn(
|
|
255
266
|
"Must fit on historical data by calling fit() prior to "
|
|
256
267
|
"calling stream(x).",
|
|
257
|
-
UserWarning
|
|
268
|
+
UserWarning,
|
|
269
|
+
)
|
|
258
270
|
return False
|
|
259
271
|
return True
|
|
260
272
|
|
|
@@ -272,7 +284,8 @@ class LocalOutlierProbability(object):
|
|
|
272
284
|
warnings.warn(
|
|
273
285
|
"Stream approach does not support clustered data. "
|
|
274
286
|
"Automatically refit using single cluster of points.",
|
|
275
|
-
UserWarning
|
|
287
|
+
UserWarning,
|
|
288
|
+
)
|
|
276
289
|
return False
|
|
277
290
|
return True
|
|
278
291
|
|
|
@@ -294,43 +307,35 @@ class LocalOutlierProbability(object):
|
|
|
294
307
|
assert len(types) == f.__code__.co_argcount
|
|
295
308
|
|
|
296
309
|
def new_f(*args, **kwds):
|
|
297
|
-
for
|
|
298
|
-
if type(a).__name__ ==
|
|
310
|
+
for a, t in zip(args, types):
|
|
311
|
+
if type(a).__name__ == "DataFrame":
|
|
299
312
|
a = np.array(a)
|
|
300
313
|
if isinstance(a, t) is False:
|
|
301
|
-
warnings.warn(
|
|
302
|
-
|
|
314
|
+
warnings.warn(
|
|
315
|
+
"Argument %r is not of type %s" % (a, t), UserWarning
|
|
316
|
+
)
|
|
303
317
|
opt_types = {
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
},
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
},
|
|
310
|
-
|
|
311
|
-
'type': types[4]
|
|
312
|
-
},
|
|
313
|
-
'n_neighbors': {
|
|
314
|
-
'type': types[5]
|
|
315
|
-
},
|
|
316
|
-
'cluster_labels': {
|
|
317
|
-
'type': types[6]
|
|
318
|
-
},
|
|
319
|
-
'use_numba': {
|
|
320
|
-
'type': types[7]
|
|
321
|
-
},
|
|
322
|
-
'progress_bar': {
|
|
323
|
-
'type': types[8]
|
|
324
|
-
}
|
|
318
|
+
"distance_matrix": {"type": types[2]},
|
|
319
|
+
"neighbor_matrix": {"type": types[3]},
|
|
320
|
+
"extent": {"type": types[4]},
|
|
321
|
+
"n_neighbors": {"type": types[5]},
|
|
322
|
+
"cluster_labels": {"type": types[6]},
|
|
323
|
+
"use_numba": {"type": types[7]},
|
|
324
|
+
"progress_bar": {"type": types[8]},
|
|
325
325
|
}
|
|
326
326
|
for x in kwds:
|
|
327
|
-
opt_types[x][
|
|
327
|
+
opt_types[x]["value"] = kwds[x]
|
|
328
328
|
for k in opt_types:
|
|
329
329
|
try:
|
|
330
|
-
if
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
330
|
+
if (
|
|
331
|
+
isinstance(opt_types[k]["value"], opt_types[k]["type"])
|
|
332
|
+
is False
|
|
333
|
+
):
|
|
334
|
+
warnings.warn(
|
|
335
|
+
"Argument %r is not of type %s."
|
|
336
|
+
% (k, opt_types[k]["type"]),
|
|
337
|
+
UserWarning,
|
|
338
|
+
)
|
|
334
339
|
except KeyError:
|
|
335
340
|
pass
|
|
336
341
|
return f(*args, **kwds)
|
|
@@ -340,11 +345,28 @@ class LocalOutlierProbability(object):
|
|
|
340
345
|
|
|
341
346
|
return decorator
|
|
342
347
|
|
|
343
|
-
@accepts(
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
+
@accepts(
|
|
349
|
+
object,
|
|
350
|
+
np.ndarray,
|
|
351
|
+
np.ndarray,
|
|
352
|
+
np.ndarray,
|
|
353
|
+
(int, np.integer),
|
|
354
|
+
(int, np.integer),
|
|
355
|
+
list,
|
|
356
|
+
bool,
|
|
357
|
+
bool,
|
|
358
|
+
)
|
|
359
|
+
def __init__(
|
|
360
|
+
self,
|
|
361
|
+
data=None,
|
|
362
|
+
distance_matrix=None,
|
|
363
|
+
neighbor_matrix=None,
|
|
364
|
+
extent=3,
|
|
365
|
+
n_neighbors=10,
|
|
366
|
+
cluster_labels=None,
|
|
367
|
+
use_numba=False,
|
|
368
|
+
progress_bar=False,
|
|
369
|
+
) -> None:
|
|
348
370
|
self.data = data
|
|
349
371
|
self.distance_matrix = distance_matrix
|
|
350
372
|
self.neighbor_matrix = neighbor_matrix
|
|
@@ -361,11 +383,11 @@ class LocalOutlierProbability(object):
|
|
|
361
383
|
self.progress_bar = progress_bar
|
|
362
384
|
self.is_fit = False
|
|
363
385
|
|
|
364
|
-
if self.use_numba is True and
|
|
386
|
+
if self.use_numba is True and "numba" not in sys.modules:
|
|
365
387
|
self.use_numba = False
|
|
366
388
|
warnings.warn(
|
|
367
|
-
"Numba is not available, falling back to pure python mode.",
|
|
368
|
-
|
|
389
|
+
"Numba is not available, falling back to pure python mode.", UserWarning
|
|
390
|
+
)
|
|
369
391
|
|
|
370
392
|
self.Validate()._inputs(self)
|
|
371
393
|
self.Validate._extent(self)
|
|
@@ -375,15 +397,14 @@ class LocalOutlierProbability(object):
|
|
|
375
397
|
"""
|
|
376
398
|
|
|
377
399
|
@staticmethod
|
|
378
|
-
def _standard_distance(cardinality: float, sum_squared_distance: float)
|
|
379
|
-
-> float:
|
|
400
|
+
def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:
|
|
380
401
|
"""
|
|
381
402
|
Calculates the standard distance of an observation.
|
|
382
403
|
:param cardinality: the cardinality of the input observation.
|
|
383
404
|
:param sum_squared_distance: the sum squared distance between all
|
|
384
405
|
neighbors of the input observation.
|
|
385
406
|
:return: the standard distance.
|
|
386
|
-
#
|
|
407
|
+
#"""
|
|
387
408
|
division_result = sum_squared_distance / cardinality
|
|
388
409
|
st_dist = sqrt(division_result)
|
|
389
410
|
return st_dist
|
|
@@ -400,8 +421,9 @@ class LocalOutlierProbability(object):
|
|
|
400
421
|
return extent * standard_distance
|
|
401
422
|
|
|
402
423
|
@staticmethod
|
|
403
|
-
def _prob_outlier_factor(
|
|
404
|
-
|
|
424
|
+
def _prob_outlier_factor(
|
|
425
|
+
probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray
|
|
426
|
+
) -> np.ndarray:
|
|
405
427
|
"""
|
|
406
428
|
Calculates the probabilistic outlier factor of an observation.
|
|
407
429
|
:param probabilistic_distance: the probabilistic distance of the
|
|
@@ -412,14 +434,14 @@ class LocalOutlierProbability(object):
|
|
|
412
434
|
if np.all(probabilistic_distance == ev_prob_dist):
|
|
413
435
|
return np.zeros(probabilistic_distance.shape)
|
|
414
436
|
else:
|
|
415
|
-
ev_prob_dist[ev_prob_dist == 0.] = 1.
|
|
416
|
-
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.
|
|
437
|
+
ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8
|
|
438
|
+
result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0
|
|
417
439
|
return result
|
|
418
440
|
|
|
419
441
|
@staticmethod
|
|
420
|
-
def _norm_prob_outlier_factor(
|
|
421
|
-
|
|
422
|
-
|
|
442
|
+
def _norm_prob_outlier_factor(
|
|
443
|
+
extent: float, ev_probabilistic_outlier_factor: list
|
|
444
|
+
) -> list:
|
|
423
445
|
"""
|
|
424
446
|
Calculates the normalized probabilistic outlier factor of an
|
|
425
447
|
observation.
|
|
@@ -434,8 +456,9 @@ class LocalOutlierProbability(object):
|
|
|
434
456
|
return npofs
|
|
435
457
|
|
|
436
458
|
@staticmethod
|
|
437
|
-
def _local_outlier_probability(
|
|
438
|
-
|
|
459
|
+
def _local_outlier_probability(
|
|
460
|
+
plof_val: np.ndarray, nplof_val: np.ndarray
|
|
461
|
+
) -> np.ndarray:
|
|
439
462
|
"""
|
|
440
463
|
Calculates the local outlier probability of an observation.
|
|
441
464
|
:param plof_val: the probabilistic outlier factor of the input
|
|
@@ -448,7 +471,7 @@ class LocalOutlierProbability(object):
|
|
|
448
471
|
if np.all(plof_val == nplof_val):
|
|
449
472
|
return np.zeros(plof_val.shape)
|
|
450
473
|
else:
|
|
451
|
-
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.))))
|
|
474
|
+
return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
|
|
452
475
|
|
|
453
476
|
def _n_observations(self) -> int:
|
|
454
477
|
"""
|
|
@@ -502,8 +525,9 @@ class LocalOutlierProbability(object):
|
|
|
502
525
|
:return: the updated storage matrix that collects information on
|
|
503
526
|
each observation.
|
|
504
527
|
"""
|
|
505
|
-
for vec, cluster_id in zip(
|
|
506
|
-
|
|
528
|
+
for vec, cluster_id in zip(
|
|
529
|
+
range(self.distance_matrix.shape[0]), self._cluster_labels()
|
|
530
|
+
):
|
|
507
531
|
data_store[vec][0] = cluster_id
|
|
508
532
|
data_store[vec][1] = self.distance_matrix[vec]
|
|
509
533
|
data_store[vec][2] = self.neighbor_matrix[vec]
|
|
@@ -511,10 +535,10 @@ class LocalOutlierProbability(object):
|
|
|
511
535
|
|
|
512
536
|
@staticmethod
|
|
513
537
|
def _compute_distance_and_neighbor_matrix(
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
538
|
+
clust_points_vector: np.ndarray,
|
|
539
|
+
indices: np.ndarray,
|
|
540
|
+
distances: np.ndarray,
|
|
541
|
+
indexes: np.ndarray,
|
|
518
542
|
) -> Tuple[np.ndarray, np.ndarray, int]:
|
|
519
543
|
"""
|
|
520
544
|
This helper method provides the heavy lifting for the _distances
|
|
@@ -522,27 +546,27 @@ class LocalOutlierProbability(object):
|
|
|
522
546
|
written so that it can make full use of Numba's jit capabilities if
|
|
523
547
|
desired.
|
|
524
548
|
"""
|
|
525
|
-
|
|
526
549
|
for i in range(clust_points_vector.shape[0]):
|
|
527
550
|
for j in range(i + 1, clust_points_vector.shape[0]):
|
|
528
|
-
|
|
551
|
+
# Global index of the points
|
|
552
|
+
global_i = indices[0][i]
|
|
553
|
+
global_j = indices[0][j]
|
|
529
554
|
|
|
530
|
-
|
|
555
|
+
# Compute Euclidean distance
|
|
556
|
+
diff = clust_points_vector[i] - clust_points_vector[j]
|
|
531
557
|
d = np.dot(diff, diff) ** 0.5
|
|
532
558
|
|
|
533
|
-
|
|
534
|
-
idx_max = distances[
|
|
559
|
+
# Update distance and neighbor index for global_i
|
|
560
|
+
idx_max = distances[global_i].argmax()
|
|
561
|
+
if d < distances[global_i][idx_max]:
|
|
562
|
+
distances[global_i][idx_max] = d
|
|
563
|
+
indexes[global_i][idx_max] = global_j
|
|
535
564
|
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
idx_max = distances[idx].argmax()
|
|
542
|
-
|
|
543
|
-
if d < distances[idx][idx_max]:
|
|
544
|
-
distances[idx][idx_max] = d
|
|
545
|
-
indexes[idx][idx_max] = p[0][0]
|
|
565
|
+
# Update distance and neighbor index for global_j
|
|
566
|
+
idx_max = distances[global_j].argmax()
|
|
567
|
+
if d < distances[global_j][idx_max]:
|
|
568
|
+
distances[global_j][idx_max] = d
|
|
569
|
+
indexes[global_j][idx_max] = global_i
|
|
546
570
|
|
|
547
571
|
yield distances, indexes, i
|
|
548
572
|
|
|
@@ -555,20 +579,21 @@ class LocalOutlierProbability(object):
|
|
|
555
579
|
:return: the updated storage matrix that collects information on
|
|
556
580
|
each observation.
|
|
557
581
|
"""
|
|
558
|
-
distances = np.full(
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
582
|
+
distances = np.full(
|
|
583
|
+
[self._n_observations(), self.n_neighbors], 9e10, dtype=float
|
|
584
|
+
)
|
|
585
|
+
indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
|
|
562
586
|
self.points_vector = self.Validate._data(self.data)
|
|
563
|
-
compute =
|
|
564
|
-
|
|
565
|
-
self.
|
|
587
|
+
compute = (
|
|
588
|
+
numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
|
|
589
|
+
if self.use_numba
|
|
590
|
+
else self._compute_distance_and_neighbor_matrix
|
|
591
|
+
)
|
|
566
592
|
progress = "="
|
|
567
593
|
for cluster_id in set(self._cluster_labels()):
|
|
568
594
|
indices = np.where(self._cluster_labels() == cluster_id)
|
|
569
595
|
clust_points_vector = np.array(
|
|
570
|
-
self.points_vector.take(indices, axis=0)[0],
|
|
571
|
-
dtype=np.float64
|
|
596
|
+
self.points_vector.take(indices, axis=0)[0], dtype=np.float64
|
|
572
597
|
)
|
|
573
598
|
# a generator that yields an updated distance matrix on each loop
|
|
574
599
|
for c in compute(clust_points_vector, indices, distances, indexes):
|
|
@@ -576,7 +601,8 @@ class LocalOutlierProbability(object):
|
|
|
576
601
|
# update the progress bar
|
|
577
602
|
if progress_bar is True:
|
|
578
603
|
progress = Utils.emit_progress_bar(
|
|
579
|
-
progress, i+1, clust_points_vector.shape[0]
|
|
604
|
+
progress, i + 1, clust_points_vector.shape[0]
|
|
605
|
+
)
|
|
580
606
|
|
|
581
607
|
self.distance_matrix = distances
|
|
582
608
|
self.neighbor_matrix = indexes
|
|
@@ -630,11 +656,10 @@ class LocalOutlierProbability(object):
|
|
|
630
656
|
"""
|
|
631
657
|
prob_distances = []
|
|
632
658
|
for i in range(data_store[:, 4].shape[0]):
|
|
633
|
-
prob_distances.append(
|
|
634
|
-
self._prob_distance(self.extent, data_store[:, 4][i]))
|
|
659
|
+
prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))
|
|
635
660
|
return np.hstack((data_store, np.array([prob_distances]).T))
|
|
636
661
|
|
|
637
|
-
def _prob_distances_ev(self, data_store
|
|
662
|
+
def _prob_distances_ev(self, data_store) -> np.ndarray:
|
|
638
663
|
"""
|
|
639
664
|
Calculates the expected value of the probabilistic distance for
|
|
640
665
|
each observation in the input data with respect to the cluster the
|
|
@@ -648,19 +673,20 @@ class LocalOutlierProbability(object):
|
|
|
648
673
|
for cluster_id in self.cluster_labels_u:
|
|
649
674
|
indices = np.where(data_store[:, 0] == cluster_id)[0]
|
|
650
675
|
for index in indices:
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
676
|
+
# Global neighbor indices for the current point
|
|
677
|
+
nbrhood = data_store[index][2].astype(int) # Ensure global indices
|
|
678
|
+
nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(
|
|
679
|
+
float
|
|
680
|
+
)
|
|
654
681
|
nbrhood_prob_distances_nonan = nbrhood_prob_distances[
|
|
655
|
-
np.logical_not(np.isnan(nbrhood_prob_distances))
|
|
656
|
-
|
|
657
|
-
|
|
682
|
+
np.logical_not(np.isnan(nbrhood_prob_distances))
|
|
683
|
+
]
|
|
684
|
+
prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()
|
|
685
|
+
|
|
658
686
|
self.prob_distances_ev = prob_set_distance_ev
|
|
659
|
-
|
|
660
|
-
return data_store
|
|
687
|
+
return np.hstack((data_store, prob_set_distance_ev))
|
|
661
688
|
|
|
662
|
-
def _prob_local_outlier_factors(self,
|
|
663
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
689
|
+
def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
|
|
664
690
|
"""
|
|
665
691
|
Calculates the probabilistic local outlier factor for each
|
|
666
692
|
observation in the input data.
|
|
@@ -670,13 +696,22 @@ class LocalOutlierProbability(object):
|
|
|
670
696
|
each observation.
|
|
671
697
|
"""
|
|
672
698
|
return np.hstack(
|
|
673
|
-
(
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
699
|
+
(
|
|
700
|
+
data_store,
|
|
701
|
+
np.array(
|
|
702
|
+
[
|
|
703
|
+
np.apply_along_axis(
|
|
704
|
+
self._prob_outlier_factor,
|
|
705
|
+
0,
|
|
706
|
+
data_store[:, 5],
|
|
707
|
+
data_store[:, 6],
|
|
708
|
+
)
|
|
709
|
+
]
|
|
710
|
+
).T,
|
|
711
|
+
)
|
|
712
|
+
)
|
|
677
713
|
|
|
678
|
-
def _prob_local_outlier_factors_ev(self,
|
|
679
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
714
|
+
def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:
|
|
680
715
|
"""
|
|
681
716
|
Calculates the expected value of the probabilistic local outlier factor
|
|
682
717
|
for each observation in the input data with respect to the cluster the
|
|
@@ -689,21 +724,31 @@ class LocalOutlierProbability(object):
|
|
|
689
724
|
prob_local_outlier_factor_ev_dict = {}
|
|
690
725
|
for cluster_id in self.cluster_labels_u:
|
|
691
726
|
indices = np.where(data_store[:, 0] == cluster_id)
|
|
692
|
-
prob_local_outlier_factors = np.take(data_store[:, 7],
|
|
693
|
-
|
|
694
|
-
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
|
|
695
|
-
np.logical_not(np.isnan(prob_local_outlier_factors))]
|
|
696
|
-
prob_local_outlier_factor_ev_dict[cluster_id] = (
|
|
697
|
-
np.power(prob_local_outlier_factors_nonan, 2).sum() /
|
|
698
|
-
float(prob_local_outlier_factors_nonan.size)
|
|
727
|
+
prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(
|
|
728
|
+
float
|
|
699
729
|
)
|
|
730
|
+
prob_local_outlier_factors_nonan = prob_local_outlier_factors[
|
|
731
|
+
np.logical_not(np.isnan(prob_local_outlier_factors))
|
|
732
|
+
]
|
|
733
|
+
prob_local_outlier_factor_ev_dict[cluster_id] = np.power(
|
|
734
|
+
prob_local_outlier_factors_nonan, 2
|
|
735
|
+
).sum() / float(prob_local_outlier_factors_nonan.size)
|
|
700
736
|
data_store = np.hstack(
|
|
701
|
-
(
|
|
702
|
-
|
|
737
|
+
(
|
|
738
|
+
data_store,
|
|
739
|
+
np.array(
|
|
740
|
+
[
|
|
741
|
+
[
|
|
742
|
+
prob_local_outlier_factor_ev_dict[x]
|
|
743
|
+
for x in data_store[:, 0].tolist()
|
|
744
|
+
]
|
|
745
|
+
]
|
|
746
|
+
).T,
|
|
747
|
+
)
|
|
748
|
+
)
|
|
703
749
|
return data_store
|
|
704
750
|
|
|
705
|
-
def _norm_prob_local_outlier_factors(self, data_store: np.ndarray)
|
|
706
|
-
-> np.ndarray:
|
|
751
|
+
def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
|
|
707
752
|
"""
|
|
708
753
|
Calculates the normalized probabilistic local outlier factor for each
|
|
709
754
|
observation in the input data.
|
|
@@ -712,11 +757,20 @@ class LocalOutlierProbability(object):
|
|
|
712
757
|
:return: the updated storage matrix that collects information on
|
|
713
758
|
each observation.
|
|
714
759
|
"""
|
|
715
|
-
return np.hstack(
|
|
716
|
-
|
|
760
|
+
return np.hstack(
|
|
761
|
+
(
|
|
762
|
+
data_store,
|
|
763
|
+
np.array(
|
|
764
|
+
[
|
|
765
|
+
self._norm_prob_outlier_factor(
|
|
766
|
+
self.extent, data_store[:, 8].tolist()
|
|
767
|
+
)
|
|
768
|
+
]
|
|
769
|
+
).T,
|
|
770
|
+
)
|
|
771
|
+
)
|
|
717
772
|
|
|
718
|
-
def _local_outlier_probabilities(self,
|
|
719
|
-
data_store: np.ndarray) -> np.ndarray:
|
|
773
|
+
def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:
|
|
720
774
|
"""
|
|
721
775
|
Calculates the local outlier probability for each observation in the
|
|
722
776
|
input data.
|
|
@@ -726,17 +780,26 @@ class LocalOutlierProbability(object):
|
|
|
726
780
|
each observation.
|
|
727
781
|
"""
|
|
728
782
|
return np.hstack(
|
|
729
|
-
(
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
783
|
+
(
|
|
784
|
+
data_store,
|
|
785
|
+
np.array(
|
|
786
|
+
[
|
|
787
|
+
np.apply_along_axis(
|
|
788
|
+
self._local_outlier_probability,
|
|
789
|
+
0,
|
|
790
|
+
data_store[:, 7],
|
|
791
|
+
data_store[:, 9],
|
|
792
|
+
)
|
|
793
|
+
]
|
|
794
|
+
).T,
|
|
795
|
+
)
|
|
796
|
+
)
|
|
733
797
|
|
|
734
798
|
"""
|
|
735
799
|
Public methods
|
|
736
800
|
"""
|
|
737
801
|
|
|
738
|
-
def fit(self) ->
|
|
739
|
-
|
|
802
|
+
def fit(self) -> "LocalOutlierProbability":
|
|
740
803
|
"""
|
|
741
804
|
Calculates the local outlier probability for each observation in the
|
|
742
805
|
input data according to the input parameters extent, n_neighbors, and
|
|
@@ -748,8 +811,7 @@ class LocalOutlierProbability(object):
|
|
|
748
811
|
self.Validate._n_neighbors(self)
|
|
749
812
|
if self.Validate._cluster_size(self) is False:
|
|
750
813
|
sys.exit()
|
|
751
|
-
if self.data is not None and self.Validate._missing_values(
|
|
752
|
-
self) is False:
|
|
814
|
+
if self.data is not None and self.Validate._missing_values(self) is False:
|
|
753
815
|
sys.exit()
|
|
754
816
|
|
|
755
817
|
store = self._store()
|
|
@@ -773,7 +835,6 @@ class LocalOutlierProbability(object):
|
|
|
773
835
|
return self
|
|
774
836
|
|
|
775
837
|
def stream(self, x: np.ndarray) -> np.ndarray:
|
|
776
|
-
|
|
777
838
|
"""
|
|
778
839
|
Calculates the local outlier probability for an individual sample
|
|
779
840
|
according to the input parameters extent, n_neighbors, and
|
|
@@ -812,12 +873,12 @@ class LocalOutlierProbability(object):
|
|
|
812
873
|
ssd = np.power(distances, 2).sum()
|
|
813
874
|
std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
|
|
814
875
|
prob_dist = self._prob_distance(self.extent, std_dist)
|
|
815
|
-
plof = self._prob_outlier_factor(
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
)
|
|
876
|
+
plof = self._prob_outlier_factor(
|
|
877
|
+
np.array(prob_dist), np.array(self.prob_distances_ev.mean())
|
|
878
|
+
)
|
|
819
879
|
loop = self._local_outlier_probability(
|
|
820
|
-
plof, self.norm_prob_local_outlier_factor
|
|
880
|
+
plof, self.norm_prob_local_outlier_factor
|
|
881
|
+
)
|
|
821
882
|
|
|
822
883
|
if orig_cluster_labels is not None:
|
|
823
884
|
self.cluster_labels = orig_cluster_labels
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 1.1
|
|
2
2
|
Name: PyNomaly
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: A Python 3 implementation of LoOP: Local Outlier Probabilities, a local density based outlier detection method providing an outlier score in the range of [0,1].
|
|
5
5
|
Home-page: https://github.com/vc1492a/PyNomaly
|
|
6
6
|
Author: Valentino Constantinou
|
|
7
7
|
Author-email: vc@valentino.io
|
|
8
8
|
License: Apache License, Version 2.0
|
|
9
|
-
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.
|
|
9
|
+
Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz
|
|
10
10
|
Description: UNKNOWN
|
|
11
11
|
Keywords: outlier,anomaly,detection,machine,learning,probability
|
|
12
12
|
Platform: UNKNOWN
|
PyNomaly-0.3.4/setup.cfg
ADDED
|
@@ -3,14 +3,14 @@ from setuptools import setup
|
|
|
3
3
|
setup(
|
|
4
4
|
name='PyNomaly',
|
|
5
5
|
packages=['PyNomaly'],
|
|
6
|
-
version='0.3.
|
|
6
|
+
version='0.3.4',
|
|
7
7
|
description='A Python 3 implementation of LoOP: Local Outlier '
|
|
8
8
|
'Probabilities, a local density based outlier detection '
|
|
9
9
|
'method providing an outlier score in the range of [0,1].',
|
|
10
10
|
author='Valentino Constantinou',
|
|
11
11
|
author_email='vc@valentino.io',
|
|
12
12
|
url='https://github.com/vc1492a/PyNomaly',
|
|
13
|
-
download_url='https://github.com/vc1492a/PyNomaly/archive/0.3.
|
|
13
|
+
download_url='https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz',
|
|
14
14
|
keywords=['outlier', 'anomaly', 'detection', 'machine', 'learning',
|
|
15
15
|
'probability'],
|
|
16
16
|
classifiers=[],
|
PyNomaly-0.3.3/setup.cfg
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|