PyNomaly 0.3.1__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyNomaly/loop.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from math import erf, sqrt
2
2
  import numpy as np
3
+ from python_utils.terminal import get_terminal_size
3
4
  import sys
5
+ from typing import Tuple, Union
4
6
  import warnings
5
7
 
6
8
  try:
@@ -8,9 +10,37 @@ try:
8
10
  except ImportError:
9
11
  pass
10
12
 
11
- __author__ = 'Valentino Constantinou'
12
- __version__ = '0.3.1'
13
- __license__ = 'Apache License, Version 2.0'
13
+ __author__ = "Valentino Constantinou"
14
+ __version__ = "0.3.4"
15
+ __license__ = "Apache License, Version 2.0"
16
+
17
+
18
+ class Utils:
19
+ @staticmethod
20
+ def emit_progress_bar(progress: str, index: int, total: int) -> str:
21
+ """
22
+ A progress bar that is continuously updated in Python's standard
23
+ out.
24
+ :param progress: a string printed to stdout that is updated and later
25
+ returned.
26
+ :param index: the current index of the iteration within the tracked
27
+ process.
28
+ :param total: the total length of the tracked process.
29
+ :return: progress string.
30
+ """
31
+
32
+ w, h = get_terminal_size()
33
+ sys.stdout.write("\r")
34
+ if total < w:
35
+ block_size = int(w / total)
36
+ else:
37
+ block_size = int(total / w)
38
+ if index % block_size == 0:
39
+ progress += "="
40
+ percent = index / total
41
+ sys.stdout.write("[ %s ] %.2f%%" % (progress, percent * 100))
42
+ sys.stdout.flush()
43
+ return progress
14
44
 
15
45
 
16
46
  class LocalOutlierProbability(object):
@@ -24,7 +54,7 @@ class LocalOutlierProbability(object):
24
54
  :param cluster_labels: a numpy array of cluster assignments w.r.t. each
25
55
  sample (optional, default None)
26
56
  :return:
27
- """"""
57
+ """ """
28
58
 
29
59
  Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP:
30
60
  Local Outlier Probabilities.
@@ -62,7 +92,7 @@ class LocalOutlierProbability(object):
62
92
  """
63
93
 
64
94
  @staticmethod
65
- def _data(obj):
95
+ def _data(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
66
96
  """
67
97
  Validates the input data to ensure it is either a Pandas DataFrame
68
98
  or Numpy array.
@@ -70,24 +100,25 @@ class LocalOutlierProbability(object):
70
100
  :return: a vector of values to be used in calculating the local
71
101
  outlier probability.
72
102
  """
73
- if obj.__class__.__name__ == 'DataFrame':
103
+ if obj.__class__.__name__ == "DataFrame":
74
104
  points_vector = obj.values
75
105
  return points_vector
76
- elif obj.__class__.__name__ == 'ndarray':
106
+ elif obj.__class__.__name__ == "ndarray":
77
107
  points_vector = obj
78
108
  return points_vector
79
109
  else:
80
110
  warnings.warn(
81
111
  "Provided data or distance matrix must be in ndarray "
82
112
  "or DataFrame.",
83
- UserWarning)
113
+ UserWarning,
114
+ )
84
115
  if isinstance(obj, list):
85
116
  points_vector = np.array(obj)
86
117
  return points_vector
87
118
  points_vector = np.array([obj])
88
119
  return points_vector
89
120
 
90
- def _inputs(self, obj):
121
+ def _inputs(self, obj: "LocalOutlierProbability"):
91
122
  """
92
123
  Validates the inputs provided during initialization to ensure
93
124
  that the needed objects are provided.
@@ -103,40 +134,48 @@ class LocalOutlierProbability(object):
103
134
  elif all(v is not None for v in [obj.data, obj.distance_matrix]):
104
135
  warnings.warn(
105
136
  "Only one of the following may be provided: data or a "
106
- "distance matrix (not both).", UserWarning
137
+ "distance matrix (not both).",
138
+ UserWarning,
107
139
  )
108
140
  return False
109
141
  if obj.data is not None:
110
142
  points_vector = self._data(obj.data)
111
143
  return points_vector, obj.distance_matrix, obj.neighbor_matrix
112
- if all(matrix is not None for matrix in [obj.neighbor_matrix,
113
- obj.distance_matrix]):
144
+ if all(
145
+ matrix is not None
146
+ for matrix in [obj.neighbor_matrix, obj.distance_matrix]
147
+ ):
114
148
  dist_vector = self._data(obj.distance_matrix)
115
149
  neigh_vector = self._data(obj.neighbor_matrix)
116
150
  else:
117
151
  warnings.warn(
118
152
  "A neighbor index matrix and distance matrix must both be "
119
- "provided when not using raw input data.", UserWarning
153
+ "provided when not using raw input data.",
154
+ UserWarning,
120
155
  )
121
156
  return False
122
157
  if obj.distance_matrix.shape != obj.neighbor_matrix.shape:
123
158
  warnings.warn(
124
159
  "The shape of the distance and neighbor "
125
- "index matrices must match.", UserWarning
160
+ "index matrices must match.",
161
+ UserWarning,
126
162
  )
127
163
  return False
128
- elif (obj.distance_matrix.shape[1] != obj.n_neighbors) \
129
- or (obj.neighbor_matrix.shape[1] !=
130
- obj.n_neighbors):
131
- warnings.warn("The shape of the distance or "
132
- "neighbor index matrix does not "
133
- "match the number of neighbors "
134
- "specified.", UserWarning)
164
+ elif (obj.distance_matrix.shape[1] != obj.n_neighbors) or (
165
+ obj.neighbor_matrix.shape[1] != obj.n_neighbors
166
+ ):
167
+ warnings.warn(
168
+ "The shape of the distance or "
169
+ "neighbor index matrix does not "
170
+ "match the number of neighbors "
171
+ "specified.",
172
+ UserWarning,
173
+ )
135
174
  return False
136
175
  return obj.data, dist_vector, neigh_vector
137
176
 
138
177
  @staticmethod
139
- def _cluster_size(obj):
178
+ def _cluster_size(obj) -> bool:
140
179
  """
141
180
  Validates the cluster labels to ensure that the smallest cluster
142
181
  size (number of observations in the cluster) is larger than the
@@ -153,12 +192,13 @@ class LocalOutlierProbability(object):
153
192
  "cluster. Specify a number of neighbors smaller than "
154
193
  "the smallest cluster size (observations in smallest "
155
194
  "cluster minus one).",
156
- UserWarning)
195
+ UserWarning,
196
+ )
157
197
  return False
158
198
  return True
159
199
 
160
200
  @staticmethod
161
- def _n_neighbors(obj):
201
+ def _n_neighbors(obj) -> bool:
162
202
  """
163
203
  Validates the specified number of neighbors to ensure that it is
164
204
  greater than 0 and that the specified value is less than the total
@@ -168,21 +208,23 @@ class LocalOutlierProbability(object):
168
208
  """
169
209
  if not obj.n_neighbors > 0:
170
210
  obj.n_neighbors = 10
171
- warnings.warn("n_neighbors must be greater than 0."
172
- " Fit with " + str(obj.n_neighbors) +
173
- " instead.",
174
- UserWarning)
211
+ warnings.warn(
212
+ "n_neighbors must be greater than 0."
213
+ " Fit with " + str(obj.n_neighbors) + " instead.",
214
+ UserWarning,
215
+ )
175
216
  return False
176
217
  elif obj.n_neighbors >= obj._n_observations():
177
218
  obj.n_neighbors = obj._n_observations() - 1
178
219
  warnings.warn(
179
220
  "n_neighbors must be less than the number of observations."
180
221
  " Fit with " + str(obj.n_neighbors) + " instead.",
181
- UserWarning)
222
+ UserWarning,
223
+ )
182
224
  return True
183
225
 
184
226
  @staticmethod
185
- def _extent(obj):
227
+ def _extent(obj) -> bool:
186
228
  """
187
229
  Validates the specified extent parameter to ensure it is either 1,
188
230
  2, or 3.
@@ -191,13 +233,13 @@ class LocalOutlierProbability(object):
191
233
  """
192
234
  if obj.extent not in [1, 2, 3]:
193
235
  warnings.warn(
194
- "extent parameter (lambda) must be 1, 2, or 3.",
195
- UserWarning)
236
+ "extent parameter (lambda) must be 1, 2, or 3.", UserWarning
237
+ )
196
238
  return False
197
239
  return True
198
240
 
199
241
  @staticmethod
200
- def _missing_values(obj):
242
+ def _missing_values(obj) -> bool:
201
243
  """
202
244
  Validates the provided data to ensure that it contains no
203
245
  missing values.
@@ -206,13 +248,13 @@ class LocalOutlierProbability(object):
206
248
  """
207
249
  if np.any(np.isnan(obj.data)):
208
250
  warnings.warn(
209
- "Method does not support missing values in input data.",
210
- UserWarning)
251
+ "Method does not support missing values in input data.", UserWarning
252
+ )
211
253
  return False
212
254
  return True
213
255
 
214
256
  @staticmethod
215
- def _fit(obj):
257
+ def _fit(obj) -> bool:
216
258
  """
217
259
  Validates that the model was fit prior to calling the stream()
218
260
  method.
@@ -223,12 +265,13 @@ class LocalOutlierProbability(object):
223
265
  warnings.warn(
224
266
  "Must fit on historical data by calling fit() prior to "
225
267
  "calling stream(x).",
226
- UserWarning)
268
+ UserWarning,
269
+ )
227
270
  return False
228
271
  return True
229
272
 
230
273
  @staticmethod
231
- def _no_cluster_labels(obj):
274
+ def _no_cluster_labels(obj) -> bool:
232
275
  """
233
276
  Checks to see if cluster labels are attempting to be used in
234
277
  stream() and, if so, calls fit() once again but without cluster
@@ -241,7 +284,8 @@ class LocalOutlierProbability(object):
241
284
  warnings.warn(
242
285
  "Stream approach does not support clustered data. "
243
286
  "Automatically refit using single cluster of points.",
244
- UserWarning)
287
+ UserWarning,
288
+ )
245
289
  return False
246
290
  return True
247
291
 
@@ -263,40 +307,35 @@ class LocalOutlierProbability(object):
263
307
  assert len(types) == f.__code__.co_argcount
264
308
 
265
309
  def new_f(*args, **kwds):
266
- for (a, t) in zip(args, types):
267
- if type(a).__name__ == 'DataFrame':
310
+ for a, t in zip(args, types):
311
+ if type(a).__name__ == "DataFrame":
268
312
  a = np.array(a)
269
313
  if isinstance(a, t) is False:
270
- warnings.warn("Argument %r is not of type %s" % (a, t),
271
- UserWarning)
314
+ warnings.warn(
315
+ "Argument %r is not of type %s" % (a, t), UserWarning
316
+ )
272
317
  opt_types = {
273
- 'distance_matrix': {
274
- 'type': types[2]
275
- },
276
- 'neighbor_matrix': {
277
- 'type': types[3]
278
- },
279
- 'extent': {
280
- 'type': types[4]
281
- },
282
- 'n_neighbors': {
283
- 'type': types[5]
284
- },
285
- 'cluster_labels': {
286
- 'type': types[6]
287
- },
288
- 'use_numba': {
289
- 'type': types[7]
290
- }
318
+ "distance_matrix": {"type": types[2]},
319
+ "neighbor_matrix": {"type": types[3]},
320
+ "extent": {"type": types[4]},
321
+ "n_neighbors": {"type": types[5]},
322
+ "cluster_labels": {"type": types[6]},
323
+ "use_numba": {"type": types[7]},
324
+ "progress_bar": {"type": types[8]},
291
325
  }
292
326
  for x in kwds:
293
- opt_types[x]['value'] = kwds[x]
327
+ opt_types[x]["value"] = kwds[x]
294
328
  for k in opt_types:
295
329
  try:
296
- if isinstance(opt_types[k]['value'],
297
- opt_types[k]['type']) is False:
298
- warnings.warn("Argument %r is not of type %s." % (
299
- k, opt_types[k]['type']), UserWarning)
330
+ if (
331
+ isinstance(opt_types[k]["value"], opt_types[k]["type"])
332
+ is False
333
+ ):
334
+ warnings.warn(
335
+ "Argument %r is not of type %s."
336
+ % (k, opt_types[k]["type"]),
337
+ UserWarning,
338
+ )
300
339
  except KeyError:
301
340
  pass
302
341
  return f(*args, **kwds)
@@ -306,11 +345,28 @@ class LocalOutlierProbability(object):
306
345
 
307
346
  return decorator
308
347
 
309
- @accepts(object, np.ndarray, np.ndarray, np.ndarray, (int, np.integer),
310
- (int, np.integer), list, bool)
311
- def __init__(self, data=None, distance_matrix=None, neighbor_matrix=None,
312
- extent=3, n_neighbors=10, cluster_labels=None,
313
- use_numba=True):
348
+ @accepts(
349
+ object,
350
+ np.ndarray,
351
+ np.ndarray,
352
+ np.ndarray,
353
+ (int, np.integer),
354
+ (int, np.integer),
355
+ list,
356
+ bool,
357
+ bool,
358
+ )
359
+ def __init__(
360
+ self,
361
+ data=None,
362
+ distance_matrix=None,
363
+ neighbor_matrix=None,
364
+ extent=3,
365
+ n_neighbors=10,
366
+ cluster_labels=None,
367
+ use_numba=False,
368
+ progress_bar=False,
369
+ ) -> None:
314
370
  self.data = data
315
371
  self.distance_matrix = distance_matrix
316
372
  self.neighbor_matrix = neighbor_matrix
@@ -324,13 +380,14 @@ class LocalOutlierProbability(object):
324
380
  self.norm_prob_local_outlier_factor = None
325
381
  self.local_outlier_probabilities = None
326
382
  self._objects = {}
383
+ self.progress_bar = progress_bar
327
384
  self.is_fit = False
328
385
 
329
- if self.use_numba and 'numba' not in sys.modules:
386
+ if self.use_numba is True and "numba" not in sys.modules:
330
387
  self.use_numba = False
331
388
  warnings.warn(
332
- "Numba is not available, falling back to pure python mode.",
333
- UserWarning)
389
+ "Numba is not available, falling back to pure python mode.", UserWarning
390
+ )
334
391
 
335
392
  self.Validate()._inputs(self)
336
393
  self.Validate._extent(self)
@@ -340,15 +397,14 @@ class LocalOutlierProbability(object):
340
397
  """
341
398
 
342
399
  @staticmethod
343
- def _standard_distance(cardinality: float, sum_squared_distance: float) \
344
- -> float:
400
+ def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:
345
401
  """
346
402
  Calculates the standard distance of an observation.
347
403
  :param cardinality: the cardinality of the input observation.
348
404
  :param sum_squared_distance: the sum squared distance between all
349
405
  neighbors of the input observation.
350
406
  :return: the standard distance.
351
- # """
407
+ #"""
352
408
  division_result = sum_squared_distance / cardinality
353
409
  st_dist = sqrt(division_result)
354
410
  return st_dist
@@ -365,8 +421,9 @@ class LocalOutlierProbability(object):
365
421
  return extent * standard_distance
366
422
 
367
423
  @staticmethod
368
- def _prob_outlier_factor(probabilistic_distance: np.ndarray, ev_prob_dist:
369
- np.ndarray) -> np.ndarray:
424
+ def _prob_outlier_factor(
425
+ probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray
426
+ ) -> np.ndarray:
370
427
  """
371
428
  Calculates the probabilistic outlier factor of an observation.
372
429
  :param probabilistic_distance: the probabilistic distance of the
@@ -377,14 +434,14 @@ class LocalOutlierProbability(object):
377
434
  if np.all(probabilistic_distance == ev_prob_dist):
378
435
  return np.zeros(probabilistic_distance.shape)
379
436
  else:
380
- ev_prob_dist[ev_prob_dist == 0.] = 1.e-8
381
- result = np.divide(probabilistic_distance, ev_prob_dist) - 1.
437
+ ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8
438
+ result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0
382
439
  return result
383
440
 
384
441
  @staticmethod
385
- def _norm_prob_outlier_factor(extent: float,
386
- ev_probabilistic_outlier_factor: list) \
387
- -> list:
442
+ def _norm_prob_outlier_factor(
443
+ extent: float, ev_probabilistic_outlier_factor: list
444
+ ) -> list:
388
445
  """
389
446
  Calculates the normalized probabilistic outlier factor of an
390
447
  observation.
@@ -399,8 +456,9 @@ class LocalOutlierProbability(object):
399
456
  return npofs
400
457
 
401
458
  @staticmethod
402
- def _local_outlier_probability(plof_val: np.ndarray, nplof_val: np.ndarray) \
403
- -> np.ndarray:
459
+ def _local_outlier_probability(
460
+ plof_val: np.ndarray, nplof_val: np.ndarray
461
+ ) -> np.ndarray:
404
462
  """
405
463
  Calculates the local outlier probability of an observation.
406
464
  :param plof_val: the probabilistic outlier factor of the input
@@ -413,7 +471,7 @@ class LocalOutlierProbability(object):
413
471
  if np.all(plof_val == nplof_val):
414
472
  return np.zeros(plof_val.shape)
415
473
  else:
416
- return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.))))
474
+ return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
417
475
 
418
476
  def _n_observations(self) -> int:
419
477
  """
@@ -467,8 +525,9 @@ class LocalOutlierProbability(object):
467
525
  :return: the updated storage matrix that collects information on
468
526
  each observation.
469
527
  """
470
- for vec, cluster_id in zip(range(self.distance_matrix.shape[0]),
471
- self._cluster_labels()):
528
+ for vec, cluster_id in zip(
529
+ range(self.distance_matrix.shape[0]), self._cluster_labels()
530
+ ):
472
531
  data_store[vec][0] = cluster_id
473
532
  data_store[vec][1] = self.distance_matrix[vec]
474
533
  data_store[vec][2] = self.neighbor_matrix[vec]
@@ -476,41 +535,42 @@ class LocalOutlierProbability(object):
476
535
 
477
536
  @staticmethod
478
537
  def _compute_distance_and_neighbor_matrix(
479
- clust_points_vector,
480
- indices,
481
- distances,
482
- indexes
483
- ):
538
+ clust_points_vector: np.ndarray,
539
+ indices: np.ndarray,
540
+ distances: np.ndarray,
541
+ indexes: np.ndarray,
542
+ ) -> Tuple[np.ndarray, np.ndarray, int]:
484
543
  """
485
544
  This helper method provides the heavy lifting for the _distances
486
545
  method and is only intended for use therein. The code has been
487
- written so that it can make full use of numba's jit capabilities if
546
+ written so that it can make full use of Numba's jit capabilities if
488
547
  desired.
489
548
  """
490
549
  for i in range(clust_points_vector.shape[0]):
491
550
  for j in range(i + 1, clust_points_vector.shape[0]):
492
- p = ((i,), (j,))
551
+ # Global index of the points
552
+ global_i = indices[0][i]
553
+ global_j = indices[0][j]
493
554
 
494
- diff = clust_points_vector[p[0]] - clust_points_vector[p[1]]
555
+ # Compute Euclidean distance
556
+ diff = clust_points_vector[i] - clust_points_vector[j]
495
557
  d = np.dot(diff, diff) ** 0.5
496
558
 
497
- idx = indices[0][p[0]]
498
- idx_max = distances[idx].argmax()
499
-
500
- if d < distances[idx][idx_max]:
501
- distances[idx][idx_max] = d
502
- indexes[idx][idx_max] = p[1][0]
559
+ # Update distance and neighbor index for global_i
560
+ idx_max = distances[global_i].argmax()
561
+ if d < distances[global_i][idx_max]:
562
+ distances[global_i][idx_max] = d
563
+ indexes[global_i][idx_max] = global_j
503
564
 
504
- idx = indices[0][p[1]]
505
- idx_max = distances[idx].argmax()
565
+ # Update distance and neighbor index for global_j
566
+ idx_max = distances[global_j].argmax()
567
+ if d < distances[global_j][idx_max]:
568
+ distances[global_j][idx_max] = d
569
+ indexes[global_j][idx_max] = global_i
506
570
 
507
- if d < distances[idx][idx_max]:
508
- distances[idx][idx_max] = d
509
- indexes[idx][idx_max] = p[0][0]
571
+ yield distances, indexes, i
510
572
 
511
- return distances, indexes
512
-
513
- def _distances(self) -> None:
573
+ def _distances(self, progress_bar: bool = False) -> None:
514
574
  """
515
575
  Provides the distances between each observation and it's closest
516
576
  neighbors. When input data is provided, calculates the euclidean
@@ -519,22 +579,30 @@ class LocalOutlierProbability(object):
519
579
  :return: the updated storage matrix that collects information on
520
580
  each observation.
521
581
  """
522
- distances = np.full([self._n_observations(), self.n_neighbors], 9e10,
523
- dtype=float)
524
- indexes = np.full([self._n_observations(), self.n_neighbors], 9e10,
525
- dtype=float)
582
+ distances = np.full(
583
+ [self._n_observations(), self.n_neighbors], 9e10, dtype=float
584
+ )
585
+ indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
526
586
  self.points_vector = self.Validate._data(self.data)
527
- compute = numba.jit(self._compute_distance_and_neighbor_matrix,
528
- cache=True) if self.use_numba else \
529
- self._compute_distance_and_neighbor_matrix
587
+ compute = (
588
+ numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
589
+ if self.use_numba
590
+ else self._compute_distance_and_neighbor_matrix
591
+ )
592
+ progress = "="
530
593
  for cluster_id in set(self._cluster_labels()):
531
594
  indices = np.where(self._cluster_labels() == cluster_id)
532
595
  clust_points_vector = np.array(
533
- self.points_vector.take(indices, axis=0)[0],
534
- dtype=np.float64
596
+ self.points_vector.take(indices, axis=0)[0], dtype=np.float64
535
597
  )
536
- distances, indexes = compute(clust_points_vector, indices,
537
- distances, indexes)
598
+ # a generator that yields an updated distance matrix on each loop
599
+ for c in compute(clust_points_vector, indices, distances, indexes):
600
+ distances, indexes, i = c
601
+ # update the progress bar
602
+ if progress_bar is True:
603
+ progress = Utils.emit_progress_bar(
604
+ progress, i + 1, clust_points_vector.shape[0]
605
+ )
538
606
 
539
607
  self.distance_matrix = distances
540
608
  self.neighbor_matrix = indexes
@@ -588,11 +656,10 @@ class LocalOutlierProbability(object):
588
656
  """
589
657
  prob_distances = []
590
658
  for i in range(data_store[:, 4].shape[0]):
591
- prob_distances.append(
592
- self._prob_distance(self.extent, data_store[:, 4][i]))
659
+ prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))
593
660
  return np.hstack((data_store, np.array([prob_distances]).T))
594
661
 
595
- def _prob_distances_ev(self, data_store: np.ndarray) -> np.ndarray:
662
+ def _prob_distances_ev(self, data_store) -> np.ndarray:
596
663
  """
597
664
  Calculates the expected value of the probabilistic distance for
598
665
  each observation in the input data with respect to the cluster the
@@ -606,19 +673,20 @@ class LocalOutlierProbability(object):
606
673
  for cluster_id in self.cluster_labels_u:
607
674
  indices = np.where(data_store[:, 0] == cluster_id)[0]
608
675
  for index in indices:
609
- nbrhood = data_store[index][2].astype(int)
610
- nbrhood_prob_distances = np.take(data_store[:, 5],
611
- nbrhood).astype(float)
676
+ # Global neighbor indices for the current point
677
+ nbrhood = data_store[index][2].astype(int) # Ensure global indices
678
+ nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(
679
+ float
680
+ )
612
681
  nbrhood_prob_distances_nonan = nbrhood_prob_distances[
613
- np.logical_not(np.isnan(nbrhood_prob_distances))]
614
- prob_set_distance_ev[index] = \
615
- nbrhood_prob_distances_nonan.mean()
682
+ np.logical_not(np.isnan(nbrhood_prob_distances))
683
+ ]
684
+ prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()
685
+
616
686
  self.prob_distances_ev = prob_set_distance_ev
617
- data_store = np.hstack((data_store, prob_set_distance_ev))
618
- return data_store
687
+ return np.hstack((data_store, prob_set_distance_ev))
619
688
 
620
- def _prob_local_outlier_factors(self,
621
- data_store: np.ndarray) -> np.ndarray:
689
+ def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
622
690
  """
623
691
  Calculates the probabilistic local outlier factor for each
624
692
  observation in the input data.
@@ -628,13 +696,22 @@ class LocalOutlierProbability(object):
628
696
  each observation.
629
697
  """
630
698
  return np.hstack(
631
- (data_store,
632
- np.array([np.apply_along_axis(self._prob_outlier_factor, 0,
633
- data_store[:, 5],
634
- data_store[:, 6])]).T))
699
+ (
700
+ data_store,
701
+ np.array(
702
+ [
703
+ np.apply_along_axis(
704
+ self._prob_outlier_factor,
705
+ 0,
706
+ data_store[:, 5],
707
+ data_store[:, 6],
708
+ )
709
+ ]
710
+ ).T,
711
+ )
712
+ )
635
713
 
636
- def _prob_local_outlier_factors_ev(self,
637
- data_store: np.ndarray) -> np.ndarray:
714
+ def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:
638
715
  """
639
716
  Calculates the expected value of the probabilistic local outlier factor
640
717
  for each observation in the input data with respect to the cluster the
@@ -647,21 +724,31 @@ class LocalOutlierProbability(object):
647
724
  prob_local_outlier_factor_ev_dict = {}
648
725
  for cluster_id in self.cluster_labels_u:
649
726
  indices = np.where(data_store[:, 0] == cluster_id)
650
- prob_local_outlier_factors = np.take(data_store[:, 7],
651
- indices).astype(float)
652
- prob_local_outlier_factors_nonan = prob_local_outlier_factors[
653
- np.logical_not(np.isnan(prob_local_outlier_factors))]
654
- prob_local_outlier_factor_ev_dict[cluster_id] = (
655
- np.power(prob_local_outlier_factors_nonan, 2).sum() /
656
- float(prob_local_outlier_factors_nonan.size)
727
+ prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(
728
+ float
657
729
  )
730
+ prob_local_outlier_factors_nonan = prob_local_outlier_factors[
731
+ np.logical_not(np.isnan(prob_local_outlier_factors))
732
+ ]
733
+ prob_local_outlier_factor_ev_dict[cluster_id] = np.power(
734
+ prob_local_outlier_factors_nonan, 2
735
+ ).sum() / float(prob_local_outlier_factors_nonan.size)
658
736
  data_store = np.hstack(
659
- (data_store, np.array([[prob_local_outlier_factor_ev_dict[x] for x
660
- in data_store[:, 0].tolist()]]).T))
737
+ (
738
+ data_store,
739
+ np.array(
740
+ [
741
+ [
742
+ prob_local_outlier_factor_ev_dict[x]
743
+ for x in data_store[:, 0].tolist()
744
+ ]
745
+ ]
746
+ ).T,
747
+ )
748
+ )
661
749
  return data_store
662
750
 
663
- def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) \
664
- -> np.ndarray:
751
+ def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
665
752
  """
666
753
  Calculates the normalized probabilistic local outlier factor for each
667
754
  observation in the input data.
@@ -670,11 +757,20 @@ class LocalOutlierProbability(object):
670
757
  :return: the updated storage matrix that collects information on
671
758
  each observation.
672
759
  """
673
- return np.hstack((data_store, np.array([self._norm_prob_outlier_factor(
674
- self.extent, data_store[:, 8].tolist())]).T))
760
+ return np.hstack(
761
+ (
762
+ data_store,
763
+ np.array(
764
+ [
765
+ self._norm_prob_outlier_factor(
766
+ self.extent, data_store[:, 8].tolist()
767
+ )
768
+ ]
769
+ ).T,
770
+ )
771
+ )
675
772
 
676
- def _local_outlier_probabilities(self,
677
- data_store: np.ndarray) -> np.ndarray:
773
+ def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:
678
774
  """
679
775
  Calculates the local outlier probability for each observation in the
680
776
  input data.
@@ -684,17 +780,26 @@ class LocalOutlierProbability(object):
684
780
  each observation.
685
781
  """
686
782
  return np.hstack(
687
- (data_store,
688
- np.array([np.apply_along_axis(self._local_outlier_probability, 0,
689
- data_store[:, 7],
690
- data_store[:, 9])]).T))
783
+ (
784
+ data_store,
785
+ np.array(
786
+ [
787
+ np.apply_along_axis(
788
+ self._local_outlier_probability,
789
+ 0,
790
+ data_store[:, 7],
791
+ data_store[:, 9],
792
+ )
793
+ ]
794
+ ).T,
795
+ )
796
+ )
691
797
 
692
798
  """
693
799
  Public methods
694
800
  """
695
801
 
696
- def fit(self) -> 'LocalOutlierProbability':
697
-
802
+ def fit(self) -> "LocalOutlierProbability":
698
803
  """
699
804
  Calculates the local outlier probability for each observation in the
700
805
  input data according to the input parameters extent, n_neighbors, and
@@ -706,13 +811,12 @@ class LocalOutlierProbability(object):
706
811
  self.Validate._n_neighbors(self)
707
812
  if self.Validate._cluster_size(self) is False:
708
813
  sys.exit()
709
- if self.data is not None and self.Validate._missing_values(
710
- self) is False:
814
+ if self.data is not None and self.Validate._missing_values(self) is False:
711
815
  sys.exit()
712
816
 
713
817
  store = self._store()
714
818
  if self.data is not None:
715
- self._distances()
819
+ self._distances(progress_bar=self.progress_bar)
716
820
  store = self._assign_distances(store)
717
821
  store = self._ssd(store)
718
822
  store = self._standard_distances(store)
@@ -731,7 +835,6 @@ class LocalOutlierProbability(object):
731
835
  return self
732
836
 
733
837
  def stream(self, x: np.ndarray) -> np.ndarray:
734
-
735
838
  """
736
839
  Calculates the local outlier probability for an individual sample
737
840
  according to the input parameters extent, n_neighbors, and
@@ -770,12 +873,12 @@ class LocalOutlierProbability(object):
770
873
  ssd = np.power(distances, 2).sum()
771
874
  std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
772
875
  prob_dist = self._prob_distance(self.extent, std_dist)
773
- plof = self._prob_outlier_factor(np.array(prob_dist),
774
- np.array(
775
- self.prob_distances_ev.mean())
776
- )
876
+ plof = self._prob_outlier_factor(
877
+ np.array(prob_dist), np.array(self.prob_distances_ev.mean())
878
+ )
777
879
  loop = self._local_outlier_probability(
778
- plof, self.norm_prob_local_outlier_factor)
880
+ plof, self.norm_prob_local_outlier_factor
881
+ )
779
882
 
780
883
  if orig_cluster_labels is not None:
781
884
  self.cluster_labels = orig_cluster_labels
@@ -1,16 +1,16 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: PyNomaly
3
- Version: 0.3.1
3
+ Version: 0.3.4
4
4
  Summary: A Python 3 implementation of LoOP: Local Outlier Probabilities, a local density based outlier detection method providing an outlier score in the range of [0,1].
5
5
  Home-page: https://github.com/vc1492a/PyNomaly
6
6
  Author: Valentino Constantinou
7
7
  Author-email: vc@valentino.io
8
8
  License: Apache License, Version 2.0
9
- Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.1.tar.gz
9
+ Download-URL: https://github.com/vc1492a/PyNomaly/archive/0.3.4.tar.gz
10
10
  Keywords: outlier,anomaly,detection,machine,learning,probability
11
11
  Platform: UNKNOWN
12
12
  Requires-Dist: numpy
13
- Requires-Dist: numba
13
+ Requires-Dist: python-utils
14
14
 
15
15
  UNKNOWN
16
16
 
@@ -0,0 +1,7 @@
1
+ PyNomaly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ PyNomaly/loop.py,sha256=VLllAa5pOIHZjlI0XuLSpjLzY3tJ_ZTzDCbbIh3VM44,34571
3
+ PyNomaly-0.3.4.dist-info/LICENSE.txt,sha256=xZYfuJFfM57xOlBLbkJmsCwEvw1P6K2t3jI8faTdOMs,563
4
+ PyNomaly-0.3.4.dist-info/METADATA,sha256=xkHaSUSpOnZynE_KfVQAwoBXNOzTpE-IymwuiRdIeos,581
5
+ PyNomaly-0.3.4.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92
6
+ PyNomaly-0.3.4.dist-info/top_level.txt,sha256=el-HX4RLyBjkh2CW3TK9yXAA54zQOIYVmcJjRbBYKX4,9
7
+ PyNomaly-0.3.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.33.4)
2
+ Generator: bdist_wheel (0.34.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,7 +0,0 @@
1
- PyNomaly/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- PyNomaly/loop.py,sha256=zK7I946YNha8VjxIrAJPgF5wjs6anYgXneJA-kH9RdA,32115
3
- PyNomaly-0.3.1.dist-info/LICENSE.txt,sha256=xZYfuJFfM57xOlBLbkJmsCwEvw1P6K2t3jI8faTdOMs,563
4
- PyNomaly-0.3.1.dist-info/METADATA,sha256=FQfdmmyCgb_cE1LJYpj4chFs40ME95Ms1G4wq_3AFAE,574
5
- PyNomaly-0.3.1.dist-info/WHEEL,sha256=S8S5VL-stOTSZDYxHyf0KP7eds0J72qrK0Evu3TfyAY,92
6
- PyNomaly-0.3.1.dist-info/top_level.txt,sha256=el-HX4RLyBjkh2CW3TK9yXAA54zQOIYVmcJjRbBYKX4,9
7
- PyNomaly-0.3.1.dist-info/RECORD,,