PyNomaly 0.3.2__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
PyNomaly/loop.py CHANGED
@@ -10,13 +10,33 @@ try:
10
10
  except ImportError:
11
11
  pass
12
12
 
13
- __author__ = 'Valentino Constantinou'
14
- __version__ = '0.3.2'
15
- __license__ = 'Apache License, Version 2.0'
13
+ __author__ = "Valentino Constantinou"
14
+ __version__ = "0.3.5"
15
+ __license__ = "Apache License, Version 2.0"
16
16
 
17
17
 
18
- class Utils:
18
+ # Custom Exceptions
19
+ class PyNomalyError(Exception):
20
+ """Base exception for PyNomaly."""
21
+ pass
22
+
23
+
24
+ class ValidationError(PyNomalyError):
25
+ """Raised when input validation fails."""
26
+ pass
27
+
28
+
29
+ class ClusterSizeError(ValidationError):
30
+ """Raised when cluster size is smaller than n_neighbors."""
31
+ pass
32
+
33
+
34
+ class MissingValuesError(ValidationError):
35
+ """Raised when data contains missing values."""
36
+ pass
19
37
 
38
+
39
+ class Utils:
20
40
  @staticmethod
21
41
  def emit_progress_bar(progress: str, index: int, total: int) -> str:
22
42
  """
@@ -32,7 +52,10 @@ class Utils:
32
52
 
33
53
  w, h = get_terminal_size()
34
54
  sys.stdout.write("\r")
35
- block_size = int(total / w)
55
+ if total < w:
56
+ block_size = int(w / total)
57
+ else:
58
+ block_size = int(total / w)
36
59
  if index % block_size == 0:
37
60
  progress += "="
38
61
  percent = index / total
@@ -52,7 +75,7 @@ class LocalOutlierProbability(object):
52
75
  :param cluster_labels: a numpy array of cluster assignments w.r.t. each
53
76
  sample (optional, default None)
54
77
  :return:
55
- """"""
78
+ """ """
56
79
 
57
80
  Based on the work of Kriegel, Kröger, Schubert, and Zimek (2009) in LoOP:
58
81
  Local Outlier Probabilities.
@@ -75,203 +98,190 @@ class LocalOutlierProbability(object):
75
98
  (2016).
76
99
  """
77
100
 
78
- class Validate:
101
+ """
102
+ Validation methods.
103
+ These methods validate inputs and raise exceptions or warnings as appropriate.
104
+ """
79
105
 
106
+ @staticmethod
107
+ def _convert_to_array(obj: Union["pd.DataFrame", np.ndarray]) -> np.ndarray:
108
+ """
109
+ Converts the input data to a numpy array if it is a Pandas DataFrame
110
+ or validates it is already a numpy array.
111
+ :param obj: user-provided input data.
112
+ :return: a vector of values to be used in calculating the local
113
+ outlier probability.
114
+ """
115
+ if obj.__class__.__name__ == "DataFrame":
116
+ points_vector = obj.values
117
+ return points_vector
118
+ elif obj.__class__.__name__ == "ndarray":
119
+ points_vector = obj
120
+ return points_vector
121
+ else:
122
+ warnings.warn(
123
+ "Provided data or distance matrix must be in ndarray "
124
+ "or DataFrame.",
125
+ UserWarning,
126
+ )
127
+ if isinstance(obj, list):
128
+ points_vector = np.array(obj)
129
+ return points_vector
130
+ points_vector = np.array([obj])
131
+ return points_vector
132
+
133
+ def _validate_inputs(self):
80
134
  """
81
- The Validate class aids in ensuring PyNomaly receives the right set
82
- of user inputs for proper execution of the Local Outlier Probability
83
- (LoOP) approach. Depending on the desired behavior, either an
84
- exception is raised to the user or PyNomaly continues executing
85
- albeit with some form of user warning.
135
+ Validates the inputs provided during initialization to ensure
136
+ that the needed objects are provided.
137
+ :return: a tuple of (data, distance_matrix, neighbor_matrix) or
138
+ raises a warning for invalid inputs.
86
139
  """
140
+ if all(v is None for v in [self.data, self.distance_matrix]):
141
+ warnings.warn(
142
+ "Data or a distance matrix must be provided.", UserWarning
143
+ )
144
+ return False
145
+ elif all(v is not None for v in [self.data, self.distance_matrix]):
146
+ warnings.warn(
147
+ "Only one of the following may be provided: data or a "
148
+ "distance matrix (not both).",
149
+ UserWarning,
150
+ )
151
+ return False
152
+ if self.data is not None:
153
+ points_vector = self._convert_to_array(self.data)
154
+ return points_vector, self.distance_matrix, self.neighbor_matrix
155
+ if all(
156
+ matrix is not None
157
+ for matrix in [self.neighbor_matrix, self.distance_matrix]
158
+ ):
159
+ dist_vector = self._convert_to_array(self.distance_matrix)
160
+ neigh_vector = self._convert_to_array(self.neighbor_matrix)
161
+ else:
162
+ warnings.warn(
163
+ "A neighbor index matrix and distance matrix must both be "
164
+ "provided when not using raw input data.",
165
+ UserWarning,
166
+ )
167
+ return False
168
+ if self.distance_matrix.shape != self.neighbor_matrix.shape:
169
+ warnings.warn(
170
+ "The shape of the distance and neighbor "
171
+ "index matrices must match.",
172
+ UserWarning,
173
+ )
174
+ return False
175
+ elif (self.distance_matrix.shape[1] != self.n_neighbors) or (
176
+ self.neighbor_matrix.shape[1] != self.n_neighbors
177
+ ):
178
+ warnings.warn(
179
+ "The shape of the distance or "
180
+ "neighbor index matrix does not "
181
+ "match the number of neighbors "
182
+ "specified.",
183
+ UserWarning,
184
+ )
185
+ return False
186
+ return self.data, dist_vector, neigh_vector
187
+
188
+ def _check_cluster_size(self) -> None:
189
+ """
190
+ Validates the cluster labels to ensure that the smallest cluster
191
+ size (number of observations in the cluster) is larger than the
192
+ specified number of neighbors.
193
+ :raises ClusterSizeError: if any cluster is too small.
194
+ """
195
+ c_labels = self._cluster_labels()
196
+ for cluster_id in set(c_labels):
197
+ c_size = np.where(c_labels == cluster_id)[0].shape[0]
198
+ if c_size <= self.n_neighbors:
199
+ raise ClusterSizeError(
200
+ "Number of neighbors specified larger than smallest "
201
+ "cluster. Specify a number of neighbors smaller than "
202
+ "the smallest cluster size (observations in smallest "
203
+ "cluster minus one)."
204
+ )
87
205
 
206
+ def _check_n_neighbors(self) -> bool:
88
207
  """
89
- Private methods.
208
+ Validates the specified number of neighbors to ensure that it is
209
+ greater than 0 and that the specified value is less than the total
210
+ number of observations.
211
+ :return: a boolean indicating whether validation has passed without
212
+ adjustment.
90
213
  """
214
+ if not self.n_neighbors > 0:
215
+ self.n_neighbors = 10
216
+ warnings.warn(
217
+ "n_neighbors must be greater than 0."
218
+ " Fit with " + str(self.n_neighbors) + " instead.",
219
+ UserWarning,
220
+ )
221
+ return False
222
+ elif self.n_neighbors >= self._n_observations():
223
+ self.n_neighbors = self._n_observations() - 1
224
+ warnings.warn(
225
+ "n_neighbors must be less than the number of observations."
226
+ " Fit with " + str(self.n_neighbors) + " instead.",
227
+ UserWarning,
228
+ )
229
+ return True
91
230
 
92
- @staticmethod
93
- def _data(obj: Union['pd.DataFrame', np.ndarray]) -> np.ndarray:
94
- """
95
- Validates the input data to ensure it is either a Pandas DataFrame
96
- or Numpy array.
97
- :param obj: user-provided input data.
98
- :return: a vector of values to be used in calculating the local
99
- outlier probability.
100
- """
101
- if obj.__class__.__name__ == 'DataFrame':
102
- points_vector = obj.values
103
- return points_vector
104
- elif obj.__class__.__name__ == 'ndarray':
105
- points_vector = obj
106
- return points_vector
107
- else:
108
- warnings.warn(
109
- "Provided data or distance matrix must be in ndarray "
110
- "or DataFrame.",
111
- UserWarning)
112
- if isinstance(obj, list):
113
- points_vector = np.array(obj)
114
- return points_vector
115
- points_vector = np.array([obj])
116
- return points_vector
231
+ def _check_extent(self) -> bool:
232
+ """
233
+ Validates the specified extent parameter to ensure it is either 1,
234
+ 2, or 3.
235
+ :return: a boolean indicating whether validation has passed.
236
+ """
237
+ if self.extent not in [1, 2, 3]:
238
+ warnings.warn(
239
+ "extent parameter (lambda) must be 1, 2, or 3.", UserWarning
240
+ )
241
+ return False
242
+ return True
117
243
 
118
- def _inputs(self, obj: 'LocalOutlierProbability'):
119
- """
120
- Validates the inputs provided during initialization to ensure
121
- that the needed objects are provided.
122
- :param obj: a PyNomaly object.
123
- :return: a boolean indicating whether validation has failed or
124
- the data, distance matrix, and neighbor matrix.
125
- """
126
- if all(v is None for v in [obj.data, obj.distance_matrix]):
127
- warnings.warn(
128
- "Data or a distance matrix must be provided.", UserWarning
129
- )
130
- return False
131
- elif all(v is not None for v in [obj.data, obj.distance_matrix]):
132
- warnings.warn(
133
- "Only one of the following may be provided: data or a "
134
- "distance matrix (not both).", UserWarning
135
- )
136
- return False
137
- if obj.data is not None:
138
- points_vector = self._data(obj.data)
139
- return points_vector, obj.distance_matrix, obj.neighbor_matrix
140
- if all(matrix is not None for matrix in [obj.neighbor_matrix,
141
- obj.distance_matrix]):
142
- dist_vector = self._data(obj.distance_matrix)
143
- neigh_vector = self._data(obj.neighbor_matrix)
144
- else:
145
- warnings.warn(
146
- "A neighbor index matrix and distance matrix must both be "
147
- "provided when not using raw input data.", UserWarning
148
- )
149
- return False
150
- if obj.distance_matrix.shape != obj.neighbor_matrix.shape:
151
- warnings.warn(
152
- "The shape of the distance and neighbor "
153
- "index matrices must match.", UserWarning
154
- )
155
- return False
156
- elif (obj.distance_matrix.shape[1] != obj.n_neighbors) \
157
- or (obj.neighbor_matrix.shape[1] !=
158
- obj.n_neighbors):
159
- warnings.warn("The shape of the distance or "
160
- "neighbor index matrix does not "
161
- "match the number of neighbors "
162
- "specified.", UserWarning)
163
- return False
164
- return obj.data, dist_vector, neigh_vector
165
-
166
- @staticmethod
167
- def _cluster_size(obj) -> bool:
168
- """
169
- Validates the cluster labels to ensure that the smallest cluster
170
- size (number of observations in the cluster) is larger than the
171
- specified number of neighbors.
172
- :param obj: a PyNomaly object.
173
- :return: a boolean indicating whether validation has passed.
174
- """
175
- c_labels = obj._cluster_labels()
176
- for cluster_id in set(c_labels):
177
- c_size = np.where(c_labels == cluster_id)[0].shape[0]
178
- if c_size <= obj.n_neighbors:
179
- warnings.warn(
180
- "Number of neighbors specified larger than smallest "
181
- "cluster. Specify a number of neighbors smaller than "
182
- "the smallest cluster size (observations in smallest "
183
- "cluster minus one).",
184
- UserWarning)
185
- return False
186
- return True
187
-
188
- @staticmethod
189
- def _n_neighbors(obj) -> bool:
190
- """
191
- Validates the specified number of neighbors to ensure that it is
192
- greater than 0 and that the specified value is less than the total
193
- number of observations.
194
- :param obj: a PyNomaly object.
195
- :return: a boolean indicating whether validation has passed.
196
- """
197
- if not obj.n_neighbors > 0:
198
- obj.n_neighbors = 10
199
- warnings.warn("n_neighbors must be greater than 0."
200
- " Fit with " + str(obj.n_neighbors) +
201
- " instead.",
202
- UserWarning)
203
- return False
204
- elif obj.n_neighbors >= obj._n_observations():
205
- obj.n_neighbors = obj._n_observations() - 1
206
- warnings.warn(
207
- "n_neighbors must be less than the number of observations."
208
- " Fit with " + str(obj.n_neighbors) + " instead.",
209
- UserWarning)
210
- return True
211
-
212
- @staticmethod
213
- def _extent(obj) -> bool:
214
- """
215
- Validates the specified extent parameter to ensure it is either 1,
216
- 2, or 3.
217
- :param obj: a PyNomaly object.
218
- :return: a boolean indicating whether validation has passed.
219
- """
220
- if obj.extent not in [1, 2, 3]:
221
- warnings.warn(
222
- "extent parameter (lambda) must be 1, 2, or 3.",
223
- UserWarning)
224
- return False
225
- return True
226
-
227
- @staticmethod
228
- def _missing_values(obj) -> bool:
229
- """
230
- Validates the provided data to ensure that it contains no
231
- missing values.
232
- :param obj: a PyNomaly object.
233
- :return: a boolean indicating whether validation has passed.
234
- """
235
- if np.any(np.isnan(obj.data)):
236
- warnings.warn(
237
- "Method does not support missing values in input data.",
238
- UserWarning)
239
- return False
240
- return True
241
-
242
- @staticmethod
243
- def _fit(obj) -> bool:
244
- """
245
- Validates that the model was fit prior to calling the stream()
246
- method.
247
- :param obj: a PyNomaly object.
248
- :return: a boolean indicating whether validation has passed.
249
- """
250
- if obj.is_fit is False:
251
- warnings.warn(
252
- "Must fit on historical data by calling fit() prior to "
253
- "calling stream(x).",
254
- UserWarning)
255
- return False
256
- return True
257
-
258
- @staticmethod
259
- def _no_cluster_labels(obj) -> bool:
260
- """
261
- Checks to see if cluster labels are attempting to be used in
262
- stream() and, if so, calls fit() once again but without cluster
263
- labels. As PyNomaly does not accept clustering algorithms as input,
264
- the stream approach does not support clustering.
265
- :param obj: a PyNomaly object.
266
- :return: a boolean indicating whether validation has passed.
267
- """
268
- if len(set(obj._cluster_labels())) > 1:
269
- warnings.warn(
270
- "Stream approach does not support clustered data. "
271
- "Automatically refit using single cluster of points.",
272
- UserWarning)
273
- return False
274
- return True
244
+ def _check_missing_values(self) -> None:
245
+ """
246
+ Validates the provided data to ensure that it contains no
247
+ missing values.
248
+ :raises MissingValuesError: if data contains NaN values.
249
+ """
250
+ if np.any(np.isnan(self.data)):
251
+ raise MissingValuesError(
252
+ "Method does not support missing values in input data."
253
+ )
254
+
255
+ def _check_is_fit(self) -> bool:
256
+ """
257
+ Checks that the model was fit prior to calling the stream() method.
258
+ :return: a boolean indicating whether the model has been fit.
259
+ """
260
+ if self.is_fit is False:
261
+ warnings.warn(
262
+ "Must fit on historical data by calling fit() prior to "
263
+ "calling stream(x).",
264
+ UserWarning,
265
+ )
266
+ return False
267
+ return True
268
+
269
+ def _check_no_cluster_labels(self) -> bool:
270
+ """
271
+ Checks to see if cluster labels are attempting to be used in
272
+ stream() and, if so, returns False. As PyNomaly does not accept
273
+ clustering algorithms as input, the stream approach does not
274
+ support clustering.
275
+ :return: a boolean indicating whether single cluster (no labels).
276
+ """
277
+ if len(set(self._cluster_labels())) > 1:
278
+ warnings.warn(
279
+ "Stream approach does not support clustered data. "
280
+ "Automatically refit using single cluster of points.",
281
+ UserWarning,
282
+ )
283
+ return False
284
+ return True
275
285
 
276
286
  """
277
287
  Decorators.
@@ -291,43 +301,35 @@ class LocalOutlierProbability(object):
291
301
  assert len(types) == f.__code__.co_argcount
292
302
 
293
303
  def new_f(*args, **kwds):
294
- for (a, t) in zip(args, types):
295
- if type(a).__name__ == 'DataFrame':
304
+ for a, t in zip(args, types):
305
+ if type(a).__name__ == "DataFrame":
296
306
  a = np.array(a)
297
307
  if isinstance(a, t) is False:
298
- warnings.warn("Argument %r is not of type %s" % (a, t),
299
- UserWarning)
308
+ warnings.warn(
309
+ "Argument %r is not of type %s" % (a, t), UserWarning
310
+ )
300
311
  opt_types = {
301
- 'distance_matrix': {
302
- 'type': types[2]
303
- },
304
- 'neighbor_matrix': {
305
- 'type': types[3]
306
- },
307
- 'extent': {
308
- 'type': types[4]
309
- },
310
- 'n_neighbors': {
311
- 'type': types[5]
312
- },
313
- 'cluster_labels': {
314
- 'type': types[6]
315
- },
316
- 'use_numba': {
317
- 'type': types[7]
318
- },
319
- 'progress_bar': {
320
- 'type': types[8]
321
- }
312
+ "distance_matrix": {"type": types[2]},
313
+ "neighbor_matrix": {"type": types[3]},
314
+ "extent": {"type": types[4]},
315
+ "n_neighbors": {"type": types[5]},
316
+ "cluster_labels": {"type": types[6]},
317
+ "use_numba": {"type": types[7]},
318
+ "progress_bar": {"type": types[8]},
322
319
  }
323
320
  for x in kwds:
324
- opt_types[x]['value'] = kwds[x]
321
+ opt_types[x]["value"] = kwds[x]
325
322
  for k in opt_types:
326
323
  try:
327
- if isinstance(opt_types[k]['value'],
328
- opt_types[k]['type']) is False:
329
- warnings.warn("Argument %r is not of type %s." % (
330
- k, opt_types[k]['type']), UserWarning)
324
+ if (
325
+ isinstance(opt_types[k]["value"], opt_types[k]["type"])
326
+ is False
327
+ ):
328
+ warnings.warn(
329
+ "Argument %r is not of type %s."
330
+ % (k, opt_types[k]["type"]),
331
+ UserWarning,
332
+ )
331
333
  except KeyError:
332
334
  pass
333
335
  return f(*args, **kwds)
@@ -337,11 +339,28 @@ class LocalOutlierProbability(object):
337
339
 
338
340
  return decorator
339
341
 
340
- @accepts(object, np.ndarray, np.ndarray, np.ndarray, (int, np.integer),
341
- (int, np.integer), list, bool, bool)
342
- def __init__(self, data=None, distance_matrix=None, neighbor_matrix=None,
343
- extent=3, n_neighbors=10, cluster_labels=None,
344
- use_numba=False, progress_bar=False) -> None:
342
+ @accepts(
343
+ object,
344
+ np.ndarray,
345
+ np.ndarray,
346
+ np.ndarray,
347
+ (int, np.integer),
348
+ (int, np.integer),
349
+ list,
350
+ bool,
351
+ bool,
352
+ )
353
+ def __init__(
354
+ self,
355
+ data=None,
356
+ distance_matrix=None,
357
+ neighbor_matrix=None,
358
+ extent=3,
359
+ n_neighbors=10,
360
+ cluster_labels=None,
361
+ use_numba=False,
362
+ progress_bar=False,
363
+ ) -> None:
345
364
  self.data = data
346
365
  self.distance_matrix = distance_matrix
347
366
  self.neighbor_matrix = neighbor_matrix
@@ -358,29 +377,28 @@ class LocalOutlierProbability(object):
358
377
  self.progress_bar = progress_bar
359
378
  self.is_fit = False
360
379
 
361
- if self.use_numba is True and 'numba' not in sys.modules:
380
+ if self.use_numba is True and "numba" not in sys.modules:
362
381
  self.use_numba = False
363
382
  warnings.warn(
364
- "Numba is not available, falling back to pure python mode.",
365
- UserWarning)
383
+ "Numba is not available, falling back to pure python mode.", UserWarning
384
+ )
366
385
 
367
- self.Validate()._inputs(self)
368
- self.Validate._extent(self)
386
+ self._validate_inputs()
387
+ self._check_extent()
369
388
 
370
389
  """
371
390
  Private methods.
372
391
  """
373
392
 
374
393
  @staticmethod
375
- def _standard_distance(cardinality: float, sum_squared_distance: float) \
376
- -> float:
394
+ def _standard_distance(cardinality: float, sum_squared_distance: float) -> float:
377
395
  """
378
396
  Calculates the standard distance of an observation.
379
397
  :param cardinality: the cardinality of the input observation.
380
398
  :param sum_squared_distance: the sum squared distance between all
381
399
  neighbors of the input observation.
382
400
  :return: the standard distance.
383
- # """
401
+ #"""
384
402
  division_result = sum_squared_distance / cardinality
385
403
  st_dist = sqrt(division_result)
386
404
  return st_dist
@@ -397,8 +415,9 @@ class LocalOutlierProbability(object):
397
415
  return extent * standard_distance
398
416
 
399
417
  @staticmethod
400
- def _prob_outlier_factor(probabilistic_distance: np.ndarray, ev_prob_dist:
401
- np.ndarray) -> np.ndarray:
418
+ def _prob_outlier_factor(
419
+ probabilistic_distance: np.ndarray, ev_prob_dist: np.ndarray
420
+ ) -> np.ndarray:
402
421
  """
403
422
  Calculates the probabilistic outlier factor of an observation.
404
423
  :param probabilistic_distance: the probabilistic distance of the
@@ -409,14 +428,14 @@ class LocalOutlierProbability(object):
409
428
  if np.all(probabilistic_distance == ev_prob_dist):
410
429
  return np.zeros(probabilistic_distance.shape)
411
430
  else:
412
- ev_prob_dist[ev_prob_dist == 0.] = 1.e-8
413
- result = np.divide(probabilistic_distance, ev_prob_dist) - 1.
431
+ ev_prob_dist[ev_prob_dist == 0.0] = 1.0e-8
432
+ result = np.divide(probabilistic_distance, ev_prob_dist) - 1.0
414
433
  return result
415
434
 
416
435
  @staticmethod
417
- def _norm_prob_outlier_factor(extent: float,
418
- ev_probabilistic_outlier_factor: list) \
419
- -> list:
436
+ def _norm_prob_outlier_factor(
437
+ extent: float, ev_probabilistic_outlier_factor: list
438
+ ) -> list:
420
439
  """
421
440
  Calculates the normalized probabilistic outlier factor of an
422
441
  observation.
@@ -431,8 +450,9 @@ class LocalOutlierProbability(object):
431
450
  return npofs
432
451
 
433
452
  @staticmethod
434
- def _local_outlier_probability(plof_val: np.ndarray, nplof_val: np.ndarray) \
435
- -> np.ndarray:
453
+ def _local_outlier_probability(
454
+ plof_val: np.ndarray, nplof_val: np.ndarray
455
+ ) -> np.ndarray:
436
456
  """
437
457
  Calculates the local outlier probability of an observation.
438
458
  :param plof_val: the probabilistic outlier factor of the input
@@ -445,7 +465,7 @@ class LocalOutlierProbability(object):
445
465
  if np.all(plof_val == nplof_val):
446
466
  return np.zeros(plof_val.shape)
447
467
  else:
448
- return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.))))
468
+ return np.maximum(0, erf_vec(plof_val / (nplof_val * np.sqrt(2.0))))
449
469
 
450
470
  def _n_observations(self) -> int:
451
471
  """
@@ -499,8 +519,9 @@ class LocalOutlierProbability(object):
499
519
  :return: the updated storage matrix that collects information on
500
520
  each observation.
501
521
  """
502
- for vec, cluster_id in zip(range(self.distance_matrix.shape[0]),
503
- self._cluster_labels()):
522
+ for vec, cluster_id in zip(
523
+ range(self.distance_matrix.shape[0]), self._cluster_labels()
524
+ ):
504
525
  data_store[vec][0] = cluster_id
505
526
  data_store[vec][1] = self.distance_matrix[vec]
506
527
  data_store[vec][2] = self.neighbor_matrix[vec]
@@ -508,10 +529,10 @@ class LocalOutlierProbability(object):
508
529
 
509
530
  @staticmethod
510
531
  def _compute_distance_and_neighbor_matrix(
511
- clust_points_vector: np.ndarray,
512
- indices: np.ndarray,
513
- distances: np.ndarray,
514
- indexes: np.ndarray
532
+ clust_points_vector: np.ndarray,
533
+ indices: np.ndarray,
534
+ distances: np.ndarray,
535
+ indexes: np.ndarray,
515
536
  ) -> Tuple[np.ndarray, np.ndarray, int]:
516
537
  """
517
538
  This helper method provides the heavy lifting for the _distances
@@ -519,27 +540,27 @@ class LocalOutlierProbability(object):
519
540
  written so that it can make full use of Numba's jit capabilities if
520
541
  desired.
521
542
  """
522
-
523
543
  for i in range(clust_points_vector.shape[0]):
524
544
  for j in range(i + 1, clust_points_vector.shape[0]):
525
- p = ((i,), (j,))
545
+ # Global index of the points
546
+ global_i = indices[0][i]
547
+ global_j = indices[0][j]
526
548
 
527
- diff = clust_points_vector[p[0]] - clust_points_vector[p[1]]
549
+ # Compute Euclidean distance
550
+ diff = clust_points_vector[i] - clust_points_vector[j]
528
551
  d = np.dot(diff, diff) ** 0.5
529
552
 
530
- idx = indices[0][p[0]]
531
- idx_max = distances[idx].argmax()
553
+ # Update distance and neighbor index for global_i
554
+ idx_max = distances[global_i].argmax()
555
+ if d < distances[global_i][idx_max]:
556
+ distances[global_i][idx_max] = d
557
+ indexes[global_i][idx_max] = global_j
532
558
 
533
- if d < distances[idx][idx_max]:
534
- distances[idx][idx_max] = d
535
- indexes[idx][idx_max] = p[1][0]
536
-
537
- idx = indices[0][p[1]]
538
- idx_max = distances[idx].argmax()
539
-
540
- if d < distances[idx][idx_max]:
541
- distances[idx][idx_max] = d
542
- indexes[idx][idx_max] = p[0][0]
559
+ # Update distance and neighbor index for global_j
560
+ idx_max = distances[global_j].argmax()
561
+ if d < distances[global_j][idx_max]:
562
+ distances[global_j][idx_max] = d
563
+ indexes[global_j][idx_max] = global_i
543
564
 
544
565
  yield distances, indexes, i
545
566
 
@@ -552,20 +573,21 @@ class LocalOutlierProbability(object):
552
573
  :return: the updated storage matrix that collects information on
553
574
  each observation.
554
575
  """
555
- distances = np.full([self._n_observations(), self.n_neighbors], 9e10,
556
- dtype=float)
557
- indexes = np.full([self._n_observations(), self.n_neighbors], 9e10,
558
- dtype=float)
559
- self.points_vector = self.Validate._data(self.data)
560
- compute = numba.jit(self._compute_distance_and_neighbor_matrix,
561
- cache=True) if self.use_numba else \
562
- self._compute_distance_and_neighbor_matrix
576
+ distances = np.full(
577
+ [self._n_observations(), self.n_neighbors], 9e10, dtype=float
578
+ )
579
+ indexes = np.full([self._n_observations(), self.n_neighbors], 9e10, dtype=float)
580
+ self.points_vector = self._convert_to_array(self.data)
581
+ compute = (
582
+ numba.jit(self._compute_distance_and_neighbor_matrix, cache=True)
583
+ if self.use_numba
584
+ else self._compute_distance_and_neighbor_matrix
585
+ )
563
586
  progress = "="
564
587
  for cluster_id in set(self._cluster_labels()):
565
588
  indices = np.where(self._cluster_labels() == cluster_id)
566
589
  clust_points_vector = np.array(
567
- self.points_vector.take(indices, axis=0)[0],
568
- dtype=np.float64
590
+ self.points_vector.take(indices, axis=0)[0], dtype=np.float64
569
591
  )
570
592
  # a generator that yields an updated distance matrix on each loop
571
593
  for c in compute(clust_points_vector, indices, distances, indexes):
@@ -573,7 +595,8 @@ class LocalOutlierProbability(object):
573
595
  # update the progress bar
574
596
  if progress_bar is True:
575
597
  progress = Utils.emit_progress_bar(
576
- progress, i+1, clust_points_vector.shape[0])
598
+ progress, i + 1, clust_points_vector.shape[0]
599
+ )
577
600
 
578
601
  self.distance_matrix = distances
579
602
  self.neighbor_matrix = indexes
@@ -627,11 +650,10 @@ class LocalOutlierProbability(object):
627
650
  """
628
651
  prob_distances = []
629
652
  for i in range(data_store[:, 4].shape[0]):
630
- prob_distances.append(
631
- self._prob_distance(self.extent, data_store[:, 4][i]))
653
+ prob_distances.append(self._prob_distance(self.extent, data_store[:, 4][i]))
632
654
  return np.hstack((data_store, np.array([prob_distances]).T))
633
655
 
634
- def _prob_distances_ev(self, data_store: np.ndarray) -> np.ndarray:
656
+ def _prob_distances_ev(self, data_store) -> np.ndarray:
635
657
  """
636
658
  Calculates the expected value of the probabilistic distance for
637
659
  each observation in the input data with respect to the cluster the
@@ -645,19 +667,20 @@ class LocalOutlierProbability(object):
645
667
  for cluster_id in self.cluster_labels_u:
646
668
  indices = np.where(data_store[:, 0] == cluster_id)[0]
647
669
  for index in indices:
648
- nbrhood = data_store[index][2].astype(int)
649
- nbrhood_prob_distances = np.take(data_store[:, 5],
650
- nbrhood).astype(float)
670
+ # Global neighbor indices for the current point
671
+ nbrhood = data_store[index][2].astype(int) # Ensure global indices
672
+ nbrhood_prob_distances = np.take(data_store[:, 5], nbrhood).astype(
673
+ float
674
+ )
651
675
  nbrhood_prob_distances_nonan = nbrhood_prob_distances[
652
- np.logical_not(np.isnan(nbrhood_prob_distances))]
653
- prob_set_distance_ev[index] = \
654
- nbrhood_prob_distances_nonan.mean()
676
+ np.logical_not(np.isnan(nbrhood_prob_distances))
677
+ ]
678
+ prob_set_distance_ev[index] = nbrhood_prob_distances_nonan.mean()
679
+
655
680
  self.prob_distances_ev = prob_set_distance_ev
656
- data_store = np.hstack((data_store, prob_set_distance_ev))
657
- return data_store
681
+ return np.hstack((data_store, prob_set_distance_ev))
658
682
 
659
- def _prob_local_outlier_factors(self,
660
- data_store: np.ndarray) -> np.ndarray:
683
+ def _prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
661
684
  """
662
685
  Calculates the probabilistic local outlier factor for each
663
686
  observation in the input data.
@@ -667,13 +690,22 @@ class LocalOutlierProbability(object):
667
690
  each observation.
668
691
  """
669
692
  return np.hstack(
670
- (data_store,
671
- np.array([np.apply_along_axis(self._prob_outlier_factor, 0,
672
- data_store[:, 5],
673
- data_store[:, 6])]).T))
693
+ (
694
+ data_store,
695
+ np.array(
696
+ [
697
+ np.apply_along_axis(
698
+ self._prob_outlier_factor,
699
+ 0,
700
+ data_store[:, 5],
701
+ data_store[:, 6],
702
+ )
703
+ ]
704
+ ).T,
705
+ )
706
+ )
674
707
 
675
- def _prob_local_outlier_factors_ev(self,
676
- data_store: np.ndarray) -> np.ndarray:
708
+ def _prob_local_outlier_factors_ev(self, data_store: np.ndarray) -> np.ndarray:
677
709
  """
678
710
  Calculates the expected value of the probabilistic local outlier factor
679
711
  for each observation in the input data with respect to the cluster the
@@ -686,21 +718,31 @@ class LocalOutlierProbability(object):
686
718
  prob_local_outlier_factor_ev_dict = {}
687
719
  for cluster_id in self.cluster_labels_u:
688
720
  indices = np.where(data_store[:, 0] == cluster_id)
689
- prob_local_outlier_factors = np.take(data_store[:, 7],
690
- indices).astype(float)
691
- prob_local_outlier_factors_nonan = prob_local_outlier_factors[
692
- np.logical_not(np.isnan(prob_local_outlier_factors))]
693
- prob_local_outlier_factor_ev_dict[cluster_id] = (
694
- np.power(prob_local_outlier_factors_nonan, 2).sum() /
695
- float(prob_local_outlier_factors_nonan.size)
721
+ prob_local_outlier_factors = np.take(data_store[:, 7], indices).astype(
722
+ float
696
723
  )
724
+ prob_local_outlier_factors_nonan = prob_local_outlier_factors[
725
+ np.logical_not(np.isnan(prob_local_outlier_factors))
726
+ ]
727
+ prob_local_outlier_factor_ev_dict[cluster_id] = np.power(
728
+ prob_local_outlier_factors_nonan, 2
729
+ ).sum() / float(prob_local_outlier_factors_nonan.size)
697
730
  data_store = np.hstack(
698
- (data_store, np.array([[prob_local_outlier_factor_ev_dict[x] for x
699
- in data_store[:, 0].tolist()]]).T))
731
+ (
732
+ data_store,
733
+ np.array(
734
+ [
735
+ [
736
+ prob_local_outlier_factor_ev_dict[x]
737
+ for x in data_store[:, 0].tolist()
738
+ ]
739
+ ]
740
+ ).T,
741
+ )
742
+ )
700
743
  return data_store
701
744
 
702
- def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) \
703
- -> np.ndarray:
745
+ def _norm_prob_local_outlier_factors(self, data_store: np.ndarray) -> np.ndarray:
704
746
  """
705
747
  Calculates the normalized probabilistic local outlier factor for each
706
748
  observation in the input data.
@@ -709,11 +751,20 @@ class LocalOutlierProbability(object):
709
751
  :return: the updated storage matrix that collects information on
710
752
  each observation.
711
753
  """
712
- return np.hstack((data_store, np.array([self._norm_prob_outlier_factor(
713
- self.extent, data_store[:, 8].tolist())]).T))
754
+ return np.hstack(
755
+ (
756
+ data_store,
757
+ np.array(
758
+ [
759
+ self._norm_prob_outlier_factor(
760
+ self.extent, data_store[:, 8].tolist()
761
+ )
762
+ ]
763
+ ).T,
764
+ )
765
+ )
714
766
 
715
- def _local_outlier_probabilities(self,
716
- data_store: np.ndarray) -> np.ndarray:
767
+ def _local_outlier_probabilities(self, data_store: np.ndarray) -> np.ndarray:
717
768
  """
718
769
  Calculates the local outlier probability for each observation in the
719
770
  input data.
@@ -723,31 +774,40 @@ class LocalOutlierProbability(object):
723
774
  each observation.
724
775
  """
725
776
  return np.hstack(
726
- (data_store,
727
- np.array([np.apply_along_axis(self._local_outlier_probability, 0,
728
- data_store[:, 7],
729
- data_store[:, 9])]).T))
777
+ (
778
+ data_store,
779
+ np.array(
780
+ [
781
+ np.apply_along_axis(
782
+ self._local_outlier_probability,
783
+ 0,
784
+ data_store[:, 7],
785
+ data_store[:, 9],
786
+ )
787
+ ]
788
+ ).T,
789
+ )
790
+ )
730
791
 
731
792
  """
732
793
  Public methods
733
794
  """
734
795
 
735
- def fit(self) -> 'LocalOutlierProbability':
736
-
796
+ def fit(self) -> "LocalOutlierProbability":
737
797
  """
738
798
  Calculates the local outlier probability for each observation in the
739
799
  input data according to the input parameters extent, n_neighbors, and
740
800
  cluster_labels.
741
801
  :return: self, which contains the local outlier probabilities as
742
802
  self.local_outlier_probabilities.
803
+ :raises ClusterSizeError: if any cluster is smaller than n_neighbors.
804
+ :raises MissingValuesError: if data contains missing values.
743
805
  """
744
806
 
745
- self.Validate._n_neighbors(self)
746
- if self.Validate._cluster_size(self) is False:
747
- sys.exit()
748
- if self.data is not None and self.Validate._missing_values(
749
- self) is False:
750
- sys.exit()
807
+ self._check_n_neighbors()
808
+ self._check_cluster_size()
809
+ if self.data is not None:
810
+ self._check_missing_values()
751
811
 
752
812
  store = self._store()
753
813
  if self.data is not None:
@@ -770,7 +830,6 @@ class LocalOutlierProbability(object):
770
830
  return self
771
831
 
772
832
  def stream(self, x: np.ndarray) -> np.ndarray:
773
-
774
833
  """
775
834
  Calculates the local outlier probability for an individual sample
776
835
  according to the input parameters extent, n_neighbors, and
@@ -784,19 +843,23 @@ class LocalOutlierProbability(object):
784
843
  """
785
844
 
786
845
  orig_cluster_labels = None
787
- if self.Validate._no_cluster_labels(self) is False:
846
+ if self._check_no_cluster_labels() is False:
788
847
  orig_cluster_labels = self.cluster_labels
789
848
  self.cluster_labels = np.array([0] * len(self.data))
790
849
 
791
- if self.Validate._fit(self) is False:
850
+ if self._check_is_fit() is False:
792
851
  self.fit()
793
852
 
794
- point_vector = self.Validate._data(x)
853
+ point_vector = self._convert_to_array(x)
795
854
  distances = np.full([1, self.n_neighbors], 9e10, dtype=float)
796
855
  if self.data is not None:
797
856
  matrix = self.points_vector
798
857
  else:
799
858
  matrix = self.distance_matrix
859
+ # When using distance matrix mode, x is a scalar distance value.
860
+ # Extract scalar from array to avoid NumPy assignment errors.
861
+ if point_vector.size == 1:
862
+ point_vector = float(point_vector.flat[0])
800
863
  for p in range(0, matrix.shape[0]):
801
864
  if self.data is not None:
802
865
  d = self._euclidean(matrix[p, :], point_vector)
@@ -809,12 +872,12 @@ class LocalOutlierProbability(object):
809
872
  ssd = np.power(distances, 2).sum()
810
873
  std_dist = np.sqrt(np.divide(ssd, self.n_neighbors))
811
874
  prob_dist = self._prob_distance(self.extent, std_dist)
812
- plof = self._prob_outlier_factor(np.array(prob_dist),
813
- np.array(
814
- self.prob_distances_ev.mean())
815
- )
875
+ plof = self._prob_outlier_factor(
876
+ np.array(prob_dist), np.array(self.prob_distances_ev.mean())
877
+ )
816
878
  loop = self._local_outlier_probability(
817
- plof, self.norm_prob_local_outlier_factor)
879
+ plof, self.norm_prob_local_outlier_factor
880
+ )
818
881
 
819
882
  if orig_cluster_labels is not None:
820
883
  self.cluster_labels = orig_cluster_labels