aisp 0.1.34__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aisp/__init__.py +4 -0
- aisp/exceptions.py +42 -0
- aisp/nsa/__init__.py +11 -0
- aisp/nsa/_base.py +212 -0
- aisp/nsa/_negative_selection.py +752 -0
- aisp/utils/__init__.py +2 -1
- aisp/utils/_multiclass.py +15 -30
- aisp/utils/metrics.py +20 -41
- aisp/utils/sanitizers.py +54 -0
- {aisp-0.1.34.dist-info → aisp-0.1.35.dist-info}/METADATA +2 -106
- aisp-0.1.35.dist-info/RECORD +14 -0
- aisp/NSA/__init__.py +0 -18
- aisp/NSA/_base.py +0 -281
- aisp/NSA/_negative_selection.py +0 -1115
- aisp-0.1.34.dist-info/RECORD +0 -11
- {aisp-0.1.34.dist-info → aisp-0.1.35.dist-info}/WHEEL +0 -0
- {aisp-0.1.34.dist-info → aisp-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {aisp-0.1.34.dist-info → aisp-0.1.35.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,752 @@
|
|
1
|
+
"""Negative Selection Algorithm."""
|
2
|
+
|
3
|
+
from collections import namedtuple
|
4
|
+
from typing import Dict, Literal, Optional, Union
|
5
|
+
from scipy.spatial.distance import cdist
|
6
|
+
from tqdm import tqdm
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import numpy.typing as npt
|
10
|
+
|
11
|
+
from ..exceptions import MaxDiscardsReachedError
|
12
|
+
from ..utils import slice_index_list_by_class
|
13
|
+
from ..utils.sanitizers import sanitize_seed, sanitize_choice, sanitize_param
|
14
|
+
from ._base import Base
|
15
|
+
|
16
|
+
|
17
|
+
class RNSA(Base):
|
18
|
+
"""
|
19
|
+
The ``RNSA`` (Real-Valued Negative Selection Algorithm) class is for classification and
|
20
|
+
identification purposes. of anomalies through the self and not self method.
|
21
|
+
|
22
|
+
Parameters
|
23
|
+
----------
|
24
|
+
* N (``int``): Number of detectors. Defaults to ``100``.
|
25
|
+
* r (``float``): Radius of the detector. Defaults to ``0.05``.
|
26
|
+
* r_s (``float``): rₛ Radius of the ``X`` own samples. Defaults to ``0.0001``.
|
27
|
+
* k (``int``): Number of neighbors near the randomly generated detectors to perform the
|
28
|
+
distance average calculation. Defaults to ``1``.
|
29
|
+
* metric (``str``): Way to calculate the distance between the detector and the sample:
|
30
|
+
+ ``'Euclidean'`` ➜ The calculation of the distance is given by the expression:
|
31
|
+
√( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²).
|
32
|
+
+ ``'minkowski'`` ➜ The calculation of the distance is given by the expression:
|
33
|
+
( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ.
|
34
|
+
+ ``'manhattan'`` ➜ The calculation of the distance is given by the expression:
|
35
|
+
( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|) .
|
36
|
+
|
37
|
+
Defaults to ``'euclidean'``.
|
38
|
+
* max_discards (``int``): This parameter indicates the maximum number of consecutive
|
39
|
+
detector discards, aimed at preventing a possible infinite loop in case a radius
|
40
|
+
is defined that cannot generate non-self detectors. Defaults to ``1000``.
|
41
|
+
* seed (``int``): Seed for the random generation of values in the detectors. Defaults to
|
42
|
+
``None``.
|
43
|
+
* algorithm(``str``), Set the algorithm version:
|
44
|
+
* ``'default-NSA'``: Default algorithm with fixed radius.
|
45
|
+
* ``'V-detector'``: This algorithm is based on the article \
|
46
|
+
[Real-Valued Negative Selection Algorithm with Variable-Sized Detectors][2], \
|
47
|
+
by Ji, Z., Dasgupta, D. (2004), and uses a variable radius for anomaly \
|
48
|
+
detection in feature spaces.
|
49
|
+
|
50
|
+
Defaults to ``'default-NSA'``.
|
51
|
+
|
52
|
+
* ``**kwargs``:
|
53
|
+
- non_self_label (``str``): This variable stores the label that will be assigned \
|
54
|
+
when the data has only one output class, and the sample is classified as not \
|
55
|
+
belonging to that class. Defaults to ``'non-self'``.
|
56
|
+
- cell_bounds (``bool``): If set to ``True``, this option limits the generation \
|
57
|
+
of detectors to the space within the plane between 0 and 1. This means that \
|
58
|
+
any detector whose radius exceeds this limit is discarded, this variable is \
|
59
|
+
only used in the ``V-detector`` algorithm. Defaults to ``False``.
|
60
|
+
- p (``float``): This parameter stores the value of ``p`` used in the Minkowski \
|
61
|
+
distance. The default is ``2``, which represents normalized Euclidean distance.\
|
62
|
+
Different values of p lead to different variants of the [Minkowski Distance][1].
|
63
|
+
|
64
|
+
Notes
|
65
|
+
----------
|
66
|
+
[1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.minkowski_distance.html
|
67
|
+
|
68
|
+
[2] https://doi.org/10.1007/978-3-540-24854-5_30
|
69
|
+
|
70
|
+
"""
|
71
|
+
|
72
|
+
def __init__(
|
73
|
+
self,
|
74
|
+
N: int = 100,
|
75
|
+
r: float = 0.05,
|
76
|
+
r_s: float = 0.0001,
|
77
|
+
k: int = 1,
|
78
|
+
metric: Literal["manhattan", "minkowski", "euclidean"] = "euclidean",
|
79
|
+
max_discards: int = 1000,
|
80
|
+
seed: int = None,
|
81
|
+
algorithm: Literal["default-NSA", "V-detector"] = "default-NSA",
|
82
|
+
**kwargs: Dict[str, Union[bool, str, float]]
|
83
|
+
):
|
84
|
+
super().__init__(metric)
|
85
|
+
|
86
|
+
self.metric = sanitize_choice(
|
87
|
+
metric,
|
88
|
+
["manhattan", "minkowski"],
|
89
|
+
"euclidean"
|
90
|
+
)
|
91
|
+
self.seed = sanitize_seed(seed)
|
92
|
+
if self.seed is not None:
|
93
|
+
np.random.seed(seed)
|
94
|
+
self.k: int = sanitize_param(k, 1, lambda x: x > 1)
|
95
|
+
self.N: int = sanitize_param(N, 100, lambda x: x >= 1)
|
96
|
+
self.r: float = sanitize_param(r, 0.05, lambda x: x > 0)
|
97
|
+
self.r_s: float = sanitize_param(r_s, 0.0001, lambda x: x > 0)
|
98
|
+
|
99
|
+
if algorithm == "V-detector":
|
100
|
+
self._detector = namedtuple("Detector", ["position", "radius"])
|
101
|
+
self._algorithm: str = algorithm
|
102
|
+
else:
|
103
|
+
self._detector = namedtuple("Detector", "position")
|
104
|
+
self._algorithm: str = "default-NSA"
|
105
|
+
|
106
|
+
self.max_discards: int = sanitize_param(max_discards, 1000, lambda x: x > 0)
|
107
|
+
|
108
|
+
# Retrieves the variables from kwargs.
|
109
|
+
self.p: float = kwargs.get("p", 2)
|
110
|
+
self._cell_bounds: bool = kwargs.get("cell_bounds", False)
|
111
|
+
self.non_self_label: str = kwargs.get("non_self_label", "non-self")
|
112
|
+
|
113
|
+
# Initializes the other class variables as None.
|
114
|
+
self.detectors: Union[dict, None] = None
|
115
|
+
self.classes: npt.NDArray = None
|
116
|
+
|
117
|
+
def fit(self, X: npt.NDArray, y: npt.NDArray, verbose: bool = True):
|
118
|
+
"""
|
119
|
+
The function ``fit(...)``, performs the training according to ``X`` and ``y``, using the
|
120
|
+
method negative selection method(``NegativeSelect``).
|
121
|
+
|
122
|
+
Parameters
|
123
|
+
----------
|
124
|
+
* X (``npt.NDArray``): Training array, containing the samples and their \
|
125
|
+
characteristics, [``N samples`` (rows)][``N features`` (columns)].
|
126
|
+
* y (``npt.NDArray``): Array of target classes of ``X`` with [``N samples`` (lines)].
|
127
|
+
verbose (``bool``): Feedback from detector generation to the user.
|
128
|
+
|
129
|
+
Returns
|
130
|
+
----------
|
131
|
+
* (``self``): Returns the instance itself.
|
132
|
+
"""
|
133
|
+
progress = None
|
134
|
+
super()._check_and_raise_exceptions_fit(X, y)
|
135
|
+
|
136
|
+
# Identifying the possible classes within the output array `y`.
|
137
|
+
self.classes = np.unique(y)
|
138
|
+
# Dictionary that will store detectors with classes as keys.
|
139
|
+
list_detectors_by_class = {}
|
140
|
+
# Separates the classes for training.
|
141
|
+
sample_index = self.__slice_index_list_by_class(y)
|
142
|
+
# Progress bar for generating all detectors.
|
143
|
+
if verbose:
|
144
|
+
progress = tqdm(
|
145
|
+
total=int(self.N * (len(self.classes))),
|
146
|
+
bar_format="{desc} ┇{bar}┇ {n}/{total} detectors",
|
147
|
+
postfix="\n",
|
148
|
+
)
|
149
|
+
for _class_ in self.classes:
|
150
|
+
# Initializes the empty set that will contain the valid detectors.
|
151
|
+
valid_detectors_set = []
|
152
|
+
discard_count = 0
|
153
|
+
# Indicating which class the algorithm is currently processing for the progress bar.
|
154
|
+
if verbose:
|
155
|
+
progress.set_description_str(
|
156
|
+
f"Generating the detectors for the {_class_} class:"
|
157
|
+
)
|
158
|
+
while len(valid_detectors_set) < self.N:
|
159
|
+
# Generates a candidate detector vector randomly with values between 0 and 1.
|
160
|
+
vector_x = np.random.random_sample(size=X.shape[1])
|
161
|
+
# Checks the validity of the detector for non-self with respect to the class samples
|
162
|
+
valid_detector = self.__checks_valid_detector(
|
163
|
+
X=X, vector_x=vector_x, samples_index_class=sample_index[_class_]
|
164
|
+
)
|
165
|
+
|
166
|
+
# If the detector is valid, add it to the list of valid detectors.
|
167
|
+
if self._algorithm == "V-detector" and valid_detector is not False:
|
168
|
+
discard_count = 0
|
169
|
+
valid_detectors_set.append(
|
170
|
+
self._detector(vector_x, valid_detector[1])
|
171
|
+
)
|
172
|
+
if verbose:
|
173
|
+
progress.update(1)
|
174
|
+
elif valid_detector:
|
175
|
+
discard_count = 0
|
176
|
+
valid_detectors_set.append(self._detector(vector_x))
|
177
|
+
if verbose:
|
178
|
+
progress.update(1)
|
179
|
+
else:
|
180
|
+
discard_count += 1
|
181
|
+
if discard_count == self.max_discards:
|
182
|
+
raise MaxDiscardsReachedError(_class_)
|
183
|
+
|
184
|
+
# Add detectors, with classes as keys in the dictionary.
|
185
|
+
list_detectors_by_class[_class_] = valid_detectors_set
|
186
|
+
# Notify completion of detector generation for the classes.
|
187
|
+
if verbose:
|
188
|
+
progress.set_description(
|
189
|
+
f"\033[92m✔ Non-self detectors for classes ({', '.join(map(str, self.classes))}) "
|
190
|
+
f"successfully generated\033[0m"
|
191
|
+
)
|
192
|
+
# Saves the found detectors in the attribute for the non-self detectors of the trained model
|
193
|
+
self.detectors = list_detectors_by_class
|
194
|
+
return self
|
195
|
+
|
196
|
+
def predict(self, X: npt.NDArray) -> Optional[npt.NDArray]:
|
197
|
+
"""
|
198
|
+
Function to perform the prediction of classes based on detectors
|
199
|
+
created after training.
|
200
|
+
|
201
|
+
Parameters
|
202
|
+
----------
|
203
|
+
* X (``npt.NDArray``)
|
204
|
+
Array with input samples with [``N samples`` (Lines)] and
|
205
|
+
[``N characteristics``(Columns)]
|
206
|
+
|
207
|
+
Returns
|
208
|
+
----------
|
209
|
+
* C (``npt.NDArray``)
|
210
|
+
an ndarray of the form ``C`` [``N samples``], containing the predicted classes
|
211
|
+
for ``X``.
|
212
|
+
* ``None``
|
213
|
+
If there are no detectors for the prediction.
|
214
|
+
"""
|
215
|
+
# If there are no detectors, Returns None.
|
216
|
+
if self.detectors is None:
|
217
|
+
return None
|
218
|
+
|
219
|
+
super()._check_and_raise_exceptions_predict(
|
220
|
+
X, len(self.detectors[self.classes[0]][0].position)
|
221
|
+
)
|
222
|
+
|
223
|
+
# Initializes an empty array that will store the predictions.
|
224
|
+
c = np.empty(shape=0)
|
225
|
+
# For each sample row in X.
|
226
|
+
for line in X:
|
227
|
+
class_found: bool
|
228
|
+
_class_ = self.__compare_sample_to_detectors(line)
|
229
|
+
if _class_ is None:
|
230
|
+
class_found = False
|
231
|
+
else:
|
232
|
+
c = np.append(c, [_class_])
|
233
|
+
class_found = True
|
234
|
+
|
235
|
+
# If there is only one class and the sample is not classified,
|
236
|
+
# set the output as non-self.
|
237
|
+
if not class_found and len(self.classes) == 1:
|
238
|
+
c = np.append(c, [self.non_self_label])
|
239
|
+
# If the class is not identified with the detectors, assign the class with
|
240
|
+
# the greatest distance from the mean of its detectors.
|
241
|
+
elif not class_found:
|
242
|
+
average_distance: dict = {}
|
243
|
+
for _class_ in self.classes:
|
244
|
+
detectores = list(
|
245
|
+
map(lambda x: x.position, self.detectors[_class_])
|
246
|
+
)
|
247
|
+
average_distance[_class_] = np.average(
|
248
|
+
[self.__distance(detector, line) for detector in detectores]
|
249
|
+
)
|
250
|
+
c = np.append(c, [max(average_distance, key=average_distance.get)])
|
251
|
+
return c
|
252
|
+
|
253
|
+
def __slice_index_list_by_class(self, y: npt.NDArray) -> dict:
|
254
|
+
"""
|
255
|
+
The function ``__slice_index_list_by_class(...)``, separates the indices of the lines
|
256
|
+
according to the output class, to loop through the sample array, only in positions where
|
257
|
+
the output is the class being trained.
|
258
|
+
|
259
|
+
Parameters
|
260
|
+
----------
|
261
|
+
* y (npt.NDArray)
|
262
|
+
Receives a ``y``[``N sample``] array with the output classes of the \
|
263
|
+
``X`` sample array.
|
264
|
+
|
265
|
+
Returns
|
266
|
+
----------
|
267
|
+
* dict: A dictionary with the list of array positions(``y``), with the classes as key.
|
268
|
+
"""
|
269
|
+
return slice_index_list_by_class(self.classes, y)
|
270
|
+
|
271
|
+
def __checks_valid_detector(
|
272
|
+
self,
|
273
|
+
X: npt.NDArray = None,
|
274
|
+
vector_x: npt.NDArray = None,
|
275
|
+
samples_index_class: npt.NDArray = None
|
276
|
+
) -> Union[bool, tuple[bool, float]]:
|
277
|
+
"""
|
278
|
+
Function to check if the detector has a valid non-proper ``r`` radius for the class.
|
279
|
+
|
280
|
+
Parameters
|
281
|
+
----------
|
282
|
+
* X (``npt.NDArray``)
|
283
|
+
Array ``X`` with the samples.
|
284
|
+
* vector_x (``npt.NDArray``)
|
285
|
+
Randomly generated vector x candidate detector with values between[0, 1].
|
286
|
+
* samples_index_class (``npt.NDArray``)
|
287
|
+
Sample positions of a class in ``X``.
|
288
|
+
|
289
|
+
Returns
|
290
|
+
----------
|
291
|
+
* Validity (``bool``): Returns whether the detector is valid or not.
|
292
|
+
"""
|
293
|
+
# If any of the input arrays have zero size, Returns false.
|
294
|
+
if (
|
295
|
+
np.size(samples_index_class) == 0
|
296
|
+
or np.size(X) == 0
|
297
|
+
or np.size(vector_x) == 0
|
298
|
+
):
|
299
|
+
return False
|
300
|
+
# If self.k > 1, uses the k nearest neighbors (kNN); otherwise, checks the detector
|
301
|
+
# without considering kNN.
|
302
|
+
if self.k > 1:
|
303
|
+
knn_list = np.empty(shape=0)
|
304
|
+
for i in samples_index_class:
|
305
|
+
# Calculates the distance between the two vectors and adds it to the kNN list if
|
306
|
+
# the distance is smaller than the largest distance in the list.
|
307
|
+
knn_list = self.__compare_knearest_neighbors_list(
|
308
|
+
knn_list, self.__distance(X[i], vector_x)
|
309
|
+
)
|
310
|
+
# If the average of the distances in the kNN list is less than the radius, Returns true.
|
311
|
+
distance_mean = np.mean(knn_list)
|
312
|
+
if self._algorithm == "V-detector":
|
313
|
+
return self.__detector_is_valid_to_vdetector(distance_mean, vector_x)
|
314
|
+
if distance_mean > (self.r + self.r_s):
|
315
|
+
return True
|
316
|
+
else:
|
317
|
+
distance: Union[float, None] = None
|
318
|
+
if self._algorithm == "V-detector":
|
319
|
+
distance = min(
|
320
|
+
self.__distance(X[i], vector_x) for i in samples_index_class
|
321
|
+
)
|
322
|
+
return self.__detector_is_valid_to_vdetector(distance, vector_x)
|
323
|
+
|
324
|
+
# Calculates the distance between the vectors; if not it is less than or equal to
|
325
|
+
# the radius plus the sample's radius, sets the validity of the detector to
|
326
|
+
# true.
|
327
|
+
threshold: float = self.r + self.r_s
|
328
|
+
if all(self.__distance(X[i], vector_x) > threshold for i in samples_index_class):
|
329
|
+
return True # Detector is valid!
|
330
|
+
|
331
|
+
return False # Detector is not valid!
|
332
|
+
|
333
|
+
def __compare_knearest_neighbors_list(
|
334
|
+
self,
|
335
|
+
knn: npt.NDArray,
|
336
|
+
distance: float
|
337
|
+
) -> npt.NDArray:
|
338
|
+
"""
|
339
|
+
Compares the k-nearest neighbor distance at position ``k-1`` in the list ``knn``,
|
340
|
+
if the distance of the new sample is less, replace it and sort in ascending order.
|
341
|
+
|
342
|
+
|
343
|
+
Parameters
|
344
|
+
----------
|
345
|
+
* knn (``npt.NDArray``)
|
346
|
+
List of k-nearest neighbor distances.
|
347
|
+
* distance (``float``)
|
348
|
+
Distance to check.
|
349
|
+
|
350
|
+
Returns
|
351
|
+
----------
|
352
|
+
* ``npt.NDArray``: Updated and sorted nearest neighbor list.
|
353
|
+
"""
|
354
|
+
# If the number of distances in kNN is less than k, adds the distance.
|
355
|
+
if len(knn) < self.k:
|
356
|
+
knn = np.append(knn, distance)
|
357
|
+
knn.sort()
|
358
|
+
return knn
|
359
|
+
|
360
|
+
# Otherwise, add the distance if the new distance is smaller than the largest
|
361
|
+
# distance in the list.
|
362
|
+
if knn[self.k - 1] > distance:
|
363
|
+
knn[self.k - 1] = distance
|
364
|
+
knn.sort()
|
365
|
+
|
366
|
+
return knn
|
367
|
+
|
368
|
+
def __compare_sample_to_detectors(self, line: npt.NDArray):
|
369
|
+
"""
|
370
|
+
Function to compare a sample with the detectors, verifying if the sample is proper.
|
371
|
+
|
372
|
+
Parameters
|
373
|
+
----------
|
374
|
+
* line (``npt.NDArray``): vector with N-features
|
375
|
+
|
376
|
+
Returns
|
377
|
+
----------
|
378
|
+
* Returns the predicted class with the detectors or None if the sample does not qualify
|
379
|
+
for any class.
|
380
|
+
"""
|
381
|
+
# List to store the classes and the average distance between the detectors and the sample.
|
382
|
+
possible_classes = []
|
383
|
+
for _class_ in self.classes:
|
384
|
+
# Variable to indicate if the class was found with the detectors.
|
385
|
+
class_found: bool = True
|
386
|
+
sum_distance = 0
|
387
|
+
for detector in self.detectors[_class_]:
|
388
|
+
distance = self.__distance(detector.position, line)
|
389
|
+
sum_distance += distance
|
390
|
+
if self._algorithm == "V-detector":
|
391
|
+
if distance <= detector.radius:
|
392
|
+
class_found = False
|
393
|
+
break
|
394
|
+
elif distance <= self.r:
|
395
|
+
class_found = False
|
396
|
+
break
|
397
|
+
|
398
|
+
# If the sample passes through all the detectors of a class, adds the class as a
|
399
|
+
# possible prediction.
|
400
|
+
if class_found:
|
401
|
+
possible_classes.append([_class_, sum_distance / self.N])
|
402
|
+
# If classified as belonging to only one class, Returns the class.
|
403
|
+
if len(possible_classes) == 1:
|
404
|
+
return possible_classes[0][0]
|
405
|
+
# If belonging to more than one class, Returns the class with the greatest average distance.
|
406
|
+
if len(possible_classes) > 1:
|
407
|
+
return max(possible_classes, key=lambda x: x[1])[0]
|
408
|
+
|
409
|
+
return None
|
410
|
+
|
411
|
+
def __distance(self, u: npt.NDArray, v: npt.NDArray) -> float:
|
412
|
+
"""
|
413
|
+
Function to calculate the distance between two points by the chosen ``metric``.
|
414
|
+
|
415
|
+
Parameters
|
416
|
+
----------
|
417
|
+
* u (``npt.NDArray``): Coordinates of the first point.
|
418
|
+
* v (``npt.NDArray``): Coordinates of the second point.
|
419
|
+
|
420
|
+
Returns
|
421
|
+
----------
|
422
|
+
* Distance (``float``): between the two points.
|
423
|
+
"""
|
424
|
+
return super()._distance(u, v)
|
425
|
+
|
426
|
+
def __detector_is_valid_to_vdetector(
|
427
|
+
self,
|
428
|
+
distance: float,
|
429
|
+
vector_x: npt.NDArray
|
430
|
+
) -> Union[bool, tuple[bool, float]]:
|
431
|
+
"""
|
432
|
+
Check if the distance between the detector and the samples, minus the radius of the samples,
|
433
|
+
is greater than the minimum radius.
|
434
|
+
|
435
|
+
Parameters
|
436
|
+
----------
|
437
|
+
* distance (``float``): minimum distance calculated between all samples.
|
438
|
+
* vector_x (``numpy.ndarray``): randomly generated candidate detector vector x with
|
439
|
+
values between 0 and 1.
|
440
|
+
|
441
|
+
Returns
|
442
|
+
----------
|
443
|
+
* ``False`` if the calculated radius is smaller than the minimum distance or exceeds the
|
444
|
+
edge of the space, if this option is enabled.
|
445
|
+
* ``True`` and the distance minus the radius of the samples, if the radius is valid.`
|
446
|
+
"""
|
447
|
+
new_detector_r = float(distance - self.r_s)
|
448
|
+
if self.r >= new_detector_r:
|
449
|
+
return False
|
450
|
+
|
451
|
+
# If _cell_bounds is True, considers the detector to be within the plane bounds.
|
452
|
+
if self._cell_bounds:
|
453
|
+
for p in vector_x:
|
454
|
+
if (p - new_detector_r) < 0 or (p + new_detector_r) > 1:
|
455
|
+
return False
|
456
|
+
|
457
|
+
return (True, new_detector_r)
|
458
|
+
|
459
|
+
def get_params(self, deep: bool = True) -> dict: # pylint: disable=W0613
|
460
|
+
"""
|
461
|
+
The get_params function Returns a dictionary with the object's main parameters.
|
462
|
+
"""
|
463
|
+
return {
|
464
|
+
"N": self.N,
|
465
|
+
"r": self.r,
|
466
|
+
"k": self.k,
|
467
|
+
"metric": self.metric,
|
468
|
+
"seed": self.seed,
|
469
|
+
"algorithm": self._algorithm,
|
470
|
+
"r_s": self.r_s,
|
471
|
+
"cell_bounds": self._cell_bounds,
|
472
|
+
"p": self.p,
|
473
|
+
}
|
474
|
+
|
475
|
+
|
476
|
+
class BNSA(Base):
|
477
|
+
"""
|
478
|
+
The ``BNSA`` (Binary Negative Selection Algorithm) class is for classification and
|
479
|
+
identification purposes of anomalies through the self and not self method.
|
480
|
+
|
481
|
+
Parameters
|
482
|
+
----------
|
483
|
+
* N (``int``): Number of detectors. Defaults to ``100``.
|
484
|
+
* aff_thresh (``float``): The variable represents the percentage of similarity
|
485
|
+
between the T cell and the own samples. The default value is 10% (0.1), while a value of
|
486
|
+
1.0 represents 100% similarity.
|
487
|
+
* max_discards (``int``): This parameter indicates the maximum number of detector discards in
|
488
|
+
sequence, which aims to avoid a possible infinite loop if a radius is defined that it is
|
489
|
+
not possible to generate non-self detectors. Defaults to ``1000``.
|
490
|
+
* seed (``int``): Seed for the random generation of values in the detectors. Defaults to
|
491
|
+
``None``.
|
492
|
+
* no_label_sample_selection (``str``): Method for selecting labels for samples designated as
|
493
|
+
non-self by all detectors. Available method types:
|
494
|
+
- (``max_average_difference``): Selects the class with the highest average difference
|
495
|
+
among the detectors.
|
496
|
+
- (``max_nearest_difference``): Selects the class with the highest difference between
|
497
|
+
the nearest and farthest detector from the sample.
|
498
|
+
"""
|
499
|
+
|
500
|
+
def __init__(
|
501
|
+
self,
|
502
|
+
N: int = 100,
|
503
|
+
aff_thresh: float = 0.1,
|
504
|
+
max_discards: int = 1000,
|
505
|
+
seed: int = None,
|
506
|
+
no_label_sample_selection: Literal[
|
507
|
+
"max_average_difference", "max_nearest_difference"
|
508
|
+
] = "max_average_difference"
|
509
|
+
):
|
510
|
+
super().__init__()
|
511
|
+
|
512
|
+
self.N: int = sanitize_param(N, 100, lambda x: x > 0)
|
513
|
+
self.aff_thresh: float = sanitize_param(aff_thresh, 0.1, lambda x: 0 < x < 1)
|
514
|
+
self.max_discards: float = sanitize_param(max_discards, 1000, lambda x: x > 0)
|
515
|
+
|
516
|
+
self.seed = sanitize_seed(seed)
|
517
|
+
|
518
|
+
if self.seed is not None:
|
519
|
+
np.random.seed(seed)
|
520
|
+
|
521
|
+
self.no_label_sample_selection: float = sanitize_param(
|
522
|
+
no_label_sample_selection,
|
523
|
+
"max_average_difference",
|
524
|
+
lambda x: x == "nearest_difference"
|
525
|
+
)
|
526
|
+
|
527
|
+
self.classes: npt.NDArray = None
|
528
|
+
self.detectors: npt.NDArray = None
|
529
|
+
|
530
|
+
def fit(self, X: npt.NDArray, y: npt.NDArray, verbose: bool = True):
|
531
|
+
"""
|
532
|
+
The function ``fit(...)``, performs the training according to ``X`` and ``y``, using the
|
533
|
+
method negative selection method(``NegativeSelect``).
|
534
|
+
|
535
|
+
Parameters
|
536
|
+
----------
|
537
|
+
* X (``npt.NDArray``):
|
538
|
+
Training array, containing the samples and their characteristics,
|
539
|
+
[``N samples`` (rows)][``N features`` (columns)].
|
540
|
+
* y (``npt.NDArray``):
|
541
|
+
Array of target classes of ``X`` with [``N samples`` (lines)].
|
542
|
+
verbose (``bool``): Feedback from detector generation to the user.
|
543
|
+
|
544
|
+
Returns
|
545
|
+
----------
|
546
|
+
* (``self``): Returns the instance itself.
|
547
|
+
"""
|
548
|
+
super()._check_and_raise_exceptions_fit(X, y, "BNSA")
|
549
|
+
|
550
|
+
# Converts the entire array X to boolean
|
551
|
+
if X.dtype != bool:
|
552
|
+
X = X.astype(bool)
|
553
|
+
|
554
|
+
# Identifying the possible classes within the output array `y`.
|
555
|
+
self.classes = np.unique(y)
|
556
|
+
# Dictionary that will store detectors with classes as keys.
|
557
|
+
list_detectors_by_class = {}
|
558
|
+
# Separates the classes for training.
|
559
|
+
sample_index: dict = self.__slice_index_list_by_class(y)
|
560
|
+
# Progress bar for generating all detectors.
|
561
|
+
if verbose:
|
562
|
+
progress = tqdm(
|
563
|
+
total=int(self.N * (len(self.classes))),
|
564
|
+
bar_format="{desc} ┇{bar}┇ {n}/{total} detectors",
|
565
|
+
postfix="\n",
|
566
|
+
)
|
567
|
+
|
568
|
+
for _class_ in self.classes:
|
569
|
+
# Initializes the empty set that will contain the valid detectors.
|
570
|
+
valid_detectors_set: list = []
|
571
|
+
discard_count: int = 0
|
572
|
+
# Updating the progress bar with the current class the algorithm is processing.
|
573
|
+
if verbose:
|
574
|
+
progress.set_description_str(
|
575
|
+
f"Generating the detectors for the {_class_} class:"
|
576
|
+
)
|
577
|
+
while len(valid_detectors_set) < self.N:
|
578
|
+
is_valid_detector: bool = True
|
579
|
+
# Generates a candidate detector vector randomly with values 0 and 1.
|
580
|
+
vector_x = np.random.choice([False, True], size=X.shape[1])
|
581
|
+
# Calculates the distance between the candidate and the class samples.
|
582
|
+
distances = cdist(
|
583
|
+
np.expand_dims(vector_x, axis=0),
|
584
|
+
X[sample_index[_class_]],
|
585
|
+
metric="hamming",
|
586
|
+
)
|
587
|
+
# Checks if any of the distances is below or equal to the threshold.
|
588
|
+
is_valid_detector = not np.any(distances <= self.aff_thresh)
|
589
|
+
|
590
|
+
# If the detector is valid, add it to the list of valid detectors.
|
591
|
+
if is_valid_detector:
|
592
|
+
discard_count = 0
|
593
|
+
valid_detectors_set.append(vector_x)
|
594
|
+
if verbose:
|
595
|
+
progress.update(1)
|
596
|
+
else:
|
597
|
+
discard_count += 1
|
598
|
+
if discard_count == self.max_discards:
|
599
|
+
raise MaxDiscardsReachedError(_class_)
|
600
|
+
|
601
|
+
# Add detectors to the dictionary with classes as keys.
|
602
|
+
list_detectors_by_class[_class_] = valid_detectors_set
|
603
|
+
|
604
|
+
# Notify the completion of detector generation for the classes.
|
605
|
+
if verbose:
|
606
|
+
progress.set_description(
|
607
|
+
f"\033[92m✔ Non-self detectors for classes ({', '.join(map(str, self.classes))}) "
|
608
|
+
f"successfully generated\033[0m"
|
609
|
+
)
|
610
|
+
# Saves the found detectors in the attribute for the class detectors.
|
611
|
+
self.detectors = list_detectors_by_class
|
612
|
+
return self
|
613
|
+
|
614
|
+
def predict(self, X: npt.NDArray) -> Optional[npt.NDArray]:
|
615
|
+
"""
|
616
|
+
Function to perform the prediction of classes based on detectors
|
617
|
+
created after training.
|
618
|
+
|
619
|
+
Parameters
|
620
|
+
----------
|
621
|
+
* X (``npt.NDArray``): Array with input samples with [``N samples`` (Lines)] and
|
622
|
+
[``N characteristics``(Columns)]
|
623
|
+
|
624
|
+
Returns
|
625
|
+
----------
|
626
|
+
* c (``npt.NDArray``): an ndarray of the form ``C`` [``N samples``],
|
627
|
+
containing the predicted classes for ``X``
|
628
|
+
* ``None``: If there are no detectors for the prediction.
|
629
|
+
"""
|
630
|
+
# If there are no detectors, Returns None.
|
631
|
+
if self.detectors is None:
|
632
|
+
return None
|
633
|
+
|
634
|
+
super()._check_and_raise_exceptions_predict(
|
635
|
+
X, len(self.detectors[self.classes[0]][0]), "BNSA"
|
636
|
+
)
|
637
|
+
|
638
|
+
# Converts the entire array X to boolean.
|
639
|
+
if X.dtype != bool:
|
640
|
+
X = X.astype(bool)
|
641
|
+
|
642
|
+
# Initializes an empty array that will store the predictions.
|
643
|
+
c = np.empty(shape=0)
|
644
|
+
# For each sample row in X.
|
645
|
+
for line in X:
|
646
|
+
class_found: bool = True
|
647
|
+
# List to store the possible classes to which the sample matches with self
|
648
|
+
# when compared to the non-self detectors.
|
649
|
+
possible_classes: list = []
|
650
|
+
for _class_ in self.classes:
|
651
|
+
similarity_sum: float = 0
|
652
|
+
# Calculates the Hamming distance between the row and all detectors.
|
653
|
+
distances = cdist(
|
654
|
+
np.expand_dims(line, axis=0),
|
655
|
+
self.detectors[_class_],
|
656
|
+
metric="hamming",
|
657
|
+
)
|
658
|
+
|
659
|
+
# Check if any distance is below or equal to the threshold.
|
660
|
+
if np.any(distances <= self.aff_thresh):
|
661
|
+
class_found = False
|
662
|
+
else:
|
663
|
+
similarity_sum = np.sum(distances)
|
664
|
+
|
665
|
+
# If the sample passes through all detectors of a class, adds the class as a
|
666
|
+
# possible prediction and its average similarity.
|
667
|
+
if class_found:
|
668
|
+
possible_classes.append([_class_, similarity_sum / self.N])
|
669
|
+
|
670
|
+
# If belonging to one or more classes, adds the class with the greatest
|
671
|
+
# average distance
|
672
|
+
if len(possible_classes) > 0:
|
673
|
+
c = np.append(c, [max(possible_classes, key=lambda x: x[1])[0]])
|
674
|
+
class_found = True
|
675
|
+
else:
|
676
|
+
class_found = False
|
677
|
+
|
678
|
+
# If there is only one class and the sample is not classified, sets the
|
679
|
+
# output as non-self.
|
680
|
+
if not class_found and len(self.classes) == 1:
|
681
|
+
c = np.append(c, ["non-self"])
|
682
|
+
# If the class cannot be identified by the detectors
|
683
|
+
elif not class_found:
|
684
|
+
c = self.__assign_class_to_non_self_sample(line, c)
|
685
|
+
|
686
|
+
return c
|
687
|
+
|
688
|
+
def __assign_class_to_non_self_sample(self, line, c) -> npt.NDArray:
|
689
|
+
"""
|
690
|
+
This function determines the class of a sample when all detectors classify it
|
691
|
+
as "non-self". Classification is performed using the ``max_average_difference``
|
692
|
+
and ``max_nearest_difference`` methods.
|
693
|
+
|
694
|
+
Parameters
|
695
|
+
----------
|
696
|
+
* line (list): Sample to be classified.
|
697
|
+
* c (list): List of predictions to be updated with the new classification.
|
698
|
+
|
699
|
+
Returns
|
700
|
+
----------
|
701
|
+
* list: The list of predictions `c` updated with the class assigned to the sample.
|
702
|
+
"""
|
703
|
+
class_differences: dict = {}
|
704
|
+
for _class_ in self.classes:
|
705
|
+
# Assign the label to the class with the greatest distance from
|
706
|
+
# the nearest detector.
|
707
|
+
if self.no_label_sample_selection == "nearest_difference":
|
708
|
+
difference_min: float = cdist(
|
709
|
+
np.expand_dims(line, axis=0),
|
710
|
+
self.detectors[_class_],
|
711
|
+
metric="hamming",
|
712
|
+
).min()
|
713
|
+
class_differences[_class_] = difference_min
|
714
|
+
# Or based on the greatest distance from the average distances of the detectors.
|
715
|
+
else:
|
716
|
+
difference_sum: float = cdist(
|
717
|
+
np.expand_dims(line, axis=0),
|
718
|
+
self.detectors[_class_],
|
719
|
+
metric="hamming",
|
720
|
+
).sum()
|
721
|
+
class_differences[_class_] = difference_sum / self.N
|
722
|
+
|
723
|
+
return np.append(c, [max(class_differences, key=class_differences.get)])
|
724
|
+
|
725
|
+
def __slice_index_list_by_class(self, y: npt.NDArray) -> dict:
|
726
|
+
"""
|
727
|
+
The function ``__slice_index_list_by_class(...)``, separates the indices of the lines
|
728
|
+
according to the output class, to loop through the sample array, only in positions where
|
729
|
+
the output is the class being trained.
|
730
|
+
|
731
|
+
Parameters
|
732
|
+
----------
|
733
|
+
* y (``npt.NDArray``):
|
734
|
+
Receives a ``y``[``N sample``] array with the output classes of the ``X``
|
735
|
+
sample array.
|
736
|
+
|
737
|
+
Returns
|
738
|
+
----------
|
739
|
+
* dict: A dictionary with the list of array positions(``y``), with the classes as key.
|
740
|
+
"""
|
741
|
+
return slice_index_list_by_class(self.classes, y)
|
742
|
+
|
743
|
+
def get_params(self, deep: bool = True) -> dict: # pylint: disable=W0613
|
744
|
+
"""
|
745
|
+
The get_params function Returns a dictionary with the object's main parameters.
|
746
|
+
"""
|
747
|
+
return {
|
748
|
+
"N": self.N,
|
749
|
+
"aff_thresh": self.aff_thresh,
|
750
|
+
"max_discards": self.max_discards,
|
751
|
+
"seed": self.seed,
|
752
|
+
}
|