aisp 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aisp/exceptions.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """Custom warnings and errors."""
2
2
 
3
+ from typing import Optional
4
+
3
5
 
4
6
  class MaxDiscardsReachedError(Exception):
5
7
  """Exception thrown when the maximum number of detector discards is reached."""
@@ -27,7 +29,7 @@ class FeatureDimensionMismatch(Exception):
27
29
  self,
28
30
  expected: int,
29
31
  received: int,
30
- variable_name: str = None
32
+ variable_name: Optional[str] = None
31
33
  ):
32
34
  parts = []
33
35
  if variable_name:
@@ -41,3 +43,17 @@ class FeatureDimensionMismatch(Exception):
41
43
  "and matches the expected shape for the model."
42
44
  )
43
45
  super().__init__(message)
46
+
47
+
48
+ class UnsupportedTypeError(Exception):
49
+ """
50
+ Exception raised when the input vector type is not supported.
51
+
52
+ This exception is thrown when the vector data type does not match any of the supported.
53
+ """
54
+
55
+ def __init__(self, message=None):
56
+ if message is None:
57
+ message = ("Type is not supported. Provide a binary, normalized, or bounded "
58
+ "continuous vector.")
59
+ super().__init__(message)
aisp/ina/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """Module (ina) Immune Network Algorithm.
2
+
3
+ This module implements algorithms based on Network Theory Algorithms proposed by Jerne.
4
+
5
+ Classes
6
+ -------
7
+ AiNet
8
+ Artificial Immune Network implementation for clustering.
9
+ """
10
+
11
+ from ._ai_network import AiNet
12
+
13
+ __author__ = 'João Paulo da Silva Barros'
14
+ __all__ = ['AiNet']
@@ -0,0 +1,553 @@
1
+ """Artificial Immune Network (AiNet)."""
2
+
3
+ from collections import Counter
4
+ from heapq import nlargest
5
+ from typing import Optional
6
+
7
+ import numpy as np
8
+ import numpy.typing as npt
9
+ from scipy.sparse.csgraph import minimum_spanning_tree, connected_components
10
+ from scipy.spatial.distance import squareform, pdist, cdist
11
+ from tqdm import tqdm
12
+
13
+ from ._base import BaseAiNet
14
+ from ..base import set_seed_numba
15
+ from ..base.mutation import clone_and_mutate_binary, clone_and_mutate_continuous, \
16
+ clone_and_mutate_ranged
17
+ from ..utils.sanitizers import sanitize_choice, sanitize_param, sanitize_seed
18
+ from ..utils.distance import hamming, compute_metric_distance, get_metric_code
19
+ from ..utils.types import FeatureType, MetricType
20
+ from ..utils.validation import detect_vector_data_type
21
+
22
+
23
+ class AiNet(BaseAiNet):
24
+ """Artificial Immune Network for Compression and Clustering .
25
+
26
+ This class implements the aiNet algorithm, an artificial immune network model designed for
27
+ clustering and data compression tasks. The aiNet algorithm uses principles from immune
28
+ network theory, clonal selection, and affinity maturation to compress high-dimensional
29
+ datasets. [1]_
30
+ For clustering, the class uses SciPy’s implementation of the **Minimum Spanning Tree**
31
+ (MST) to remove the most distant nodes and separate the groups. [2]_
32
+
33
+ Parameters
34
+ ----------
35
+ N : int, default=50
36
+ Number of memory cells (antibodies) in the population.
37
+ n_clone : int, default=10
38
+ Number of clones generated for each selected memory cell.
39
+ top_clonal_memory_size : Optional[int], default=5
40
+ Number of highest-affinity antibodies selected per antigen for cloning and mutation.
41
+ If set to None or 0, all antibodies are cloned, following the original aiNet algorithm.
42
+ n_diversity_injection : int, default=5
43
+ Number of new random memory cells injected to maintain diversity.
44
+ affinity_threshold : float, default=0.5
45
+ Threshold for affinity (similarity) to determine cell suppression or selection.
46
+ suppression_threshold : float, default=0.5
47
+ Threshold for suppressing similar memory cells.
48
+ mst_inconsistency_factor : float, default=2.0
49
+ Factor used to determine which edges in the **Minimum Spanning Tree (MST)**
50
+ are considered inconsistent.
51
+ max_iterations : int, default=10
52
+ Maximum number of training iterations.
53
+ k : int, default=3
54
+ The number of K nearest neighbors that will be used to choose a label in the prediction.
55
+ metric : Literal["manhattan", "minkowski", "euclidean"], default="euclidean"
56
+ Way to calculate the distance between the detector and the sample:
57
+
58
+ * ``'Euclidean'`` ➜ The calculation of the distance is given by the expression:
59
+ √( (x₁ – x₂)² + (y₁ – y₂)² + ... + (yn – yn)²).
60
+
61
+ * ``'minkowski'`` ➜ The calculation of the distance is given by the expression:
62
+ ( |X₁ – Y₁|p + |X₂ – Y₂|p + ... + |Xn – Yn|p) ¹/ₚ.
63
+
64
+ * ``'manhattan'`` ➜ The calculation of the distance is given by the expression:
65
+ ( |x₁ – x₂| + |y₁ – y₂| + ... + |yn – yn|).
66
+
67
+ seed : Optional[int]
68
+ Seed for the random generation of detector values. Defaults to None.
69
+ use_mst_clustering : bool, default=True
70
+ If ``True``, performs clustering with **Minimum Spanning Tree** (MST). If ``False``,
71
+ does not perform clustering and predict returns None.
72
+ **kwargs
73
+ p : float
74
+ This parameter stores the value of ``p`` used in the Minkowski distance. The default
75
+ is ``2``, which represents normalized Euclidean distance.\
76
+ Different values of p lead to different variants of the Minkowski Distance.
77
+
78
+ References
79
+ ----------
80
+ .. [1] de Castro, L. N., & Von Zuben, F. J. (2001).
81
+ *aiNet: An Artificial Immune Network for Data Analysis*.
82
+ Draft Chapter XII of the book *Data Mining: A Heuristic Approach*.
83
+ Department of Computer and Automation Engineering, University of Campinas.
84
+ Available at:
85
+ https://www.dca.fee.unicamp.br/~vonzuben/research/lnunes_dout/
86
+ artigos/DMHA.PDF
87
+ .. [2] SciPy Documentation. *Minimum Spanning Tree*.
88
+ https://docs.scipy.org/doc/scipy/reference/generated/
89
+ scipy.sparse.csgraph.minimum_spanning_tree
90
+ """
91
+
92
+ def __init__(
93
+ self,
94
+ N: int = 50,
95
+ n_clone: int = 10,
96
+ top_clonal_memory_size: int = 5,
97
+ n_diversity_injection: int = 5,
98
+ affinity_threshold: float = 0.5,
99
+ suppression_threshold: float = 0.5,
100
+ mst_inconsistency_factor: float = 2.0,
101
+ max_iterations: int = 10,
102
+ k: int = 3,
103
+ metric: MetricType = "euclidean",
104
+ seed: Optional[int] = None,
105
+ use_mst_clustering: bool = True,
106
+ **kwargs
107
+ ):
108
+ self.N: int = sanitize_param(N, 50, lambda x: x > 0)
109
+ self.n_clone: int = sanitize_param(n_clone, 10, lambda x: x > 0)
110
+ if top_clonal_memory_size is None:
111
+ self.top_clonal_memory_size: Optional[int] = None
112
+ else:
113
+ self.top_clonal_memory_size: Optional[int] = sanitize_param(
114
+ top_clonal_memory_size, 5, lambda x: x > 0
115
+ )
116
+
117
+ self.n_diversity_injection: int = sanitize_param(
118
+ n_diversity_injection, 5, lambda x: x > 0
119
+ )
120
+ self.affinity_threshold: float = sanitize_param(
121
+ affinity_threshold, 0.5, lambda x: x > 0
122
+ )
123
+ self.suppression_threshold: float = sanitize_param(
124
+ suppression_threshold, 0.5, lambda x: x > 0
125
+ )
126
+ self.mst_inconsistency_factor: float = sanitize_param(
127
+ mst_inconsistency_factor, 2, lambda x: x >= 0
128
+ )
129
+ self.max_iterations: int = sanitize_param(max_iterations, 10, lambda x: x > 0)
130
+ self.k: int = sanitize_param(k, 1, lambda x: x > 0)
131
+ self.seed: Optional[int] = sanitize_seed(seed)
132
+ self.use_mst_clustering: bool = use_mst_clustering
133
+ if self.seed is not None:
134
+ np.random.seed(self.seed)
135
+ set_seed_numba(self.seed)
136
+
137
+ self._feature_type: FeatureType = "continuous-features"
138
+ self.metric: str = sanitize_choice(
139
+ metric, ["euclidean", "manhattan", "minkowski"], "euclidean"
140
+ )
141
+ if self._feature_type == "binary-features":
142
+ self.metric = "hamming"
143
+
144
+ self.p: np.float64 = np.float64(kwargs.get("p", 2.0))
145
+ self._metric_params = {}
146
+ if self.metric == "minkowski":
147
+ self._metric_params['p'] = self.p
148
+ self.classes = []
149
+ self._memory_network: dict = {}
150
+ self._population_antibodies: Optional[npt.NDArray] = None
151
+ self._n_features: int = 0
152
+ self._bounds: Optional[npt.NDArray[np.float64]] = None
153
+ self._mst_structure: Optional[npt.NDArray] = None
154
+ self._mst_mean_distance: Optional[float] = None
155
+ self._mst_std_distance: Optional[float] = None
156
+ self._predict_cells = None
157
+ self._predict_labels = None
158
+
159
+ @property
160
+ def memory_network(self) -> dict:
161
+ """Return the immune network representing clusters or graph structure."""
162
+ return self._memory_network
163
+
164
+ @property
165
+ def population_antibodies(self) -> Optional[npt.NDArray]:
166
+ """Return the set of memory antibodies."""
167
+ return self._population_antibodies
168
+
169
+ @property
170
+ def mst(self) -> dict:
171
+ """Returns the Minimum Spanning Tree and its statistics."""
172
+ return {
173
+ 'graph': self._mst_structure,
174
+ 'mean_distance': self._mst_mean_distance,
175
+ 'std_distance': self._mst_std_distance
176
+ }
177
+
178
+ def fit(self, X: npt.NDArray, verbose: bool = True):
179
+ """
180
+ Train the AiNet model on input data.
181
+
182
+ Parameters
183
+ ----------
184
+ X : npt.NDArray
185
+ Input data used for training the model.
186
+ verbose : bool, default=True
187
+ Feedback from the progress bar showing current training interaction details.
188
+
189
+ Returns
190
+ -------
191
+ self : AiNet
192
+ Returns the instance of the class that implements this method.
193
+ """
194
+ self._feature_type = detect_vector_data_type(X)
195
+
196
+ super()._check_and_raise_exceptions_fit(X)
197
+
198
+ match self._feature_type:
199
+ case "binary-features":
200
+ X = X.astype(np.bool_)
201
+ self.metric = "hamming"
202
+ case "ranged-features":
203
+ self._bounds = np.vstack([np.min(X, axis=0), np.max(X, axis=0)])
204
+
205
+ self._n_features = X.shape[1]
206
+
207
+ progress = tqdm(
208
+ total=self.max_iterations,
209
+ postfix="\n",
210
+ disable=not verbose,
211
+ bar_format="{desc} ┇{bar}┇ {n}/{total} total training interactions",
212
+ )
213
+
214
+ population_p = self._init_population_antibodies()
215
+
216
+ t: int = 1
217
+ while t <= self.max_iterations:
218
+ pool_memory = []
219
+ permutations = np.random.permutation(X.shape[0])
220
+ for antigen in X[permutations]:
221
+ clonal_memory = self._select_and_clone_population(antigen, population_p)
222
+ pool_memory.extend(self._clonal_suppression(antigen, clonal_memory))
223
+ pool_memory = self._memory_suppression(pool_memory)
224
+
225
+ if t < self.max_iterations:
226
+ pool_memory.extend(self._diversity_introduction())
227
+ population_p = np.asarray(pool_memory)
228
+
229
+ progress.update(1)
230
+
231
+ t += 1
232
+ self._population_antibodies = population_p
233
+
234
+ if self.use_mst_clustering:
235
+ self._build_mst()
236
+ self.update_clusters()
237
+ progress.set_description(
238
+ f"\033[92m✔ Set of memory antibodies for classes "
239
+ f"({', '.join(map(str, self.classes))}) successfully generated | "
240
+ f"Clusters: {len(self.classes)} | Population of antibodies size: "
241
+ f"{len(self._population_antibodies)}\033[0m"
242
+ )
243
+ progress.close()
244
+
245
+ return self
246
+
247
+ def predict(self, X) -> Optional[npt.NDArray]:
248
+ """
249
+ Predict cluster labels for input data.
250
+
251
+ Parameters
252
+ ----------
253
+ X : npt.NDArray
254
+ Data to predict.
255
+
256
+ Returns
257
+ -------
258
+ Predictions : Optional[npt.NDArray]
259
+ Predicted cluster labels, or None if clustering is disabled.
260
+ """
261
+ if not self.use_mst_clustering or self._memory_network is None:
262
+ return None
263
+
264
+ super()._check_and_raise_exceptions_predict(
265
+ X, self._n_features, self._feature_type
266
+ )
267
+
268
+ c: list = []
269
+
270
+ all_cells_memory = [
271
+ (class_name, cell)
272
+ for class_name in self.classes
273
+ for cell in self._memory_network[class_name]
274
+ ]
275
+
276
+ for line in X:
277
+ label_stim_list = [
278
+ (class_name, self._affinity(memory, line))
279
+ for class_name, memory in all_cells_memory
280
+ ]
281
+ # Create the list with the k nearest neighbors and select the class with the most votes
282
+ k_nearest = nlargest(self.k, label_stim_list, key=lambda x: x[1])
283
+ votes = Counter(label for label, _ in k_nearest)
284
+ c.append(votes.most_common(1)[0][0])
285
+ return np.array(c)
286
+
287
+ def _init_population_antibodies(self) -> npt.NDArray:
288
+ """
289
+ Initialize the antibody set of the network population randomly.
290
+
291
+ Returns
292
+ -------
293
+ npt.NDArray
294
+ List of initialized memories.
295
+ """
296
+ return self._generate_random_antibodies(
297
+ self.N,
298
+ self._n_features,
299
+ self._feature_type,
300
+ self._bounds
301
+ )
302
+
303
+ def _select_and_clone_population(
304
+ self,
305
+ antigen: npt.NDArray,
306
+ population: npt.NDArray
307
+ ) -> list:
308
+ """
309
+ Select top antibodies by affinity and generate mutated clones.
310
+
311
+ Parameters
312
+ ----------
313
+ antigen : npt.NDArray
314
+ The antigen for which affinities will be calculated.
315
+ population: list
316
+ The list of antibodies (solutions) to be evaluated and cloned.
317
+
318
+ Returns
319
+ -------
320
+ list[npt.NDArray]
321
+ List of mutated clones.
322
+ """
323
+ affinities = self._calculate_affinities(antigen, population)
324
+
325
+ if self.top_clonal_memory_size is not None and self.top_clonal_memory_size > 0:
326
+ selected_idxs = np.argsort(-affinities)[:self.top_clonal_memory_size]
327
+ else:
328
+ selected_idxs = np.arange(affinities.shape[0])
329
+
330
+ clonal_m = []
331
+ for i in selected_idxs:
332
+ clones = self._clone_and_mutate(
333
+ population[i],
334
+ int(self.n_clone * affinities[i])
335
+ )
336
+ clonal_m.extend(clones)
337
+
338
+ return clonal_m
339
+
340
+ def _clonal_suppression(self, antigen: npt.NDArray, clones: list):
341
+ """
342
+ Suppresses redundant clones based on affinity thresholds.
343
+
344
+ This function removes clones whose affinity with the antigen is lower than the defined
345
+ threshold (affinity_threshold) and eliminates redundant clones whose similarity with the
346
+ clones already selected exceeds the suppression threshold (suppression_threshold).
347
+
348
+ Parameters
349
+ ----------
350
+ antigen : npt.NDArray
351
+ The antigen for which affinities will be calculated.
352
+ clones : list
353
+ The list of candidate clones to be suppressed.
354
+
355
+ Returns
356
+ -------
357
+ list
358
+ Non-redundant, high-affinity clones.
359
+ """
360
+ suppression_affinity = [
361
+ clone for clone in clones
362
+ if self._affinity(clone, antigen) > self.affinity_threshold
363
+ ]
364
+ return self._memory_suppression(suppression_affinity)
365
+
366
+ def _memory_suppression(self, pool_memory: list) -> list:
367
+ """
368
+ Remove redundant antibodies from memory pool.
369
+
370
+ Calculate the affinity between all memory antibodies and remove redundant antibodies
371
+ whose similarity exceeds the suppression threshold.
372
+
373
+ Parameters
374
+ ----------
375
+ pool_memory : list
376
+ antibodies memory.
377
+
378
+ Returns
379
+ -------
380
+ list
381
+ Memory pool without redundant antibodies.
382
+ """
383
+ if not pool_memory:
384
+ return []
385
+ suppressed_memory = [pool_memory[0]]
386
+ for candidate in pool_memory[1:]:
387
+ affinities = self._calculate_affinities(
388
+ candidate.reshape(1, -1),
389
+ np.asarray(suppressed_memory)
390
+ )
391
+
392
+ if not np.any(affinities > self.suppression_threshold):
393
+ suppressed_memory.append(candidate)
394
+ return suppressed_memory
395
+
396
+ def _diversity_introduction(self):
397
+ """
398
+ Introduce diversity into the antibody population.
399
+
400
+ Returns
401
+ -------
402
+ npt.NDArray
403
+ Array of new random antibodies for diversity introduction.
404
+ """
405
+ return self._generate_random_antibodies(
406
+ self.n_diversity_injection,
407
+ self._n_features,
408
+ self._feature_type,
409
+ self._bounds
410
+ )
411
+
412
+ def _affinity(self, u: npt.NDArray, v: npt.NDArray) -> float:
413
+ """
414
+ Calculate the stimulus between two vectors using metrics.
415
+
416
+ Parameters
417
+ ----------
418
+ u : npt.NDArray
419
+ Coordinates of the first point.
420
+ v : npt.NDArray
421
+ Coordinates of the second point.
422
+
423
+ Returns
424
+ -------
425
+ float
426
+ Affinity score in [0, 1], where higher means more similar.
427
+ """
428
+ distance: float
429
+ if self._feature_type == "binary-features":
430
+ distance = hamming(u, v)
431
+ else:
432
+ distance = compute_metric_distance(
433
+ u, v, get_metric_code(self.metric), self.p
434
+ )
435
+
436
+ return 1 - (distance / (1 + distance))
437
+
438
+ def _calculate_affinities(self, u: npt.NDArray, v: npt.NDArray) -> npt.NDArray:
439
+ """
440
+ Calculate the affinity matrix between a reference vector and a set of target vectors.
441
+
442
+ Parameters
443
+ ----------
444
+ u : npt.NDArray
445
+ An array with shape (n_features).
446
+ v : npt.NDArray
447
+ An array of vectors with shape (n_samples, n_features).
448
+
449
+
450
+ Returns
451
+ -------
452
+ npt.NDArray
453
+ One-dimensional array of shape (n_samples,), containing the affinities between `u`
454
+ and each vector in `v`.
455
+ """
456
+ u = np.reshape(u, (1, -1))
457
+ v = np.atleast_2d(v)
458
+ distances = cdist(u, v, metric=self.metric, **self._metric_params)[0]
459
+
460
+ return 1 - (distances / (1 + distances))
461
+
462
+ def _clone_and_mutate(self, antibody: npt.NDArray, n_clone: int) -> npt.NDArray:
463
+ """
464
+ Generate mutated clones from an antibody, based on the feature type.
465
+
466
+ Parameters
467
+ ----------
468
+ antibody : npt.NDArray
469
+ Original antibody vector to be cloned and mutated.
470
+ n_clone : int
471
+ Number of clones to generate.
472
+
473
+ Returns
474
+ -------
475
+ npt.NDArray
476
+ Array of shape (n_clone, len(antibody)) containing mutated clones
477
+ """
478
+ if self._feature_type == "binary-features":
479
+ return clone_and_mutate_binary(antibody, n_clone)
480
+ if self._feature_type == "ranged-features" and self._bounds is not None:
481
+ return clone_and_mutate_ranged(antibody, n_clone, self._bounds)
482
+ return clone_and_mutate_continuous(antibody, n_clone)
483
+
484
+ def _build_mst(self):
485
+ """Construct the Minimum Spanning Tree (MST) for the antibody population.
486
+
487
+ Computes the pairwise distances between antibodies, builds the MST from
488
+ these distances, and stores the MST structure along with the mean and
489
+ standard deviation of its edge weights.
490
+
491
+ Raises
492
+ ------
493
+ ValueError
494
+ If the antibody population is empty.
495
+ """
496
+ if self._population_antibodies is None or len(self._population_antibodies) == 0:
497
+ raise ValueError("Population of antibodies is empty")
498
+
499
+ antibodies_matrix = squareform(
500
+ pdist(self._population_antibodies, metric=self.metric, **self._metric_params)
501
+ )
502
+ antibodies_mst = minimum_spanning_tree(antibodies_matrix).toarray()
503
+ self._mst_structure = antibodies_mst
504
+ nonzero_edges = antibodies_mst[antibodies_mst > 0]
505
+ self._mst_mean_distance = float(np.mean(nonzero_edges)) if nonzero_edges.size else 0.0
506
+ self._mst_std_distance = float(np.std(nonzero_edges)) if nonzero_edges.size else 0.0
507
+
508
+ def update_clusters(self, mst_inconsistency_factor: Optional[float] = None):
509
+ """Partition the clusters based on the MST inconsistency factor.
510
+
511
+ Uses the precomputed Minimum Spanning Tree (MST) of the antibody population
512
+ to redefine clusters. Edges whose weights exceed the mean plus the
513
+ `mst_inconsistency_factor` multiplied by the standard deviation of MST edge
514
+ weights are removed. Each connected component after pruning is treated as a
515
+ distinct cluster.
516
+
517
+ Parameters
518
+ ----------
519
+ mst_inconsistency_factor : float, optional
520
+ If provided, overrides the current inconsistency factor.
521
+
522
+ Raises
523
+ ------
524
+ ValueError
525
+ If the Minimum Spanning Tree (MST) has not yet been created
526
+
527
+ Updates
528
+ -------
529
+ self._memory_network : dict[int, npt.NDArray]
530
+ Dictionary mapping cluster labels to antibody arrays.
531
+ self.classes : list
532
+ List of cluster labels.
533
+ """
534
+ if self._mst_structure is None:
535
+ raise ValueError("The Minimum Spanning Tree (MST) has not yet been created.")
536
+
537
+ if mst_inconsistency_factor is not None:
538
+ self.mst_inconsistency_factor = mst_inconsistency_factor
539
+
540
+ antibodies_mst = self._mst_structure.copy()
541
+
542
+ thresholds = antibodies_mst > (
543
+ self._mst_mean_distance + self.mst_inconsistency_factor * self._mst_std_distance
544
+ )
545
+ antibodies_mst[thresholds] = 0
546
+
547
+ n_antibodies, labels = connected_components(csgraph=antibodies_mst, directed=False)
548
+
549
+ self._memory_network = {
550
+ label: self._population_antibodies[labels == label]
551
+ for label in range(n_antibodies)
552
+ }
553
+ self.classes = np.array(list(self._memory_network.keys()))