deepal6 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepal6/__init__.py ADDED
@@ -0,0 +1,56 @@
1
+ """
2
+ DeepAL6 — Deep Active Learning Library
3
+ ======================================
4
+ A flexible, research-grade active learning framework supporting 6 query
5
+ strategies compared against a random baseline, for both tabular and image
6
+ domains.
7
+
8
+ Strategies
9
+ ----------
10
+ - Random : Uniform random draw (baseline)
11
+ - Entropy : Highest predictive entropy H[y|x]
12
+ - Margin : Smallest top-2 class probability gap
13
+ - BALD : Mutual information I[y;θ|x,D] via MC Dropout
14
+ - CoreSet : Greedy k-center covering in embedding space
15
+ - BADGE : Gradient embeddings + k-means++ diversity
16
+
17
+ Quickstart
18
+ ----------
19
+ from deepal6 import ActiveLearner, TabularDataModule
20
+
21
+ data = TabularDataModule(X_train, y_train, X_test, y_test)
22
+ learner = ActiveLearner(data, strategy="BALD")
23
+ results = learner.run(initial_size=50, batch_size=20, n_rounds=20)
24
+ learner.plot(results)
25
+ """
26
+
27
+ import deepal6.strategies.query # registers all 6 strategies into STRATEGIES dict
28
+ from deepal6.learner import ActiveLearner
29
+ from deepal6.data.tabular import TabularDataModule
30
+ from deepal6.data.image import ImageDataModule
31
+ from deepal6.strategies.registry import STRATEGIES, list_strategies
32
+ from deepal6.config import ALConfig
33
+ from deepal6.exceptions import (
34
+ DeepALError,
35
+ ConfigurationError,
36
+ DataError,
37
+ StrategyError,
38
+ ModelError,
39
+ )
40
+
41
+ __version__ = "1.0.0"
42
+ __author__ = "Bob Philip Aila — AIMS Rwanda"
43
+
44
+ __all__ = [
45
+ "ActiveLearner",
46
+ "TabularDataModule",
47
+ "ImageDataModule",
48
+ "STRATEGIES",
49
+ "list_strategies",
50
+ "ALConfig",
51
+ "DeepALError",
52
+ "ConfigurationError",
53
+ "DataError",
54
+ "StrategyError",
55
+ "ModelError",
56
+ ]
deepal6/config.py ADDED
@@ -0,0 +1,203 @@
1
+ """
2
+ deepal.config
3
+ -------------
4
+ ALConfig is the single source of truth for all experiment hyper-parameters.
5
+ Every parameter has a sensible default drawn from the thesis experiments.
6
+ All values are validated on construction so errors surface early.
7
+ """
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import List, Optional, Dict, Any
11
+
12
+ from deepal6.exceptions import ConfigurationError
13
+ from deepal6.strategies.registry import STRATEGIES
14
+
15
+
16
+ @dataclass
17
+ class ALConfig:
18
+ """
19
+ Configuration for an active learning experiment.
20
+
21
+ Parameters
22
+ ----------
23
+ strategy : str or list of str
24
+ Query strategy name(s). One of:
25
+ 'Random', 'Entropy', 'Margin', 'BALD', 'CoreSet', 'BADGE'.
26
+ Pass a list to run multiple strategies in one experiment.
27
+ Default: all 6 strategies.
28
+ initial_size : int
29
+ Number of samples in the initial labeled set L0.
30
+ The draw is always stratified (equal class proportions) to
31
+ prevent the random baseline from winning by luck.
32
+ Default: 50.
33
+ batch_size : int
34
+ Number of samples queried from the pool per AL round.
35
+ Smaller batches (10–20) favour uncertainty strategies;
36
+ larger batches (50+) favour diversity strategies (CoreSet, BADGE).
37
+ Default: 20.
38
+ n_rounds : int
39
+ Maximum number of active learning rounds.
40
+ Total labelling budget = initial_size + n_rounds * batch_size.
41
+ Default: 20.
42
+ n_seeds : int
43
+ Number of independent runs per strategy (different random seeds).
44
+ Mean ± std across seeds is reported. ≥3 recommended for publication.
45
+ Default: 5.
46
+ train_epochs : int
47
+ Epochs to train at each AL round.
48
+ Default: 50 (tabular) — ImageDataModule overrides to 10.
49
+ lr : float
50
+ Adam learning rate. Default: 1e-3 (tabular), 1e-4 (image).
51
+ weight_decay : float
52
+ L2 regularisation strength. Default: 1e-4.
53
+ dropout_rate : float
54
+ Dropout probability used in both CreditNet and ResNet-18 head.
55
+ Also controls MC Dropout stochasticity for BALD.
56
+ Default: 0.3.
57
+ mc_passes : int
58
+ Number of stochastic forward passes for BALD.
59
+ Higher = lower variance estimate, higher cost.
60
+ Default: 20.
61
+ train_batch_size : int
62
+ Mini-batch size used during model training (not AL batch size).
63
+ Default: 32.
64
+ device : str or None
65
+ 'cuda', 'cpu', or None (auto-detect). Default: None.
66
+ seed : int
67
+ Base random seed. Each of the n_seeds runs uses seed+i.
68
+ Default: 42.
69
+ verbose : bool
70
+ Print per-round metrics during the experiment. Default: True.
71
+ save_checkpoints : bool
72
+ Save the best model checkpoint per strategy per seed.
73
+ Default: False.
74
+ checkpoint_dir : str
75
+ Directory for checkpoints (only used if save_checkpoints=True).
76
+ Default: './checkpoints'.
77
+ extra_strategy_kwargs : dict
78
+ Extra keyword arguments forwarded to specific strategy functions.
79
+ E.g., {'mc_passes': 30} overrides the top-level mc_passes for BALD.
80
+ Default: {}.
81
+
82
+ Examples
83
+ --------
84
+ # Single strategy, quick experiment
85
+ cfg = ALConfig(strategy='BALD', initial_size=30, batch_size=10, n_rounds=15)
86
+
87
+ # Compare all strategies with publication-quality settings
88
+ cfg = ALConfig(n_seeds=5, train_epochs=50)
89
+
90
+ # Image domain — fewer epochs, lower LR, larger batches
91
+ cfg = ALConfig(strategy=['BALD', 'CoreSet', 'Random'],
92
+ train_epochs=10, lr=1e-4, batch_size=20)
93
+ """
94
+
95
+ strategy: Any = field(default_factory=lambda: list(STRATEGIES.keys()))
96
+ initial_size: int = 50
97
+ batch_size: int = 20
98
+ n_rounds: int = 20
99
+ n_seeds: int = 5
100
+ train_epochs: int = 50
101
+ lr: float = 1e-3
102
+ weight_decay: float = 1e-4
103
+ dropout_rate: float = 0.3
104
+ mc_passes: int = 20
105
+ train_batch_size: int = 32
106
+ device: Optional[str] = None
107
+ seed: int = 42
108
+ verbose: bool = True
109
+ save_checkpoints: bool = False
110
+ checkpoint_dir: str = "./checkpoints"
111
+ extra_strategy_kwargs: Dict[str, Any] = field(default_factory=dict)
112
+
113
+ def __post_init__(self):
114
+ self._validate()
115
+
116
+ def _validate(self):
117
+ """Validate all parameters; raise ConfigurationError with clear message."""
118
+ # Normalise strategy to list
119
+ if isinstance(self.strategy, str):
120
+ self.strategy = [self.strategy]
121
+
122
+ unknown = [s for s in self.strategy if s not in STRATEGIES]
123
+ if unknown:
124
+ raise ConfigurationError(
125
+ f"Unknown strategy name(s): {unknown}.\n"
126
+ f"Available strategies: {list(STRATEGIES.keys())}.\n"
127
+ f"Tip: strategy names are case-sensitive. "
128
+ f"Use deepal.list_strategies() to see all options."
129
+ )
130
+
131
+ if self.initial_size < 2:
132
+ raise ConfigurationError(
133
+ f"initial_size must be at least 2 (got {self.initial_size}). "
134
+ f"You need at least one sample per class for a meaningful start."
135
+ )
136
+
137
+ if self.batch_size < 1:
138
+ raise ConfigurationError(
139
+ f"batch_size must be >= 1 (got {self.batch_size}). "
140
+ f"Typical values: 10–50."
141
+ )
142
+
143
+ if self.n_rounds < 1:
144
+ raise ConfigurationError(
145
+ f"n_rounds must be >= 1 (got {self.n_rounds})."
146
+ )
147
+
148
+ if self.n_seeds < 1:
149
+ raise ConfigurationError(
150
+ f"n_seeds must be >= 1 (got {self.n_seeds}). "
151
+ f"Use n_seeds >= 3 for statistically meaningful results."
152
+ )
153
+
154
+ if self.train_epochs < 1:
155
+ raise ConfigurationError(
156
+ f"train_epochs must be >= 1 (got {self.train_epochs})."
157
+ )
158
+
159
+ if not (0.0 < self.lr < 1.0):
160
+ raise ConfigurationError(
161
+ f"lr={self.lr} looks suspicious. Typical range: 1e-5 to 1e-2."
162
+ )
163
+
164
+ if not (0.0 <= self.dropout_rate < 1.0):
165
+ raise ConfigurationError(
166
+ f"dropout_rate must be in [0, 1) (got {self.dropout_rate})."
167
+ )
168
+
169
+ if self.mc_passes < 1:
170
+ raise ConfigurationError(
171
+ f"mc_passes must be >= 1 (got {self.mc_passes}). "
172
+ f"BALD typically uses 20–50 passes."
173
+ )
174
+
175
+ if self.device not in (None, "cpu", "cuda"):
176
+ raise ConfigurationError(
177
+ f"device must be None, 'cpu', or 'cuda' (got '{self.device}')."
178
+ )
179
+
180
+ @property
181
+ def total_budget(self) -> int:
182
+ """Maximum number of labeled samples at end of experiment."""
183
+ return self.initial_size + self.n_rounds * self.batch_size
184
+
185
+ def summary(self) -> str:
186
+ """Human-readable parameter summary."""
187
+ lines = [
188
+ "=" * 55,
189
+ " ALConfig — Experiment Parameters",
190
+ "=" * 55,
191
+ f" Strategies : {self.strategy}",
192
+ f" Initial size : {self.initial_size}",
193
+ f" Batch size : {self.batch_size}",
194
+ f" Rounds : {self.n_rounds}",
195
+ f" Seeds : {self.n_seeds}",
196
+ f" Total budget : {self.total_budget} labels",
197
+ f" Train epochs : {self.train_epochs}",
198
+ f" LR / dropout : {self.lr} / {self.dropout_rate}",
199
+ f" MC passes : {self.mc_passes} (BALD)",
200
+ f" Device : {self.device or 'auto'}",
201
+ "=" * 55,
202
+ ]
203
+ return "\n".join(lines)
@@ -0,0 +1,5 @@
1
+ from deepal6.data.base import BaseDataModule
2
+ from deepal6.data.tabular import TabularDataModule
3
+ from deepal6.data.image import ImageDataModule
4
+
5
+ __all__ = ["BaseDataModule", "TabularDataModule", "ImageDataModule"]
deepal6/data/base.py ADDED
@@ -0,0 +1,119 @@
1
+ """
2
+ deepal.data.base
3
+ ----------------
4
+ Abstract base class for all DataModules.
5
+
6
+ A DataModule wraps your dataset and exposes a standard interface that all
7
+ query strategies use:
8
+ - predict_proba(model, indices, mc_passes=1)
9
+ - get_embeddings(model, indices)
10
+ - get_gradient_embeddings(model, indices)
11
+ - labels : np.ndarray of all training labels
12
+ - n_train : total pool size
13
+ """
14
+
15
+ from abc import ABC, abstractmethod
16
+ import numpy as np
17
+
18
+
19
+ class BaseDataModule(ABC):
20
+ """
21
+ Abstract interface every DataModule must implement.
22
+
23
+ Subclass this to add support for new data types (e.g. text, graphs).
24
+ """
25
+
26
+ @property
27
+ @abstractmethod
28
+ def labels(self) -> np.ndarray:
29
+ """Full array of training labels (length = n_train)."""
30
+ ...
31
+
32
+ @property
33
+ @abstractmethod
34
+ def n_train(self) -> int:
35
+ """Total number of training samples in the pool."""
36
+ ...
37
+
38
+ @abstractmethod
39
+ def predict_proba(
40
+ self,
41
+ model,
42
+ indices,
43
+ mc_passes: int = 1,
44
+ ) -> np.ndarray:
45
+ """
46
+ Predict class probabilities for the given pool indices.
47
+
48
+ Parameters
49
+ ----------
50
+ model : nn.Module
51
+ Trained PyTorch model.
52
+ indices : array-like of int
53
+ Global indices into the training pool.
54
+ mc_passes : int
55
+ 1 → standard deterministic inference, shape (N,).
56
+ >1 → MC Dropout passes, shape (mc_passes, N).
57
+
58
+ Returns
59
+ -------
60
+ np.ndarray of float in (0, 1).
61
+ """
62
+ ...
63
+
64
+ @abstractmethod
65
+ def get_embeddings(self, model, indices) -> np.ndarray:
66
+ """
67
+ Extract penultimate-layer embeddings for the given indices.
68
+
69
+ Returns
70
+ -------
71
+ np.ndarray, shape (len(indices), embedding_dim).
72
+ """
73
+ ...
74
+
75
+ @abstractmethod
76
+ def get_gradient_embeddings(self, model, indices) -> np.ndarray:
77
+ """
78
+ Compute gradient embeddings (∇_{θ_last} BCE) for BADGE.
79
+
80
+ Returns
81
+ -------
82
+ np.ndarray, shape (len(indices), n_last_weights).
83
+ """
84
+ ...
85
+
86
+ @abstractmethod
87
+ def train_model(self, model, labeled_idx, config) -> None:
88
+ """
89
+ Train model in-place on the current labeled set.
90
+
91
+ Parameters
92
+ ----------
93
+ model : nn.Module
94
+ labeled_idx : list of int
95
+ config : ALConfig
96
+ """
97
+ ...
98
+
99
+ @abstractmethod
100
+ def evaluate(self, model) -> dict:
101
+ """
102
+ Evaluate model on the held-out test set.
103
+
104
+ Returns
105
+ -------
106
+ dict with keys: 'accuracy', 'auc', 'bal_acc', 'recall', 'ece'.
107
+ """
108
+ ...
109
+
110
+ @abstractmethod
111
+ def build_model(self, config) -> "nn.Module":
112
+ """
113
+ Construct and return a fresh model (reset weights each AL round).
114
+
115
+ Parameters
116
+ ----------
117
+ config : ALConfig
118
+ """
119
+ ...