PyPI - AbstractIntegratedModule - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

AbstractIntegratedModule 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

AbstractIntegratedModule.py CHANGED Viewed

@@ -1,52 +1,3 @@
-"""
-Advanced Integrated AI Module (AbstractIntegratedModule)
-Multi-agent P2P inference system with geometric deep learning
-Installation:
-    pip install aiml
-Usage:
-    from AbstractIntegratedModule import IntegratedPipeline, CohesiveAgentDeployment
-"""
-__version__ = "0.1.5"
-__author__ = "Micro-Novelty"
-__all__ = [
-    # Main user-facing classes
-    "IntegratedPipeline",
-    "CohesiveAgentDeployment",
-    "AgentDistributedInference",
-    "WeightedEnsemblePredictor",
-    # Models
-    "Transformer",
-    "MLP",
-    "GeometricWeightShaping",
-    # Security (user may need)
-    "SecurityLevel",
-    "SecurityConfig",
-    "TrustLevel",
-    # Storage (user may need)
-    "ModelStorage",
-    # Fallback
-    "ConsecutivePeerAgent",
-    # Singleton base (if users want to extend)
-    "Singleton",
-    # Version
-    "__version__",
-]
-# THIS IS THE SOURCE CODE OF ABSTRACTINTEGRATEDMODULE
-# YOU ARE HEREBY GRANTED TO AUDIT, REVIEW, AND INITIATE PULL REQUESTS AND ISSUES
-# LICENSE: MIT, PROVIDED.
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 import pandas as pd
@@ -299,11 +250,7 @@ class SingletonMeta(type):
     _lock: threading.Lock = threading.Lock()
     def __call__(cls, *args, **kwargs):
-        # Double-checked locking pattern:
-        #   Fast path (no lock) — if the instance already exists, return immediately.
-        #   Slow path (with lock) — only one thread can create the instance; a second
-        #   check inside the lock guards against two threads both passing the fast path
-        #   before either acquires the lock.
+        # Fast path: instance already exists
         if cls in cls._instances:
             return cls._instances[cls]
@@ -353,23 +300,6 @@ class GeometricWeightShaping:
         self.floating_context = None
     def eigenvalue_encoder(self, x):
-        # Encodes the geometric complexity of the input data into a scalar (trC) and a
-        # principal component count (k). The scalar trC is later used as the upper bound
-        # for the random floating-point context in abstract_weight_shaping.
-        #
-        # Step-by-step logic:
-        #   1. Augment input with magnitude-scaled structured noise so the covariance
-        #      matrix is never degenerate even on very small or homogeneous datasets.
-        #   2. Run eigendecomposition on the augmented covariance, sort eigenvalues
-        #      descending, then find k = the number of principal components that
-        #      capture 90% of cumulative variance.  k is a compact measure of
-        #      intrinsic dimensionality.
-        #   3. Derive three chained scalars (trA → trB → trC) that compress k and the
-        #      data anisotropy into a single weight-shaping magnitude.
-        #      - trA  : scales k by directional variation; high anisotropy → large trA
-        #      - trB  : dampens trA²; keeps the signal in a bounded range
-        #      - trC  : final scalar — NOTE: trB² - 1.0 can equal zero when trB == ±1,
-        #               causing division-by-zero (known fragility flagged in code review)
         eps = 1e-5
         X = np.asarray(x)
         if X.ndim > 2:
@@ -379,32 +309,22 @@ class GeometricWeightShaping:
         anisotropy = self.anisotropy_measurement(X)
-        # Augment data with noise proportional to its magnitude to avoid a singular
-        # covariance matrix when the dataset is small or nearly constant.
         structured_noise = np.random.uniform(0, mag, size=X.shape)
         X = np.vstack((X, structured_noise))
         cov = np.cov(X, rowvar=False)
-        # eigh is used instead of eig because cov is symmetric; it returns real eigenvalues
-        # and is numerically more stable than the general eigensolver.
         eigenvalues, eigenvectors = np.linalg.eigh(cov)
-        idx = np.argsort(eigenvalues)[::-1]  # sort largest-first
+        idx = np.argsort(eigenvalues)[::-1]
         eigenvalues = eigenvalues[idx]
-        # Cumulative explained variance ratio; searchsorted finds the elbow at 90 %.
         energy = np.cumsum(eigenvalues) / np.sum(eigenvalues)
-        k = np.searchsorted(energy, 0.90) + 1     # +1 converts 0-based index to count
+        k = np.searchsorted(energy, 0.90) + 1
-        # K_G: normalised inverse of k — small k (low-dim data) → K_G near 1,
-        #       large k (high-dim data) → K_G near 0.
         K_G = 1.0 / (1.0 + k)
-        mag_G = 1.0 / (1.0 + K_G)  # secondary magnitude dampener
+        mag_G = 1.0 / (1.0 + K_G)
-        # Three-stage compression cascade that maps (k, anisotropy) → trC scalar.
-        trA = k / (1.0 - anisotropy) + eps   # anisotropy close to 1 inflates trA
-        trB = (1/2 + mag_G) / (1.0 + trA**2) # quadratic dampener keeps trB < 0.5
-        # WARNING: trB² - 1.0 is negative for all typical trB values (|trB| < 1),
-        # so trC ends up negative.  When trB == ±1 exactly this divides by zero.
+        trA = k / (1.0 - anisotropy) + eps
+        trB = (1/2 + mag_G) / (1.0 + trA**2)
         trC = (1/6 + K_G) / (trB**2 - 1.0)
         return trC, k
@@ -455,23 +375,6 @@ class GeometricWeightShaping:
     # weight shaping provides directional context in which how the data should be processed in order to align with the data geometry
     def abstract_weight_shaping(self, x):
-        # Derives a data-adaptive random weight matrix whose range is governed by
-        # the geometric complexity of the input batch x.
-        #
-        # Key scalars produced along the way:
-        #   anisotropy  — directional spread of gradients across x (higher = more varied)
-        #   trC, k      — eigenvalue-derived complexity scalar and intrinsic dimensionality
-        #   AME         — Abstract Modelling Error: log-product of magnitude × gradient energy
-        #   AEL         — Adaptive Energy Level: blends spectral similarity with anisotropy;
-        #                 measures how much the data geometry resembles random noise
-        #   AMR         — sigmoid-scaled AME; used as a soft gate between 0 and 1
-        #   efficient_distributed_energy — the upper bound fed to the final uniform sampler;
-        #                 equals k + AEL*(1 - AMR): dominated by intrinsic dimensionality
-        #                 when the model rate (AMR) is high, shifts to AEL when AMR is low.
-        #
-        # The resulting weight matrix (shape: input_size × output_size) is drawn from
-        # Uniform[0, efficient_distributed_energy], which gives the downstream Dense layer
-        # a geometry-aware initialisation instead of a fixed scale like He/Xavier.
         input_size = self.input_size
         output_size = self.output_size
@@ -483,19 +386,13 @@ class GeometricWeightShaping:
         trC, k = self.eigenvalue_encoder(x)
         AME = self.AME_Encoder(x)
-        # floating_point: noise draw bounded by trC; used only to compute spectral
-        # similarity (how much the real data "looks like" noise geometrically).
         floating_point = np.random.uniform(0, trC, size=x.shape)
         spectral_similarity = self.spectral_similarity(x, floating_point)
-        # AEL rises when data is both spectrally noise-like and highly anisotropic.
         AEL = 0.3 + spectral_similarity * anisotropy
-        scaled_anisotropy = anisotropy / (anisotropy + 1.0)  # unused below; kept for potential future use
-        AMR = 1.0 / (1.0 + np.exp(-AME))  # abstract modelling rate — sigmoid gate on AME
+        scaled_anisotropy = anisotropy / (anisotropy + 1.0)
+        AMR = 1.0 / (1.0 + np.exp(-AME)) # abstract modelling rate
-        # Upper bound of the weight distribution.
-        # When data complexity is low (AMR → 1), the AEL term vanishes → bound ≈ k.
-        # When data is geometrically rich (AMR → 0), AEL contributes more → wider init.
         efficient_distributed_energy = k + AEL * (1.0 - AMR)
         floating_context = rng.uniform(0, efficient_distributed_energy, size=(input_size, output_size))
         self.floating_context = floating_context
@@ -579,9 +476,13 @@ class Loss:
 class Transformer:
-    def __init__(self, vocab_size, d_model=32, n_heads=4, num_classes=7):
+    def __init__(self, vocab_size, d_model=8, n_heads=2, num_classes=7, learning_rate=0.01, attn_dropout=0.0, ffn_dropout=0.0, weight_decay=1e-4):
         self.d_model = d_model  # Embedding dimension
         self.n_heads = n_heads
+        self.attn_dropout_rate = attn_dropout
+        self.ffn_dropout_rate  = ffn_dropout
+        self.transformer_lr = learning_rate
+        self.weight_decay = weight_decay
         self.token_embedding = np.random.randn(vocab_size, d_model) * 0.02
@@ -620,7 +521,28 @@ class Transformer:
         mean = np.mean(x, axis=-1, keepdims=True)
         var = np.var(x, axis=-1, keepdims=True)
         return scale * (x - mean) / np.sqrt(var + 1e-5) + shift
+    def apply_update(self, param, grad, lr):
+        # L2 weight decay applied directly at update time
+        # equivalent to: grad += weight_decay * param
+        return param - lr * (grad + self.weight_decay * param)
+    def dropout(self, x, rate=0.1, training=True, alpha=None):
+        if not training or rate == 0.0:
+            return x, None
+        # If alpha provided, scale the effective drop rate by it
+        # low alpha (early training, fixed attention) → very light dropout
+        # high alpha (dynamic attention active)       → full dropout rate
+        effective_rate = rate * alpha if alpha is not None else rate
+        if effective_rate == 0.0:
+            return x, None
+        mask = (np.random.rand(*x.shape) > effective_rate).astype(np.float32)
+        return x * mask / (1.0 - effective_rate), mask
     def softmax(self, x):
         if x.ndim == 3:
             shifted = x - np.max(x, axis=-1, keepdims=True)
@@ -642,29 +564,13 @@ class Transformer:
         return output, weights
-    def multi_head_attention(self, x, mask=None):
+    def multi_head_attention(self, x, mask=None, alpha=None):
         batch_size, seq_len, d_model = x.shape
-        try:
-            alpha = self.alpha  # between 0 and 1
-        except:
-            # alpha not yet set (first call before any train_step); derive it from the
-            # data's geometric complexity via AME so we start with a meaningful blend.
-            AME = self.AME_Encoder(x)
-            AMR = 1.0 / (1.0 + np.exp(-AME))
-            alpha = AMR
-            self.alpha = alpha
-        # Interpolate between the frozen initial projections (W_q/k/v_fixed) and the
-        # learnable ones (W_q/k/v).  alpha starts near 0 and ramps toward 1 during
-        # training (see train() where alpha = min(1.0, epoch/100)), so early epochs
-        # lean on the stable fixed projections and later epochs use the learned ones.
         W_q_mix = (1 - alpha) * self.W_q_fixed + alpha * self.W_q
         W_k_mix = (1 - alpha) * self.W_k_fixed + alpha * self.W_k
         W_v_mix = (1 - alpha) * self.W_v_fixed + alpha * self.W_v
-        # Project input into multi-head Q, K, V spaces.
-        # einsum notation 'bsd,hdm->bhsm': batch × seq × d_model dotted with
-        # n_heads × d_model × head_dim → batch × n_heads × seq × head_dim
         Q = np.einsum('bsd,hdm->bhsm', x, W_q_mix)
         K = np.einsum('bsd,hdm->bhsm', x, W_k_mix)
         V = np.einsum('bsd,hdm->bhsm', x, W_v_mix)
@@ -680,65 +586,75 @@ class Transformer:
         self.cache['attn_weights'] = attn_weights
         self.cache['attn_output'] = attn_output
-        # Concatenate heads: (batch, n_heads, seq, head_dim) → (batch, seq, d_model)
+        # Concatenate heads
         attn_output = attn_output.transpose(0, 2,1, 3).reshape(batch_size, seq_len, -1)
         self.cache['attn_concat'] = attn_output
-        # Final linear projection: mixes head outputs back into d_model space.
-        # W_o is geometry-initialised via GWS on the first training call (see train()).
+        # Final linear projection
         output = np.matmul(attn_output, self.W_o)
         self.cache['attn_out'] = output
         return output, attn_weights
-    def forward(self, input_ids, embedded=False):
+    def forward(self, input_ids, embedded=False, pad_token_id=0, training=True, attn_dropout=0.1, ffn_dropout=0.1):
         if embedded:
-            # Accept pre-computed embeddings directly (e.g. TF-IDF vectors passed as
-            # float arrays) instead of integer token IDs.  Reshape to 3-D if needed.
             x = np.asarray(input_ids)
             if x.ndim == 2:
                 x = x[np.newaxis, ...]
             batch_size, seq_len, _ = x.shape
             self.cache['embedded_input'] = x
             self.cache['input_ids'] = None
+            mask = None
         else:
+            input_ids = np.asarray(input_ids, dtype=np.int32)
             if input_ids.ndim == 1:
-                input_ids = input_ids.reshape(1, -1)
-            # Standard token-embedding lookup + additive positional encoding.
+                input_ids = input_ids[np.newaxis, :]
             x = self.token_embedding[input_ids]
             x = x + self.pos_embedding[:x.shape[1]]
             batch_size, seq_len = input_ids.shape
             self.cache['embedded_input'] = None
             self.cache['input_ids'] = input_ids
+            mask = self.padding_mask_utility(input_ids, pad_token_id)  # (B,1,1,T)
+        self.cache['mask'] = mask if not embedded else None
         self.cache['seq_len'] = seq_len
         self.cache['batch_size'] = batch_size
         self.cache['x_token'] = x
         self.cache['x_pos'] = x
-        # Multi-head attention with residual
-        attn_out, attn_weights = self.multi_head_attention(x)
+        # Multi-head attention with residual
+        AME = self.AME_Encoder(x)
+        alpha = 1.0 / (1.0 + np.exp(-AME))
+        attn_out, attn_weights = self.multi_head_attention(x, mask=mask, alpha=alpha)
+        current_alpha = self.cache.get('alpha', 0.0)
+        attn_out, attn_drop_mask = self.dropout(attn_out, rate=self.attn_dropout_rate, training=training, alpha=current_alpha)
+        self.cache['attn_drop_mask'] = attn_drop_mask
-        # Exponential moving average update for alpha using the attention quality score.
-        # Keeps alpha stable: 95 % of the old value + 5 % of the current quality signal.
-        # This means alpha slowly tracks how well-focused the current attention is,
-        # rather than jumping abruptly each step.
-        self.alpha = 0.95 * self.alpha + 0.05 * self.attention_quality_computing(attn_weights)
+        alpha = 0.95 * alpha + 0.05 * self.attention_quality_computing(attn_weights, mask=mask)
+        self.alpha = alpha
+        self.cache['alpha'] = alpha  # store in cache
-        # Pre-norm residual: cache the sum before normalising so backward can recover it.
         self.cache['x_ln1_input'] = x + attn_out
         x = self.layer_norm(x + attn_out, self.ln1_scale, self.ln1_shift)
         self.cache['x_after_ln1'] = x
-        # Feed-forward network: expand to 4×d_model, apply ReLU, project back.
+        # Feed-forward with residual
         self.cache['ffn_input'] = x
         ffn_pre = np.matmul(x, self.ffn1)
         self.cache['ffn_pre'] = ffn_pre
         ffn_act = np.maximum(0, ffn_pre)  # ReLU
+        ffn_act, ffn_drop_mask = self.dropout(ffn_act, rate=self.ffn_dropout_rate, training=training, alpha=current_alpha)
         self.cache['ffn_act'] = ffn_act
+        self.cache['ffn_drop_mask'] = ffn_drop_mask
         ffn_out = np.matmul(ffn_act, self.ffn2)
         self.cache['ffn_out'] = ffn_out
@@ -746,9 +662,15 @@ class Transformer:
         x = self.layer_norm(x + ffn_out, self.ln2_scale, self.ln2_shift)
         self.cache['x_after_ln2'] = x
-        # Mean-pool across the sequence dimension to get a fixed-size representation
-        # regardless of input length.  Shape: (batch, d_model).
-        x_pooled = np.mean(x, axis=1)  # (batch, d_model)
+        if mask is not None:
+            # Reshape mask to (B, T, 1) for broadcasting against (B, T, D)
+            token_mask = mask[:, 0, 0, :, np.newaxis]        # (B, T, 1)
+            x_masked   = x * token_mask                       # zero out padding
+            lengths    = token_mask.sum(axis=1)               # (B, 1) valid token counts
+            x_pooled   = x_masked.sum(axis=1) / (lengths + 1e-6)  # (B, D)
+        else:
+            x_pooled = np.mean(x, axis=1)
         self.cache['x_pooled'] = x_pooled
         # Output projection
@@ -762,18 +684,6 @@ class Transformer:
     def layer_norm_backward(self, d_out, x, scale, shift):
-        # Backpropagates through layer normalisation.
-        # LayerNorm forward: y = scale * (x - mean) / sqrt(var + eps) + shift
-        #
-        # Gradient derivation (standard result, often omitted from textbooks):
-        #   dx_hat  = d_out * scale         — upstream grad scaled by learned gamma
-        #   dvar    = sum(dx_hat * (x-mean) * -0.5 * std^{-3})   — chain rule on variance
-        #   dmean   = sum(dx_hat * -1/std) + dvar * mean(-2*(x-mean))
-        #             — two paths: direct via x_hat, indirect via variance
-        #   dx      = dx_hat/std + dvar * 2*(x-mean)/N + dmean/N
-        #             — three additive terms, each from a different path through the graph
-        #
-        # N is the feature dimension (last axis) — normalization is per-sample, per-position.
         eps = 1e-5
         mean = np.mean(x, axis=-1, keepdims=True)
         var = np.var(x, axis=-1, keepdims=True)
@@ -784,10 +694,7 @@ class Transformer:
         N = x.shape[-1]
         dx_hat = d_out * scale
         dvar = np.sum(dx_hat * (x - mean) * -0.5 * std**-3, axis=-1, keepdims=True)
-        dmean = (
-        np.sum(dx_hat * -1/std, axis=-1, keepdims=True)
-        + dvar * np.mean(-2*(x-mean), axis=-1, keepdims=True)
-        )
+        dmean = np.sum(dx_hat * (-1.0 / std), axis=-1, keepdims=True)
         dx = (
         dx_hat / std
@@ -798,10 +705,12 @@ class Transformer:
         return dx
     # fixed attention backward allow the transformer to not update its Q, K, V projections, allowing much stable attention, while sacrificing flexibility.
-    def fixed_attention_backward(self, d_logits, lr=0.001):
+    def fixed_attention_backward(self, d_logits, lr=0.01, max_norm=1.0):
         # Gradient for output layer
         d_output = d_logits
+        alpha = self.cache.get('alpha', 1.0)
         d_Wo = np.dot(self.cache['x_pooled'].T, d_output)
         d_bo = np.sum(d_output, axis=0, keepdims=True)
@@ -823,8 +732,12 @@ class Transformer:
         # Gradient for FFN1 through ReLU
         d_ffn_act = np.matmul(d_ffn, self.ffn2.T)
-        d_ffn_pre = d_ffn_act
-        d_ffn_pre[self.cache['ffn_pre'] <= 0] = 0
+        ffn_drop_mask = self.cache.get('ffn_drop_mask')
+        if ffn_drop_mask is not None:
+            d_ffn_act = d_ffn_act * ffn_drop_mask / (1.0 - self.ffn_dropout_rate)
+        d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0)   # ReLU backward unchanged
         d_prev = np.matmul(d_ffn_pre, self.ffn1.T)
         d_ffn1 = np.sum(np.matmul(self.cache['ffn_input'].transpose(0, 2, 1), d_ffn_pre), axis=0)
@@ -839,23 +752,36 @@ class Transformer:
         d_attn = dx
         # Gradient for attention output projection
-        d_Wo_attn = np.sum(np.matmul(self.cache['attn_concat'].transpose(0, 2, 1), d_attn), axis=0)
+        attn_drop_mask = self.cache.get('attn_drop_mask')
+        if attn_drop_mask is not None:
+            d_attn = d_attn * attn_drop_mask / (1.0 - self.attn_dropout_rate)
+        d_Wo_attn = np.sum(np.matmul(self.cache['attn_concat'].transpose(0,2,1), d_attn), axis=0)
+        grads = {
+                'output':  d_Wo,
+                'ffn2':    d_ffn2,
+                'ffn1':    d_ffn1,
+                'W_o':     d_Wo_attn,
+            }
+        grads, norm = self.clip_gradients(grads, max_norm)
         # Update weights
-        self.output -= lr * d_Wo
-        self.output_bias -= lr * d_bo.squeeze()
-        self.ffn2 -= lr * d_ffn2
-        self.ffn1 -= lr * d_ffn1
-        self.W_o -= lr * d_Wo_attn
+        self.output = self.apply_update(self.output, grads['output'], lr)
+        self.ffn2   = self.apply_update(self.ffn2,   grads['ffn2'],   lr)
+        self.ffn1   = self.apply_update(self.ffn1,   grads['ffn1'],   lr)
+        self.W_o    = self.apply_update(self.W_o,    grads['W_o'],    lr)
+        # output_bias intentionally excluded — biases don't get weight decay
         return d_x
-    def dynamic_backward(self, d_logits, lr=0.001):
+    def dynamic_backward(self, d_logits, lr=0.01, max_norm=1.0):
         # Gradient for output layer
         d_output = d_logits
+        alpha = self.cache.get('alpha', 1.0)
         d_Wo = np.dot(self.cache['x_pooled'].T, d_output)
         d_bo = np.sum(d_output, axis=0)
@@ -863,8 +789,14 @@ class Transformer:
         d_pooled = np.dot(d_output, self.output.T)
         # Expand pooled gradient to all positions
-        d_x = np.repeat(d_pooled[:, np.newaxis, :] / self.cache['seq_len'], self.cache['seq_len'], axis=1)
+        mask = self.cache['mask']  # (B, 1, 1, T)
+        if mask is not None:
+            token_mask = mask[:, 0, 0, :, np.newaxis]             # (B, T, 1)
+            lengths    = token_mask.sum(axis=1, keepdims=True)    # (B, 1, 1)
+            d_x        = (d_pooled[:, np.newaxis, :] / (lengths + 1e-6)) * token_mask
+        else:
+            d_x = np.repeat(d_pooled[:, np.newaxis, :] / self.cache['seq_len'], self.cache['seq_len'], axis=1)
         # Layer norm 2 gradient
         d_x = self.layer_norm_backward(d_x, self.cache['x_ln2_input'],
                                         self.ln2_scale, self.ln2_shift)
@@ -877,8 +809,12 @@ class Transformer:
         # Gradient for FFN1 through ReLU
         d_ffn_act = np.matmul(d_ffn, self.ffn2.T)
+        ffn_drop_mask = self.cache.get('ffn_drop_mask')
+        if ffn_drop_mask is not None:
+            d_ffn_act = d_ffn_act * ffn_drop_mask / (1.0 - self.ffn_dropout_rate)
+        d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0)   # ReLU backward unchanged
-        d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0)
         d_prev = np.matmul(d_ffn_pre, self.ffn1.T)
         d_ffn1 = np.sum(np.matmul(self.cache['ffn_input'].transpose(0, 2, 1), d_ffn_pre), axis=0)
@@ -890,6 +826,10 @@ class Transformer:
         dx = d_prev + d_residual
         # Gradient for attention output projection
+        attn_drop_mask = self.cache.get('attn_drop_mask')
+        if attn_drop_mask is not None:
+            d_attn = d_attn * attn_drop_mask / (1.0 - self.attn_dropout_rate)
         d_Wo_attn = np.sum(np.matmul(self.cache['attn_concat'].transpose(0, 2, 1), d_attn), axis=0)
         d_attn_concat = np.matmul(d_attn, self.W_o.T)
@@ -897,8 +837,6 @@ class Transformer:
         d_head = self.n_heads
         d_dim = self.d_model // self.n_heads
-        # Reshape concatenated head gradients back to per-head form before computing
-        # QKV gradients.  Reverses the transpose+reshape done in multi_head_attention.
         d_attn_heads = d_attn_concat.reshape(batch, seq_len, d_head, d_dim) .transpose(0, 2, 1, 3)
         V = self.cache['V']
@@ -906,84 +844,150 @@ class Transformer:
         Q = self.cache['Q']
         weight = self.cache['attn_weights']
-        # Gradient of attention output w.r.t. V: dL/dV = attn_weights^T · dL/d_attn_out
         d_V = np.matmul(weight.transpose(0, 1, 3, 2), d_attn_heads)
-        # Gradient w.r.t. pre-softmax attention scores via the product rule on Vᵀ.
         d_weights = np.matmul(d_attn_heads, V.transpose(0, 1, 3, 2))
-        # Softmax Jacobian shortcut: d(softmax) = softmax * (d_out - sum(d_out * softmax))
-        # Scaled by 1/sqrt(d_k) matching the forward-pass scaling.
         d_scores = weight * (d_weights - np.sum(d_weights * weight, axis=-1, keepdims=True))
         d_scores /= np.sqrt(Q.shape[-1])
-        # Back-propagate through Q·Kᵀ to get per-head Q and K gradients.
         d_Q = np.matmul(d_scores, K)
         d_K = np.matmul(d_scores.transpose(0, 1, 3, 2), Q)
         x = self.cache['x_attn_input']
-        # Project head-space gradients back to projection-weight gradients using einsum.
-        # 'bsd, bhsm->hdm': accumulate over batch and sequence dimensions.
         d_W_q = np.einsum('bsd, bhsm->hdm', x, d_Q)
         d_W_k = np.einsum('bsd, bhsm->hdm', x, d_K)
         d_W_v = np.einsum('bsd, bhsm->hdm', x, d_V)
-        # Project head-space gradients back to input space (for the layer below).
         d_x_q = np.einsum('bhsm, hdm->bsd', d_Q, self.W_q)
         d_x_k = np.einsum('bhsm, hdm->bsd', d_K, self.W_k)
         d_x_v = np.einsum('bhsm, hdm->bsd', d_V, self.W_v)
-        # Sum Q, K, V contributions — each head projection touches the same input x.
         d_x_attn_input = d_x_q + d_x_k + d_x_v
         d_x_total = d_x_attn_input + d_residual
         input_ids = self.cache.get('input_ids')
         if input_ids is not None:
-            for b in range(input_ids.shape[0]):
-                for t in range(input_ids.shape[1]):
-                    idx = int(input_ids[b, t])
-                    self.token_embedding[idx] -= lr * d_x_total[b, t] / self.cache['seq_len']
+            flat_ids = input_ids.flatten()          # (B*T,)
+            flat_grads = d_x_total.reshape(-1, self.d_model) / self.cache['seq_len']
+            np.add.at(self.token_embedding, flat_ids, -lr * flat_grads)
         # Update weights
-        self.output -= lr * d_Wo
-        self.output_bias -= lr * d_bo.squeeze()
-        self.ffn2 -= lr * d_ffn2
-        self.ffn1 -= lr * d_ffn1
-        self.W_o -= lr * d_Wo_attn
+        grads = {
+                'output': d_Wo,
+                'ffn2':   d_ffn2,
+                'ffn1':   d_ffn1,
+                'W_o':    d_Wo_attn,
+                'W_q':    alpha * d_W_q,   # already alpha-scaled, clip the combined thing
+                'W_k':    alpha * d_W_k,
+                'W_v':    alpha * d_W_v,
+            }
+        grads, norm = self.clip_gradients(grads, max_norm)
-        alpha = self.alpha
+        self.output = self.apply_update(self.output, grads['output'], lr)
+        self.ffn2   = self.apply_update(self.ffn2,   grads['ffn2'],   lr)
+        self.ffn1   = self.apply_update(self.ffn1,   grads['ffn1'],   lr)
+        self.W_o    = self.apply_update(self.W_o,    grads['W_o'],    lr)
+        self.W_q    = self.apply_update(self.W_q,    grads['W_q'],    lr)
+        self.W_k    = self.apply_update(self.W_k,    grads['W_k'],    lr)
+        self.W_v    = self.apply_update(self.W_v,    grads['W_v'],    lr)
-        self.W_q -= lr * alpha * d_W_q
-        self.W_k -= lr * alpha * d_W_k
-        self.W_v -= lr * alpha * d_W_v
+        if input_ids is not None:
+            emb_norm = np.linalg.norm(d_x_total)
+            emb_coef = min(1.0, max_norm / (emb_norm + 1e-6))
-        self.pos_embedding[:seq_len] -= lr * d_x_total.mean(axis=0)
+            flat_ids   = input_ids.flatten()                          # (B*T,)
+            flat_grads = d_x_total.reshape(-1, self.d_model) / self.cache['seq_len']  # (B*T, D)
-        return d_x_total
+            np.add.at(self.token_embedding, flat_ids, -lr * emb_coef * flat_grads)
+            self.pos_embedding[:seq_len] -= lr * emb_coef * d_x_total.mean(axis=0)
+        else:
+            self.pos_embedding[:seq_len] -= lr * d_x_total.mean(axis=0)
+            norm = d_x_total
+        return norm
-    def train_step(self, input_ids, epoch, y_true, lr=0.001, mode=None, embedded=False):
-        probs, attn_weights = self.forward(input_ids, embedded=embedded)
+    def smoothing_labels_utility(self, y_true, smoothing=0.1):
+        # y_true: (B, num_classes) one-hot
+        num_classes = y_true.shape[1]
+        return y_true * (1.0 - smoothing) + smoothing / num_classes
+    def learning_rate_warm_up(self, epoch, epochs, lr_base, schedule='cosine_warmup', warmup_frac=0.1):
+        warmup_epochs = int(epochs * warmup_frac)
-        # Loss (cross-entropy)
+        if schedule == 'cosine_warmup':
+            if epoch < warmup_epochs:
+                # Linear warmup
+                return lr_base * (epoch + 1) / warmup_epochs
+            else:
+                # Cosine decay after warmup
+                progress = (epoch - warmup_epochs) / (epochs - warmup_epochs)
+                return lr_base * 0.5 * (1 + np.cos(np.pi * progress))
+        elif schedule == 'step':
+            # Halve lr every 30% of training
+            step = int(epochs * 0.3)
+            return lr_base * (0.5 ** (epoch // step))
+        elif schedule == 'constant':
+            return lr_base
+        return lr_base
+    def padding_mask_utility(self, input_ids, pad_token_id=0):
+        # input_ids: (B, T)
+        # Returns: (B, 1, 1, T) — broadcast-ready for (B, heads, T_q, T_k)
+        mask = (input_ids != pad_token_id).astype(np.float32)
+        return mask[:, np.newaxis, np.newaxis, :]   # (B, 1, 1, T)
+    def clip_gradients(self, grads: dict, max_norm: float = 1.0) -> dict:
+        # Compute global norm across all gradient tensors
+        total_norm = np.sqrt(sum(
+            np.sum(g ** 2) for g in grads.values()
+        ))
+        clip_coef = max_norm / (total_norm + 1e-6)
+        # scale down, never up
+        if clip_coef < 1.0:
+            grads = {k: g * clip_coef for k, g in grads.items()}
+        return grads, total_norm  # return norm for monitoring
+    def train_step(self, input_ids, epoch, y_true, lr=0.01, mode=None, embedded=False, max_norm=1.0, pad_token_id=0):
+        if not embedded and input_ids.ndim == 1:
+            input_ids = input_ids[np.newaxis, :]   # (1, T), single sample
+        if y_true.ndim == 1:
+            y_true = y_true[np.newaxis, :]
+        probs, attn_weights = self.forward(input_ids, embedded=embedded, pad_token_id=pad_token_id, training=True, attn_dropout=self.attn_dropout_rate, ffn_dropout=self.ffn_dropout_rate)
+        y_true_smooth = self.smoothing_labels_utility(y_true, smoothing=0.1)
+        if y_true_smooth.shape[0] and y_true_smooth.shape[1] != probs.shape[0] and probs.shape[1]:
+            if y_true_smooth.shape[1] > probs.shape[1]:
+               y_true_smooth = y_true_smooth[:, :probs.shape[1]]
+            else:
+               y_true_smooth = np.pad(y_true_smooth, ((0, 0), (0, probs.shape[1] - y_true_smooth.shape[1])), mode='constant')
         if y_true.shape[0] and y_true.shape[1] != probs.shape[0] and probs.shape[1]:
             if y_true.shape[1] > probs.shape[1]:
-                y_true = y_true[:, :probs.shape[1]]
+               y_true = y_true[:, :probs.shape[1]]
             else:
-                y_true = np.pad(y_true, ((0, 0), (0, probs.shape[1] - y_true.shape[1])), mode='constant')
-        loss = -np.mean(np.sum(y_true * np.log(probs + 1e-8), axis=1))
+               y_true = np.pad(y_true, ((0, 0), (0, probs.shape[1] - y_true.shape[1])), mode='constant')
+        # Loss (cross-entropy)
+        loss = -np.mean(np.sum(y_true_smooth * np.log(probs + 1e-8), axis=1))
         # Gradient of loss w.r.t. logits
-        d_logits = (probs - y_true) / y_true.shape[0]
+        d_logits = (probs - y_true_smooth) / y_true_smooth.shape[0]
         # Backward pass
         if mode == 'fixed_backward':
-            self.fixed_attention_backward(d_logits, lr)
+            self.fixed_attention_backward(d_logits, lr, max_norm=max_norm)
         else:
-            self.dynamic_backward(d_logits, lr)
+            self.dynamic_backward(d_logits, lr, max_norm=max_norm)
         # Accuracy
         preds = np.argmax(probs, axis=1)
@@ -992,8 +996,16 @@ class Transformer:
         return loss, acc
+    def batch_padding_utility(self, sequences, pad_token_id=0):
+        # sequences: list of 1-D np arrays of varying length
+        max_len = max(len(s) for s in sequences)
+        padded  = np.full((len(sequences), max_len), pad_token_id, dtype=np.int32)
+        for i, s in enumerate(sequences):
+            padded[i, :len(s)] = s
+        return padded   # (B, T)
-    def train(self, input_ids_list, y_true_list, epochs=100, mode=None, lr=0.001, embedded=False):
+    def train(self, input_ids_list, y_true_list, epochs=100, mode=None, lr=0.01, embedded=False, max_norm=1.0, schedule='cosine_warmup', pad_token_id=0, batch_size=None):
         losses = []
         accs = []
         d_model = self.d_model
@@ -1004,23 +1016,48 @@ class Transformer:
             self.shaping = GeometricWeightShaping(d_model, d_model)
             shaping_input = input_ids_list
             if embedded:
-                shaping_input = np.vstack([x.reshape(1, -1) if x.ndim > 2 else x for x in input_ids_list])
+                shaping_input = np.vstack([
+                    x.reshape(-1, x.shape[-1]) if x.ndim >= 2 else x
+                    for x in input_ids_list
+                    ])
+            else:
+                shaping_input = input_ids_list
             self.W_o = self.shaping.weight_shaping(shaping_input)
             self.encoded = True
+        # Pre-pad all sequences once before training starts
+        # only when batch_size is set and not in embedded mode
+        if batch_size is not None and not embedded:
+            input_ids_list = [
+                self.batch_padding_utility(input_ids_list[i:i+batch_size], pad_token_id)
+                for i in range(0, len(input_ids_list), batch_size)
+            ]
+            y_true_list = [
+                np.stack(y_true_list[i:i+batch_size])
+                for i in range(0, len(y_true_list), batch_size)
+            ]
+            # input_ids_list is now a list of (B, T) arrays
+            # y_true_list is now a list of (B, num_classes) arrays
+        print(f"[==] Starting comprehensive training for {epochs} epochs with mode: {mode}, learning rate: {lr}, schedule: {schedule}")
         for epoch in range(epochs):
             epoch_losses = []
             epoch_accs = []
+            current_lr = self.learning_rate_warm_up(epoch, epochs, lr, schedule)
             self.alpha = min(1.0, epoch / 100)
             for input_ids, y_true in zip(input_ids_list, y_true_list):
                 if input_ids.ndim == 1:
-                    input_ids = input_ids.reshape(1, -1)
+                    input_ids = input_ids[np.newaxis, :]
                 if y_true.ndim == 1:
-                    y_true = y_true.reshape(1, -1)
+                    y_true = y_true[np.newaxis, :]
-                loss, acc = self.train_step(input_ids, epoch, y_true, lr, mode, embedded=embedded)
+                loss, acc = self.train_step(input_ids, epoch, y_true, current_lr, mode,
+                                            embedded=embedded, max_norm=max_norm, pad_token_id=pad_token_id)
                 epoch_losses.append(loss)
                 epoch_accs.append(acc)
@@ -1031,7 +1068,7 @@ class Transformer:
             if epoch % 10 == 0:
                 print(f"[=] Epoch {epoch} | loss: {avg_loss:.4f} | Acc: {avg_acc:.2%}")
         return losses, accs
@@ -1039,7 +1076,7 @@ class Transformer:
         if not embedded and input_ids.ndim == 1:
             input_ids = input_ids.reshape(1, -1)
-        probs, attn_weights = self.forward(input_ids, embedded=embedded)
+        probs, attn_weights = self.forward(input_ids, embedded=embedded, training=False, attn_dropout=0.0, ffn_dropout=0.0)
         preds = np.argmax(probs, axis=1)
         return preds, probs, attn_weights
@@ -1048,10 +1085,6 @@ class Transformer:
     def AME_Encoder(self, x):
         X = np.asarray(x)
-        if x.shape[1] == 1:
-            x = x.T
-            x= x.flatten()
         gradient = np.gradient(x, axis=-1)
         grad_energy = np.mean(np.linalg.norm(gradient, axis=-1))
         X_mag = np.mean(np.linalg.norm(X, axis=-1))
@@ -1075,26 +1108,23 @@ class Transformer:
     # attention quality computing provides the transformer a robust geometric complexity alignment scalar,
     #  this scalar can be used to compute alpha for a much stable forward pass in scarce data environment, allowing it to complement with AWE MLP below.
-    def attention_quality_computing(self, attn_weights):
-        # Produces a scalar in [0, 1] that summarises how "high quality" the current
-        # attention distribution is.  This value is used to update self.alpha via EMA
-        # in forward(), which in turn controls the frozen-vs-learned projection blend.
-        #
-        # Four complementary signals are combined:
-        #   norm_entropy  — 1 − normalised entropy; near 1 when attention is focused
-        #                   on a small number of tokens (confident), near 0 when flat.
-        #   avg_max       — mean of per-head max weights; a direct focus indicator.
-        #   norm_var      — clipped variance scaled by seq_len; captures head diversity.
-        #   qualified     — geometric factor: (1 - AMR) * anisotropy;
-        #                   high when data geometry is complex but AMR (model rate) is low,
-        #                   effectively weighting quality higher when the model is in an
-        #                   exploratory (low-AMR) regime over anisotropic data.
-        #
-        # Final score = qualified*(norm_entropy + avg_max) + anisotropy*norm_var
-        # Clipped to [0, 1] and returned as dynamic_alpha.
+    def attention_quality_computing(self, attn_weights, mask=None):
         eps = 1e-5
         eps = 1e-5
         batch, heads, seq_len, _ = attn_weights.shape
+        if mask is not None:
+            # mask: (B, 1, 1, T) → expand to (B, heads, T, T)
+            mask_expanded = np.broadcast_to(
+                mask, (batch, heads, seq_len, seq_len)
+            )
+            # Zero out padding positions before computing stats
+            attn_weights = attn_weights * mask_expanded
+            # Renormalise so rows still sum to 1 over valid tokens only
+            row_sums = attn_weights.sum(axis=-1, keepdims=True) + eps
+            attn_weights = attn_weights / row_sums
         AME = self.AME_Encoder(attn_weights)
         anisotropy = self.anisotropy_measurement(attn_weights)
@@ -1109,8 +1139,6 @@ class Transformer:
         norm_var = np.clip(var_attn * seq_len, 0, 1)
         AMR = 1.0 / (1.0 + np.exp(-AME))  # abstract modelling rate
-        # qualified is high when AMR is low (geometry complex, model still learning)
-        # and anisotropy is high (strongly directional gradients in the attention map).
         qualified = (1.0 - AMR) + eps * anisotropy
         quality_score = qualified * norm_entropy + qualified * avg_max + anisotropy * norm_var
@@ -1138,35 +1166,17 @@ class Dense:
             self.activation_derivative = None
     def multi_modal_linear_transformation(self, x):
-        # Standard linear layer z = xW + b, but with a multi-level shape-mismatch
-        # recovery cascade.  This is needed because the GWS weight matrix W is shaped
-        # at construction time from the training data, and at inference time the input
-        # may have a different number of features (e.g. after vocabulary drift or
-        # when calling the model with embedded TF-IDF vectors vs raw token IDs).
-        #
-        # Recovery hierarchy (outermost try wins):
-        #   Level 1 (primary): normal dot(x, W) + b.
-        #   Level 2 (first fallback): column-slice W to match x.shape[1], then add
-        #             a matching slice of b.
-        #   Level 3 (deep fallback): slice both x and W along whichever dimension fits,
-        #             then add a b slice.  Covers edge cases where both x and W need trimming.
-        #
-        # The guard at the top reshapes W in-place if shapes are obviously mismatched
-        # (x.shape[1] != W.shape[0]), preferring slicing over re-initialisation.
         if len(x.shape) > 1 and x.shape[1] != self.W.shape[0]:
             V1, V2 = x.shape[0], x.shape[1]
             try:
-                # Trim W's rows to match the feature dimension of x.
                 self.W = self.W[:V2, :]
             except:
-                # If trimming fails (W is already smaller), re-initialise with correct dims.
                 self.special_weight = GeometricWeightShaping(V2, V1)
                 self.W = self.special_weight.weight_shaping(x)
         try:
             try:
                 z = np.dot(x, self.W) + self.b
             except:
-                # W has more rows than x has columns; trim and add matching bias slice.
                 subnet_W = self.W[:x.shape[1], :x.shape[0]]
                 sub_z = np.dot(x, subnet_W)
@@ -1179,7 +1189,6 @@ class Dense:
                 subnet_W = self.W[:x.shape[1]:, :x.shape[0]]
                 sub_z = np.dot(x, subnet_W)
             except:
-                # Last resort: trim x to fit W or vice versa, whichever succeeds first.
                 weight = self.W
                 try:
@@ -1340,19 +1349,6 @@ class MLP:
     def train(self, X, y, epochs=1000, lr=0.01, verbose=True):
-        # Decide whether to use the "focused" sub-network (feed_layers) or the
-        # standard full network (layers) for this training run.
-        #
-        # focused_fit_condition is True when ALL three hold:
-        #   1. feed_layers is non-empty  — a focused sub-network exists
-        #   2. anisotropy > 0.25         — data has sufficient directional variation
-        #                                  (flat/isotropic data doesn't benefit from focus)
-        #   3. AME > 0.25               — combined magnitude × gradient energy is above
-        #                                  a minimum threshold (data is complex enough)
-        #
-        # When True, only feed_layers are updated via focused_forward/focused_backward,
-        # letting the model concentrate its learning capacity on high-complexity data
-        # without disrupting the full network's previously learned representations.
         focused_fit_condition = len(self.feed_layers) > 0 and self.anisotropy_measurement(X) > 0.25 and self.AME_Encoder(X) > 0.25
         print(f'[+] Focused fit condition: {focused_fit_condition} || Anisotropy: {self.anisotropy_measurement(X):.4f} || AME: {self.AME_Encoder(X):.4f}')
         for epoch in range(epochs):
@@ -1395,19 +1391,6 @@ class WeightedEnsemblePredictor:
     def attention_memory_gate(self, probs, x):
-        # Fast-path cache lookup: checks whether a previously seen input (stored under
-        # prefix 'TA' in self.memory) is geometrically similar to the current input x.
-        # Similarity is measured by cosine similarity ≥ 0.85 (tight threshold to avoid
-        # false hits on unrelated inputs that happen to share some features).
-        #
-        # If a match is found, the cached attention outputs (texts, x2, x3, x4) are
-        # returned directly, skipping a full forward pass through the transformer.
-        # This also acts as a continual memory mechanism: the pipeline "remembers"
-        # past attention patterns and reuses them for similar future inputs.
-        #
-        # Cache miss path:
-        #   - If self_attn_weights was set by a prior call, return it as a warm fallback.
-        #   - Otherwise return (None, None, None, None) signalling a full inference needed.
         memory = self.memory
         cache_attn_memory = [key for key, (_, inp, _, _, _) in memory.items() if key.startswith('TA') and self.pipeline.cosine_similarity(x, inp) >= 0.85]
@@ -1589,6 +1572,7 @@ class WeightedEnsemblePredictor:
                 self.credibility_summarized_prediction(input_ids, mlp_probs, trans_probs, attn_weights, type='pipeline')
             except Exception as e:
                 print(f'[-] Cant get explainability features! : {e}')
+                traceback.print_exc()
         else:
             print('[-] No agreement established, skipping explainability features.')
@@ -1623,28 +1607,6 @@ class WeightedEnsemblePredictor:
         return anisotropy
     def _dynamic_weighted_ensemble(self, trans_probs, mlp_probs, attn_weights, input_ids):
-        # Per-sample dynamic weighting of Transformer and MLP predictions.
-        # Unlike the static self.transformer_weight / self.mlp_weight used in
-        # calibrate_weights(), this method derives weights on-the-fly from three signals:
-        #
-        #   trans_conf_factor  — derived from attention statistics:
-        #                         attn_focus    = std of the attention map (0 = flat, high = peaked)
-        #                         attn_growth   = sigmoid(attn_focus) — bounded confidence signal
-        #                         attn_limit    = (1 - attn_focus + attn_growth) * anisotropy
-        #                         factor        = attn_growth + attn_limit * attn_focus
-        #                        Intuitively: the transformer earns more weight when its attention
-        #                        is peaked (focused) AND the distribution is geometrically varied.
-        #
-        #   mlp_conf_factor    — derived from MLP output entropy:
-        #                         lower entropy → sharper distribution → higher confidence → higher weight.
-        #                         formula: 1 / (1 + entropy)
-        #
-        #   agreement          — 1.0 if both models predict the same class, else 0.3.
-        #                        Acts as a confidence multiplier: agreement boosts both weights
-        #                        proportionally, disagreement dampens the overall contribution.
-        #
-        # Both factors are multiplied by (1 + agreement) / 2, then normalised so they sum to 1.
-        # The final ensemble for sample i is: trans_weight * trans_row + mlp_weight * mlp_row.
         batch_size = trans_probs.shape[0]
         try:
             n_trans_classes = trans_probs.shape[1]
@@ -1653,8 +1615,6 @@ class WeightedEnsemblePredictor:
             n_trans_classes = trans_probs.shape[-1]
             n_mlp_classes = mlp_probs.shape[-1]
-        # Align probability vectors to the same class count (the larger of the two).
-        # Necessary when the transformer and MLP were trained with different label sets.
         n_classes = max(n_trans_classes, n_mlp_classes)
         print(f"🔄 Aligning classes: {n_trans_classes} and {n_mlp_classes} → {n_classes}")
@@ -1663,7 +1623,6 @@ class WeightedEnsemblePredictor:
             trans_row = np.zeros(n_classes)
             mlp_row = np.zeros(n_classes)
-            # Zero-pad shorter probability vectors to n_classes, then re-normalise.
             trans_row[:n_trans_classes] = trans_probs[i]
             mlp_row[:n_mlp_classes] = mlp_probs[i]
@@ -1672,23 +1631,19 @@ class WeightedEnsemblePredictor:
             trans_pred = np.argmax(trans_probs[i])
             mlp_pred = np.argmax(mlp_probs[i])
-            # agreement is a binary multiplier; 1.0 when models agree, 0.3 when they differ.
             agreement = 1.0 if trans_pred == mlp_pred else 0.3
             if attn_weights is not None and i < len(attn_weights):
                 print('🔄 Sophisticated confidence assembling')
                 attn = attn_weights[i]
-                # Geometric variation in the attention map itself.
                 anisotropy = self.anisotropy_measurement(attn)
                 attn_focus = np.std(attn) if attn.size > 0 else 0.5
-                attn_growth = 1.0 / (1.0 + np.exp(-attn_focus))  # sigmoid of focus
-                # attn_limit blends (1 - focus + growth) with anisotropy to bound the factor.
+                attn_growth = 1.0 / (1.0 + np.exp(-attn_focus))
                 attn_limit = (1.0 - attn_focus + attn_growth) * anisotropy
                 trans_conf_factor = attn_growth + attn_limit * attn_focus
             else:
-                # Fallback when per-sample attn slice is unavailable: use scalar attn_weights.
                 attn_growth = 1.0 / (1.0 + np.exp(-attn_weights))
                 anisotropy = self.anisotropy_measurement(attn_weights)
                 trans_conf_factor = attn_growth * anisotropy
@@ -1696,7 +1651,6 @@ class WeightedEnsemblePredictor:
             mlp_entropy = -np.sum(mlp_probs[i] * np.log(mlp_probs[i] + 1e-8))
             mlp_conf_factor = 1.0 / (1.0 + mlp_entropy)  # Lower entropy = higher confidence
-            # Scale both factors by the agreement bonus, then normalise.
             trans_weight = trans_conf_factor * (1.0 + agreement) / 2
             mlp_weight = mlp_conf_factor * (1.0 + agreement) / 2
@@ -1758,32 +1712,9 @@ class WeightedEnsemblePredictor:
         return ensemble
     def _meta_ensemble(self, trans_probs, mlp_probs, attn_weights, X_mlp):
-        # Second-level ("stacking") ensemble.  Instead of computing weights from raw
-        # attention or entropy signals, it builds a meta-feature vector for each sample
-        # that summarises both models' outputs and their relationship, then derives
-        # sample-specific weights from those features.
-        #
-        # Meta-features per sample (up to 7 values):
-        #   [0] max(trans_row)             — transformer peak confidence
-        #   [1] max(mlp_row)               — MLP peak confidence
-        #   [2] std(trans_row)             — transformer output spread (uncertainty proxy)
-        #   [3] std(mlp_row)               — MLP output spread
-        #   [4] 1.0 if both agree, else 0  — inter-model agreement flag
-        #   [5] std(attn[i])               — attention map spread (if available)
-        #   [6] max(attn[i])               — peak attention value (if available)
-        #
-        # Weight derivation:
-        #   base_weight = 0.5 + 0.3 * agreement  → 0.5 (disagree) or 0.8 (agree)
-        #   Whichever model has higher confidence gets base_weight;
-        #   the other gets 1 - base_weight.
-        #
-        # NOTE: there is a scoping bug here — trans_row / mlp_row from the loop above
-        # are used outside the loop in the weight application (line ~1582).  On the last
-        # iteration they hold values for sample batch_size-1, but for earlier iterations
-        # the wrong row is applied.  Flagged in code review.
         batch_size = trans_probs.shape[0]
         n_classes = trans_probs.shape[1]
-        threshold_feature = 0.1 + self.pipeline.confidence_threshold
+        threshold_feature = 0.1 + self.pipeline.confidence_threshold
         n_trans_classes = trans_probs.shape[1]
         n_mlp_classes = mlp_probs.shape[1]
@@ -1798,7 +1729,6 @@ class WeightedEnsemblePredictor:
             trans_row[:n_trans_classes] = trans_probs[i]
             mlp_row[:n_mlp_classes] = mlp_probs[i]
-            # Re-normalise after zero-padding to maintain valid probability distributions.
             trans_row = trans_row / (trans_row.sum() + 1e-8)
             mlp_row = mlp_row / (mlp_row.sum() + 1e-8)
@@ -2060,14 +1990,6 @@ class ExplainabilityModule:
         # 3. IMMEDIATE TRAINING (single step with higher Learning Rate)
         anisotropy = self.pipeline.anisotropy_measurement(X)
-        # Derive a geometry-aware learning rate for the correction step.
-        # anisotropy_dist: sigmoid of anisotropy — saturates to 1 for strongly directional data.
-        # deviation:       inverse of std; near 1 when features are tightly clustered (low spread).
-        # AEL (Adaptive Error Level): high when data is variable (low deviation) AND anisotropic.
-        #   AEL → 1  ⟹  corrective LR = 2/(1+1) = 1.0  (fast correction on complex data)
-        #   AEL → 0  ⟹  corrective LR = 2/(1+0) = 2.0  (even faster on flat/simple data)
-        # This intentionally boosts the correction LR above the normal training LR so
-        # a single wrong prediction can be overridden quickly without many epochs.
         anisotropy_dist = 1.0 / (1.0 + np.exp(-anisotropy))
         deviation = 1.0 / (1.0 + np.std(X))
         AEL = (1.0 - deviation) * anisotropy_dist + eps
@@ -2363,26 +2285,6 @@ class ExplainabilityModule:
     def _get_final_output(self, mlp_pred, mlp_conf, trans_pred, trans_conf, attn_weights):
-        # Resolves the final prediction when the two models disagree.
-        # When they agree, the higher-confidence model's score is taken directly.
-        # When they disagree, an "Abstract Attention Transformation" (AAT) scalar
-        # is computed to determine which model to trust more:
-        #
-        #   sliced_anisotropy — directional variation in the first attention slice;
-        #                        high → attention is non-uniform / informative.
-        #   deviation         — 1/(1 + std(attn_weights)); near 1 when attention is tightly
-        #                        concentrated, near 0 when it is spread out.
-        #   attn_quality      — overall quality score from attention_quality_computing.
-        #   AAT               — deviation * (1 - sliced_anisotropy):
-        #                        high when attention is concentrated (low anisotropy) AND
-        #                        tightly distributed (low spread); this configuration favours
-        #                        the transformer's focused contextual prediction.
-        #
-        # Confidence blending on disagreement:
-        #   If MLP wins:         final_conf = mlp_conf * (1 - trans_conf) * (1 - AAT)
-        #       → lower AAT (diffuse attention) → MLP gets more room to dominate.
-        #   If Transformer wins: final_conf = trans_conf * (1 - mlp_conf) * AAT
-        #       → higher AAT (focused attention) → transformer earns a larger share.
         eps = 1e-5
         if isinstance(mlp_conf, np.ndarray):
             mlp_conf = np.clip(np.mean(mlp_conf), 0, 1)
@@ -2940,18 +2842,6 @@ class ModelStorage:
     def save_model_dict(self, memory_name, model_dict, type=None, model_type='mlp'):
-        # Persists a model's in-memory dict to SQLite using an "active record" versioning
-        # pattern: each save inserts a new row marked is_active=1, then immediately
-        # deactivates all other rows for the same memory_name via a secondary UPDATE.
-        # This means only the most recent save is "live" — reads always fetch is_active=1.
-        #
-        # Two destination tables depending on the `type` argument:
-        #   type == 'Transformer' → model_attn_storage  (stores attention-related weights)
-        #   else                  → model_storage        (stores MLP / pipeline weights)
-        #
-        # numpy arrays inside model_dict are recursively converted to Python lists
-        # by _prepare_for_serialization() before json.dumps, ensuring they round-trip
-        # correctly when loaded back via _convert_to_arrays().
         try:
             db_path = self.get_database_path()
             conn = sqlite3.connect(db_path)
@@ -2970,7 +2860,6 @@ class ModelStorage:
                     VALUES (?, ?, ?, ?)
                 """, (memory_name, model_type, model_json, 1))
-                # Deactivate all other rows for this memory_name (soft-delete old versions).
                 c.execute("""
                     UPDATE model_attn_storage
                     SET is_active = 0
@@ -2988,7 +2877,6 @@ class ModelStorage:
                     VALUES (?, ?, ?, ?)
                 """, (memory_name, model_type, model_json, 1))
-                # Deactivate all other rows for this memory_name (soft-delete old versions).
                 c.execute("""
                     UPDATE model_storage
                     SET is_active = 0
@@ -3068,24 +2956,14 @@ class ModelStorage:
     def _parse_array_string(self, s):
-        # Attempts to recover a numpy array from a string representation that may have
-        # been serialised in one of several formats (JSON, Python literal, space-separated,
-        # or comma-separated).  This is necessary because model weights and probability
-        # vectors are stored in SQLite as JSON strings and must be reconstructed precisely.
-        #
-        # Strategy order (first success wins):
-        #   1. JSON array  — handles standard serialisation from json.dumps.
-        #   2. ast.literal_eval — handles Python repr output, e.g. "[0.1, 0.2, ...]".
-        #   3. Space/bracket-separated floats — covers numpy __str__ output like
-        #      "[ 0.1  0.2  0.3]" (spaces instead of commas, optional brackets).
-        #   4. Comma-separated floats — fallback for CSV-style strings.
-        #
-        # Returns the original string unchanged if all strategies fail, letting the
-        # caller handle the type mismatch rather than silently producing garbage data.
+        """
+        Parse string representation of array back to numpy array.
+        Returns original string if parsing fails.
+        """
         if not isinstance(s, str) or not s:
             return s
-        # Normalise whitespace before parsing — strip newlines, tabs, collapse spaces.
+        # Clean the string
         s = s.replace('\n', '').replace('\r', '').replace('\t', '')
         s = ' '.join(s.split()).strip()
@@ -3346,14 +3224,6 @@ class ModelStorage:
             pass
     def load_peer_request_dict(self, memory_name, agent_id):
-        # Retrieves a peer agent's stored prediction request from agent_attn_storage,
-        # excluding rows whose agent_id matches any ID in the provided list.
-        # The exclusion prevents an agent from retrieving its own previously stored
-        # request, ensuring it only receives data from *other* agents in the network.
-        #
-        # The IN clause is constructed dynamically with one '?' placeholder per agent_id
-        # entry, which is safe against SQL injection via parameterised queries.
-        # Returns (model_attn_data, model_target_pred) parsed from JSON, or (None, None).
         print(f'|| Peer request with Agent')
         try:
             try:
@@ -3820,10 +3690,7 @@ class AsyncMessageQueue:
         if not success:
             self._stats['messages_failed'] += 1
-        # Exponential moving average of message latency.
-        # alpha = 0.1 means the current measurement contributes 10 % to the running average,
-        # providing a smoothed latency estimate that is robust to spikes without requiring
-        # a fixed-size history window.
+        # Update moving average
         alpha = 0.1  # Smoothing factor
         self._stats['avg_latency'] = alpha * latency + (1 - alpha) * self._stats['avg_latency']
@@ -4069,7 +3936,7 @@ class AgentDistributedInference:
         # Security: Audit log
         self.security_log = []
-        self.enable_ssl = False # Set to True to enable SSL encryption
+        self.enable_ssl = False  # Set to True to enable SSL encryption
         # i provided basic cert file and key since there are other layered security other than ssl, and also due to infrequent external connections.
         self.ssl_cert_file = ssl_cert_file
         self.ssl_key_file = ssl_key_file
@@ -4310,7 +4177,7 @@ class AgentDistributedInference:
         key = self.secret_key.encode() if isinstance(self.secret_key, str) else self.secret_key
         signature = hmac.new(key, message_bytes, hashlib.sha256).hexdigest()
-        print(f'|| Signing message with: {len(message)} total of size, with signature: {signature}')
+        print(f'|| Signing message with: {len(message)} total of size')
         logger.info(f"[=] Signing message: {len(message)}")
         return signature
@@ -4579,7 +4446,7 @@ class AgentDistributedInference:
                 print(f"[-] Connection attempt to blocked IP: {host}")
                 self._log_security_event('connection_blocked', {'ip': host})
                 return None
             # Socket creation
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -5366,7 +5233,7 @@ class AgentDistributedInference:
             print(f'[||] Successfully calibrate probs with previous Peer using database!')
             self.save_to_local_peer(self.memory_name, probs)
         else:
-            print(f'[-] Connection to peer agent {self.temporary_agent_id} failed or not permitted, returning regular probs...')
+            print(f'[-] Connection to peer agent {self.temporary_agent_id} is not permitted, returning regular probs...')
         return probs
@@ -6160,13 +6027,15 @@ class IntegratedPipeline:
         self.titles = None
         self.labels = None
-        self.use_transformer = False
+        self.use_transformer = True
         self.agreement = False
         self.external_peer_enabled = False
         self.autonomous = False
         self.show_explainability_details = True
         self.temperature = 1.0
+        self.transformer_lr = 0.1
         self.memory_name = memory_name
         self.pending_batch = []
@@ -8072,16 +7941,15 @@ class IntegratedPipeline:
         _, y_true = self.input_encoding(datasets)
         sequence_inputs = self.sequence_encoding(datasets)
         unsuitable_training = self.training_necessary_condition(sequence_inputs, X_raw)
+        lr = self.model2.transformer_lr if self.model2 else self.transformer_lr
         if not unsuitable_training:
             print(f'🚀 Training Transformer with {len(sequence_inputs)} Samples: ')
             conditional_anisotropy = self.anisotropy_measurement(sequence_inputs)
             if conditional_anisotropy >= self.confidence_threshold:
-                lr = 1e-4
                 print('[+] Dynamic Backward')
                 mode = 'dynamic_backward'
             else:
-                lr = 0.1
                 print('[-] Fixed Backward')
                 mode = 'fixed_backward'
@@ -11092,7 +10960,7 @@ class ConsecutivePeerAgent:
         # Verify message signature
         expected = self._sign_message({k: v for k, v in message.items() if k != 'signature'})
-        print(f'[ConsecutivePeerAgent] Comparing Signature and verfiying...')
+        print(f'[ConsecutivePeerAgent] Comparing Signature and verifying...')
         return hmac.compare_digest(expected, signature)
     def _send_message(self, sock: socket.socket, message: dict) -> bool:
@@ -13167,39 +13035,9 @@ def PermissiveTest():
             pass
-def main_cli():
-    """Command-line interface entry point"""
-    import argparse
-    import asyncio
-    parser = argparse.ArgumentParser(description="AbstractIntegratedModule - AI Multi-agent System")
-    parser.add_argument("--version", action="store_true", help="Show version")
-    parser.add_argument("--train", help="Training data file")
-    parser.add_argument("--predict", help="Text to predict")
-    args = parser.parse_args()
-    if args.version:
-        print(f"AbstractIntegratedModule version {__version__}")
-        return
-    if args.predict:
-        # Simple prediction example
-        pipeline = IntegratedPipeline("temp", use_async=False)
-        result = pipeline.predict_single(args.predict)
-        print(f"[=] Prediction: {result}")
-    if args.train:
-        print(f"[=] Training with {args.train}")
 if __name__ == "__main__":
     try:
         PermissiveTest()
-        main_cli()
     except Exception as e:
         print(f'|| Program Crashed...,  Error: {e}')
         traceback.print_exc()

AbstractIntegratedModule 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

AbstractIntegratedModule 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl