AbstractIntegratedModule 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,52 +1,3 @@
1
-
2
- """
3
- Advanced Integrated AI Module (AbstractIntegratedModule)
4
- Multi-agent P2P inference system with geometric deep learning
5
-
6
- Installation:
7
- pip install aiml
8
-
9
- Usage:
10
- from AbstractIntegratedModule import IntegratedPipeline, CohesiveAgentDeployment
11
- """
12
-
13
- __version__ = "0.1.5"
14
- __author__ = "Micro-Novelty"
15
- __all__ = [
16
- # Main user-facing classes
17
- "IntegratedPipeline",
18
- "CohesiveAgentDeployment",
19
- "AgentDistributedInference",
20
- "WeightedEnsemblePredictor",
21
-
22
- # Models
23
- "Transformer",
24
- "MLP",
25
- "GeometricWeightShaping",
26
-
27
- # Security (user may need)
28
- "SecurityLevel",
29
- "SecurityConfig",
30
- "TrustLevel",
31
-
32
- # Storage (user may need)
33
- "ModelStorage",
34
-
35
- # Fallback
36
- "ConsecutivePeerAgent",
37
-
38
- # Singleton base (if users want to extend)
39
- "Singleton",
40
-
41
- # Version
42
- "__version__",
43
- ]
44
-
45
- # THIS IS THE SOURCE CODE OF ABSTRACTINTEGRATEDMODULE
46
- # YOU ARE HEREBY GRANTED TO AUDIT, REVIEW, AND INITIATE PULL REQUESTS AND ISSUES
47
- # LICENSE: MIT, PROVIDED.
48
-
49
-
50
1
  import numpy as np
51
2
  from sklearn.preprocessing import StandardScaler
52
3
  import pandas as pd
@@ -299,11 +250,7 @@ class SingletonMeta(type):
299
250
  _lock: threading.Lock = threading.Lock()
300
251
 
301
252
  def __call__(cls, *args, **kwargs):
302
- # Double-checked locking pattern:
303
- # Fast path (no lock) — if the instance already exists, return immediately.
304
- # Slow path (with lock) — only one thread can create the instance; a second
305
- # check inside the lock guards against two threads both passing the fast path
306
- # before either acquires the lock.
253
+ # Fast path: instance already exists
307
254
  if cls in cls._instances:
308
255
  return cls._instances[cls]
309
256
 
@@ -353,23 +300,6 @@ class GeometricWeightShaping:
353
300
  self.floating_context = None
354
301
 
355
302
  def eigenvalue_encoder(self, x):
356
- # Encodes the geometric complexity of the input data into a scalar (trC) and a
357
- # principal component count (k). The scalar trC is later used as the upper bound
358
- # for the random floating-point context in abstract_weight_shaping.
359
- #
360
- # Step-by-step logic:
361
- # 1. Augment input with magnitude-scaled structured noise so the covariance
362
- # matrix is never degenerate even on very small or homogeneous datasets.
363
- # 2. Run eigendecomposition on the augmented covariance, sort eigenvalues
364
- # descending, then find k = the number of principal components that
365
- # capture 90% of cumulative variance. k is a compact measure of
366
- # intrinsic dimensionality.
367
- # 3. Derive three chained scalars (trA → trB → trC) that compress k and the
368
- # data anisotropy into a single weight-shaping magnitude.
369
- # - trA : scales k by directional variation; high anisotropy → large trA
370
- # - trB : dampens trA²; keeps the signal in a bounded range
371
- # - trC : final scalar — NOTE: trB² - 1.0 can equal zero when trB == ±1,
372
- # causing division-by-zero (known fragility flagged in code review)
373
303
  eps = 1e-5
374
304
  X = np.asarray(x)
375
305
  if X.ndim > 2:
@@ -379,32 +309,22 @@ class GeometricWeightShaping:
379
309
 
380
310
  anisotropy = self.anisotropy_measurement(X)
381
311
 
382
- # Augment data with noise proportional to its magnitude to avoid a singular
383
- # covariance matrix when the dataset is small or nearly constant.
384
312
  structured_noise = np.random.uniform(0, mag, size=X.shape)
385
313
  X = np.vstack((X, structured_noise))
386
314
  cov = np.cov(X, rowvar=False)
387
315
 
388
- # eigh is used instead of eig because cov is symmetric; it returns real eigenvalues
389
- # and is numerically more stable than the general eigensolver.
390
316
  eigenvalues, eigenvectors = np.linalg.eigh(cov)
391
- idx = np.argsort(eigenvalues)[::-1] # sort largest-first
317
+ idx = np.argsort(eigenvalues)[::-1]
392
318
 
393
319
  eigenvalues = eigenvalues[idx]
394
- # Cumulative explained variance ratio; searchsorted finds the elbow at 90 %.
395
320
  energy = np.cumsum(eigenvalues) / np.sum(eigenvalues)
396
- k = np.searchsorted(energy, 0.90) + 1 # +1 converts 0-based index to count
321
+ k = np.searchsorted(energy, 0.90) + 1
397
322
 
398
- # K_G: normalised inverse of k — small k (low-dim data) → K_G near 1,
399
- # large k (high-dim data) → K_G near 0.
400
323
  K_G = 1.0 / (1.0 + k)
401
- mag_G = 1.0 / (1.0 + K_G) # secondary magnitude dampener
324
+ mag_G = 1.0 / (1.0 + K_G)
402
325
 
403
- # Three-stage compression cascade that maps (k, anisotropy) trC scalar.
404
- trA = k / (1.0 - anisotropy) + eps # anisotropy close to 1 inflates trA
405
- trB = (1/2 + mag_G) / (1.0 + trA**2) # quadratic dampener keeps trB < 0.5
406
- # WARNING: trB² - 1.0 is negative for all typical trB values (|trB| < 1),
407
- # so trC ends up negative. When trB == ±1 exactly this divides by zero.
326
+ trA = k / (1.0 - anisotropy) + eps
327
+ trB = (1/2 + mag_G) / (1.0 + trA**2)
408
328
  trC = (1/6 + K_G) / (trB**2 - 1.0)
409
329
  return trC, k
410
330
 
@@ -455,23 +375,6 @@ class GeometricWeightShaping:
455
375
 
456
376
  # weight shaping provides directional context in which how the data should be processed in order to align with the data geometry
457
377
  def abstract_weight_shaping(self, x):
458
- # Derives a data-adaptive random weight matrix whose range is governed by
459
- # the geometric complexity of the input batch x.
460
- #
461
- # Key scalars produced along the way:
462
- # anisotropy — directional spread of gradients across x (higher = more varied)
463
- # trC, k — eigenvalue-derived complexity scalar and intrinsic dimensionality
464
- # AME — Abstract Modelling Error: log-product of magnitude × gradient energy
465
- # AEL — Adaptive Energy Level: blends spectral similarity with anisotropy;
466
- # measures how much the data geometry resembles random noise
467
- # AMR — sigmoid-scaled AME; used as a soft gate between 0 and 1
468
- # efficient_distributed_energy — the upper bound fed to the final uniform sampler;
469
- # equals k + AEL*(1 - AMR): dominated by intrinsic dimensionality
470
- # when the model rate (AMR) is high, shifts to AEL when AMR is low.
471
- #
472
- # The resulting weight matrix (shape: input_size × output_size) is drawn from
473
- # Uniform[0, efficient_distributed_energy], which gives the downstream Dense layer
474
- # a geometry-aware initialisation instead of a fixed scale like He/Xavier.
475
378
  input_size = self.input_size
476
379
  output_size = self.output_size
477
380
 
@@ -483,19 +386,13 @@ class GeometricWeightShaping:
483
386
  trC, k = self.eigenvalue_encoder(x)
484
387
  AME = self.AME_Encoder(x)
485
388
 
486
- # floating_point: noise draw bounded by trC; used only to compute spectral
487
- # similarity (how much the real data "looks like" noise geometrically).
488
389
  floating_point = np.random.uniform(0, trC, size=x.shape)
489
390
  spectral_similarity = self.spectral_similarity(x, floating_point)
490
391
 
491
- # AEL rises when data is both spectrally noise-like and highly anisotropic.
492
392
  AEL = 0.3 + spectral_similarity * anisotropy
493
- scaled_anisotropy = anisotropy / (anisotropy + 1.0) # unused below; kept for potential future use
494
- AMR = 1.0 / (1.0 + np.exp(-AME)) # abstract modelling rate — sigmoid gate on AME
393
+ scaled_anisotropy = anisotropy / (anisotropy + 1.0)
394
+ AMR = 1.0 / (1.0 + np.exp(-AME)) # abstract modelling rate
495
395
 
496
- # Upper bound of the weight distribution.
497
- # When data complexity is low (AMR → 1), the AEL term vanishes → bound ≈ k.
498
- # When data is geometrically rich (AMR → 0), AEL contributes more → wider init.
499
396
  efficient_distributed_energy = k + AEL * (1.0 - AMR)
500
397
  floating_context = rng.uniform(0, efficient_distributed_energy, size=(input_size, output_size))
501
398
  self.floating_context = floating_context
@@ -579,9 +476,13 @@ class Loss:
579
476
 
580
477
 
581
478
  class Transformer:
582
- def __init__(self, vocab_size, d_model=32, n_heads=4, num_classes=7):
479
+ def __init__(self, vocab_size, d_model=8, n_heads=2, num_classes=7, learning_rate=0.01, attn_dropout=0.0, ffn_dropout=0.0, weight_decay=1e-4):
583
480
  self.d_model = d_model # Embedding dimension
584
481
  self.n_heads = n_heads
482
+ self.attn_dropout_rate = attn_dropout
483
+ self.ffn_dropout_rate = ffn_dropout
484
+ self.transformer_lr = learning_rate
485
+ self.weight_decay = weight_decay
585
486
 
586
487
  self.token_embedding = np.random.randn(vocab_size, d_model) * 0.02
587
488
 
@@ -620,7 +521,28 @@ class Transformer:
620
521
  mean = np.mean(x, axis=-1, keepdims=True)
621
522
  var = np.var(x, axis=-1, keepdims=True)
622
523
  return scale * (x - mean) / np.sqrt(var + 1e-5) + shift
623
-
524
+
525
+ def apply_update(self, param, grad, lr):
526
+ # L2 weight decay applied directly at update time
527
+ # equivalent to: grad += weight_decay * param
528
+ return param - lr * (grad + self.weight_decay * param)
529
+
530
+ def dropout(self, x, rate=0.1, training=True, alpha=None):
531
+ if not training or rate == 0.0:
532
+ return x, None
533
+
534
+ # If alpha provided, scale the effective drop rate by it
535
+ # low alpha (early training, fixed attention) → very light dropout
536
+ # high alpha (dynamic attention active) → full dropout rate
537
+ effective_rate = rate * alpha if alpha is not None else rate
538
+
539
+ if effective_rate == 0.0:
540
+ return x, None
541
+
542
+ mask = (np.random.rand(*x.shape) > effective_rate).astype(np.float32)
543
+ return x * mask / (1.0 - effective_rate), mask
544
+
545
+
624
546
  def softmax(self, x):
625
547
  if x.ndim == 3:
626
548
  shifted = x - np.max(x, axis=-1, keepdims=True)
@@ -642,29 +564,13 @@ class Transformer:
642
564
  return output, weights
643
565
 
644
566
 
645
- def multi_head_attention(self, x, mask=None):
567
+ def multi_head_attention(self, x, mask=None, alpha=None):
646
568
  batch_size, seq_len, d_model = x.shape
647
- try:
648
- alpha = self.alpha # between 0 and 1
649
- except:
650
- # alpha not yet set (first call before any train_step); derive it from the
651
- # data's geometric complexity via AME so we start with a meaningful blend.
652
- AME = self.AME_Encoder(x)
653
- AMR = 1.0 / (1.0 + np.exp(-AME))
654
- alpha = AMR
655
- self.alpha = alpha
656
569
 
657
- # Interpolate between the frozen initial projections (W_q/k/v_fixed) and the
658
- # learnable ones (W_q/k/v). alpha starts near 0 and ramps toward 1 during
659
- # training (see train() where alpha = min(1.0, epoch/100)), so early epochs
660
- # lean on the stable fixed projections and later epochs use the learned ones.
661
570
  W_q_mix = (1 - alpha) * self.W_q_fixed + alpha * self.W_q
662
571
  W_k_mix = (1 - alpha) * self.W_k_fixed + alpha * self.W_k
663
572
  W_v_mix = (1 - alpha) * self.W_v_fixed + alpha * self.W_v
664
573
 
665
- # Project input into multi-head Q, K, V spaces.
666
- # einsum notation 'bsd,hdm->bhsm': batch × seq × d_model dotted with
667
- # n_heads × d_model × head_dim → batch × n_heads × seq × head_dim
668
574
  Q = np.einsum('bsd,hdm->bhsm', x, W_q_mix)
669
575
  K = np.einsum('bsd,hdm->bhsm', x, W_k_mix)
670
576
  V = np.einsum('bsd,hdm->bhsm', x, W_v_mix)
@@ -680,65 +586,75 @@ class Transformer:
680
586
  self.cache['attn_weights'] = attn_weights
681
587
  self.cache['attn_output'] = attn_output
682
588
 
683
- # Concatenate heads: (batch, n_heads, seq, head_dim) → (batch, seq, d_model)
589
+ # Concatenate heads
684
590
  attn_output = attn_output.transpose(0, 2,1, 3).reshape(batch_size, seq_len, -1)
685
591
  self.cache['attn_concat'] = attn_output
686
592
 
687
- # Final linear projection: mixes head outputs back into d_model space.
688
- # W_o is geometry-initialised via GWS on the first training call (see train()).
593
+ # Final linear projection
689
594
  output = np.matmul(attn_output, self.W_o)
690
595
  self.cache['attn_out'] = output
691
596
 
692
597
  return output, attn_weights
693
598
 
694
599
 
695
- def forward(self, input_ids, embedded=False):
600
+ def forward(self, input_ids, embedded=False, pad_token_id=0, training=True, attn_dropout=0.1, ffn_dropout=0.1):
696
601
  if embedded:
697
- # Accept pre-computed embeddings directly (e.g. TF-IDF vectors passed as
698
- # float arrays) instead of integer token IDs. Reshape to 3-D if needed.
699
602
  x = np.asarray(input_ids)
700
603
  if x.ndim == 2:
701
604
  x = x[np.newaxis, ...]
702
605
  batch_size, seq_len, _ = x.shape
703
606
  self.cache['embedded_input'] = x
704
607
  self.cache['input_ids'] = None
608
+ mask = None
705
609
  else:
610
+ input_ids = np.asarray(input_ids, dtype=np.int32)
706
611
  if input_ids.ndim == 1:
707
- input_ids = input_ids.reshape(1, -1)
708
- # Standard token-embedding lookup + additive positional encoding.
612
+ input_ids = input_ids[np.newaxis, :]
613
+
709
614
  x = self.token_embedding[input_ids]
710
615
  x = x + self.pos_embedding[:x.shape[1]]
711
616
  batch_size, seq_len = input_ids.shape
712
617
  self.cache['embedded_input'] = None
713
618
  self.cache['input_ids'] = input_ids
619
+ mask = self.padding_mask_utility(input_ids, pad_token_id) # (B,1,1,T)
714
620
 
621
+ self.cache['mask'] = mask if not embedded else None
715
622
  self.cache['seq_len'] = seq_len
716
623
  self.cache['batch_size'] = batch_size
717
624
  self.cache['x_token'] = x
718
625
  self.cache['x_pos'] = x
719
626
 
720
- # Multi-head attention with residual
721
- attn_out, attn_weights = self.multi_head_attention(x)
627
+ # Multi-head attention with residual
628
+ AME = self.AME_Encoder(x)
629
+ alpha = 1.0 / (1.0 + np.exp(-AME))
630
+ attn_out, attn_weights = self.multi_head_attention(x, mask=mask, alpha=alpha)
631
+
632
+ current_alpha = self.cache.get('alpha', 0.0)
633
+
634
+ attn_out, attn_drop_mask = self.dropout(attn_out, rate=self.attn_dropout_rate, training=training, alpha=current_alpha)
635
+ self.cache['attn_drop_mask'] = attn_drop_mask
722
636
 
723
- # Exponential moving average update for alpha using the attention quality score.
724
- # Keeps alpha stable: 95 % of the old value + 5 % of the current quality signal.
725
- # This means alpha slowly tracks how well-focused the current attention is,
726
- # rather than jumping abruptly each step.
727
- self.alpha = 0.95 * self.alpha + 0.05 * self.attention_quality_computing(attn_weights)
637
+ alpha = 0.95 * alpha + 0.05 * self.attention_quality_computing(attn_weights, mask=mask)
638
+
639
+ self.alpha = alpha
640
+ self.cache['alpha'] = alpha # store in cache
728
641
 
729
- # Pre-norm residual: cache the sum before normalising so backward can recover it.
730
642
  self.cache['x_ln1_input'] = x + attn_out
731
643
  x = self.layer_norm(x + attn_out, self.ln1_scale, self.ln1_shift)
732
644
  self.cache['x_after_ln1'] = x
733
645
 
734
- # Feed-forward network: expand to 4×d_model, apply ReLU, project back.
646
+ # Feed-forward with residual
735
647
  self.cache['ffn_input'] = x
736
648
  ffn_pre = np.matmul(x, self.ffn1)
737
649
  self.cache['ffn_pre'] = ffn_pre
738
650
 
739
651
  ffn_act = np.maximum(0, ffn_pre) # ReLU
652
+
653
+ ffn_act, ffn_drop_mask = self.dropout(ffn_act, rate=self.ffn_dropout_rate, training=training, alpha=current_alpha)
654
+
740
655
  self.cache['ffn_act'] = ffn_act
741
-
656
+ self.cache['ffn_drop_mask'] = ffn_drop_mask
657
+
742
658
  ffn_out = np.matmul(ffn_act, self.ffn2)
743
659
  self.cache['ffn_out'] = ffn_out
744
660
 
@@ -746,9 +662,15 @@ class Transformer:
746
662
  x = self.layer_norm(x + ffn_out, self.ln2_scale, self.ln2_shift)
747
663
  self.cache['x_after_ln2'] = x
748
664
 
749
- # Mean-pool across the sequence dimension to get a fixed-size representation
750
- # regardless of input length. Shape: (batch, d_model).
751
- x_pooled = np.mean(x, axis=1) # (batch, d_model)
665
+ if mask is not None:
666
+ # Reshape mask to (B, T, 1) for broadcasting against (B, T, D)
667
+ token_mask = mask[:, 0, 0, :, np.newaxis] # (B, T, 1)
668
+ x_masked = x * token_mask # zero out padding
669
+ lengths = token_mask.sum(axis=1) # (B, 1) valid token counts
670
+ x_pooled = x_masked.sum(axis=1) / (lengths + 1e-6) # (B, D)
671
+ else:
672
+ x_pooled = np.mean(x, axis=1)
673
+
752
674
  self.cache['x_pooled'] = x_pooled
753
675
 
754
676
  # Output projection
@@ -762,18 +684,6 @@ class Transformer:
762
684
 
763
685
 
764
686
  def layer_norm_backward(self, d_out, x, scale, shift):
765
- # Backpropagates through layer normalisation.
766
- # LayerNorm forward: y = scale * (x - mean) / sqrt(var + eps) + shift
767
- #
768
- # Gradient derivation (standard result, often omitted from textbooks):
769
- # dx_hat = d_out * scale — upstream grad scaled by learned gamma
770
- # dvar = sum(dx_hat * (x-mean) * -0.5 * std^{-3}) — chain rule on variance
771
- # dmean = sum(dx_hat * -1/std) + dvar * mean(-2*(x-mean))
772
- # — two paths: direct via x_hat, indirect via variance
773
- # dx = dx_hat/std + dvar * 2*(x-mean)/N + dmean/N
774
- # — three additive terms, each from a different path through the graph
775
- #
776
- # N is the feature dimension (last axis) — normalization is per-sample, per-position.
777
687
  eps = 1e-5
778
688
  mean = np.mean(x, axis=-1, keepdims=True)
779
689
  var = np.var(x, axis=-1, keepdims=True)
@@ -784,10 +694,7 @@ class Transformer:
784
694
  N = x.shape[-1]
785
695
  dx_hat = d_out * scale
786
696
  dvar = np.sum(dx_hat * (x - mean) * -0.5 * std**-3, axis=-1, keepdims=True)
787
- dmean = (
788
- np.sum(dx_hat * -1/std, axis=-1, keepdims=True)
789
- + dvar * np.mean(-2*(x-mean), axis=-1, keepdims=True)
790
- )
697
+ dmean = np.sum(dx_hat * (-1.0 / std), axis=-1, keepdims=True)
791
698
 
792
699
  dx = (
793
700
  dx_hat / std
@@ -798,10 +705,12 @@ class Transformer:
798
705
  return dx
799
706
 
800
707
  # fixed attention backward allow the transformer to not update its Q, K, V projections, allowing much stable attention, while sacrificing flexibility.
801
- def fixed_attention_backward(self, d_logits, lr=0.001):
708
+ def fixed_attention_backward(self, d_logits, lr=0.01, max_norm=1.0):
802
709
 
803
710
  # Gradient for output layer
804
711
  d_output = d_logits
712
+ alpha = self.cache.get('alpha', 1.0)
713
+
805
714
  d_Wo = np.dot(self.cache['x_pooled'].T, d_output)
806
715
  d_bo = np.sum(d_output, axis=0, keepdims=True)
807
716
 
@@ -823,8 +732,12 @@ class Transformer:
823
732
 
824
733
  # Gradient for FFN1 through ReLU
825
734
  d_ffn_act = np.matmul(d_ffn, self.ffn2.T)
826
- d_ffn_pre = d_ffn_act
827
- d_ffn_pre[self.cache['ffn_pre'] <= 0] = 0
735
+ ffn_drop_mask = self.cache.get('ffn_drop_mask')
736
+ if ffn_drop_mask is not None:
737
+ d_ffn_act = d_ffn_act * ffn_drop_mask / (1.0 - self.ffn_dropout_rate)
738
+
739
+ d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0) # ReLU backward unchanged
740
+
828
741
  d_prev = np.matmul(d_ffn_pre, self.ffn1.T)
829
742
  d_ffn1 = np.sum(np.matmul(self.cache['ffn_input'].transpose(0, 2, 1), d_ffn_pre), axis=0)
830
743
 
@@ -839,23 +752,36 @@ class Transformer:
839
752
  d_attn = dx
840
753
 
841
754
  # Gradient for attention output projection
842
- d_Wo_attn = np.sum(np.matmul(self.cache['attn_concat'].transpose(0, 2, 1), d_attn), axis=0)
843
-
755
+ attn_drop_mask = self.cache.get('attn_drop_mask')
756
+ if attn_drop_mask is not None:
757
+ d_attn = d_attn * attn_drop_mask / (1.0 - self.attn_dropout_rate)
758
+
759
+ d_Wo_attn = np.sum(np.matmul(self.cache['attn_concat'].transpose(0,2,1), d_attn), axis=0)
760
+
761
+ grads = {
762
+ 'output': d_Wo,
763
+ 'ffn2': d_ffn2,
764
+ 'ffn1': d_ffn1,
765
+ 'W_o': d_Wo_attn,
766
+ }
767
+
768
+ grads, norm = self.clip_gradients(grads, max_norm)
769
+
844
770
  # Update weights
845
- self.output -= lr * d_Wo
846
- self.output_bias -= lr * d_bo.squeeze()
847
- self.ffn2 -= lr * d_ffn2
848
- self.ffn1 -= lr * d_ffn1
849
- self.W_o -= lr * d_Wo_attn
850
-
851
-
771
+ self.output = self.apply_update(self.output, grads['output'], lr)
772
+ self.ffn2 = self.apply_update(self.ffn2, grads['ffn2'], lr)
773
+ self.ffn1 = self.apply_update(self.ffn1, grads['ffn1'], lr)
774
+ self.W_o = self.apply_update(self.W_o, grads['W_o'], lr)
775
+ # output_bias intentionally excluded — biases don't get weight decay
776
+
852
777
  return d_x
853
778
 
854
779
 
855
- def dynamic_backward(self, d_logits, lr=0.001):
856
-
780
+ def dynamic_backward(self, d_logits, lr=0.01, max_norm=1.0):
857
781
  # Gradient for output layer
858
782
  d_output = d_logits
783
+ alpha = self.cache.get('alpha', 1.0)
784
+
859
785
  d_Wo = np.dot(self.cache['x_pooled'].T, d_output)
860
786
  d_bo = np.sum(d_output, axis=0)
861
787
 
@@ -863,8 +789,14 @@ class Transformer:
863
789
  d_pooled = np.dot(d_output, self.output.T)
864
790
 
865
791
  # Expand pooled gradient to all positions
866
- d_x = np.repeat(d_pooled[:, np.newaxis, :] / self.cache['seq_len'], self.cache['seq_len'], axis=1)
867
-
792
+ mask = self.cache['mask'] # (B, 1, 1, T)
793
+ if mask is not None:
794
+ token_mask = mask[:, 0, 0, :, np.newaxis] # (B, T, 1)
795
+ lengths = token_mask.sum(axis=1, keepdims=True) # (B, 1, 1)
796
+ d_x = (d_pooled[:, np.newaxis, :] / (lengths + 1e-6)) * token_mask
797
+ else:
798
+ d_x = np.repeat(d_pooled[:, np.newaxis, :] / self.cache['seq_len'], self.cache['seq_len'], axis=1)
799
+
868
800
  # Layer norm 2 gradient
869
801
  d_x = self.layer_norm_backward(d_x, self.cache['x_ln2_input'],
870
802
  self.ln2_scale, self.ln2_shift)
@@ -877,8 +809,12 @@ class Transformer:
877
809
 
878
810
  # Gradient for FFN1 through ReLU
879
811
  d_ffn_act = np.matmul(d_ffn, self.ffn2.T)
812
+ ffn_drop_mask = self.cache.get('ffn_drop_mask')
813
+ if ffn_drop_mask is not None:
814
+ d_ffn_act = d_ffn_act * ffn_drop_mask / (1.0 - self.ffn_dropout_rate)
815
+
816
+ d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0) # ReLU backward unchanged
880
817
 
881
- d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0)
882
818
  d_prev = np.matmul(d_ffn_pre, self.ffn1.T)
883
819
  d_ffn1 = np.sum(np.matmul(self.cache['ffn_input'].transpose(0, 2, 1), d_ffn_pre), axis=0)
884
820
 
@@ -890,6 +826,10 @@ class Transformer:
890
826
  dx = d_prev + d_residual
891
827
 
892
828
  # Gradient for attention output projection
829
+ attn_drop_mask = self.cache.get('attn_drop_mask')
830
+ if attn_drop_mask is not None:
831
+ d_attn = d_attn * attn_drop_mask / (1.0 - self.attn_dropout_rate)
832
+
893
833
  d_Wo_attn = np.sum(np.matmul(self.cache['attn_concat'].transpose(0, 2, 1), d_attn), axis=0)
894
834
 
895
835
  d_attn_concat = np.matmul(d_attn, self.W_o.T)
@@ -897,8 +837,6 @@ class Transformer:
897
837
  d_head = self.n_heads
898
838
  d_dim = self.d_model // self.n_heads
899
839
 
900
- # Reshape concatenated head gradients back to per-head form before computing
901
- # QKV gradients. Reverses the transpose+reshape done in multi_head_attention.
902
840
  d_attn_heads = d_attn_concat.reshape(batch, seq_len, d_head, d_dim) .transpose(0, 2, 1, 3)
903
841
 
904
842
  V = self.cache['V']
@@ -906,84 +844,150 @@ class Transformer:
906
844
  Q = self.cache['Q']
907
845
  weight = self.cache['attn_weights']
908
846
 
909
- # Gradient of attention output w.r.t. V: dL/dV = attn_weights^T · dL/d_attn_out
910
847
  d_V = np.matmul(weight.transpose(0, 1, 3, 2), d_attn_heads)
911
- # Gradient w.r.t. pre-softmax attention scores via the product rule on Vᵀ.
912
848
  d_weights = np.matmul(d_attn_heads, V.transpose(0, 1, 3, 2))
913
849
 
914
- # Softmax Jacobian shortcut: d(softmax) = softmax * (d_out - sum(d_out * softmax))
915
- # Scaled by 1/sqrt(d_k) matching the forward-pass scaling.
916
850
  d_scores = weight * (d_weights - np.sum(d_weights * weight, axis=-1, keepdims=True))
917
851
  d_scores /= np.sqrt(Q.shape[-1])
918
852
 
919
- # Back-propagate through Q·Kᵀ to get per-head Q and K gradients.
920
853
  d_Q = np.matmul(d_scores, K)
921
854
  d_K = np.matmul(d_scores.transpose(0, 1, 3, 2), Q)
922
855
 
923
856
  x = self.cache['x_attn_input']
924
857
 
925
- # Project head-space gradients back to projection-weight gradients using einsum.
926
- # 'bsd, bhsm->hdm': accumulate over batch and sequence dimensions.
927
858
  d_W_q = np.einsum('bsd, bhsm->hdm', x, d_Q)
928
859
  d_W_k = np.einsum('bsd, bhsm->hdm', x, d_K)
929
860
  d_W_v = np.einsum('bsd, bhsm->hdm', x, d_V)
930
861
 
931
- # Project head-space gradients back to input space (for the layer below).
932
862
  d_x_q = np.einsum('bhsm, hdm->bsd', d_Q, self.W_q)
933
863
  d_x_k = np.einsum('bhsm, hdm->bsd', d_K, self.W_k)
934
864
  d_x_v = np.einsum('bhsm, hdm->bsd', d_V, self.W_v)
935
865
 
936
- # Sum Q, K, V contributions — each head projection touches the same input x.
937
866
  d_x_attn_input = d_x_q + d_x_k + d_x_v
938
867
  d_x_total = d_x_attn_input + d_residual
939
868
 
940
869
  input_ids = self.cache.get('input_ids')
941
870
 
942
871
  if input_ids is not None:
943
- for b in range(input_ids.shape[0]):
944
- for t in range(input_ids.shape[1]):
945
- idx = int(input_ids[b, t])
946
- self.token_embedding[idx] -= lr * d_x_total[b, t] / self.cache['seq_len']
872
+ flat_ids = input_ids.flatten() # (B*T,)
873
+ flat_grads = d_x_total.reshape(-1, self.d_model) / self.cache['seq_len']
874
+ np.add.at(self.token_embedding, flat_ids, -lr * flat_grads)
947
875
 
948
876
  # Update weights
949
- self.output -= lr * d_Wo
950
- self.output_bias -= lr * d_bo.squeeze()
951
- self.ffn2 -= lr * d_ffn2
952
- self.ffn1 -= lr * d_ffn1
953
- self.W_o -= lr * d_Wo_attn
877
+ grads = {
878
+ 'output': d_Wo,
879
+ 'ffn2': d_ffn2,
880
+ 'ffn1': d_ffn1,
881
+ 'W_o': d_Wo_attn,
882
+ 'W_q': alpha * d_W_q, # already alpha-scaled, clip the combined thing
883
+ 'W_k': alpha * d_W_k,
884
+ 'W_v': alpha * d_W_v,
885
+ }
886
+ grads, norm = self.clip_gradients(grads, max_norm)
954
887
 
955
- alpha = self.alpha
888
+ self.output = self.apply_update(self.output, grads['output'], lr)
889
+ self.ffn2 = self.apply_update(self.ffn2, grads['ffn2'], lr)
890
+ self.ffn1 = self.apply_update(self.ffn1, grads['ffn1'], lr)
891
+ self.W_o = self.apply_update(self.W_o, grads['W_o'], lr)
892
+ self.W_q = self.apply_update(self.W_q, grads['W_q'], lr)
893
+ self.W_k = self.apply_update(self.W_k, grads['W_k'], lr)
894
+ self.W_v = self.apply_update(self.W_v, grads['W_v'], lr)
956
895
 
957
- self.W_q -= lr * alpha * d_W_q
958
- self.W_k -= lr * alpha * d_W_k
959
- self.W_v -= lr * alpha * d_W_v
896
+ if input_ids is not None:
897
+ emb_norm = np.linalg.norm(d_x_total)
898
+ emb_coef = min(1.0, max_norm / (emb_norm + 1e-6))
960
899
 
961
- self.pos_embedding[:seq_len] -= lr * d_x_total.mean(axis=0)
900
+ flat_ids = input_ids.flatten() # (B*T,)
901
+ flat_grads = d_x_total.reshape(-1, self.d_model) / self.cache['seq_len'] # (B*T, D)
962
902
 
963
-
964
- return d_x_total
965
-
903
+ np.add.at(self.token_embedding, flat_ids, -lr * emb_coef * flat_grads)
904
+ self.pos_embedding[:seq_len] -= lr * emb_coef * d_x_total.mean(axis=0)
905
+ else:
906
+ self.pos_embedding[:seq_len] -= lr * d_x_total.mean(axis=0)
907
+ norm = d_x_total
908
+
909
+ return norm
966
910
 
967
- def train_step(self, input_ids, epoch, y_true, lr=0.001, mode=None, embedded=False):
968
- probs, attn_weights = self.forward(input_ids, embedded=embedded)
911
+ def smoothing_labels_utility(self, y_true, smoothing=0.1):
912
+ # y_true: (B, num_classes) one-hot
913
+ num_classes = y_true.shape[1]
914
+ return y_true * (1.0 - smoothing) + smoothing / num_classes
915
+
916
+ def learning_rate_warm_up(self, epoch, epochs, lr_base, schedule='cosine_warmup', warmup_frac=0.1):
917
+ warmup_epochs = int(epochs * warmup_frac)
969
918
 
970
- # Loss (cross-entropy)
919
+ if schedule == 'cosine_warmup':
920
+ if epoch < warmup_epochs:
921
+ # Linear warmup
922
+ return lr_base * (epoch + 1) / warmup_epochs
923
+ else:
924
+ # Cosine decay after warmup
925
+ progress = (epoch - warmup_epochs) / (epochs - warmup_epochs)
926
+ return lr_base * 0.5 * (1 + np.cos(np.pi * progress))
927
+
928
+ elif schedule == 'step':
929
+ # Halve lr every 30% of training
930
+ step = int(epochs * 0.3)
931
+ return lr_base * (0.5 ** (epoch // step))
932
+
933
+ elif schedule == 'constant':
934
+ return lr_base
935
+
936
+ return lr_base
937
+
938
+ def padding_mask_utility(self, input_ids, pad_token_id=0):
939
+ # input_ids: (B, T)
940
+ # Returns: (B, 1, 1, T) — broadcast-ready for (B, heads, T_q, T_k)
941
+ mask = (input_ids != pad_token_id).astype(np.float32)
942
+ return mask[:, np.newaxis, np.newaxis, :] # (B, 1, 1, T)
943
+
944
+ def clip_gradients(self, grads: dict, max_norm: float = 1.0) -> dict:
945
+ # Compute global norm across all gradient tensors
946
+ total_norm = np.sqrt(sum(
947
+ np.sum(g ** 2) for g in grads.values()
948
+ ))
949
+
950
+ clip_coef = max_norm / (total_norm + 1e-6)
951
+
952
+ # scale down, never up
953
+ if clip_coef < 1.0:
954
+ grads = {k: g * clip_coef for k, g in grads.items()}
955
+
956
+ return grads, total_norm # return norm for monitoring
957
+
958
+
959
+ def train_step(self, input_ids, epoch, y_true, lr=0.01, mode=None, embedded=False, max_norm=1.0, pad_token_id=0):
960
+ if not embedded and input_ids.ndim == 1:
961
+ input_ids = input_ids[np.newaxis, :] # (1, T), single sample
962
+ if y_true.ndim == 1:
963
+ y_true = y_true[np.newaxis, :]
964
+
965
+ probs, attn_weights = self.forward(input_ids, embedded=embedded, pad_token_id=pad_token_id, training=True, attn_dropout=self.attn_dropout_rate, ffn_dropout=self.ffn_dropout_rate)
966
+ y_true_smooth = self.smoothing_labels_utility(y_true, smoothing=0.1)
967
+
968
+ if y_true_smooth.shape[0] and y_true_smooth.shape[1] != probs.shape[0] and probs.shape[1]:
969
+ if y_true_smooth.shape[1] > probs.shape[1]:
970
+ y_true_smooth = y_true_smooth[:, :probs.shape[1]]
971
+ else:
972
+ y_true_smooth = np.pad(y_true_smooth, ((0, 0), (0, probs.shape[1] - y_true_smooth.shape[1])), mode='constant')
973
+
971
974
  if y_true.shape[0] and y_true.shape[1] != probs.shape[0] and probs.shape[1]:
972
975
  if y_true.shape[1] > probs.shape[1]:
973
- y_true = y_true[:, :probs.shape[1]]
976
+ y_true = y_true[:, :probs.shape[1]]
974
977
  else:
975
- y_true = np.pad(y_true, ((0, 0), (0, probs.shape[1] - y_true.shape[1])), mode='constant')
976
-
977
- loss = -np.mean(np.sum(y_true * np.log(probs + 1e-8), axis=1))
978
+ y_true = np.pad(y_true, ((0, 0), (0, probs.shape[1] - y_true.shape[1])), mode='constant')
979
+
980
+ # Loss (cross-entropy)
981
+ loss = -np.mean(np.sum(y_true_smooth * np.log(probs + 1e-8), axis=1))
978
982
 
979
983
  # Gradient of loss w.r.t. logits
980
- d_logits = (probs - y_true) / y_true.shape[0]
984
+ d_logits = (probs - y_true_smooth) / y_true_smooth.shape[0]
981
985
 
982
986
  # Backward pass
983
987
  if mode == 'fixed_backward':
984
- self.fixed_attention_backward(d_logits, lr)
988
+ self.fixed_attention_backward(d_logits, lr, max_norm=max_norm)
985
989
  else:
986
- self.dynamic_backward(d_logits, lr)
990
+ self.dynamic_backward(d_logits, lr, max_norm=max_norm)
987
991
 
988
992
  # Accuracy
989
993
  preds = np.argmax(probs, axis=1)
@@ -992,8 +996,16 @@ class Transformer:
992
996
 
993
997
  return loss, acc
994
998
 
999
+ def batch_padding_utility(self, sequences, pad_token_id=0):
1000
+ # sequences: list of 1-D np arrays of varying length
1001
+ max_len = max(len(s) for s in sequences)
1002
+ padded = np.full((len(sequences), max_len), pad_token_id, dtype=np.int32)
1003
+ for i, s in enumerate(sequences):
1004
+ padded[i, :len(s)] = s
1005
+ return padded # (B, T)
995
1006
 
996
- def train(self, input_ids_list, y_true_list, epochs=100, mode=None, lr=0.001, embedded=False):
1007
+
1008
+ def train(self, input_ids_list, y_true_list, epochs=100, mode=None, lr=0.01, embedded=False, max_norm=1.0, schedule='cosine_warmup', pad_token_id=0, batch_size=None):
997
1009
  losses = []
998
1010
  accs = []
999
1011
  d_model = self.d_model
@@ -1004,23 +1016,48 @@ class Transformer:
1004
1016
  self.shaping = GeometricWeightShaping(d_model, d_model)
1005
1017
  shaping_input = input_ids_list
1006
1018
  if embedded:
1007
- shaping_input = np.vstack([x.reshape(1, -1) if x.ndim > 2 else x for x in input_ids_list])
1019
+ shaping_input = np.vstack([
1020
+ x.reshape(-1, x.shape[-1]) if x.ndim >= 2 else x
1021
+ for x in input_ids_list
1022
+ ])
1023
+ else:
1024
+ shaping_input = input_ids_list
1025
+
1008
1026
  self.W_o = self.shaping.weight_shaping(shaping_input)
1009
1027
  self.encoded = True
1028
+
1029
+ # Pre-pad all sequences once before training starts
1030
+ # only when batch_size is set and not in embedded mode
1031
+ if batch_size is not None and not embedded:
1032
+ input_ids_list = [
1033
+ self.batch_padding_utility(input_ids_list[i:i+batch_size], pad_token_id)
1034
+ for i in range(0, len(input_ids_list), batch_size)
1035
+ ]
1036
+ y_true_list = [
1037
+ np.stack(y_true_list[i:i+batch_size])
1038
+ for i in range(0, len(y_true_list), batch_size)
1039
+ ]
1040
+ # input_ids_list is now a list of (B, T) arrays
1041
+ # y_true_list is now a list of (B, num_classes) arrays
1042
+
1043
+
1044
+ print(f"[==] Starting comprehensive training for {epochs} epochs with mode: {mode}, learning rate: {lr}, schedule: {schedule}")
1010
1045
 
1011
1046
  for epoch in range(epochs):
1012
1047
  epoch_losses = []
1013
1048
  epoch_accs = []
1049
+ current_lr = self.learning_rate_warm_up(epoch, epochs, lr, schedule)
1014
1050
  self.alpha = min(1.0, epoch / 100)
1015
1051
 
1016
1052
  for input_ids, y_true in zip(input_ids_list, y_true_list):
1017
1053
 
1018
1054
  if input_ids.ndim == 1:
1019
- input_ids = input_ids.reshape(1, -1)
1055
+ input_ids = input_ids[np.newaxis, :]
1020
1056
  if y_true.ndim == 1:
1021
- y_true = y_true.reshape(1, -1)
1057
+ y_true = y_true[np.newaxis, :]
1022
1058
 
1023
- loss, acc = self.train_step(input_ids, epoch, y_true, lr, mode, embedded=embedded)
1059
+ loss, acc = self.train_step(input_ids, epoch, y_true, current_lr, mode,
1060
+ embedded=embedded, max_norm=max_norm, pad_token_id=pad_token_id)
1024
1061
  epoch_losses.append(loss)
1025
1062
  epoch_accs.append(acc)
1026
1063
 
@@ -1031,7 +1068,7 @@ class Transformer:
1031
1068
 
1032
1069
  if epoch % 10 == 0:
1033
1070
  print(f"[=] Epoch {epoch} | loss: {avg_loss:.4f} | Acc: {avg_acc:.2%}")
1034
-
1071
+
1035
1072
  return losses, accs
1036
1073
 
1037
1074
 
@@ -1039,7 +1076,7 @@ class Transformer:
1039
1076
  if not embedded and input_ids.ndim == 1:
1040
1077
  input_ids = input_ids.reshape(1, -1)
1041
1078
 
1042
- probs, attn_weights = self.forward(input_ids, embedded=embedded)
1079
+ probs, attn_weights = self.forward(input_ids, embedded=embedded, training=False, attn_dropout=0.0, ffn_dropout=0.0)
1043
1080
  preds = np.argmax(probs, axis=1)
1044
1081
 
1045
1082
  return preds, probs, attn_weights
@@ -1048,10 +1085,6 @@ class Transformer:
1048
1085
  def AME_Encoder(self, x):
1049
1086
  X = np.asarray(x)
1050
1087
 
1051
- if x.shape[1] == 1:
1052
- x = x.T
1053
- x= x.flatten()
1054
-
1055
1088
  gradient = np.gradient(x, axis=-1)
1056
1089
  grad_energy = np.mean(np.linalg.norm(gradient, axis=-1))
1057
1090
  X_mag = np.mean(np.linalg.norm(X, axis=-1))
@@ -1075,26 +1108,23 @@ class Transformer:
1075
1108
 
1076
1109
  # attention quality computing provides the transformer a robust geometric complexity alignment scalar,
1077
1110
  # this scalar can be used to compute alpha for a much stable forward pass in scarce data environment, allowing it to complement with AWE MLP below.
1078
- def attention_quality_computing(self, attn_weights):
1079
- # Produces a scalar in [0, 1] that summarises how "high quality" the current
1080
- # attention distribution is. This value is used to update self.alpha via EMA
1081
- # in forward(), which in turn controls the frozen-vs-learned projection blend.
1082
- #
1083
- # Four complementary signals are combined:
1084
- # norm_entropy — 1 − normalised entropy; near 1 when attention is focused
1085
- # on a small number of tokens (confident), near 0 when flat.
1086
- # avg_max — mean of per-head max weights; a direct focus indicator.
1087
- # norm_var — clipped variance scaled by seq_len; captures head diversity.
1088
- # qualified — geometric factor: (1 - AMR) * anisotropy;
1089
- # high when data geometry is complex but AMR (model rate) is low,
1090
- # effectively weighting quality higher when the model is in an
1091
- # exploratory (low-AMR) regime over anisotropic data.
1092
- #
1093
- # Final score = qualified*(norm_entropy + avg_max) + anisotropy*norm_var
1094
- # Clipped to [0, 1] and returned as dynamic_alpha.
1111
+ def attention_quality_computing(self, attn_weights, mask=None):
1095
1112
  eps = 1e-5
1096
1113
  eps = 1e-5
1097
1114
  batch, heads, seq_len, _ = attn_weights.shape
1115
+
1116
+ if mask is not None:
1117
+ # mask: (B, 1, 1, T) → expand to (B, heads, T, T)
1118
+ mask_expanded = np.broadcast_to(
1119
+ mask, (batch, heads, seq_len, seq_len)
1120
+ )
1121
+ # Zero out padding positions before computing stats
1122
+ attn_weights = attn_weights * mask_expanded
1123
+
1124
+ # Renormalise so rows still sum to 1 over valid tokens only
1125
+ row_sums = attn_weights.sum(axis=-1, keepdims=True) + eps
1126
+ attn_weights = attn_weights / row_sums
1127
+
1098
1128
  AME = self.AME_Encoder(attn_weights)
1099
1129
  anisotropy = self.anisotropy_measurement(attn_weights)
1100
1130
 
@@ -1109,8 +1139,6 @@ class Transformer:
1109
1139
  norm_var = np.clip(var_attn * seq_len, 0, 1)
1110
1140
 
1111
1141
  AMR = 1.0 / (1.0 + np.exp(-AME)) # abstract modelling rate
1112
- # qualified is high when AMR is low (geometry complex, model still learning)
1113
- # and anisotropy is high (strongly directional gradients in the attention map).
1114
1142
  qualified = (1.0 - AMR) + eps * anisotropy
1115
1143
 
1116
1144
  quality_score = qualified * norm_entropy + qualified * avg_max + anisotropy * norm_var
@@ -1138,35 +1166,17 @@ class Dense:
1138
1166
  self.activation_derivative = None
1139
1167
 
1140
1168
  def multi_modal_linear_transformation(self, x):
1141
- # Standard linear layer z = xW + b, but with a multi-level shape-mismatch
1142
- # recovery cascade. This is needed because the GWS weight matrix W is shaped
1143
- # at construction time from the training data, and at inference time the input
1144
- # may have a different number of features (e.g. after vocabulary drift or
1145
- # when calling the model with embedded TF-IDF vectors vs raw token IDs).
1146
- #
1147
- # Recovery hierarchy (outermost try wins):
1148
- # Level 1 (primary): normal dot(x, W) + b.
1149
- # Level 2 (first fallback): column-slice W to match x.shape[1], then add
1150
- # a matching slice of b.
1151
- # Level 3 (deep fallback): slice both x and W along whichever dimension fits,
1152
- # then add a b slice. Covers edge cases where both x and W need trimming.
1153
- #
1154
- # The guard at the top reshapes W in-place if shapes are obviously mismatched
1155
- # (x.shape[1] != W.shape[0]), preferring slicing over re-initialisation.
1156
1169
  if len(x.shape) > 1 and x.shape[1] != self.W.shape[0]:
1157
1170
  V1, V2 = x.shape[0], x.shape[1]
1158
1171
  try:
1159
- # Trim W's rows to match the feature dimension of x.
1160
1172
  self.W = self.W[:V2, :]
1161
1173
  except:
1162
- # If trimming fails (W is already smaller), re-initialise with correct dims.
1163
1174
  self.special_weight = GeometricWeightShaping(V2, V1)
1164
1175
  self.W = self.special_weight.weight_shaping(x)
1165
1176
  try:
1166
1177
  try:
1167
1178
  z = np.dot(x, self.W) + self.b
1168
1179
  except:
1169
- # W has more rows than x has columns; trim and add matching bias slice.
1170
1180
  subnet_W = self.W[:x.shape[1], :x.shape[0]]
1171
1181
 
1172
1182
  sub_z = np.dot(x, subnet_W)
@@ -1179,7 +1189,6 @@ class Dense:
1179
1189
  subnet_W = self.W[:x.shape[1]:, :x.shape[0]]
1180
1190
  sub_z = np.dot(x, subnet_W)
1181
1191
  except:
1182
- # Last resort: trim x to fit W or vice versa, whichever succeeds first.
1183
1192
  weight = self.W
1184
1193
 
1185
1194
  try:
@@ -1340,19 +1349,6 @@ class MLP:
1340
1349
 
1341
1350
 
1342
1351
  def train(self, X, y, epochs=1000, lr=0.01, verbose=True):
1343
- # Decide whether to use the "focused" sub-network (feed_layers) or the
1344
- # standard full network (layers) for this training run.
1345
- #
1346
- # focused_fit_condition is True when ALL three hold:
1347
- # 1. feed_layers is non-empty — a focused sub-network exists
1348
- # 2. anisotropy > 0.25 — data has sufficient directional variation
1349
- # (flat/isotropic data doesn't benefit from focus)
1350
- # 3. AME > 0.25 — combined magnitude × gradient energy is above
1351
- # a minimum threshold (data is complex enough)
1352
- #
1353
- # When True, only feed_layers are updated via focused_forward/focused_backward,
1354
- # letting the model concentrate its learning capacity on high-complexity data
1355
- # without disrupting the full network's previously learned representations.
1356
1352
  focused_fit_condition = len(self.feed_layers) > 0 and self.anisotropy_measurement(X) > 0.25 and self.AME_Encoder(X) > 0.25
1357
1353
  print(f'[+] Focused fit condition: {focused_fit_condition} || Anisotropy: {self.anisotropy_measurement(X):.4f} || AME: {self.AME_Encoder(X):.4f}')
1358
1354
  for epoch in range(epochs):
@@ -1395,19 +1391,6 @@ class WeightedEnsemblePredictor:
1395
1391
 
1396
1392
 
1397
1393
  def attention_memory_gate(self, probs, x):
1398
- # Fast-path cache lookup: checks whether a previously seen input (stored under
1399
- # prefix 'TA' in self.memory) is geometrically similar to the current input x.
1400
- # Similarity is measured by cosine similarity ≥ 0.85 (tight threshold to avoid
1401
- # false hits on unrelated inputs that happen to share some features).
1402
- #
1403
- # If a match is found, the cached attention outputs (texts, x2, x3, x4) are
1404
- # returned directly, skipping a full forward pass through the transformer.
1405
- # This also acts as a continual memory mechanism: the pipeline "remembers"
1406
- # past attention patterns and reuses them for similar future inputs.
1407
- #
1408
- # Cache miss path:
1409
- # - If self_attn_weights was set by a prior call, return it as a warm fallback.
1410
- # - Otherwise return (None, None, None, None) signalling a full inference needed.
1411
1394
  memory = self.memory
1412
1395
  cache_attn_memory = [key for key, (_, inp, _, _, _) in memory.items() if key.startswith('TA') and self.pipeline.cosine_similarity(x, inp) >= 0.85]
1413
1396
 
@@ -1589,6 +1572,7 @@ class WeightedEnsemblePredictor:
1589
1572
  self.credibility_summarized_prediction(input_ids, mlp_probs, trans_probs, attn_weights, type='pipeline')
1590
1573
  except Exception as e:
1591
1574
  print(f'[-] Cant get explainability features! : {e}')
1575
+ traceback.print_exc()
1592
1576
  else:
1593
1577
  print('[-] No agreement established, skipping explainability features.')
1594
1578
 
@@ -1623,28 +1607,6 @@ class WeightedEnsemblePredictor:
1623
1607
  return anisotropy
1624
1608
 
1625
1609
  def _dynamic_weighted_ensemble(self, trans_probs, mlp_probs, attn_weights, input_ids):
1626
- # Per-sample dynamic weighting of Transformer and MLP predictions.
1627
- # Unlike the static self.transformer_weight / self.mlp_weight used in
1628
- # calibrate_weights(), this method derives weights on-the-fly from three signals:
1629
- #
1630
- # trans_conf_factor — derived from attention statistics:
1631
- # attn_focus = std of the attention map (0 = flat, high = peaked)
1632
- # attn_growth = sigmoid(attn_focus) — bounded confidence signal
1633
- # attn_limit = (1 - attn_focus + attn_growth) * anisotropy
1634
- # factor = attn_growth + attn_limit * attn_focus
1635
- # Intuitively: the transformer earns more weight when its attention
1636
- # is peaked (focused) AND the distribution is geometrically varied.
1637
- #
1638
- # mlp_conf_factor — derived from MLP output entropy:
1639
- # lower entropy → sharper distribution → higher confidence → higher weight.
1640
- # formula: 1 / (1 + entropy)
1641
- #
1642
- # agreement — 1.0 if both models predict the same class, else 0.3.
1643
- # Acts as a confidence multiplier: agreement boosts both weights
1644
- # proportionally, disagreement dampens the overall contribution.
1645
- #
1646
- # Both factors are multiplied by (1 + agreement) / 2, then normalised so they sum to 1.
1647
- # The final ensemble for sample i is: trans_weight * trans_row + mlp_weight * mlp_row.
1648
1610
  batch_size = trans_probs.shape[0]
1649
1611
  try:
1650
1612
  n_trans_classes = trans_probs.shape[1]
@@ -1653,8 +1615,6 @@ class WeightedEnsemblePredictor:
1653
1615
  n_trans_classes = trans_probs.shape[-1]
1654
1616
  n_mlp_classes = mlp_probs.shape[-1]
1655
1617
 
1656
- # Align probability vectors to the same class count (the larger of the two).
1657
- # Necessary when the transformer and MLP were trained with different label sets.
1658
1618
  n_classes = max(n_trans_classes, n_mlp_classes)
1659
1619
 
1660
1620
  print(f"🔄 Aligning classes: {n_trans_classes} and {n_mlp_classes} → {n_classes}")
@@ -1663,7 +1623,6 @@ class WeightedEnsemblePredictor:
1663
1623
  trans_row = np.zeros(n_classes)
1664
1624
  mlp_row = np.zeros(n_classes)
1665
1625
 
1666
- # Zero-pad shorter probability vectors to n_classes, then re-normalise.
1667
1626
  trans_row[:n_trans_classes] = trans_probs[i]
1668
1627
  mlp_row[:n_mlp_classes] = mlp_probs[i]
1669
1628
 
@@ -1672,23 +1631,19 @@ class WeightedEnsemblePredictor:
1672
1631
 
1673
1632
  trans_pred = np.argmax(trans_probs[i])
1674
1633
  mlp_pred = np.argmax(mlp_probs[i])
1675
- # agreement is a binary multiplier; 1.0 when models agree, 0.3 when they differ.
1676
1634
  agreement = 1.0 if trans_pred == mlp_pred else 0.3
1677
1635
 
1678
1636
  if attn_weights is not None and i < len(attn_weights):
1679
1637
  print('🔄 Sophisticated confidence assembling')
1680
1638
  attn = attn_weights[i]
1681
- # Geometric variation in the attention map itself.
1682
1639
  anisotropy = self.anisotropy_measurement(attn)
1683
1640
 
1684
1641
  attn_focus = np.std(attn) if attn.size > 0 else 0.5
1685
- attn_growth = 1.0 / (1.0 + np.exp(-attn_focus)) # sigmoid of focus
1686
- # attn_limit blends (1 - focus + growth) with anisotropy to bound the factor.
1642
+ attn_growth = 1.0 / (1.0 + np.exp(-attn_focus))
1687
1643
  attn_limit = (1.0 - attn_focus + attn_growth) * anisotropy
1688
1644
 
1689
1645
  trans_conf_factor = attn_growth + attn_limit * attn_focus
1690
1646
  else:
1691
- # Fallback when per-sample attn slice is unavailable: use scalar attn_weights.
1692
1647
  attn_growth = 1.0 / (1.0 + np.exp(-attn_weights))
1693
1648
  anisotropy = self.anisotropy_measurement(attn_weights)
1694
1649
  trans_conf_factor = attn_growth * anisotropy
@@ -1696,7 +1651,6 @@ class WeightedEnsemblePredictor:
1696
1651
  mlp_entropy = -np.sum(mlp_probs[i] * np.log(mlp_probs[i] + 1e-8))
1697
1652
  mlp_conf_factor = 1.0 / (1.0 + mlp_entropy) # Lower entropy = higher confidence
1698
1653
 
1699
- # Scale both factors by the agreement bonus, then normalise.
1700
1654
  trans_weight = trans_conf_factor * (1.0 + agreement) / 2
1701
1655
  mlp_weight = mlp_conf_factor * (1.0 + agreement) / 2
1702
1656
 
@@ -1758,32 +1712,9 @@ class WeightedEnsemblePredictor:
1758
1712
  return ensemble
1759
1713
 
1760
1714
  def _meta_ensemble(self, trans_probs, mlp_probs, attn_weights, X_mlp):
1761
- # Second-level ("stacking") ensemble. Instead of computing weights from raw
1762
- # attention or entropy signals, it builds a meta-feature vector for each sample
1763
- # that summarises both models' outputs and their relationship, then derives
1764
- # sample-specific weights from those features.
1765
- #
1766
- # Meta-features per sample (up to 7 values):
1767
- # [0] max(trans_row) — transformer peak confidence
1768
- # [1] max(mlp_row) — MLP peak confidence
1769
- # [2] std(trans_row) — transformer output spread (uncertainty proxy)
1770
- # [3] std(mlp_row) — MLP output spread
1771
- # [4] 1.0 if both agree, else 0 — inter-model agreement flag
1772
- # [5] std(attn[i]) — attention map spread (if available)
1773
- # [6] max(attn[i]) — peak attention value (if available)
1774
- #
1775
- # Weight derivation:
1776
- # base_weight = 0.5 + 0.3 * agreement → 0.5 (disagree) or 0.8 (agree)
1777
- # Whichever model has higher confidence gets base_weight;
1778
- # the other gets 1 - base_weight.
1779
- #
1780
- # NOTE: there is a scoping bug here — trans_row / mlp_row from the loop above
1781
- # are used outside the loop in the weight application (line ~1582). On the last
1782
- # iteration they hold values for sample batch_size-1, but for earlier iterations
1783
- # the wrong row is applied. Flagged in code review.
1784
1715
  batch_size = trans_probs.shape[0]
1785
1716
  n_classes = trans_probs.shape[1]
1786
- threshold_feature = 0.1 + self.pipeline.confidence_threshold
1717
+ threshold_feature = 0.1 + self.pipeline.confidence_threshold
1787
1718
 
1788
1719
  n_trans_classes = trans_probs.shape[1]
1789
1720
  n_mlp_classes = mlp_probs.shape[1]
@@ -1798,7 +1729,6 @@ class WeightedEnsemblePredictor:
1798
1729
  trans_row[:n_trans_classes] = trans_probs[i]
1799
1730
  mlp_row[:n_mlp_classes] = mlp_probs[i]
1800
1731
 
1801
- # Re-normalise after zero-padding to maintain valid probability distributions.
1802
1732
  trans_row = trans_row / (trans_row.sum() + 1e-8)
1803
1733
  mlp_row = mlp_row / (mlp_row.sum() + 1e-8)
1804
1734
 
@@ -2060,14 +1990,6 @@ class ExplainabilityModule:
2060
1990
  # 3. IMMEDIATE TRAINING (single step with higher Learning Rate)
2061
1991
  anisotropy = self.pipeline.anisotropy_measurement(X)
2062
1992
 
2063
- # Derive a geometry-aware learning rate for the correction step.
2064
- # anisotropy_dist: sigmoid of anisotropy — saturates to 1 for strongly directional data.
2065
- # deviation: inverse of std; near 1 when features are tightly clustered (low spread).
2066
- # AEL (Adaptive Error Level): high when data is variable (low deviation) AND anisotropic.
2067
- # AEL → 1 ⟹ corrective LR = 2/(1+1) = 1.0 (fast correction on complex data)
2068
- # AEL → 0 ⟹ corrective LR = 2/(1+0) = 2.0 (even faster on flat/simple data)
2069
- # This intentionally boosts the correction LR above the normal training LR so
2070
- # a single wrong prediction can be overridden quickly without many epochs.
2071
1993
  anisotropy_dist = 1.0 / (1.0 + np.exp(-anisotropy))
2072
1994
  deviation = 1.0 / (1.0 + np.std(X))
2073
1995
  AEL = (1.0 - deviation) * anisotropy_dist + eps
@@ -2363,26 +2285,6 @@ class ExplainabilityModule:
2363
2285
 
2364
2286
 
2365
2287
  def _get_final_output(self, mlp_pred, mlp_conf, trans_pred, trans_conf, attn_weights):
2366
- # Resolves the final prediction when the two models disagree.
2367
- # When they agree, the higher-confidence model's score is taken directly.
2368
- # When they disagree, an "Abstract Attention Transformation" (AAT) scalar
2369
- # is computed to determine which model to trust more:
2370
- #
2371
- # sliced_anisotropy — directional variation in the first attention slice;
2372
- # high → attention is non-uniform / informative.
2373
- # deviation — 1/(1 + std(attn_weights)); near 1 when attention is tightly
2374
- # concentrated, near 0 when it is spread out.
2375
- # attn_quality — overall quality score from attention_quality_computing.
2376
- # AAT — deviation * (1 - sliced_anisotropy):
2377
- # high when attention is concentrated (low anisotropy) AND
2378
- # tightly distributed (low spread); this configuration favours
2379
- # the transformer's focused contextual prediction.
2380
- #
2381
- # Confidence blending on disagreement:
2382
- # If MLP wins: final_conf = mlp_conf * (1 - trans_conf) * (1 - AAT)
2383
- # → lower AAT (diffuse attention) → MLP gets more room to dominate.
2384
- # If Transformer wins: final_conf = trans_conf * (1 - mlp_conf) * AAT
2385
- # → higher AAT (focused attention) → transformer earns a larger share.
2386
2288
  eps = 1e-5
2387
2289
  if isinstance(mlp_conf, np.ndarray):
2388
2290
  mlp_conf = np.clip(np.mean(mlp_conf), 0, 1)
@@ -2940,18 +2842,6 @@ class ModelStorage:
2940
2842
 
2941
2843
 
2942
2844
  def save_model_dict(self, memory_name, model_dict, type=None, model_type='mlp'):
2943
- # Persists a model's in-memory dict to SQLite using an "active record" versioning
2944
- # pattern: each save inserts a new row marked is_active=1, then immediately
2945
- # deactivates all other rows for the same memory_name via a secondary UPDATE.
2946
- # This means only the most recent save is "live" — reads always fetch is_active=1.
2947
- #
2948
- # Two destination tables depending on the `type` argument:
2949
- # type == 'Transformer' → model_attn_storage (stores attention-related weights)
2950
- # else → model_storage (stores MLP / pipeline weights)
2951
- #
2952
- # numpy arrays inside model_dict are recursively converted to Python lists
2953
- # by _prepare_for_serialization() before json.dumps, ensuring they round-trip
2954
- # correctly when loaded back via _convert_to_arrays().
2955
2845
  try:
2956
2846
  db_path = self.get_database_path()
2957
2847
  conn = sqlite3.connect(db_path)
@@ -2970,7 +2860,6 @@ class ModelStorage:
2970
2860
  VALUES (?, ?, ?, ?)
2971
2861
  """, (memory_name, model_type, model_json, 1))
2972
2862
 
2973
- # Deactivate all other rows for this memory_name (soft-delete old versions).
2974
2863
  c.execute("""
2975
2864
  UPDATE model_attn_storage
2976
2865
  SET is_active = 0
@@ -2988,7 +2877,6 @@ class ModelStorage:
2988
2877
  VALUES (?, ?, ?, ?)
2989
2878
  """, (memory_name, model_type, model_json, 1))
2990
2879
 
2991
- # Deactivate all other rows for this memory_name (soft-delete old versions).
2992
2880
  c.execute("""
2993
2881
  UPDATE model_storage
2994
2882
  SET is_active = 0
@@ -3068,24 +2956,14 @@ class ModelStorage:
3068
2956
 
3069
2957
 
3070
2958
  def _parse_array_string(self, s):
3071
- # Attempts to recover a numpy array from a string representation that may have
3072
- # been serialised in one of several formats (JSON, Python literal, space-separated,
3073
- # or comma-separated). This is necessary because model weights and probability
3074
- # vectors are stored in SQLite as JSON strings and must be reconstructed precisely.
3075
- #
3076
- # Strategy order (first success wins):
3077
- # 1. JSON array — handles standard serialisation from json.dumps.
3078
- # 2. ast.literal_eval — handles Python repr output, e.g. "[0.1, 0.2, ...]".
3079
- # 3. Space/bracket-separated floats — covers numpy __str__ output like
3080
- # "[ 0.1 0.2 0.3]" (spaces instead of commas, optional brackets).
3081
- # 4. Comma-separated floats — fallback for CSV-style strings.
3082
- #
3083
- # Returns the original string unchanged if all strategies fail, letting the
3084
- # caller handle the type mismatch rather than silently producing garbage data.
2959
+ """
2960
+ Parse string representation of array back to numpy array.
2961
+ Returns original string if parsing fails.
2962
+ """
3085
2963
  if not isinstance(s, str) or not s:
3086
2964
  return s
3087
2965
 
3088
- # Normalise whitespace before parsing — strip newlines, tabs, collapse spaces.
2966
+ # Clean the string
3089
2967
  s = s.replace('\n', '').replace('\r', '').replace('\t', '')
3090
2968
  s = ' '.join(s.split()).strip()
3091
2969
 
@@ -3346,14 +3224,6 @@ class ModelStorage:
3346
3224
  pass
3347
3225
 
3348
3226
  def load_peer_request_dict(self, memory_name, agent_id):
3349
- # Retrieves a peer agent's stored prediction request from agent_attn_storage,
3350
- # excluding rows whose agent_id matches any ID in the provided list.
3351
- # The exclusion prevents an agent from retrieving its own previously stored
3352
- # request, ensuring it only receives data from *other* agents in the network.
3353
- #
3354
- # The IN clause is constructed dynamically with one '?' placeholder per agent_id
3355
- # entry, which is safe against SQL injection via parameterised queries.
3356
- # Returns (model_attn_data, model_target_pred) parsed from JSON, or (None, None).
3357
3227
  print(f'|| Peer request with Agent')
3358
3228
  try:
3359
3229
  try:
@@ -3820,10 +3690,7 @@ class AsyncMessageQueue:
3820
3690
  if not success:
3821
3691
  self._stats['messages_failed'] += 1
3822
3692
 
3823
- # Exponential moving average of message latency.
3824
- # alpha = 0.1 means the current measurement contributes 10 % to the running average,
3825
- # providing a smoothed latency estimate that is robust to spikes without requiring
3826
- # a fixed-size history window.
3693
+ # Update moving average
3827
3694
  alpha = 0.1 # Smoothing factor
3828
3695
  self._stats['avg_latency'] = alpha * latency + (1 - alpha) * self._stats['avg_latency']
3829
3696
 
@@ -4069,7 +3936,7 @@ class AgentDistributedInference:
4069
3936
  # Security: Audit log
4070
3937
  self.security_log = []
4071
3938
 
4072
- self.enable_ssl = False # Set to True to enable SSL encryption
3939
+ self.enable_ssl = False # Set to True to enable SSL encryption
4073
3940
  # i provided basic cert file and key since there are other layered security other than ssl, and also due to infrequent external connections.
4074
3941
  self.ssl_cert_file = ssl_cert_file
4075
3942
  self.ssl_key_file = ssl_key_file
@@ -4310,7 +4177,7 @@ class AgentDistributedInference:
4310
4177
  key = self.secret_key.encode() if isinstance(self.secret_key, str) else self.secret_key
4311
4178
  signature = hmac.new(key, message_bytes, hashlib.sha256).hexdigest()
4312
4179
 
4313
- print(f'|| Signing message with: {len(message)} total of size, with signature: {signature}')
4180
+ print(f'|| Signing message with: {len(message)} total of size')
4314
4181
  logger.info(f"[=] Signing message: {len(message)}")
4315
4182
  return signature
4316
4183
 
@@ -4579,7 +4446,7 @@ class AgentDistributedInference:
4579
4446
  print(f"[-] Connection attempt to blocked IP: {host}")
4580
4447
  self._log_security_event('connection_blocked', {'ip': host})
4581
4448
  return None
4582
-
4449
+
4583
4450
  # Socket creation
4584
4451
  sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
4585
4452
 
@@ -5366,7 +5233,7 @@ class AgentDistributedInference:
5366
5233
  print(f'[||] Successfully calibrate probs with previous Peer using database!')
5367
5234
  self.save_to_local_peer(self.memory_name, probs)
5368
5235
  else:
5369
- print(f'[-] Connection to peer agent {self.temporary_agent_id} failed or not permitted, returning regular probs...')
5236
+ print(f'[-] Connection to peer agent {self.temporary_agent_id} is not permitted, returning regular probs...')
5370
5237
 
5371
5238
  return probs
5372
5239
 
@@ -6160,13 +6027,15 @@ class IntegratedPipeline:
6160
6027
  self.titles = None
6161
6028
  self.labels = None
6162
6029
 
6163
- self.use_transformer = False
6030
+ self.use_transformer = True
6164
6031
  self.agreement = False
6165
6032
  self.external_peer_enabled = False
6166
6033
  self.autonomous = False
6167
6034
  self.show_explainability_details = True
6168
6035
 
6169
6036
  self.temperature = 1.0
6037
+ self.transformer_lr = 0.1
6038
+
6170
6039
  self.memory_name = memory_name
6171
6040
 
6172
6041
  self.pending_batch = []
@@ -8072,16 +7941,15 @@ class IntegratedPipeline:
8072
7941
  _, y_true = self.input_encoding(datasets)
8073
7942
  sequence_inputs = self.sequence_encoding(datasets)
8074
7943
  unsuitable_training = self.training_necessary_condition(sequence_inputs, X_raw)
7944
+ lr = self.model2.transformer_lr if self.model2 else self.transformer_lr
8075
7945
 
8076
7946
  if not unsuitable_training:
8077
7947
  print(f'🚀 Training Transformer with {len(sequence_inputs)} Samples: ')
8078
7948
  conditional_anisotropy = self.anisotropy_measurement(sequence_inputs)
8079
7949
  if conditional_anisotropy >= self.confidence_threshold:
8080
- lr = 1e-4
8081
7950
  print('[+] Dynamic Backward')
8082
7951
  mode = 'dynamic_backward'
8083
7952
  else:
8084
- lr = 0.1
8085
7953
  print('[-] Fixed Backward')
8086
7954
  mode = 'fixed_backward'
8087
7955
 
@@ -11092,7 +10960,7 @@ class ConsecutivePeerAgent:
11092
10960
  # Verify message signature
11093
10961
  expected = self._sign_message({k: v for k, v in message.items() if k != 'signature'})
11094
10962
 
11095
- print(f'[ConsecutivePeerAgent] Comparing Signature and verfiying...')
10963
+ print(f'[ConsecutivePeerAgent] Comparing Signature and verifying...')
11096
10964
  return hmac.compare_digest(expected, signature)
11097
10965
 
11098
10966
  def _send_message(self, sock: socket.socket, message: dict) -> bool:
@@ -13167,39 +13035,9 @@ def PermissiveTest():
13167
13035
  pass
13168
13036
 
13169
13037
 
13170
- def main_cli():
13171
- """Command-line interface entry point"""
13172
- import argparse
13173
- import asyncio
13174
-
13175
- parser = argparse.ArgumentParser(description="AbstractIntegratedModule - AI Multi-agent System")
13176
- parser.add_argument("--version", action="store_true", help="Show version")
13177
- parser.add_argument("--train", help="Training data file")
13178
- parser.add_argument("--predict", help="Text to predict")
13179
-
13180
- args = parser.parse_args()
13181
-
13182
- if args.version:
13183
- print(f"AbstractIntegratedModule version {__version__}")
13184
- return
13185
-
13186
- if args.predict:
13187
- # Simple prediction example
13188
- pipeline = IntegratedPipeline("temp", use_async=False)
13189
- result = pipeline.predict_single(args.predict)
13190
- print(f"[=] Prediction: {result}")
13191
-
13192
- if args.train:
13193
- print(f"[=] Training with {args.train}")
13194
-
13195
-
13196
-
13197
-
13198
-
13199
13038
  if __name__ == "__main__":
13200
13039
  try:
13201
13040
  PermissiveTest()
13202
- main_cli()
13203
13041
  except Exception as e:
13204
13042
  print(f'|| Program Crashed..., Error: {e}')
13205
13043
  traceback.print_exc()