AbstractIntegratedModule 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AbstractIntegratedModule.py +298 -460
- {abstractintegratedmodule-0.1.8.dist-info → abstractintegratedmodule-0.1.9.dist-info}/METADATA +1 -1
- abstractintegratedmodule-0.1.9.dist-info/RECORD +5 -0
- abstractintegratedmodule-0.1.8.dist-info/RECORD +0 -5
- {abstractintegratedmodule-0.1.8.dist-info → abstractintegratedmodule-0.1.9.dist-info}/WHEEL +0 -0
- {abstractintegratedmodule-0.1.8.dist-info → abstractintegratedmodule-0.1.9.dist-info}/top_level.txt +0 -0
AbstractIntegratedModule.py
CHANGED
|
@@ -1,52 +1,3 @@
|
|
|
1
|
-
|
|
2
|
-
"""
|
|
3
|
-
Advanced Integrated AI Module (AbstractIntegratedModule)
|
|
4
|
-
Multi-agent P2P inference system with geometric deep learning
|
|
5
|
-
|
|
6
|
-
Installation:
|
|
7
|
-
pip install aiml
|
|
8
|
-
|
|
9
|
-
Usage:
|
|
10
|
-
from AbstractIntegratedModule import IntegratedPipeline, CohesiveAgentDeployment
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
__version__ = "0.1.5"
|
|
14
|
-
__author__ = "Micro-Novelty"
|
|
15
|
-
__all__ = [
|
|
16
|
-
# Main user-facing classes
|
|
17
|
-
"IntegratedPipeline",
|
|
18
|
-
"CohesiveAgentDeployment",
|
|
19
|
-
"AgentDistributedInference",
|
|
20
|
-
"WeightedEnsemblePredictor",
|
|
21
|
-
|
|
22
|
-
# Models
|
|
23
|
-
"Transformer",
|
|
24
|
-
"MLP",
|
|
25
|
-
"GeometricWeightShaping",
|
|
26
|
-
|
|
27
|
-
# Security (user may need)
|
|
28
|
-
"SecurityLevel",
|
|
29
|
-
"SecurityConfig",
|
|
30
|
-
"TrustLevel",
|
|
31
|
-
|
|
32
|
-
# Storage (user may need)
|
|
33
|
-
"ModelStorage",
|
|
34
|
-
|
|
35
|
-
# Fallback
|
|
36
|
-
"ConsecutivePeerAgent",
|
|
37
|
-
|
|
38
|
-
# Singleton base (if users want to extend)
|
|
39
|
-
"Singleton",
|
|
40
|
-
|
|
41
|
-
# Version
|
|
42
|
-
"__version__",
|
|
43
|
-
]
|
|
44
|
-
|
|
45
|
-
# THIS IS THE SOURCE CODE OF ABSTRACTINTEGRATEDMODULE
|
|
46
|
-
# YOU ARE HEREBY GRANTED TO AUDIT, REVIEW, AND INITIATE PULL REQUESTS AND ISSUES
|
|
47
|
-
# LICENSE: MIT, PROVIDED.
|
|
48
|
-
|
|
49
|
-
|
|
50
1
|
import numpy as np
|
|
51
2
|
from sklearn.preprocessing import StandardScaler
|
|
52
3
|
import pandas as pd
|
|
@@ -299,11 +250,7 @@ class SingletonMeta(type):
|
|
|
299
250
|
_lock: threading.Lock = threading.Lock()
|
|
300
251
|
|
|
301
252
|
def __call__(cls, *args, **kwargs):
|
|
302
|
-
#
|
|
303
|
-
# Fast path (no lock) — if the instance already exists, return immediately.
|
|
304
|
-
# Slow path (with lock) — only one thread can create the instance; a second
|
|
305
|
-
# check inside the lock guards against two threads both passing the fast path
|
|
306
|
-
# before either acquires the lock.
|
|
253
|
+
# Fast path: instance already exists
|
|
307
254
|
if cls in cls._instances:
|
|
308
255
|
return cls._instances[cls]
|
|
309
256
|
|
|
@@ -353,23 +300,6 @@ class GeometricWeightShaping:
|
|
|
353
300
|
self.floating_context = None
|
|
354
301
|
|
|
355
302
|
def eigenvalue_encoder(self, x):
|
|
356
|
-
# Encodes the geometric complexity of the input data into a scalar (trC) and a
|
|
357
|
-
# principal component count (k). The scalar trC is later used as the upper bound
|
|
358
|
-
# for the random floating-point context in abstract_weight_shaping.
|
|
359
|
-
#
|
|
360
|
-
# Step-by-step logic:
|
|
361
|
-
# 1. Augment input with magnitude-scaled structured noise so the covariance
|
|
362
|
-
# matrix is never degenerate even on very small or homogeneous datasets.
|
|
363
|
-
# 2. Run eigendecomposition on the augmented covariance, sort eigenvalues
|
|
364
|
-
# descending, then find k = the number of principal components that
|
|
365
|
-
# capture 90% of cumulative variance. k is a compact measure of
|
|
366
|
-
# intrinsic dimensionality.
|
|
367
|
-
# 3. Derive three chained scalars (trA → trB → trC) that compress k and the
|
|
368
|
-
# data anisotropy into a single weight-shaping magnitude.
|
|
369
|
-
# - trA : scales k by directional variation; high anisotropy → large trA
|
|
370
|
-
# - trB : dampens trA²; keeps the signal in a bounded range
|
|
371
|
-
# - trC : final scalar — NOTE: trB² - 1.0 can equal zero when trB == ±1,
|
|
372
|
-
# causing division-by-zero (known fragility flagged in code review)
|
|
373
303
|
eps = 1e-5
|
|
374
304
|
X = np.asarray(x)
|
|
375
305
|
if X.ndim > 2:
|
|
@@ -379,32 +309,22 @@ class GeometricWeightShaping:
|
|
|
379
309
|
|
|
380
310
|
anisotropy = self.anisotropy_measurement(X)
|
|
381
311
|
|
|
382
|
-
# Augment data with noise proportional to its magnitude to avoid a singular
|
|
383
|
-
# covariance matrix when the dataset is small or nearly constant.
|
|
384
312
|
structured_noise = np.random.uniform(0, mag, size=X.shape)
|
|
385
313
|
X = np.vstack((X, structured_noise))
|
|
386
314
|
cov = np.cov(X, rowvar=False)
|
|
387
315
|
|
|
388
|
-
# eigh is used instead of eig because cov is symmetric; it returns real eigenvalues
|
|
389
|
-
# and is numerically more stable than the general eigensolver.
|
|
390
316
|
eigenvalues, eigenvectors = np.linalg.eigh(cov)
|
|
391
|
-
idx = np.argsort(eigenvalues)[::-1]
|
|
317
|
+
idx = np.argsort(eigenvalues)[::-1]
|
|
392
318
|
|
|
393
319
|
eigenvalues = eigenvalues[idx]
|
|
394
|
-
# Cumulative explained variance ratio; searchsorted finds the elbow at 90 %.
|
|
395
320
|
energy = np.cumsum(eigenvalues) / np.sum(eigenvalues)
|
|
396
|
-
k = np.searchsorted(energy, 0.90) + 1
|
|
321
|
+
k = np.searchsorted(energy, 0.90) + 1
|
|
397
322
|
|
|
398
|
-
# K_G: normalised inverse of k — small k (low-dim data) → K_G near 1,
|
|
399
|
-
# large k (high-dim data) → K_G near 0.
|
|
400
323
|
K_G = 1.0 / (1.0 + k)
|
|
401
|
-
mag_G = 1.0 / (1.0 + K_G)
|
|
324
|
+
mag_G = 1.0 / (1.0 + K_G)
|
|
402
325
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
trB = (1/2 + mag_G) / (1.0 + trA**2) # quadratic dampener keeps trB < 0.5
|
|
406
|
-
# WARNING: trB² - 1.0 is negative for all typical trB values (|trB| < 1),
|
|
407
|
-
# so trC ends up negative. When trB == ±1 exactly this divides by zero.
|
|
326
|
+
trA = k / (1.0 - anisotropy) + eps
|
|
327
|
+
trB = (1/2 + mag_G) / (1.0 + trA**2)
|
|
408
328
|
trC = (1/6 + K_G) / (trB**2 - 1.0)
|
|
409
329
|
return trC, k
|
|
410
330
|
|
|
@@ -455,23 +375,6 @@ class GeometricWeightShaping:
|
|
|
455
375
|
|
|
456
376
|
# weight shaping provides directional context in which how the data should be processed in order to align with the data geometry
|
|
457
377
|
def abstract_weight_shaping(self, x):
|
|
458
|
-
# Derives a data-adaptive random weight matrix whose range is governed by
|
|
459
|
-
# the geometric complexity of the input batch x.
|
|
460
|
-
#
|
|
461
|
-
# Key scalars produced along the way:
|
|
462
|
-
# anisotropy — directional spread of gradients across x (higher = more varied)
|
|
463
|
-
# trC, k — eigenvalue-derived complexity scalar and intrinsic dimensionality
|
|
464
|
-
# AME — Abstract Modelling Error: log-product of magnitude × gradient energy
|
|
465
|
-
# AEL — Adaptive Energy Level: blends spectral similarity with anisotropy;
|
|
466
|
-
# measures how much the data geometry resembles random noise
|
|
467
|
-
# AMR — sigmoid-scaled AME; used as a soft gate between 0 and 1
|
|
468
|
-
# efficient_distributed_energy — the upper bound fed to the final uniform sampler;
|
|
469
|
-
# equals k + AEL*(1 - AMR): dominated by intrinsic dimensionality
|
|
470
|
-
# when the model rate (AMR) is high, shifts to AEL when AMR is low.
|
|
471
|
-
#
|
|
472
|
-
# The resulting weight matrix (shape: input_size × output_size) is drawn from
|
|
473
|
-
# Uniform[0, efficient_distributed_energy], which gives the downstream Dense layer
|
|
474
|
-
# a geometry-aware initialisation instead of a fixed scale like He/Xavier.
|
|
475
378
|
input_size = self.input_size
|
|
476
379
|
output_size = self.output_size
|
|
477
380
|
|
|
@@ -483,19 +386,13 @@ class GeometricWeightShaping:
|
|
|
483
386
|
trC, k = self.eigenvalue_encoder(x)
|
|
484
387
|
AME = self.AME_Encoder(x)
|
|
485
388
|
|
|
486
|
-
# floating_point: noise draw bounded by trC; used only to compute spectral
|
|
487
|
-
# similarity (how much the real data "looks like" noise geometrically).
|
|
488
389
|
floating_point = np.random.uniform(0, trC, size=x.shape)
|
|
489
390
|
spectral_similarity = self.spectral_similarity(x, floating_point)
|
|
490
391
|
|
|
491
|
-
# AEL rises when data is both spectrally noise-like and highly anisotropic.
|
|
492
392
|
AEL = 0.3 + spectral_similarity * anisotropy
|
|
493
|
-
scaled_anisotropy = anisotropy / (anisotropy + 1.0)
|
|
494
|
-
AMR = 1.0 / (1.0 + np.exp(-AME))
|
|
393
|
+
scaled_anisotropy = anisotropy / (anisotropy + 1.0)
|
|
394
|
+
AMR = 1.0 / (1.0 + np.exp(-AME)) # abstract modelling rate
|
|
495
395
|
|
|
496
|
-
# Upper bound of the weight distribution.
|
|
497
|
-
# When data complexity is low (AMR → 1), the AEL term vanishes → bound ≈ k.
|
|
498
|
-
# When data is geometrically rich (AMR → 0), AEL contributes more → wider init.
|
|
499
396
|
efficient_distributed_energy = k + AEL * (1.0 - AMR)
|
|
500
397
|
floating_context = rng.uniform(0, efficient_distributed_energy, size=(input_size, output_size))
|
|
501
398
|
self.floating_context = floating_context
|
|
@@ -579,9 +476,13 @@ class Loss:
|
|
|
579
476
|
|
|
580
477
|
|
|
581
478
|
class Transformer:
|
|
582
|
-
def __init__(self, vocab_size, d_model=
|
|
479
|
+
def __init__(self, vocab_size, d_model=8, n_heads=2, num_classes=7, learning_rate=0.01, attn_dropout=0.0, ffn_dropout=0.0, weight_decay=1e-4):
|
|
583
480
|
self.d_model = d_model # Embedding dimension
|
|
584
481
|
self.n_heads = n_heads
|
|
482
|
+
self.attn_dropout_rate = attn_dropout
|
|
483
|
+
self.ffn_dropout_rate = ffn_dropout
|
|
484
|
+
self.transformer_lr = learning_rate
|
|
485
|
+
self.weight_decay = weight_decay
|
|
585
486
|
|
|
586
487
|
self.token_embedding = np.random.randn(vocab_size, d_model) * 0.02
|
|
587
488
|
|
|
@@ -620,7 +521,28 @@ class Transformer:
|
|
|
620
521
|
mean = np.mean(x, axis=-1, keepdims=True)
|
|
621
522
|
var = np.var(x, axis=-1, keepdims=True)
|
|
622
523
|
return scale * (x - mean) / np.sqrt(var + 1e-5) + shift
|
|
623
|
-
|
|
524
|
+
|
|
525
|
+
def apply_update(self, param, grad, lr):
|
|
526
|
+
# L2 weight decay applied directly at update time
|
|
527
|
+
# equivalent to: grad += weight_decay * param
|
|
528
|
+
return param - lr * (grad + self.weight_decay * param)
|
|
529
|
+
|
|
530
|
+
def dropout(self, x, rate=0.1, training=True, alpha=None):
|
|
531
|
+
if not training or rate == 0.0:
|
|
532
|
+
return x, None
|
|
533
|
+
|
|
534
|
+
# If alpha provided, scale the effective drop rate by it
|
|
535
|
+
# low alpha (early training, fixed attention) → very light dropout
|
|
536
|
+
# high alpha (dynamic attention active) → full dropout rate
|
|
537
|
+
effective_rate = rate * alpha if alpha is not None else rate
|
|
538
|
+
|
|
539
|
+
if effective_rate == 0.0:
|
|
540
|
+
return x, None
|
|
541
|
+
|
|
542
|
+
mask = (np.random.rand(*x.shape) > effective_rate).astype(np.float32)
|
|
543
|
+
return x * mask / (1.0 - effective_rate), mask
|
|
544
|
+
|
|
545
|
+
|
|
624
546
|
def softmax(self, x):
|
|
625
547
|
if x.ndim == 3:
|
|
626
548
|
shifted = x - np.max(x, axis=-1, keepdims=True)
|
|
@@ -642,29 +564,13 @@ class Transformer:
|
|
|
642
564
|
return output, weights
|
|
643
565
|
|
|
644
566
|
|
|
645
|
-
def multi_head_attention(self, x, mask=None):
|
|
567
|
+
def multi_head_attention(self, x, mask=None, alpha=None):
|
|
646
568
|
batch_size, seq_len, d_model = x.shape
|
|
647
|
-
try:
|
|
648
|
-
alpha = self.alpha # between 0 and 1
|
|
649
|
-
except:
|
|
650
|
-
# alpha not yet set (first call before any train_step); derive it from the
|
|
651
|
-
# data's geometric complexity via AME so we start with a meaningful blend.
|
|
652
|
-
AME = self.AME_Encoder(x)
|
|
653
|
-
AMR = 1.0 / (1.0 + np.exp(-AME))
|
|
654
|
-
alpha = AMR
|
|
655
|
-
self.alpha = alpha
|
|
656
569
|
|
|
657
|
-
# Interpolate between the frozen initial projections (W_q/k/v_fixed) and the
|
|
658
|
-
# learnable ones (W_q/k/v). alpha starts near 0 and ramps toward 1 during
|
|
659
|
-
# training (see train() where alpha = min(1.0, epoch/100)), so early epochs
|
|
660
|
-
# lean on the stable fixed projections and later epochs use the learned ones.
|
|
661
570
|
W_q_mix = (1 - alpha) * self.W_q_fixed + alpha * self.W_q
|
|
662
571
|
W_k_mix = (1 - alpha) * self.W_k_fixed + alpha * self.W_k
|
|
663
572
|
W_v_mix = (1 - alpha) * self.W_v_fixed + alpha * self.W_v
|
|
664
573
|
|
|
665
|
-
# Project input into multi-head Q, K, V spaces.
|
|
666
|
-
# einsum notation 'bsd,hdm->bhsm': batch × seq × d_model dotted with
|
|
667
|
-
# n_heads × d_model × head_dim → batch × n_heads × seq × head_dim
|
|
668
574
|
Q = np.einsum('bsd,hdm->bhsm', x, W_q_mix)
|
|
669
575
|
K = np.einsum('bsd,hdm->bhsm', x, W_k_mix)
|
|
670
576
|
V = np.einsum('bsd,hdm->bhsm', x, W_v_mix)
|
|
@@ -680,65 +586,75 @@ class Transformer:
|
|
|
680
586
|
self.cache['attn_weights'] = attn_weights
|
|
681
587
|
self.cache['attn_output'] = attn_output
|
|
682
588
|
|
|
683
|
-
# Concatenate heads
|
|
589
|
+
# Concatenate heads
|
|
684
590
|
attn_output = attn_output.transpose(0, 2,1, 3).reshape(batch_size, seq_len, -1)
|
|
685
591
|
self.cache['attn_concat'] = attn_output
|
|
686
592
|
|
|
687
|
-
# Final linear projection
|
|
688
|
-
# W_o is geometry-initialised via GWS on the first training call (see train()).
|
|
593
|
+
# Final linear projection
|
|
689
594
|
output = np.matmul(attn_output, self.W_o)
|
|
690
595
|
self.cache['attn_out'] = output
|
|
691
596
|
|
|
692
597
|
return output, attn_weights
|
|
693
598
|
|
|
694
599
|
|
|
695
|
-
def forward(self, input_ids, embedded=False):
|
|
600
|
+
def forward(self, input_ids, embedded=False, pad_token_id=0, training=True, attn_dropout=0.1, ffn_dropout=0.1):
|
|
696
601
|
if embedded:
|
|
697
|
-
# Accept pre-computed embeddings directly (e.g. TF-IDF vectors passed as
|
|
698
|
-
# float arrays) instead of integer token IDs. Reshape to 3-D if needed.
|
|
699
602
|
x = np.asarray(input_ids)
|
|
700
603
|
if x.ndim == 2:
|
|
701
604
|
x = x[np.newaxis, ...]
|
|
702
605
|
batch_size, seq_len, _ = x.shape
|
|
703
606
|
self.cache['embedded_input'] = x
|
|
704
607
|
self.cache['input_ids'] = None
|
|
608
|
+
mask = None
|
|
705
609
|
else:
|
|
610
|
+
input_ids = np.asarray(input_ids, dtype=np.int32)
|
|
706
611
|
if input_ids.ndim == 1:
|
|
707
|
-
input_ids = input_ids.
|
|
708
|
-
|
|
612
|
+
input_ids = input_ids[np.newaxis, :]
|
|
613
|
+
|
|
709
614
|
x = self.token_embedding[input_ids]
|
|
710
615
|
x = x + self.pos_embedding[:x.shape[1]]
|
|
711
616
|
batch_size, seq_len = input_ids.shape
|
|
712
617
|
self.cache['embedded_input'] = None
|
|
713
618
|
self.cache['input_ids'] = input_ids
|
|
619
|
+
mask = self.padding_mask_utility(input_ids, pad_token_id) # (B,1,1,T)
|
|
714
620
|
|
|
621
|
+
self.cache['mask'] = mask if not embedded else None
|
|
715
622
|
self.cache['seq_len'] = seq_len
|
|
716
623
|
self.cache['batch_size'] = batch_size
|
|
717
624
|
self.cache['x_token'] = x
|
|
718
625
|
self.cache['x_pos'] = x
|
|
719
626
|
|
|
720
|
-
# Multi-head attention with residual
|
|
721
|
-
|
|
627
|
+
# Multi-head attention with residual
|
|
628
|
+
AME = self.AME_Encoder(x)
|
|
629
|
+
alpha = 1.0 / (1.0 + np.exp(-AME))
|
|
630
|
+
attn_out, attn_weights = self.multi_head_attention(x, mask=mask, alpha=alpha)
|
|
631
|
+
|
|
632
|
+
current_alpha = self.cache.get('alpha', 0.0)
|
|
633
|
+
|
|
634
|
+
attn_out, attn_drop_mask = self.dropout(attn_out, rate=self.attn_dropout_rate, training=training, alpha=current_alpha)
|
|
635
|
+
self.cache['attn_drop_mask'] = attn_drop_mask
|
|
722
636
|
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
self.alpha = 0.95 * self.alpha + 0.05 * self.attention_quality_computing(attn_weights)
|
|
637
|
+
alpha = 0.95 * alpha + 0.05 * self.attention_quality_computing(attn_weights, mask=mask)
|
|
638
|
+
|
|
639
|
+
self.alpha = alpha
|
|
640
|
+
self.cache['alpha'] = alpha # store in cache
|
|
728
641
|
|
|
729
|
-
# Pre-norm residual: cache the sum before normalising so backward can recover it.
|
|
730
642
|
self.cache['x_ln1_input'] = x + attn_out
|
|
731
643
|
x = self.layer_norm(x + attn_out, self.ln1_scale, self.ln1_shift)
|
|
732
644
|
self.cache['x_after_ln1'] = x
|
|
733
645
|
|
|
734
|
-
# Feed-forward
|
|
646
|
+
# Feed-forward with residual
|
|
735
647
|
self.cache['ffn_input'] = x
|
|
736
648
|
ffn_pre = np.matmul(x, self.ffn1)
|
|
737
649
|
self.cache['ffn_pre'] = ffn_pre
|
|
738
650
|
|
|
739
651
|
ffn_act = np.maximum(0, ffn_pre) # ReLU
|
|
652
|
+
|
|
653
|
+
ffn_act, ffn_drop_mask = self.dropout(ffn_act, rate=self.ffn_dropout_rate, training=training, alpha=current_alpha)
|
|
654
|
+
|
|
740
655
|
self.cache['ffn_act'] = ffn_act
|
|
741
|
-
|
|
656
|
+
self.cache['ffn_drop_mask'] = ffn_drop_mask
|
|
657
|
+
|
|
742
658
|
ffn_out = np.matmul(ffn_act, self.ffn2)
|
|
743
659
|
self.cache['ffn_out'] = ffn_out
|
|
744
660
|
|
|
@@ -746,9 +662,15 @@ class Transformer:
|
|
|
746
662
|
x = self.layer_norm(x + ffn_out, self.ln2_scale, self.ln2_shift)
|
|
747
663
|
self.cache['x_after_ln2'] = x
|
|
748
664
|
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
665
|
+
if mask is not None:
|
|
666
|
+
# Reshape mask to (B, T, 1) for broadcasting against (B, T, D)
|
|
667
|
+
token_mask = mask[:, 0, 0, :, np.newaxis] # (B, T, 1)
|
|
668
|
+
x_masked = x * token_mask # zero out padding
|
|
669
|
+
lengths = token_mask.sum(axis=1) # (B, 1) valid token counts
|
|
670
|
+
x_pooled = x_masked.sum(axis=1) / (lengths + 1e-6) # (B, D)
|
|
671
|
+
else:
|
|
672
|
+
x_pooled = np.mean(x, axis=1)
|
|
673
|
+
|
|
752
674
|
self.cache['x_pooled'] = x_pooled
|
|
753
675
|
|
|
754
676
|
# Output projection
|
|
@@ -762,18 +684,6 @@ class Transformer:
|
|
|
762
684
|
|
|
763
685
|
|
|
764
686
|
def layer_norm_backward(self, d_out, x, scale, shift):
|
|
765
|
-
# Backpropagates through layer normalisation.
|
|
766
|
-
# LayerNorm forward: y = scale * (x - mean) / sqrt(var + eps) + shift
|
|
767
|
-
#
|
|
768
|
-
# Gradient derivation (standard result, often omitted from textbooks):
|
|
769
|
-
# dx_hat = d_out * scale — upstream grad scaled by learned gamma
|
|
770
|
-
# dvar = sum(dx_hat * (x-mean) * -0.5 * std^{-3}) — chain rule on variance
|
|
771
|
-
# dmean = sum(dx_hat * -1/std) + dvar * mean(-2*(x-mean))
|
|
772
|
-
# — two paths: direct via x_hat, indirect via variance
|
|
773
|
-
# dx = dx_hat/std + dvar * 2*(x-mean)/N + dmean/N
|
|
774
|
-
# — three additive terms, each from a different path through the graph
|
|
775
|
-
#
|
|
776
|
-
# N is the feature dimension (last axis) — normalization is per-sample, per-position.
|
|
777
687
|
eps = 1e-5
|
|
778
688
|
mean = np.mean(x, axis=-1, keepdims=True)
|
|
779
689
|
var = np.var(x, axis=-1, keepdims=True)
|
|
@@ -784,10 +694,7 @@ class Transformer:
|
|
|
784
694
|
N = x.shape[-1]
|
|
785
695
|
dx_hat = d_out * scale
|
|
786
696
|
dvar = np.sum(dx_hat * (x - mean) * -0.5 * std**-3, axis=-1, keepdims=True)
|
|
787
|
-
dmean = (
|
|
788
|
-
np.sum(dx_hat * -1/std, axis=-1, keepdims=True)
|
|
789
|
-
+ dvar * np.mean(-2*(x-mean), axis=-1, keepdims=True)
|
|
790
|
-
)
|
|
697
|
+
dmean = np.sum(dx_hat * (-1.0 / std), axis=-1, keepdims=True)
|
|
791
698
|
|
|
792
699
|
dx = (
|
|
793
700
|
dx_hat / std
|
|
@@ -798,10 +705,12 @@ class Transformer:
|
|
|
798
705
|
return dx
|
|
799
706
|
|
|
800
707
|
# fixed attention backward allow the transformer to not update its Q, K, V projections, allowing much stable attention, while sacrificing flexibility.
|
|
801
|
-
def fixed_attention_backward(self, d_logits, lr=0.
|
|
708
|
+
def fixed_attention_backward(self, d_logits, lr=0.01, max_norm=1.0):
|
|
802
709
|
|
|
803
710
|
# Gradient for output layer
|
|
804
711
|
d_output = d_logits
|
|
712
|
+
alpha = self.cache.get('alpha', 1.0)
|
|
713
|
+
|
|
805
714
|
d_Wo = np.dot(self.cache['x_pooled'].T, d_output)
|
|
806
715
|
d_bo = np.sum(d_output, axis=0, keepdims=True)
|
|
807
716
|
|
|
@@ -823,8 +732,12 @@ class Transformer:
|
|
|
823
732
|
|
|
824
733
|
# Gradient for FFN1 through ReLU
|
|
825
734
|
d_ffn_act = np.matmul(d_ffn, self.ffn2.T)
|
|
826
|
-
|
|
827
|
-
|
|
735
|
+
ffn_drop_mask = self.cache.get('ffn_drop_mask')
|
|
736
|
+
if ffn_drop_mask is not None:
|
|
737
|
+
d_ffn_act = d_ffn_act * ffn_drop_mask / (1.0 - self.ffn_dropout_rate)
|
|
738
|
+
|
|
739
|
+
d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0) # ReLU backward unchanged
|
|
740
|
+
|
|
828
741
|
d_prev = np.matmul(d_ffn_pre, self.ffn1.T)
|
|
829
742
|
d_ffn1 = np.sum(np.matmul(self.cache['ffn_input'].transpose(0, 2, 1), d_ffn_pre), axis=0)
|
|
830
743
|
|
|
@@ -839,23 +752,36 @@ class Transformer:
|
|
|
839
752
|
d_attn = dx
|
|
840
753
|
|
|
841
754
|
# Gradient for attention output projection
|
|
842
|
-
|
|
843
|
-
|
|
755
|
+
attn_drop_mask = self.cache.get('attn_drop_mask')
|
|
756
|
+
if attn_drop_mask is not None:
|
|
757
|
+
d_attn = d_attn * attn_drop_mask / (1.0 - self.attn_dropout_rate)
|
|
758
|
+
|
|
759
|
+
d_Wo_attn = np.sum(np.matmul(self.cache['attn_concat'].transpose(0,2,1), d_attn), axis=0)
|
|
760
|
+
|
|
761
|
+
grads = {
|
|
762
|
+
'output': d_Wo,
|
|
763
|
+
'ffn2': d_ffn2,
|
|
764
|
+
'ffn1': d_ffn1,
|
|
765
|
+
'W_o': d_Wo_attn,
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
grads, norm = self.clip_gradients(grads, max_norm)
|
|
769
|
+
|
|
844
770
|
# Update weights
|
|
845
|
-
self.output
|
|
846
|
-
self.
|
|
847
|
-
self.
|
|
848
|
-
self.
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
771
|
+
self.output = self.apply_update(self.output, grads['output'], lr)
|
|
772
|
+
self.ffn2 = self.apply_update(self.ffn2, grads['ffn2'], lr)
|
|
773
|
+
self.ffn1 = self.apply_update(self.ffn1, grads['ffn1'], lr)
|
|
774
|
+
self.W_o = self.apply_update(self.W_o, grads['W_o'], lr)
|
|
775
|
+
# output_bias intentionally excluded — biases don't get weight decay
|
|
776
|
+
|
|
852
777
|
return d_x
|
|
853
778
|
|
|
854
779
|
|
|
855
|
-
def dynamic_backward(self, d_logits, lr=0.
|
|
856
|
-
|
|
780
|
+
def dynamic_backward(self, d_logits, lr=0.01, max_norm=1.0):
|
|
857
781
|
# Gradient for output layer
|
|
858
782
|
d_output = d_logits
|
|
783
|
+
alpha = self.cache.get('alpha', 1.0)
|
|
784
|
+
|
|
859
785
|
d_Wo = np.dot(self.cache['x_pooled'].T, d_output)
|
|
860
786
|
d_bo = np.sum(d_output, axis=0)
|
|
861
787
|
|
|
@@ -863,8 +789,14 @@ class Transformer:
|
|
|
863
789
|
d_pooled = np.dot(d_output, self.output.T)
|
|
864
790
|
|
|
865
791
|
# Expand pooled gradient to all positions
|
|
866
|
-
|
|
867
|
-
|
|
792
|
+
mask = self.cache['mask'] # (B, 1, 1, T)
|
|
793
|
+
if mask is not None:
|
|
794
|
+
token_mask = mask[:, 0, 0, :, np.newaxis] # (B, T, 1)
|
|
795
|
+
lengths = token_mask.sum(axis=1, keepdims=True) # (B, 1, 1)
|
|
796
|
+
d_x = (d_pooled[:, np.newaxis, :] / (lengths + 1e-6)) * token_mask
|
|
797
|
+
else:
|
|
798
|
+
d_x = np.repeat(d_pooled[:, np.newaxis, :] / self.cache['seq_len'], self.cache['seq_len'], axis=1)
|
|
799
|
+
|
|
868
800
|
# Layer norm 2 gradient
|
|
869
801
|
d_x = self.layer_norm_backward(d_x, self.cache['x_ln2_input'],
|
|
870
802
|
self.ln2_scale, self.ln2_shift)
|
|
@@ -877,8 +809,12 @@ class Transformer:
|
|
|
877
809
|
|
|
878
810
|
# Gradient for FFN1 through ReLU
|
|
879
811
|
d_ffn_act = np.matmul(d_ffn, self.ffn2.T)
|
|
812
|
+
ffn_drop_mask = self.cache.get('ffn_drop_mask')
|
|
813
|
+
if ffn_drop_mask is not None:
|
|
814
|
+
d_ffn_act = d_ffn_act * ffn_drop_mask / (1.0 - self.ffn_dropout_rate)
|
|
815
|
+
|
|
816
|
+
d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0) # ReLU backward unchanged
|
|
880
817
|
|
|
881
|
-
d_ffn_pre = d_ffn_act * (self.cache['ffn_pre'] >= 0)
|
|
882
818
|
d_prev = np.matmul(d_ffn_pre, self.ffn1.T)
|
|
883
819
|
d_ffn1 = np.sum(np.matmul(self.cache['ffn_input'].transpose(0, 2, 1), d_ffn_pre), axis=0)
|
|
884
820
|
|
|
@@ -890,6 +826,10 @@ class Transformer:
|
|
|
890
826
|
dx = d_prev + d_residual
|
|
891
827
|
|
|
892
828
|
# Gradient for attention output projection
|
|
829
|
+
attn_drop_mask = self.cache.get('attn_drop_mask')
|
|
830
|
+
if attn_drop_mask is not None:
|
|
831
|
+
d_attn = d_attn * attn_drop_mask / (1.0 - self.attn_dropout_rate)
|
|
832
|
+
|
|
893
833
|
d_Wo_attn = np.sum(np.matmul(self.cache['attn_concat'].transpose(0, 2, 1), d_attn), axis=0)
|
|
894
834
|
|
|
895
835
|
d_attn_concat = np.matmul(d_attn, self.W_o.T)
|
|
@@ -897,8 +837,6 @@ class Transformer:
|
|
|
897
837
|
d_head = self.n_heads
|
|
898
838
|
d_dim = self.d_model // self.n_heads
|
|
899
839
|
|
|
900
|
-
# Reshape concatenated head gradients back to per-head form before computing
|
|
901
|
-
# QKV gradients. Reverses the transpose+reshape done in multi_head_attention.
|
|
902
840
|
d_attn_heads = d_attn_concat.reshape(batch, seq_len, d_head, d_dim) .transpose(0, 2, 1, 3)
|
|
903
841
|
|
|
904
842
|
V = self.cache['V']
|
|
@@ -906,84 +844,150 @@ class Transformer:
|
|
|
906
844
|
Q = self.cache['Q']
|
|
907
845
|
weight = self.cache['attn_weights']
|
|
908
846
|
|
|
909
|
-
# Gradient of attention output w.r.t. V: dL/dV = attn_weights^T · dL/d_attn_out
|
|
910
847
|
d_V = np.matmul(weight.transpose(0, 1, 3, 2), d_attn_heads)
|
|
911
|
-
# Gradient w.r.t. pre-softmax attention scores via the product rule on Vᵀ.
|
|
912
848
|
d_weights = np.matmul(d_attn_heads, V.transpose(0, 1, 3, 2))
|
|
913
849
|
|
|
914
|
-
# Softmax Jacobian shortcut: d(softmax) = softmax * (d_out - sum(d_out * softmax))
|
|
915
|
-
# Scaled by 1/sqrt(d_k) matching the forward-pass scaling.
|
|
916
850
|
d_scores = weight * (d_weights - np.sum(d_weights * weight, axis=-1, keepdims=True))
|
|
917
851
|
d_scores /= np.sqrt(Q.shape[-1])
|
|
918
852
|
|
|
919
|
-
# Back-propagate through Q·Kᵀ to get per-head Q and K gradients.
|
|
920
853
|
d_Q = np.matmul(d_scores, K)
|
|
921
854
|
d_K = np.matmul(d_scores.transpose(0, 1, 3, 2), Q)
|
|
922
855
|
|
|
923
856
|
x = self.cache['x_attn_input']
|
|
924
857
|
|
|
925
|
-
# Project head-space gradients back to projection-weight gradients using einsum.
|
|
926
|
-
# 'bsd, bhsm->hdm': accumulate over batch and sequence dimensions.
|
|
927
858
|
d_W_q = np.einsum('bsd, bhsm->hdm', x, d_Q)
|
|
928
859
|
d_W_k = np.einsum('bsd, bhsm->hdm', x, d_K)
|
|
929
860
|
d_W_v = np.einsum('bsd, bhsm->hdm', x, d_V)
|
|
930
861
|
|
|
931
|
-
# Project head-space gradients back to input space (for the layer below).
|
|
932
862
|
d_x_q = np.einsum('bhsm, hdm->bsd', d_Q, self.W_q)
|
|
933
863
|
d_x_k = np.einsum('bhsm, hdm->bsd', d_K, self.W_k)
|
|
934
864
|
d_x_v = np.einsum('bhsm, hdm->bsd', d_V, self.W_v)
|
|
935
865
|
|
|
936
|
-
# Sum Q, K, V contributions — each head projection touches the same input x.
|
|
937
866
|
d_x_attn_input = d_x_q + d_x_k + d_x_v
|
|
938
867
|
d_x_total = d_x_attn_input + d_residual
|
|
939
868
|
|
|
940
869
|
input_ids = self.cache.get('input_ids')
|
|
941
870
|
|
|
942
871
|
if input_ids is not None:
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
self.token_embedding[idx] -= lr * d_x_total[b, t] / self.cache['seq_len']
|
|
872
|
+
flat_ids = input_ids.flatten() # (B*T,)
|
|
873
|
+
flat_grads = d_x_total.reshape(-1, self.d_model) / self.cache['seq_len']
|
|
874
|
+
np.add.at(self.token_embedding, flat_ids, -lr * flat_grads)
|
|
947
875
|
|
|
948
876
|
# Update weights
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
877
|
+
grads = {
|
|
878
|
+
'output': d_Wo,
|
|
879
|
+
'ffn2': d_ffn2,
|
|
880
|
+
'ffn1': d_ffn1,
|
|
881
|
+
'W_o': d_Wo_attn,
|
|
882
|
+
'W_q': alpha * d_W_q, # already alpha-scaled, clip the combined thing
|
|
883
|
+
'W_k': alpha * d_W_k,
|
|
884
|
+
'W_v': alpha * d_W_v,
|
|
885
|
+
}
|
|
886
|
+
grads, norm = self.clip_gradients(grads, max_norm)
|
|
954
887
|
|
|
955
|
-
|
|
888
|
+
self.output = self.apply_update(self.output, grads['output'], lr)
|
|
889
|
+
self.ffn2 = self.apply_update(self.ffn2, grads['ffn2'], lr)
|
|
890
|
+
self.ffn1 = self.apply_update(self.ffn1, grads['ffn1'], lr)
|
|
891
|
+
self.W_o = self.apply_update(self.W_o, grads['W_o'], lr)
|
|
892
|
+
self.W_q = self.apply_update(self.W_q, grads['W_q'], lr)
|
|
893
|
+
self.W_k = self.apply_update(self.W_k, grads['W_k'], lr)
|
|
894
|
+
self.W_v = self.apply_update(self.W_v, grads['W_v'], lr)
|
|
956
895
|
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
896
|
+
if input_ids is not None:
|
|
897
|
+
emb_norm = np.linalg.norm(d_x_total)
|
|
898
|
+
emb_coef = min(1.0, max_norm / (emb_norm + 1e-6))
|
|
960
899
|
|
|
961
|
-
|
|
900
|
+
flat_ids = input_ids.flatten() # (B*T,)
|
|
901
|
+
flat_grads = d_x_total.reshape(-1, self.d_model) / self.cache['seq_len'] # (B*T, D)
|
|
962
902
|
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
903
|
+
np.add.at(self.token_embedding, flat_ids, -lr * emb_coef * flat_grads)
|
|
904
|
+
self.pos_embedding[:seq_len] -= lr * emb_coef * d_x_total.mean(axis=0)
|
|
905
|
+
else:
|
|
906
|
+
self.pos_embedding[:seq_len] -= lr * d_x_total.mean(axis=0)
|
|
907
|
+
norm = d_x_total
|
|
908
|
+
|
|
909
|
+
return norm
|
|
966
910
|
|
|
967
|
-
def
|
|
968
|
-
|
|
911
|
+
def smoothing_labels_utility(self, y_true, smoothing=0.1):
|
|
912
|
+
# y_true: (B, num_classes) one-hot
|
|
913
|
+
num_classes = y_true.shape[1]
|
|
914
|
+
return y_true * (1.0 - smoothing) + smoothing / num_classes
|
|
915
|
+
|
|
916
|
+
def learning_rate_warm_up(self, epoch, epochs, lr_base, schedule='cosine_warmup', warmup_frac=0.1):
|
|
917
|
+
warmup_epochs = int(epochs * warmup_frac)
|
|
969
918
|
|
|
970
|
-
|
|
919
|
+
if schedule == 'cosine_warmup':
|
|
920
|
+
if epoch < warmup_epochs:
|
|
921
|
+
# Linear warmup
|
|
922
|
+
return lr_base * (epoch + 1) / warmup_epochs
|
|
923
|
+
else:
|
|
924
|
+
# Cosine decay after warmup
|
|
925
|
+
progress = (epoch - warmup_epochs) / (epochs - warmup_epochs)
|
|
926
|
+
return lr_base * 0.5 * (1 + np.cos(np.pi * progress))
|
|
927
|
+
|
|
928
|
+
elif schedule == 'step':
|
|
929
|
+
# Halve lr every 30% of training
|
|
930
|
+
step = int(epochs * 0.3)
|
|
931
|
+
return lr_base * (0.5 ** (epoch // step))
|
|
932
|
+
|
|
933
|
+
elif schedule == 'constant':
|
|
934
|
+
return lr_base
|
|
935
|
+
|
|
936
|
+
return lr_base
|
|
937
|
+
|
|
938
|
+
def padding_mask_utility(self, input_ids, pad_token_id=0):
|
|
939
|
+
# input_ids: (B, T)
|
|
940
|
+
# Returns: (B, 1, 1, T) — broadcast-ready for (B, heads, T_q, T_k)
|
|
941
|
+
mask = (input_ids != pad_token_id).astype(np.float32)
|
|
942
|
+
return mask[:, np.newaxis, np.newaxis, :] # (B, 1, 1, T)
|
|
943
|
+
|
|
944
|
+
def clip_gradients(self, grads: dict, max_norm: float = 1.0) -> dict:
|
|
945
|
+
# Compute global norm across all gradient tensors
|
|
946
|
+
total_norm = np.sqrt(sum(
|
|
947
|
+
np.sum(g ** 2) for g in grads.values()
|
|
948
|
+
))
|
|
949
|
+
|
|
950
|
+
clip_coef = max_norm / (total_norm + 1e-6)
|
|
951
|
+
|
|
952
|
+
# scale down, never up
|
|
953
|
+
if clip_coef < 1.0:
|
|
954
|
+
grads = {k: g * clip_coef for k, g in grads.items()}
|
|
955
|
+
|
|
956
|
+
return grads, total_norm # return norm for monitoring
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
def train_step(self, input_ids, epoch, y_true, lr=0.01, mode=None, embedded=False, max_norm=1.0, pad_token_id=0):
|
|
960
|
+
if not embedded and input_ids.ndim == 1:
|
|
961
|
+
input_ids = input_ids[np.newaxis, :] # (1, T), single sample
|
|
962
|
+
if y_true.ndim == 1:
|
|
963
|
+
y_true = y_true[np.newaxis, :]
|
|
964
|
+
|
|
965
|
+
probs, attn_weights = self.forward(input_ids, embedded=embedded, pad_token_id=pad_token_id, training=True, attn_dropout=self.attn_dropout_rate, ffn_dropout=self.ffn_dropout_rate)
|
|
966
|
+
y_true_smooth = self.smoothing_labels_utility(y_true, smoothing=0.1)
|
|
967
|
+
|
|
968
|
+
if y_true_smooth.shape[0] and y_true_smooth.shape[1] != probs.shape[0] and probs.shape[1]:
|
|
969
|
+
if y_true_smooth.shape[1] > probs.shape[1]:
|
|
970
|
+
y_true_smooth = y_true_smooth[:, :probs.shape[1]]
|
|
971
|
+
else:
|
|
972
|
+
y_true_smooth = np.pad(y_true_smooth, ((0, 0), (0, probs.shape[1] - y_true_smooth.shape[1])), mode='constant')
|
|
973
|
+
|
|
971
974
|
if y_true.shape[0] and y_true.shape[1] != probs.shape[0] and probs.shape[1]:
|
|
972
975
|
if y_true.shape[1] > probs.shape[1]:
|
|
973
|
-
|
|
976
|
+
y_true = y_true[:, :probs.shape[1]]
|
|
974
977
|
else:
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
+
y_true = np.pad(y_true, ((0, 0), (0, probs.shape[1] - y_true.shape[1])), mode='constant')
|
|
979
|
+
|
|
980
|
+
# Loss (cross-entropy)
|
|
981
|
+
loss = -np.mean(np.sum(y_true_smooth * np.log(probs + 1e-8), axis=1))
|
|
978
982
|
|
|
979
983
|
# Gradient of loss w.r.t. logits
|
|
980
|
-
d_logits = (probs -
|
|
984
|
+
d_logits = (probs - y_true_smooth) / y_true_smooth.shape[0]
|
|
981
985
|
|
|
982
986
|
# Backward pass
|
|
983
987
|
if mode == 'fixed_backward':
|
|
984
|
-
self.fixed_attention_backward(d_logits, lr)
|
|
988
|
+
self.fixed_attention_backward(d_logits, lr, max_norm=max_norm)
|
|
985
989
|
else:
|
|
986
|
-
self.dynamic_backward(d_logits, lr)
|
|
990
|
+
self.dynamic_backward(d_logits, lr, max_norm=max_norm)
|
|
987
991
|
|
|
988
992
|
# Accuracy
|
|
989
993
|
preds = np.argmax(probs, axis=1)
|
|
@@ -992,8 +996,16 @@ class Transformer:
|
|
|
992
996
|
|
|
993
997
|
return loss, acc
|
|
994
998
|
|
|
999
|
+
def batch_padding_utility(self, sequences, pad_token_id=0):
|
|
1000
|
+
# sequences: list of 1-D np arrays of varying length
|
|
1001
|
+
max_len = max(len(s) for s in sequences)
|
|
1002
|
+
padded = np.full((len(sequences), max_len), pad_token_id, dtype=np.int32)
|
|
1003
|
+
for i, s in enumerate(sequences):
|
|
1004
|
+
padded[i, :len(s)] = s
|
|
1005
|
+
return padded # (B, T)
|
|
995
1006
|
|
|
996
|
-
|
|
1007
|
+
|
|
1008
|
+
def train(self, input_ids_list, y_true_list, epochs=100, mode=None, lr=0.01, embedded=False, max_norm=1.0, schedule='cosine_warmup', pad_token_id=0, batch_size=None):
|
|
997
1009
|
losses = []
|
|
998
1010
|
accs = []
|
|
999
1011
|
d_model = self.d_model
|
|
@@ -1004,23 +1016,48 @@ class Transformer:
|
|
|
1004
1016
|
self.shaping = GeometricWeightShaping(d_model, d_model)
|
|
1005
1017
|
shaping_input = input_ids_list
|
|
1006
1018
|
if embedded:
|
|
1007
|
-
shaping_input = np.vstack([
|
|
1019
|
+
shaping_input = np.vstack([
|
|
1020
|
+
x.reshape(-1, x.shape[-1]) if x.ndim >= 2 else x
|
|
1021
|
+
for x in input_ids_list
|
|
1022
|
+
])
|
|
1023
|
+
else:
|
|
1024
|
+
shaping_input = input_ids_list
|
|
1025
|
+
|
|
1008
1026
|
self.W_o = self.shaping.weight_shaping(shaping_input)
|
|
1009
1027
|
self.encoded = True
|
|
1028
|
+
|
|
1029
|
+
# Pre-pad all sequences once before training starts
|
|
1030
|
+
# only when batch_size is set and not in embedded mode
|
|
1031
|
+
if batch_size is not None and not embedded:
|
|
1032
|
+
input_ids_list = [
|
|
1033
|
+
self.batch_padding_utility(input_ids_list[i:i+batch_size], pad_token_id)
|
|
1034
|
+
for i in range(0, len(input_ids_list), batch_size)
|
|
1035
|
+
]
|
|
1036
|
+
y_true_list = [
|
|
1037
|
+
np.stack(y_true_list[i:i+batch_size])
|
|
1038
|
+
for i in range(0, len(y_true_list), batch_size)
|
|
1039
|
+
]
|
|
1040
|
+
# input_ids_list is now a list of (B, T) arrays
|
|
1041
|
+
# y_true_list is now a list of (B, num_classes) arrays
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
print(f"[==] Starting comprehensive training for {epochs} epochs with mode: {mode}, learning rate: {lr}, schedule: {schedule}")
|
|
1010
1045
|
|
|
1011
1046
|
for epoch in range(epochs):
|
|
1012
1047
|
epoch_losses = []
|
|
1013
1048
|
epoch_accs = []
|
|
1049
|
+
current_lr = self.learning_rate_warm_up(epoch, epochs, lr, schedule)
|
|
1014
1050
|
self.alpha = min(1.0, epoch / 100)
|
|
1015
1051
|
|
|
1016
1052
|
for input_ids, y_true in zip(input_ids_list, y_true_list):
|
|
1017
1053
|
|
|
1018
1054
|
if input_ids.ndim == 1:
|
|
1019
|
-
input_ids = input_ids.
|
|
1055
|
+
input_ids = input_ids[np.newaxis, :]
|
|
1020
1056
|
if y_true.ndim == 1:
|
|
1021
|
-
y_true = y_true.
|
|
1057
|
+
y_true = y_true[np.newaxis, :]
|
|
1022
1058
|
|
|
1023
|
-
loss, acc = self.train_step(input_ids, epoch, y_true,
|
|
1059
|
+
loss, acc = self.train_step(input_ids, epoch, y_true, current_lr, mode,
|
|
1060
|
+
embedded=embedded, max_norm=max_norm, pad_token_id=pad_token_id)
|
|
1024
1061
|
epoch_losses.append(loss)
|
|
1025
1062
|
epoch_accs.append(acc)
|
|
1026
1063
|
|
|
@@ -1031,7 +1068,7 @@ class Transformer:
|
|
|
1031
1068
|
|
|
1032
1069
|
if epoch % 10 == 0:
|
|
1033
1070
|
print(f"[=] Epoch {epoch} | loss: {avg_loss:.4f} | Acc: {avg_acc:.2%}")
|
|
1034
|
-
|
|
1071
|
+
|
|
1035
1072
|
return losses, accs
|
|
1036
1073
|
|
|
1037
1074
|
|
|
@@ -1039,7 +1076,7 @@ class Transformer:
|
|
|
1039
1076
|
if not embedded and input_ids.ndim == 1:
|
|
1040
1077
|
input_ids = input_ids.reshape(1, -1)
|
|
1041
1078
|
|
|
1042
|
-
probs, attn_weights = self.forward(input_ids, embedded=embedded)
|
|
1079
|
+
probs, attn_weights = self.forward(input_ids, embedded=embedded, training=False, attn_dropout=0.0, ffn_dropout=0.0)
|
|
1043
1080
|
preds = np.argmax(probs, axis=1)
|
|
1044
1081
|
|
|
1045
1082
|
return preds, probs, attn_weights
|
|
@@ -1048,10 +1085,6 @@ class Transformer:
|
|
|
1048
1085
|
def AME_Encoder(self, x):
|
|
1049
1086
|
X = np.asarray(x)
|
|
1050
1087
|
|
|
1051
|
-
if x.shape[1] == 1:
|
|
1052
|
-
x = x.T
|
|
1053
|
-
x= x.flatten()
|
|
1054
|
-
|
|
1055
1088
|
gradient = np.gradient(x, axis=-1)
|
|
1056
1089
|
grad_energy = np.mean(np.linalg.norm(gradient, axis=-1))
|
|
1057
1090
|
X_mag = np.mean(np.linalg.norm(X, axis=-1))
|
|
@@ -1075,26 +1108,23 @@ class Transformer:
|
|
|
1075
1108
|
|
|
1076
1109
|
# attention quality computing provides the transformer a robust geometric complexity alignment scalar,
|
|
1077
1110
|
# this scalar can be used to compute alpha for a much stable forward pass in scarce data environment, allowing it to complement with AWE MLP below.
|
|
1078
|
-
def attention_quality_computing(self, attn_weights):
|
|
1079
|
-
# Produces a scalar in [0, 1] that summarises how "high quality" the current
|
|
1080
|
-
# attention distribution is. This value is used to update self.alpha via EMA
|
|
1081
|
-
# in forward(), which in turn controls the frozen-vs-learned projection blend.
|
|
1082
|
-
#
|
|
1083
|
-
# Four complementary signals are combined:
|
|
1084
|
-
# norm_entropy — 1 − normalised entropy; near 1 when attention is focused
|
|
1085
|
-
# on a small number of tokens (confident), near 0 when flat.
|
|
1086
|
-
# avg_max — mean of per-head max weights; a direct focus indicator.
|
|
1087
|
-
# norm_var — clipped variance scaled by seq_len; captures head diversity.
|
|
1088
|
-
# qualified — geometric factor: (1 - AMR) * anisotropy;
|
|
1089
|
-
# high when data geometry is complex but AMR (model rate) is low,
|
|
1090
|
-
# effectively weighting quality higher when the model is in an
|
|
1091
|
-
# exploratory (low-AMR) regime over anisotropic data.
|
|
1092
|
-
#
|
|
1093
|
-
# Final score = qualified*(norm_entropy + avg_max) + anisotropy*norm_var
|
|
1094
|
-
# Clipped to [0, 1] and returned as dynamic_alpha.
|
|
1111
|
+
def attention_quality_computing(self, attn_weights, mask=None):
|
|
1095
1112
|
eps = 1e-5
|
|
1096
1113
|
eps = 1e-5
|
|
1097
1114
|
batch, heads, seq_len, _ = attn_weights.shape
|
|
1115
|
+
|
|
1116
|
+
if mask is not None:
|
|
1117
|
+
# mask: (B, 1, 1, T) → expand to (B, heads, T, T)
|
|
1118
|
+
mask_expanded = np.broadcast_to(
|
|
1119
|
+
mask, (batch, heads, seq_len, seq_len)
|
|
1120
|
+
)
|
|
1121
|
+
# Zero out padding positions before computing stats
|
|
1122
|
+
attn_weights = attn_weights * mask_expanded
|
|
1123
|
+
|
|
1124
|
+
# Renormalise so rows still sum to 1 over valid tokens only
|
|
1125
|
+
row_sums = attn_weights.sum(axis=-1, keepdims=True) + eps
|
|
1126
|
+
attn_weights = attn_weights / row_sums
|
|
1127
|
+
|
|
1098
1128
|
AME = self.AME_Encoder(attn_weights)
|
|
1099
1129
|
anisotropy = self.anisotropy_measurement(attn_weights)
|
|
1100
1130
|
|
|
@@ -1109,8 +1139,6 @@ class Transformer:
|
|
|
1109
1139
|
norm_var = np.clip(var_attn * seq_len, 0, 1)
|
|
1110
1140
|
|
|
1111
1141
|
AMR = 1.0 / (1.0 + np.exp(-AME)) # abstract modelling rate
|
|
1112
|
-
# qualified is high when AMR is low (geometry complex, model still learning)
|
|
1113
|
-
# and anisotropy is high (strongly directional gradients in the attention map).
|
|
1114
1142
|
qualified = (1.0 - AMR) + eps * anisotropy
|
|
1115
1143
|
|
|
1116
1144
|
quality_score = qualified * norm_entropy + qualified * avg_max + anisotropy * norm_var
|
|
@@ -1138,35 +1166,17 @@ class Dense:
|
|
|
1138
1166
|
self.activation_derivative = None
|
|
1139
1167
|
|
|
1140
1168
|
def multi_modal_linear_transformation(self, x):
|
|
1141
|
-
# Standard linear layer z = xW + b, but with a multi-level shape-mismatch
|
|
1142
|
-
# recovery cascade. This is needed because the GWS weight matrix W is shaped
|
|
1143
|
-
# at construction time from the training data, and at inference time the input
|
|
1144
|
-
# may have a different number of features (e.g. after vocabulary drift or
|
|
1145
|
-
# when calling the model with embedded TF-IDF vectors vs raw token IDs).
|
|
1146
|
-
#
|
|
1147
|
-
# Recovery hierarchy (outermost try wins):
|
|
1148
|
-
# Level 1 (primary): normal dot(x, W) + b.
|
|
1149
|
-
# Level 2 (first fallback): column-slice W to match x.shape[1], then add
|
|
1150
|
-
# a matching slice of b.
|
|
1151
|
-
# Level 3 (deep fallback): slice both x and W along whichever dimension fits,
|
|
1152
|
-
# then add a b slice. Covers edge cases where both x and W need trimming.
|
|
1153
|
-
#
|
|
1154
|
-
# The guard at the top reshapes W in-place if shapes are obviously mismatched
|
|
1155
|
-
# (x.shape[1] != W.shape[0]), preferring slicing over re-initialisation.
|
|
1156
1169
|
if len(x.shape) > 1 and x.shape[1] != self.W.shape[0]:
|
|
1157
1170
|
V1, V2 = x.shape[0], x.shape[1]
|
|
1158
1171
|
try:
|
|
1159
|
-
# Trim W's rows to match the feature dimension of x.
|
|
1160
1172
|
self.W = self.W[:V2, :]
|
|
1161
1173
|
except:
|
|
1162
|
-
# If trimming fails (W is already smaller), re-initialise with correct dims.
|
|
1163
1174
|
self.special_weight = GeometricWeightShaping(V2, V1)
|
|
1164
1175
|
self.W = self.special_weight.weight_shaping(x)
|
|
1165
1176
|
try:
|
|
1166
1177
|
try:
|
|
1167
1178
|
z = np.dot(x, self.W) + self.b
|
|
1168
1179
|
except:
|
|
1169
|
-
# W has more rows than x has columns; trim and add matching bias slice.
|
|
1170
1180
|
subnet_W = self.W[:x.shape[1], :x.shape[0]]
|
|
1171
1181
|
|
|
1172
1182
|
sub_z = np.dot(x, subnet_W)
|
|
@@ -1179,7 +1189,6 @@ class Dense:
|
|
|
1179
1189
|
subnet_W = self.W[:x.shape[1]:, :x.shape[0]]
|
|
1180
1190
|
sub_z = np.dot(x, subnet_W)
|
|
1181
1191
|
except:
|
|
1182
|
-
# Last resort: trim x to fit W or vice versa, whichever succeeds first.
|
|
1183
1192
|
weight = self.W
|
|
1184
1193
|
|
|
1185
1194
|
try:
|
|
@@ -1340,19 +1349,6 @@ class MLP:
|
|
|
1340
1349
|
|
|
1341
1350
|
|
|
1342
1351
|
def train(self, X, y, epochs=1000, lr=0.01, verbose=True):
|
|
1343
|
-
# Decide whether to use the "focused" sub-network (feed_layers) or the
|
|
1344
|
-
# standard full network (layers) for this training run.
|
|
1345
|
-
#
|
|
1346
|
-
# focused_fit_condition is True when ALL three hold:
|
|
1347
|
-
# 1. feed_layers is non-empty — a focused sub-network exists
|
|
1348
|
-
# 2. anisotropy > 0.25 — data has sufficient directional variation
|
|
1349
|
-
# (flat/isotropic data doesn't benefit from focus)
|
|
1350
|
-
# 3. AME > 0.25 — combined magnitude × gradient energy is above
|
|
1351
|
-
# a minimum threshold (data is complex enough)
|
|
1352
|
-
#
|
|
1353
|
-
# When True, only feed_layers are updated via focused_forward/focused_backward,
|
|
1354
|
-
# letting the model concentrate its learning capacity on high-complexity data
|
|
1355
|
-
# without disrupting the full network's previously learned representations.
|
|
1356
1352
|
focused_fit_condition = len(self.feed_layers) > 0 and self.anisotropy_measurement(X) > 0.25 and self.AME_Encoder(X) > 0.25
|
|
1357
1353
|
print(f'[+] Focused fit condition: {focused_fit_condition} || Anisotropy: {self.anisotropy_measurement(X):.4f} || AME: {self.AME_Encoder(X):.4f}')
|
|
1358
1354
|
for epoch in range(epochs):
|
|
@@ -1395,19 +1391,6 @@ class WeightedEnsemblePredictor:
|
|
|
1395
1391
|
|
|
1396
1392
|
|
|
1397
1393
|
def attention_memory_gate(self, probs, x):
|
|
1398
|
-
# Fast-path cache lookup: checks whether a previously seen input (stored under
|
|
1399
|
-
# prefix 'TA' in self.memory) is geometrically similar to the current input x.
|
|
1400
|
-
# Similarity is measured by cosine similarity ≥ 0.85 (tight threshold to avoid
|
|
1401
|
-
# false hits on unrelated inputs that happen to share some features).
|
|
1402
|
-
#
|
|
1403
|
-
# If a match is found, the cached attention outputs (texts, x2, x3, x4) are
|
|
1404
|
-
# returned directly, skipping a full forward pass through the transformer.
|
|
1405
|
-
# This also acts as a continual memory mechanism: the pipeline "remembers"
|
|
1406
|
-
# past attention patterns and reuses them for similar future inputs.
|
|
1407
|
-
#
|
|
1408
|
-
# Cache miss path:
|
|
1409
|
-
# - If self_attn_weights was set by a prior call, return it as a warm fallback.
|
|
1410
|
-
# - Otherwise return (None, None, None, None) signalling a full inference needed.
|
|
1411
1394
|
memory = self.memory
|
|
1412
1395
|
cache_attn_memory = [key for key, (_, inp, _, _, _) in memory.items() if key.startswith('TA') and self.pipeline.cosine_similarity(x, inp) >= 0.85]
|
|
1413
1396
|
|
|
@@ -1589,6 +1572,7 @@ class WeightedEnsemblePredictor:
|
|
|
1589
1572
|
self.credibility_summarized_prediction(input_ids, mlp_probs, trans_probs, attn_weights, type='pipeline')
|
|
1590
1573
|
except Exception as e:
|
|
1591
1574
|
print(f'[-] Cant get explainability features! : {e}')
|
|
1575
|
+
traceback.print_exc()
|
|
1592
1576
|
else:
|
|
1593
1577
|
print('[-] No agreement established, skipping explainability features.')
|
|
1594
1578
|
|
|
@@ -1623,28 +1607,6 @@ class WeightedEnsemblePredictor:
|
|
|
1623
1607
|
return anisotropy
|
|
1624
1608
|
|
|
1625
1609
|
def _dynamic_weighted_ensemble(self, trans_probs, mlp_probs, attn_weights, input_ids):
|
|
1626
|
-
# Per-sample dynamic weighting of Transformer and MLP predictions.
|
|
1627
|
-
# Unlike the static self.transformer_weight / self.mlp_weight used in
|
|
1628
|
-
# calibrate_weights(), this method derives weights on-the-fly from three signals:
|
|
1629
|
-
#
|
|
1630
|
-
# trans_conf_factor — derived from attention statistics:
|
|
1631
|
-
# attn_focus = std of the attention map (0 = flat, high = peaked)
|
|
1632
|
-
# attn_growth = sigmoid(attn_focus) — bounded confidence signal
|
|
1633
|
-
# attn_limit = (1 - attn_focus + attn_growth) * anisotropy
|
|
1634
|
-
# factor = attn_growth + attn_limit * attn_focus
|
|
1635
|
-
# Intuitively: the transformer earns more weight when its attention
|
|
1636
|
-
# is peaked (focused) AND the distribution is geometrically varied.
|
|
1637
|
-
#
|
|
1638
|
-
# mlp_conf_factor — derived from MLP output entropy:
|
|
1639
|
-
# lower entropy → sharper distribution → higher confidence → higher weight.
|
|
1640
|
-
# formula: 1 / (1 + entropy)
|
|
1641
|
-
#
|
|
1642
|
-
# agreement — 1.0 if both models predict the same class, else 0.3.
|
|
1643
|
-
# Acts as a confidence multiplier: agreement boosts both weights
|
|
1644
|
-
# proportionally, disagreement dampens the overall contribution.
|
|
1645
|
-
#
|
|
1646
|
-
# Both factors are multiplied by (1 + agreement) / 2, then normalised so they sum to 1.
|
|
1647
|
-
# The final ensemble for sample i is: trans_weight * trans_row + mlp_weight * mlp_row.
|
|
1648
1610
|
batch_size = trans_probs.shape[0]
|
|
1649
1611
|
try:
|
|
1650
1612
|
n_trans_classes = trans_probs.shape[1]
|
|
@@ -1653,8 +1615,6 @@ class WeightedEnsemblePredictor:
|
|
|
1653
1615
|
n_trans_classes = trans_probs.shape[-1]
|
|
1654
1616
|
n_mlp_classes = mlp_probs.shape[-1]
|
|
1655
1617
|
|
|
1656
|
-
# Align probability vectors to the same class count (the larger of the two).
|
|
1657
|
-
# Necessary when the transformer and MLP were trained with different label sets.
|
|
1658
1618
|
n_classes = max(n_trans_classes, n_mlp_classes)
|
|
1659
1619
|
|
|
1660
1620
|
print(f"🔄 Aligning classes: {n_trans_classes} and {n_mlp_classes} → {n_classes}")
|
|
@@ -1663,7 +1623,6 @@ class WeightedEnsemblePredictor:
|
|
|
1663
1623
|
trans_row = np.zeros(n_classes)
|
|
1664
1624
|
mlp_row = np.zeros(n_classes)
|
|
1665
1625
|
|
|
1666
|
-
# Zero-pad shorter probability vectors to n_classes, then re-normalise.
|
|
1667
1626
|
trans_row[:n_trans_classes] = trans_probs[i]
|
|
1668
1627
|
mlp_row[:n_mlp_classes] = mlp_probs[i]
|
|
1669
1628
|
|
|
@@ -1672,23 +1631,19 @@ class WeightedEnsemblePredictor:
|
|
|
1672
1631
|
|
|
1673
1632
|
trans_pred = np.argmax(trans_probs[i])
|
|
1674
1633
|
mlp_pred = np.argmax(mlp_probs[i])
|
|
1675
|
-
# agreement is a binary multiplier; 1.0 when models agree, 0.3 when they differ.
|
|
1676
1634
|
agreement = 1.0 if trans_pred == mlp_pred else 0.3
|
|
1677
1635
|
|
|
1678
1636
|
if attn_weights is not None and i < len(attn_weights):
|
|
1679
1637
|
print('🔄 Sophisticated confidence assembling')
|
|
1680
1638
|
attn = attn_weights[i]
|
|
1681
|
-
# Geometric variation in the attention map itself.
|
|
1682
1639
|
anisotropy = self.anisotropy_measurement(attn)
|
|
1683
1640
|
|
|
1684
1641
|
attn_focus = np.std(attn) if attn.size > 0 else 0.5
|
|
1685
|
-
attn_growth = 1.0 / (1.0 + np.exp(-attn_focus))
|
|
1686
|
-
# attn_limit blends (1 - focus + growth) with anisotropy to bound the factor.
|
|
1642
|
+
attn_growth = 1.0 / (1.0 + np.exp(-attn_focus))
|
|
1687
1643
|
attn_limit = (1.0 - attn_focus + attn_growth) * anisotropy
|
|
1688
1644
|
|
|
1689
1645
|
trans_conf_factor = attn_growth + attn_limit * attn_focus
|
|
1690
1646
|
else:
|
|
1691
|
-
# Fallback when per-sample attn slice is unavailable: use scalar attn_weights.
|
|
1692
1647
|
attn_growth = 1.0 / (1.0 + np.exp(-attn_weights))
|
|
1693
1648
|
anisotropy = self.anisotropy_measurement(attn_weights)
|
|
1694
1649
|
trans_conf_factor = attn_growth * anisotropy
|
|
@@ -1696,7 +1651,6 @@ class WeightedEnsemblePredictor:
|
|
|
1696
1651
|
mlp_entropy = -np.sum(mlp_probs[i] * np.log(mlp_probs[i] + 1e-8))
|
|
1697
1652
|
mlp_conf_factor = 1.0 / (1.0 + mlp_entropy) # Lower entropy = higher confidence
|
|
1698
1653
|
|
|
1699
|
-
# Scale both factors by the agreement bonus, then normalise.
|
|
1700
1654
|
trans_weight = trans_conf_factor * (1.0 + agreement) / 2
|
|
1701
1655
|
mlp_weight = mlp_conf_factor * (1.0 + agreement) / 2
|
|
1702
1656
|
|
|
@@ -1758,32 +1712,9 @@ class WeightedEnsemblePredictor:
|
|
|
1758
1712
|
return ensemble
|
|
1759
1713
|
|
|
1760
1714
|
def _meta_ensemble(self, trans_probs, mlp_probs, attn_weights, X_mlp):
|
|
1761
|
-
# Second-level ("stacking") ensemble. Instead of computing weights from raw
|
|
1762
|
-
# attention or entropy signals, it builds a meta-feature vector for each sample
|
|
1763
|
-
# that summarises both models' outputs and their relationship, then derives
|
|
1764
|
-
# sample-specific weights from those features.
|
|
1765
|
-
#
|
|
1766
|
-
# Meta-features per sample (up to 7 values):
|
|
1767
|
-
# [0] max(trans_row) — transformer peak confidence
|
|
1768
|
-
# [1] max(mlp_row) — MLP peak confidence
|
|
1769
|
-
# [2] std(trans_row) — transformer output spread (uncertainty proxy)
|
|
1770
|
-
# [3] std(mlp_row) — MLP output spread
|
|
1771
|
-
# [4] 1.0 if both agree, else 0 — inter-model agreement flag
|
|
1772
|
-
# [5] std(attn[i]) — attention map spread (if available)
|
|
1773
|
-
# [6] max(attn[i]) — peak attention value (if available)
|
|
1774
|
-
#
|
|
1775
|
-
# Weight derivation:
|
|
1776
|
-
# base_weight = 0.5 + 0.3 * agreement → 0.5 (disagree) or 0.8 (agree)
|
|
1777
|
-
# Whichever model has higher confidence gets base_weight;
|
|
1778
|
-
# the other gets 1 - base_weight.
|
|
1779
|
-
#
|
|
1780
|
-
# NOTE: there is a scoping bug here — trans_row / mlp_row from the loop above
|
|
1781
|
-
# are used outside the loop in the weight application (line ~1582). On the last
|
|
1782
|
-
# iteration they hold values for sample batch_size-1, but for earlier iterations
|
|
1783
|
-
# the wrong row is applied. Flagged in code review.
|
|
1784
1715
|
batch_size = trans_probs.shape[0]
|
|
1785
1716
|
n_classes = trans_probs.shape[1]
|
|
1786
|
-
threshold_feature = 0.1 + self.pipeline.confidence_threshold
|
|
1717
|
+
threshold_feature = 0.1 + self.pipeline.confidence_threshold
|
|
1787
1718
|
|
|
1788
1719
|
n_trans_classes = trans_probs.shape[1]
|
|
1789
1720
|
n_mlp_classes = mlp_probs.shape[1]
|
|
@@ -1798,7 +1729,6 @@ class WeightedEnsemblePredictor:
|
|
|
1798
1729
|
trans_row[:n_trans_classes] = trans_probs[i]
|
|
1799
1730
|
mlp_row[:n_mlp_classes] = mlp_probs[i]
|
|
1800
1731
|
|
|
1801
|
-
# Re-normalise after zero-padding to maintain valid probability distributions.
|
|
1802
1732
|
trans_row = trans_row / (trans_row.sum() + 1e-8)
|
|
1803
1733
|
mlp_row = mlp_row / (mlp_row.sum() + 1e-8)
|
|
1804
1734
|
|
|
@@ -2060,14 +1990,6 @@ class ExplainabilityModule:
|
|
|
2060
1990
|
# 3. IMMEDIATE TRAINING (single step with higher Learning Rate)
|
|
2061
1991
|
anisotropy = self.pipeline.anisotropy_measurement(X)
|
|
2062
1992
|
|
|
2063
|
-
# Derive a geometry-aware learning rate for the correction step.
|
|
2064
|
-
# anisotropy_dist: sigmoid of anisotropy — saturates to 1 for strongly directional data.
|
|
2065
|
-
# deviation: inverse of std; near 1 when features are tightly clustered (low spread).
|
|
2066
|
-
# AEL (Adaptive Error Level): high when data is variable (low deviation) AND anisotropic.
|
|
2067
|
-
# AEL → 1 ⟹ corrective LR = 2/(1+1) = 1.0 (fast correction on complex data)
|
|
2068
|
-
# AEL → 0 ⟹ corrective LR = 2/(1+0) = 2.0 (even faster on flat/simple data)
|
|
2069
|
-
# This intentionally boosts the correction LR above the normal training LR so
|
|
2070
|
-
# a single wrong prediction can be overridden quickly without many epochs.
|
|
2071
1993
|
anisotropy_dist = 1.0 / (1.0 + np.exp(-anisotropy))
|
|
2072
1994
|
deviation = 1.0 / (1.0 + np.std(X))
|
|
2073
1995
|
AEL = (1.0 - deviation) * anisotropy_dist + eps
|
|
@@ -2363,26 +2285,6 @@ class ExplainabilityModule:
|
|
|
2363
2285
|
|
|
2364
2286
|
|
|
2365
2287
|
def _get_final_output(self, mlp_pred, mlp_conf, trans_pred, trans_conf, attn_weights):
|
|
2366
|
-
# Resolves the final prediction when the two models disagree.
|
|
2367
|
-
# When they agree, the higher-confidence model's score is taken directly.
|
|
2368
|
-
# When they disagree, an "Abstract Attention Transformation" (AAT) scalar
|
|
2369
|
-
# is computed to determine which model to trust more:
|
|
2370
|
-
#
|
|
2371
|
-
# sliced_anisotropy — directional variation in the first attention slice;
|
|
2372
|
-
# high → attention is non-uniform / informative.
|
|
2373
|
-
# deviation — 1/(1 + std(attn_weights)); near 1 when attention is tightly
|
|
2374
|
-
# concentrated, near 0 when it is spread out.
|
|
2375
|
-
# attn_quality — overall quality score from attention_quality_computing.
|
|
2376
|
-
# AAT — deviation * (1 - sliced_anisotropy):
|
|
2377
|
-
# high when attention is concentrated (low anisotropy) AND
|
|
2378
|
-
# tightly distributed (low spread); this configuration favours
|
|
2379
|
-
# the transformer's focused contextual prediction.
|
|
2380
|
-
#
|
|
2381
|
-
# Confidence blending on disagreement:
|
|
2382
|
-
# If MLP wins: final_conf = mlp_conf * (1 - trans_conf) * (1 - AAT)
|
|
2383
|
-
# → lower AAT (diffuse attention) → MLP gets more room to dominate.
|
|
2384
|
-
# If Transformer wins: final_conf = trans_conf * (1 - mlp_conf) * AAT
|
|
2385
|
-
# → higher AAT (focused attention) → transformer earns a larger share.
|
|
2386
2288
|
eps = 1e-5
|
|
2387
2289
|
if isinstance(mlp_conf, np.ndarray):
|
|
2388
2290
|
mlp_conf = np.clip(np.mean(mlp_conf), 0, 1)
|
|
@@ -2940,18 +2842,6 @@ class ModelStorage:
|
|
|
2940
2842
|
|
|
2941
2843
|
|
|
2942
2844
|
def save_model_dict(self, memory_name, model_dict, type=None, model_type='mlp'):
|
|
2943
|
-
# Persists a model's in-memory dict to SQLite using an "active record" versioning
|
|
2944
|
-
# pattern: each save inserts a new row marked is_active=1, then immediately
|
|
2945
|
-
# deactivates all other rows for the same memory_name via a secondary UPDATE.
|
|
2946
|
-
# This means only the most recent save is "live" — reads always fetch is_active=1.
|
|
2947
|
-
#
|
|
2948
|
-
# Two destination tables depending on the `type` argument:
|
|
2949
|
-
# type == 'Transformer' → model_attn_storage (stores attention-related weights)
|
|
2950
|
-
# else → model_storage (stores MLP / pipeline weights)
|
|
2951
|
-
#
|
|
2952
|
-
# numpy arrays inside model_dict are recursively converted to Python lists
|
|
2953
|
-
# by _prepare_for_serialization() before json.dumps, ensuring they round-trip
|
|
2954
|
-
# correctly when loaded back via _convert_to_arrays().
|
|
2955
2845
|
try:
|
|
2956
2846
|
db_path = self.get_database_path()
|
|
2957
2847
|
conn = sqlite3.connect(db_path)
|
|
@@ -2970,7 +2860,6 @@ class ModelStorage:
|
|
|
2970
2860
|
VALUES (?, ?, ?, ?)
|
|
2971
2861
|
""", (memory_name, model_type, model_json, 1))
|
|
2972
2862
|
|
|
2973
|
-
# Deactivate all other rows for this memory_name (soft-delete old versions).
|
|
2974
2863
|
c.execute("""
|
|
2975
2864
|
UPDATE model_attn_storage
|
|
2976
2865
|
SET is_active = 0
|
|
@@ -2988,7 +2877,6 @@ class ModelStorage:
|
|
|
2988
2877
|
VALUES (?, ?, ?, ?)
|
|
2989
2878
|
""", (memory_name, model_type, model_json, 1))
|
|
2990
2879
|
|
|
2991
|
-
# Deactivate all other rows for this memory_name (soft-delete old versions).
|
|
2992
2880
|
c.execute("""
|
|
2993
2881
|
UPDATE model_storage
|
|
2994
2882
|
SET is_active = 0
|
|
@@ -3068,24 +2956,14 @@ class ModelStorage:
|
|
|
3068
2956
|
|
|
3069
2957
|
|
|
3070
2958
|
def _parse_array_string(self, s):
|
|
3071
|
-
|
|
3072
|
-
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
#
|
|
3076
|
-
# Strategy order (first success wins):
|
|
3077
|
-
# 1. JSON array — handles standard serialisation from json.dumps.
|
|
3078
|
-
# 2. ast.literal_eval — handles Python repr output, e.g. "[0.1, 0.2, ...]".
|
|
3079
|
-
# 3. Space/bracket-separated floats — covers numpy __str__ output like
|
|
3080
|
-
# "[ 0.1 0.2 0.3]" (spaces instead of commas, optional brackets).
|
|
3081
|
-
# 4. Comma-separated floats — fallback for CSV-style strings.
|
|
3082
|
-
#
|
|
3083
|
-
# Returns the original string unchanged if all strategies fail, letting the
|
|
3084
|
-
# caller handle the type mismatch rather than silently producing garbage data.
|
|
2959
|
+
"""
|
|
2960
|
+
Parse string representation of array back to numpy array.
|
|
2961
|
+
Returns original string if parsing fails.
|
|
2962
|
+
"""
|
|
3085
2963
|
if not isinstance(s, str) or not s:
|
|
3086
2964
|
return s
|
|
3087
2965
|
|
|
3088
|
-
#
|
|
2966
|
+
# Clean the string
|
|
3089
2967
|
s = s.replace('\n', '').replace('\r', '').replace('\t', '')
|
|
3090
2968
|
s = ' '.join(s.split()).strip()
|
|
3091
2969
|
|
|
@@ -3346,14 +3224,6 @@ class ModelStorage:
|
|
|
3346
3224
|
pass
|
|
3347
3225
|
|
|
3348
3226
|
def load_peer_request_dict(self, memory_name, agent_id):
|
|
3349
|
-
# Retrieves a peer agent's stored prediction request from agent_attn_storage,
|
|
3350
|
-
# excluding rows whose agent_id matches any ID in the provided list.
|
|
3351
|
-
# The exclusion prevents an agent from retrieving its own previously stored
|
|
3352
|
-
# request, ensuring it only receives data from *other* agents in the network.
|
|
3353
|
-
#
|
|
3354
|
-
# The IN clause is constructed dynamically with one '?' placeholder per agent_id
|
|
3355
|
-
# entry, which is safe against SQL injection via parameterised queries.
|
|
3356
|
-
# Returns (model_attn_data, model_target_pred) parsed from JSON, or (None, None).
|
|
3357
3227
|
print(f'|| Peer request with Agent')
|
|
3358
3228
|
try:
|
|
3359
3229
|
try:
|
|
@@ -3820,10 +3690,7 @@ class AsyncMessageQueue:
|
|
|
3820
3690
|
if not success:
|
|
3821
3691
|
self._stats['messages_failed'] += 1
|
|
3822
3692
|
|
|
3823
|
-
#
|
|
3824
|
-
# alpha = 0.1 means the current measurement contributes 10 % to the running average,
|
|
3825
|
-
# providing a smoothed latency estimate that is robust to spikes without requiring
|
|
3826
|
-
# a fixed-size history window.
|
|
3693
|
+
# Update moving average
|
|
3827
3694
|
alpha = 0.1 # Smoothing factor
|
|
3828
3695
|
self._stats['avg_latency'] = alpha * latency + (1 - alpha) * self._stats['avg_latency']
|
|
3829
3696
|
|
|
@@ -4069,7 +3936,7 @@ class AgentDistributedInference:
|
|
|
4069
3936
|
# Security: Audit log
|
|
4070
3937
|
self.security_log = []
|
|
4071
3938
|
|
|
4072
|
-
self.enable_ssl = False
|
|
3939
|
+
self.enable_ssl = False # Set to True to enable SSL encryption
|
|
4073
3940
|
# i provided basic cert file and key since there are other layered security other than ssl, and also due to infrequent external connections.
|
|
4074
3941
|
self.ssl_cert_file = ssl_cert_file
|
|
4075
3942
|
self.ssl_key_file = ssl_key_file
|
|
@@ -4310,7 +4177,7 @@ class AgentDistributedInference:
|
|
|
4310
4177
|
key = self.secret_key.encode() if isinstance(self.secret_key, str) else self.secret_key
|
|
4311
4178
|
signature = hmac.new(key, message_bytes, hashlib.sha256).hexdigest()
|
|
4312
4179
|
|
|
4313
|
-
print(f'|| Signing message with: {len(message)} total of size
|
|
4180
|
+
print(f'|| Signing message with: {len(message)} total of size')
|
|
4314
4181
|
logger.info(f"[=] Signing message: {len(message)}")
|
|
4315
4182
|
return signature
|
|
4316
4183
|
|
|
@@ -4579,7 +4446,7 @@ class AgentDistributedInference:
|
|
|
4579
4446
|
print(f"[-] Connection attempt to blocked IP: {host}")
|
|
4580
4447
|
self._log_security_event('connection_blocked', {'ip': host})
|
|
4581
4448
|
return None
|
|
4582
|
-
|
|
4449
|
+
|
|
4583
4450
|
# Socket creation
|
|
4584
4451
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
4585
4452
|
|
|
@@ -5366,7 +5233,7 @@ class AgentDistributedInference:
|
|
|
5366
5233
|
print(f'[||] Successfully calibrate probs with previous Peer using database!')
|
|
5367
5234
|
self.save_to_local_peer(self.memory_name, probs)
|
|
5368
5235
|
else:
|
|
5369
|
-
print(f'[-] Connection to peer agent {self.temporary_agent_id}
|
|
5236
|
+
print(f'[-] Connection to peer agent {self.temporary_agent_id} is not permitted, returning regular probs...')
|
|
5370
5237
|
|
|
5371
5238
|
return probs
|
|
5372
5239
|
|
|
@@ -6160,13 +6027,15 @@ class IntegratedPipeline:
|
|
|
6160
6027
|
self.titles = None
|
|
6161
6028
|
self.labels = None
|
|
6162
6029
|
|
|
6163
|
-
self.use_transformer =
|
|
6030
|
+
self.use_transformer = True
|
|
6164
6031
|
self.agreement = False
|
|
6165
6032
|
self.external_peer_enabled = False
|
|
6166
6033
|
self.autonomous = False
|
|
6167
6034
|
self.show_explainability_details = True
|
|
6168
6035
|
|
|
6169
6036
|
self.temperature = 1.0
|
|
6037
|
+
self.transformer_lr = 0.1
|
|
6038
|
+
|
|
6170
6039
|
self.memory_name = memory_name
|
|
6171
6040
|
|
|
6172
6041
|
self.pending_batch = []
|
|
@@ -8072,16 +7941,15 @@ class IntegratedPipeline:
|
|
|
8072
7941
|
_, y_true = self.input_encoding(datasets)
|
|
8073
7942
|
sequence_inputs = self.sequence_encoding(datasets)
|
|
8074
7943
|
unsuitable_training = self.training_necessary_condition(sequence_inputs, X_raw)
|
|
7944
|
+
lr = self.model2.transformer_lr if self.model2 else self.transformer_lr
|
|
8075
7945
|
|
|
8076
7946
|
if not unsuitable_training:
|
|
8077
7947
|
print(f'🚀 Training Transformer with {len(sequence_inputs)} Samples: ')
|
|
8078
7948
|
conditional_anisotropy = self.anisotropy_measurement(sequence_inputs)
|
|
8079
7949
|
if conditional_anisotropy >= self.confidence_threshold:
|
|
8080
|
-
lr = 1e-4
|
|
8081
7950
|
print('[+] Dynamic Backward')
|
|
8082
7951
|
mode = 'dynamic_backward'
|
|
8083
7952
|
else:
|
|
8084
|
-
lr = 0.1
|
|
8085
7953
|
print('[-] Fixed Backward')
|
|
8086
7954
|
mode = 'fixed_backward'
|
|
8087
7955
|
|
|
@@ -11092,7 +10960,7 @@ class ConsecutivePeerAgent:
|
|
|
11092
10960
|
# Verify message signature
|
|
11093
10961
|
expected = self._sign_message({k: v for k, v in message.items() if k != 'signature'})
|
|
11094
10962
|
|
|
11095
|
-
print(f'[ConsecutivePeerAgent] Comparing Signature and
|
|
10963
|
+
print(f'[ConsecutivePeerAgent] Comparing Signature and verifying...')
|
|
11096
10964
|
return hmac.compare_digest(expected, signature)
|
|
11097
10965
|
|
|
11098
10966
|
def _send_message(self, sock: socket.socket, message: dict) -> bool:
|
|
@@ -13167,39 +13035,9 @@ def PermissiveTest():
|
|
|
13167
13035
|
pass
|
|
13168
13036
|
|
|
13169
13037
|
|
|
13170
|
-
def main_cli():
|
|
13171
|
-
"""Command-line interface entry point"""
|
|
13172
|
-
import argparse
|
|
13173
|
-
import asyncio
|
|
13174
|
-
|
|
13175
|
-
parser = argparse.ArgumentParser(description="AbstractIntegratedModule - AI Multi-agent System")
|
|
13176
|
-
parser.add_argument("--version", action="store_true", help="Show version")
|
|
13177
|
-
parser.add_argument("--train", help="Training data file")
|
|
13178
|
-
parser.add_argument("--predict", help="Text to predict")
|
|
13179
|
-
|
|
13180
|
-
args = parser.parse_args()
|
|
13181
|
-
|
|
13182
|
-
if args.version:
|
|
13183
|
-
print(f"AbstractIntegratedModule version {__version__}")
|
|
13184
|
-
return
|
|
13185
|
-
|
|
13186
|
-
if args.predict:
|
|
13187
|
-
# Simple prediction example
|
|
13188
|
-
pipeline = IntegratedPipeline("temp", use_async=False)
|
|
13189
|
-
result = pipeline.predict_single(args.predict)
|
|
13190
|
-
print(f"[=] Prediction: {result}")
|
|
13191
|
-
|
|
13192
|
-
if args.train:
|
|
13193
|
-
print(f"[=] Training with {args.train}")
|
|
13194
|
-
|
|
13195
|
-
|
|
13196
|
-
|
|
13197
|
-
|
|
13198
|
-
|
|
13199
13038
|
if __name__ == "__main__":
|
|
13200
13039
|
try:
|
|
13201
13040
|
PermissiveTest()
|
|
13202
|
-
main_cli()
|
|
13203
13041
|
except Exception as e:
|
|
13204
13042
|
print(f'|| Program Crashed..., Error: {e}')
|
|
13205
13043
|
traceback.print_exc()
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
AbstractIntegratedModule.py,sha256=c_YlNhlaZUyRfIG3r6niicfk-9yJ1Zu_jBqlUsbcjmQ,540197
|
|
2
|
+
abstractintegratedmodule-0.1.9.dist-info/METADATA,sha256=U_TIRHbZSR-YtzVztfxzskYcBcwlKyIK6AV2KgYclnM,57653
|
|
3
|
+
abstractintegratedmodule-0.1.9.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
4
|
+
abstractintegratedmodule-0.1.9.dist-info/top_level.txt,sha256=H2-a2eP316_DXtZSJz2ztHo8n32qVrJCtyyvuc8m87E,25
|
|
5
|
+
abstractintegratedmodule-0.1.9.dist-info/RECORD,,
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
AbstractIntegratedModule.py,sha256=AGmy7En70jha9YnTRT9knYHHd2K5yV6ZANI1HgnWams,553844
|
|
2
|
-
abstractintegratedmodule-0.1.8.dist-info/METADATA,sha256=6XbxKZ2G4wPAwlmhYOtQyq-7vNWLvp6LkBQCAHB1_Lg,57653
|
|
3
|
-
abstractintegratedmodule-0.1.8.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
4
|
-
abstractintegratedmodule-0.1.8.dist-info/top_level.txt,sha256=H2-a2eP316_DXtZSJz2ztHo8n32qVrJCtyyvuc8m87E,25
|
|
5
|
-
abstractintegratedmodule-0.1.8.dist-info/RECORD,,
|
|
File without changes
|
{abstractintegratedmodule-0.1.8.dist-info → abstractintegratedmodule-0.1.9.dist-info}/top_level.txt
RENAMED
|
File without changes
|