PyPI - dl-backtrace - Versions diffs - 0.0.18__py3-none-any.whl → 0.0.20.dev36__py3-none-any.whl - Mend - Supply Chain Defender

dl-backtrace 0.0.18py3-none-any.whl → 0.0.20.dev36py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dl-backtrace might be problematic. Click here for more details.

Files changed (11) hide show

dl_backtrace/pytorch_backtrace/backtrace/utils/prop.py CHANGED Viewed

@@ -1,42 +1,32 @@
 import gc
+import torch
 import numpy as np
-import tensorflow as tf
 from numpy.lib.stride_tricks import as_strided
-from tensorflow.keras import backend as K
 def np_swish(x, beta=0.75):
     z = 1 / (1 + np.exp(-(beta * x)))
     return x * z
 def np_wave(x, alpha=1.0):
     return (alpha * x * np.exp(1.0)) / (np.exp(-x) + np.exp(x))
 def np_pulse(x, alpha=1.0):
     return alpha * (1 - np.tanh(x) * np.tanh(x))
 def np_absolute(x, alpha=1.0):
     return alpha * x * np.tanh(x)
 def np_hard_sigmoid(x):
     return np.clip(0.2 * x + 0.5, 0, 1)
 def np_sigmoid(x):
     z = 1 / (1 + np.exp(-x))
     return z
 def np_tanh(x):
     z = np.tanh(x)
     return z.astype(np.float32)
 class LSTM_forward(object):
     def __init__(
         self, num_cells, units, weights, return_sequence=False, go_backwards=False
@@ -48,8 +38,8 @@ class LSTM_forward(object):
         self.bias = weights[2][1]
         self.return_sequence = return_sequence
         self.go_backwards = go_backwards
-        self.recurrent_activation = tf.math.sigmoid
-        self.activation = tf.math.tanh
+        self.recurrent_activation = torch.sigmoid()
+        self.activation = torch.tanh()
         self.compute_log = {}
         for i in range(self.num_cells):
             self.compute_log[i] = {}
@@ -63,23 +53,19 @@ class LSTM_forward(object):
         """Computes carry and output using split kernels."""
         x_i, x_f, x_c, x_o = x
         h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1
-        #print(self.recurrent_kernel[1][:, : self.units].shape)
-        #print(h_tm1_i.shape,self.recurrent_kernel[1][:, : self.units].shape)
-        w=tf.convert_to_tensor(self.recurrent_kernel[1], dtype=tf.float32)
-        #print(K.dot(h_tm1_i, w[:, : self.units]))
+        w=torch.as_tensor(self.recurrent_kernel[1], dtype=torch.float32)
         i = self.recurrent_activation(
-            x_i + K.dot(h_tm1_i, w[:, : self.units])
+            x_i + torch.dot(h_tm1_i, w[:, : self.units])
         )
         f = self.recurrent_activation(
-            x_f + K.dot(h_tm1_f, w[:, self.units : self.units * 2])
+            x_f + torch.dot(h_tm1_f, w[:, self.units : self.units * 2])
         )
         c = f * c_tm1 + i * self.activation(
             x_c
-            + K.dot(h_tm1_c, w[:, self.units * 2 : self.units * 3])
+            + torch.dot(h_tm1_c, w[:, self.units * 2 : self.units * 3])
         )
         o = self.recurrent_activation(
-            x_o + K.dot(h_tm1_o, w[:, self.units * 3 :])
+            x_o + torch.dot(h_tm1_o, w[:, self.units * 3 :])
         )
         self.compute_log[cell_num]["int_arrays"]["i"] = i
         self.compute_log[cell_num]["int_arrays"]["f"] = f
@@ -97,16 +83,16 @@ class LSTM_forward(object):
         inputs_f = inputs
         inputs_c = inputs
         inputs_o = inputs
-        k_i, k_f, k_c, k_o = tf.split(self.kernel[1], num_or_size_splits=4, axis=1)
-        x_i = K.dot(inputs_i, k_i)
-        x_f = K.dot(inputs_f, k_f)
-        x_c = K.dot(inputs_c, k_c)
-        x_o = K.dot(inputs_o, k_o)
-        b_i, b_f, b_c, b_o = tf.split(self.bias, num_or_size_splits=4, axis=0)
-        x_i = tf.add(x_i, b_i)
-        x_f = tf.add(x_f, b_f)
-        x_c = tf.add(x_c, b_c)
-        x_o = tf.add(x_o, b_o)
+        k_i, k_f, k_c, k_o = torch.split(self.kernel[1],self.kernel.size(1)//4,dim=1)
+        x_i = torch.dot(inputs_i, k_i)
+        x_f = torch.dot(inputs_f, k_f)
+        x_c = torch.dot(inputs_c, k_c)
+        x_o = torch.dot(inputs_o, k_o)
+        b_i, b_f, b_c, b_o = torch.split(self.bias,self.bias.size(1)//4,dim=0)
+        x_i = x_i + b_i
+        x_f = x_f + b_f
+        x_c = x_c + b_c
+        x_o = x_o + b_o
         h_tm1_i = h_tm1
         h_tm1_f = h_tm1
@@ -123,12 +109,12 @@ class LSTM_forward(object):
         return h, [h, c]
     def calculate_lstm_wt(self, input_data):
-        hstate = tf.convert_to_tensor(np.zeros((1, self.units)), dtype=tf.float32)
-        cstate = tf.convert_to_tensor(np.zeros((1, self.units)), dtype=tf.float32)
+        hstate = torch.tensor((1,self.units),dtype=torch.float32)
+        cstate = torch.tensor((1,self.units),dtype=torch.float32)
         output = []
         for ind in range(input_data.shape[0]):
-            inp = tf.convert_to_tensor(
-                input_data[ind, :].reshape((1, input_data.shape[1])), dtype=tf.float32
+            inp = torch.tensor(
+                input_data[ind, :].reshape((1, input_data.shape[1])), dtype=torch.float32
             )
             h, s = self.calculate_lstm_cell_wt(inp, [hstate, cstate], ind)
             hstate = s[0]
@@ -136,9 +122,6 @@ class LSTM_forward(object):
             output.append(h)
         return output
 class LSTM_backtrace(object):
     def __init__(
         self, num_cells, units, weights, return_sequence=False, go_backwards=False
@@ -270,8 +253,6 @@ class LSTM_backtrace(object):
         x_i, x_f, x_c, x_o = x
         f = self.compute_log[cell_num]["int_arrays"]["f"].numpy()[0]
         i = self.compute_log[cell_num]["int_arrays"]["i"].numpy()[0]
-        #         o = self.recurrent_activation(
-        #             x_o + np.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3:])).astype(np.float32)
         temp1 = np.dot(h_tm1_o, self.recurrent_kernel[1][:, self.units * 3 :]).astype(
             np.float32
         )
@@ -283,9 +264,6 @@ class LSTM_backtrace(object):
             [],
             {"type": None},
         )
-        #         c = f * c_tm1 + i * self.activation(x_c + np.dot(
-        #             h_tm1_c, self.recurrent_kernel[:, self.units * 2:self.units * 3])).astype(np.float32)
         temp2 = f * c_tm1
         temp3_1 = np.dot(
             h_tm1_c, self.recurrent_kernel[1][:, self.units * 2 : self.units * 3]
@@ -303,9 +281,6 @@ class LSTM_backtrace(object):
             [],
             {"type": None},
         )
-        #         f = self.recurrent_activation(x_f + np.dot(
-        #             h_tm1_f, self.recurrent_kernel[:, self.units:self.units * 2])).astype(np.float32)
         temp4 = np.dot(h_tm1_f, self.recurrent_kernel[1][:, self.units : self.units * 2])
         wt_x_f, wt_temp4 = self.calculate_wt_add(wt_f, [x_f, temp4])
         wt_h_tm1_f = self.calculate_wt_fc(
@@ -315,9 +290,6 @@ class LSTM_backtrace(object):
             [],
             {"type": None},
         )
-        #         i = self.recurrent_activation(
-        #             x_i + np.dot(h_tm1_i, self.recurrent_kernel[:, :self.units])).astype(np.float32)
         temp5 = np.dot(h_tm1_i, self.recurrent_kernel[1][:, : self.units])
         wt_x_i, wt_temp5 = self.calculate_wt_add(wt_i, [x_i, temp5])
         wt_h_tm1_i = self.calculate_wt_fc(
@@ -364,7 +336,6 @@ class LSTM_backtrace(object):
         wt_h_tm1 = wt_h_tm1_i + wt_h_tm1_f + wt_h_tm1_c + wt_h_tm1_o
         inputs = self.compute_log[cell_num]["inp"].numpy()[0]
-        #print(np.split(self.kernel[1], indices_or_sections=4, axis=1))
         k_i, k_f, k_c, k_o = np.split(self.kernel[1], indices_or_sections=4, axis=1)
         b_i, b_f, b_c, b_o = np.split(self.bias[1], indices_or_sections=4, axis=0)
@@ -395,12 +366,10 @@ class LSTM_backtrace(object):
         output.reverse()
         return np.array(output)
 def dummy_wt(wts, inp, *args):
     test_wt = np.zeros_like(inp)
     return test_wt
 def calculate_wt_fc(wts, inp, w, b, act):
     mul_mat = np.einsum("ij,i->ij", w.numpy().T, inp).T
     wt_mat = np.zeros(mul_mat.shape)
@@ -461,12 +430,10 @@ def calculate_wt_fc(wts, inp, w, b, act):
     wt_mat = wt_mat.sum(axis=0)
     return wt_mat
 def calculate_wt_rshp(wts, inp=None):
     x = np.reshape(wts, inp.shape)
     return x
 def calculate_wt_concat(wts, inp=None, axis=-1):
     wts=wts.T
     splits = [i.shape[axis] for i in inp]
@@ -476,7 +443,6 @@ def calculate_wt_concat(wts, inp=None, axis=-1):
     x = np.split(wts, indices_or_sections=splits, axis=axis)
     return x
 def calculate_wt_add(wts, inp=None):
     wts=wts.T
     wt_mat = []
@@ -523,199 +489,231 @@ def calculate_wt_add(wts, inp=None):
     wt_mat = [i.reshape(wts.shape) for i in list(wt_mat)]
     return wt_mat
-def calculate_start_wt(arg):
-    x = np.argmax(arg[0])
-    y = np.zeros(arg.shape)
-    y[0][x] = 1
+def calculate_start_wt(arg, scaler=None,thresholding=0.5,task="binary-classification"):
+    if arg.ndim == 2:
+        if task == "binary-classification" or task == "multi-class classification":
+            x = np.argmax(arg[0])
+            m = np.max(arg[0])
+            y = np.zeros(arg.shape)
+            if scaler:
+                y[0][x] = scaler
+            else:
+                y[0][x] = m
+        elif task == "bbox-regression":
+            y = np.zeros(arg.shape)
+            if scaler:
+                y[0] = scaler
+                num_non_zero_elements = np.count_nonzero(y)
+                if num_non_zero_elements > 0:
+                    y = y / num_non_zero_elements
+            else:
+                m = np.max(arg[0])
+                x = np.argmax(arg[0])
+                y[0][x] = m
+        else:
+            x = np.argmax(arg[0])
+            m = np.max(arg[0])
+            y = np.zeros(arg.shape)
+            if scaler:
+                y[0][x] = scaler
+            else:
+                y[0][x] = m
+    elif arg.ndim == 4 and task == "binary-segmentation":
+        indices = np.where(arg > thresholding)
+        y = np.zeros(arg.shape)
+        if scaler:
+            y[indices] = scaler
+            num_non_zero_elements = np.count_nonzero(y)
+            if num_non_zero_elements > 0:
+                y = y / num_non_zero_elements
+        else:
+            y[indices] = arg[indices]
+    else:
+        x = np.argmax(arg[0])
+        m = np.max(arg[0])
+        y = np.zeros(arg.shape)
+        if scaler:
+            y[0][x] = scaler
+        else:
+            y[0][x] = m
     return y[0]
 def calculate_wt_passthru(wts):
     return wts
+def calculate_wt_zero_pad(wts,inp,padding):
+    wt_mat = wts[padding[0][0]:inp.shape[0]+padding[0][0],padding[1][0]:inp.shape[1]+padding[1][0],:]
+    return wt_mat
+def calculate_padding(kernel_size, inp, padding, strides, const_val=0.0):
+    if padding=='valid':
+        return (inp, [[0,0],[0,0],[0,0]])
+    elif padding == 'same':
+        h = inp.shape[0]%strides[0]
+        if h==0:
+            pad_h = np.max([0,kernel_size[0]-strides[0]])
+        else:
+            pad_h = np.max([0,kernel_size[0]-h])
-def calculate_wt_conv_unit(wt, p_mat, n_mat, t_sum, p_sum, n_sum, act):
-    wt_mat = np.zeros_like(p_mat)
-    if act["type"] == "mono":
+        v = inp.shape[1]%strides[1]
+        if v==0:
+            pad_v = np.max([0,kernel_size[1]-strides[1]])
+        else:
+            pad_v = np.max([0,kernel_size[1]-v])
+        paddings = [np.floor([pad_h/2.0,(pad_h+1)/2.0]).astype("int32"),
+                    np.floor([pad_v/2.0,(pad_v+1)/2.0]).astype("int32"),
+                    np.zeros((2)).astype("int32")]
+        inp_pad = np.pad(inp, paddings, 'constant', constant_values=const_val)
+        return (inp_pad,paddings)
+    else:
+        if isinstance(padding, tuple) and padding != (None, None):
+            pad_h = padding[0]
+            pad_v = padding[1]
+            paddings = [np.floor([pad_h,pad_h]).astype("int32"),
+                    np.floor([pad_v,pad_v]).astype("int32"),
+                    np.zeros((2)).astype("int32")]
+            inp_pad = np.pad(inp, paddings, 'constant', constant_values=const_val)
+            return (inp_pad,paddings)
+        else:
+            return (inp, [[0,0],[0,0],[0,0]])
+def calculate_wt_conv_unit(patch, wts, w, b, act):
+    k = w.numpy()
+    bias = b.numpy()
+    b_ind = bias>0
+    bias_pos = bias*b_ind
+    b_ind = bias<0
+    bias_neg = bias*b_ind*-1.0
+    conv_out = np.einsum("ijkl,ijk->ijkl",k,patch)
+    p_ind = conv_out>0
+    p_ind = conv_out*p_ind
+    p_sum = np.einsum("ijkl->l",p_ind)
+    n_ind = conv_out<0
+    n_ind = conv_out*n_ind
+    n_sum = np.einsum("ijkl->l",n_ind)*-1.0
+    t_sum = p_sum+n_sum
+    wt_mat = np.zeros_like(k)
+    p_saturate = p_sum>0
+    n_saturate = n_sum>0
+    if act["type"]=='mono':
         if act["range"]["l"]:
-            if t_sum < act["range"]["l"]:
-                p_sum = 0
+            temp_ind = t_sum > act["range"]["l"]
+            p_saturate = temp_ind
         if act["range"]["u"]:
-            if t_sum > act["range"]["u"]:
-                n_sum = 0
-    elif act["type"] == "non_mono":
+            temp_ind = t_sum < act["range"]["u"]
+            n_saturate = temp_ind
+    elif act["type"]=='non_mono':
         t_act = act["func"](t_sum)
-        p_act = act["func"](p_sum)
-        n_act = act["func"](n_sum)
+        p_act = act["func"](p_sum + bias_pos)
+        n_act = act["func"](-1*(n_sum + bias_neg))
         if act["range"]["l"]:
-            if t_sum < act["range"]["l"]:
-                p_sum = 0
+            temp_ind = t_sum > act["range"]["l"]
+            p_saturate = p_saturate*temp_ind
         if act["range"]["u"]:
-            if t_sum > act["range"]["u"]:
-                n_sum = 0
-        if p_sum > 0 and n_sum > 0:
-            if t_act == p_act:
-                n_sum = 0
-            elif t_act == n_act:
-                p_sum = 0
-    p_agg_wt = 0.0
-    n_agg_wt = 0.0
-    if p_sum + n_sum > 0.0:
-        p_agg_wt = p_sum / (p_sum + n_sum)
-        n_agg_wt = n_sum / (p_sum + n_sum)
-    if p_sum == 0.0:
-        p_sum = 1.0
-    if n_sum == 0.0:
-        n_sum = 1.0
-    wt_mat = wt_mat + ((p_mat / p_sum) * wt * p_agg_wt)
-    wt_mat = wt_mat + ((n_mat / n_sum) * wt * n_agg_wt * -1.0)
+            temp_ind = t_sum < act["range"]["u"]
+            n_saturate = n_saturate*temp_ind
+        temp_ind = np.abs(t_act - p_act)>1e-5
+        n_saturate = n_saturate*temp_ind
+        temp_ind = np.abs(t_act - n_act)>1e-5
+        p_saturate = p_saturate*temp_ind
+    p_agg_wt = (1.0/(p_sum+n_sum+bias_pos+bias_neg))*wts*p_saturate
+    n_agg_wt = (1.0/(p_sum+n_sum+bias_pos+bias_neg))*wts*n_saturate
+    wt_mat = wt_mat+(p_ind*p_agg_wt)
+    wt_mat = wt_mat+(n_ind*n_agg_wt*-1.0)
+    wt_mat = np.sum(wt_mat,axis=-1)
     return wt_mat
-def calculate_wt_conv(wts, inp, w, b, act):
-    wts=wts.T
-    inp=inp.T
-    w=w.T
-    expanded_input = as_strided(
-        inp,
-        shape=(
-            inp.shape[0]
-            - w.numpy().shape[0]
-            + 1,  # The feature map is a few pixels smaller than the input
-            inp.shape[1] - w.numpy().shape[1] + 1,
-            inp.shape[2],
-            w.numpy().shape[0],
-            w.numpy().shape[1],
-        ),
-        strides=(
-            inp.strides[0],
-            inp.strides[1],
-            inp.strides[2],
-            inp.strides[
-                0
-            ],  # When we move one step in the 3rd dimension, we should move one step in the original data too
-            inp.strides[1],
-        ),
-        writeable=False,  # totally use this to avoid writing to memory in weird places
-    )
-    test_wt = np.einsum("mnc->cmn", np.zeros_like(inp), order="C", optimize=True)
-    for k in range(w.numpy().shape[-1]):
-        kernel = w.numpy()[:, :, :, k]
-        x = np.einsum(
-            "abcmn,mnc->abcmn", expanded_input, kernel, order="C", optimize=True
-        )
-        x_pos = x.copy()
-        x_neg = x.copy()
-        x_pos[x < 0] = 0
-        x_neg[x > 0] = 0
-        x_sum = np.einsum("abcmn->ab", x, order="C", optimize=True)
-        x_p_sum = np.einsum("abcmn->ab", x_pos, order="C", optimize=True)
-        x_n_sum = np.einsum("abcmn->ab", x_neg, order="C", optimize=True) * -1.0
-        #     print(np.sum(x),np.sum(x_pos),np.sum(x_neg),np.sum(x_n_sum))
-        for ind1 in range(expanded_input.shape[0]):
-            for ind2 in range(expanded_input.shape[1]):
-                temp_wt_mat = calculate_wt_conv_unit(
-                    wts[ind1, ind2, k],
-                    x_pos[ind1, ind2, :, :, :],
-                    x_neg[ind1, ind2, :, :, :],
-                    x_sum[ind1, ind2],
-                    x_p_sum[ind1, ind2],
-                    x_n_sum[ind1, ind2],
-                    act,
-                )
-                test_wt[
-                    :, ind1 : ind1 + kernel.shape[0], ind2 : ind2 + kernel.shape[1]
-                ] += temp_wt_mat
-    test_wt = np.einsum("cmn->mnc", test_wt, order="C", optimize=True)
-    gc.collect()
-    return test_wt
-def get_max_index(mat=None):
-    max_ind = np.argmax(mat)
-    ind = []
-    rem = max_ind
-    for i in mat.shape[:-1]:
-        ind.append(rem // i)
-        rem = rem % i
-    ind.append(rem)
-    return tuple(ind)
-def calculate_wt_maxpool(wts, inp, pool_size):
+def calculate_wt_conv(wts, inp, w, b, padding, strides, act):
+    wts = wts.T
+    inp = inp.T
+    w = w.T
+    input_padded, paddings = calculate_padding(w.shape, inp, padding, strides)
+    out_ds = np.zeros_like(input_padded)
+    for ind1 in range(wts.shape[0]):
+        for ind2 in range(wts.shape[1]):
+            indexes = [np.arange(ind1*strides[0], ind1*(strides[0])+w.shape[0]),
+                       np.arange(ind2*strides[1], ind2*(strides[1])+w.shape[1])]
+            # Take slice
+            tmp_patch = input_padded[np.ix_(indexes[0],indexes[1])]
+            updates = calculate_wt_conv_unit(tmp_patch, wts[ind1,ind2,:], w, b, act)
+            # Build tensor with "filtered" gradient
+            out_ds[np.ix_(indexes[0],indexes[1])]+=updates
+    out_ds = out_ds[paddings[0][0]:(paddings[0][0]+inp.shape[0]),
+                    paddings[1][0]:(paddings[1][0]+inp.shape[1]),:]
+    return out_ds
+def calculate_wt_max_unit(patch, wts, pool_size):
+    pmax = np.einsum("ijk,k->ijk",np.ones_like(patch),np.max(np.max(patch,axis=0),axis=0))
+    indexes = (patch-pmax)==0
+    indexes = indexes.astype(np.float32)
+    indexes_norm = 1.0/np.einsum("mnc->c",indexes)
+    indexes = np.einsum("ijk,k->ijk",indexes,indexes_norm)
+    out = np.einsum("ijk,k->ijk",indexes,wts)
+    return out
+def calculate_wt_maxpool(wts, inp, pool_size, padding, strides):
     wts=wts.T
     inp=inp.T
-    pad1 = pool_size[0]
-    pad2 = pool_size[1]
-    test_samp_pad = np.pad(inp, ((0, pad1), (0, pad2), (0, 0)), "constant")
-    dim1, dim2, _ = wts.shape
-    test_wt = np.zeros_like(test_samp_pad)
-    for k in range(inp.shape[2]):
-        wt_mat = wts[:, :, k]
-        for ind1 in range(dim1):
-            for ind2 in range(dim2):
-                temp_inp = test_samp_pad[
-                    ind1 * pool_size[0] : (ind1 + 1) * pool_size[0],
-                    ind2 * pool_size[1] : (ind2 + 1) * pool_size[1],
-                    k,
-                ]
-                max_index = get_max_index(temp_inp)
-                test_wt[
-                    ind1 * pool_size[0] : (ind1 + 1) * pool_size[0],
-                    ind2 * pool_size[1] : (ind2 + 1) * pool_size[1],
-                    k,
-                ][max_index] = wt_mat[ind1, ind2]
-    test_wt = test_wt[0 : inp.shape[0], 0 : inp.shape[1], :]
-    return test_wt
+    strides = (strides,strides)
+    padding = (padding,padding)
+    input_padded, paddings = calculate_padding(pool_size, inp, padding, strides, -np.inf)
+    out_ds = np.zeros_like(input_padded)
+    for ind1 in range(wts.shape[0]):
+        for ind2 in range(wts.shape[1]):
+            indexes = [np.arange(ind1*strides[0], ind1*(strides[0])+pool_size[0]),
+                       np.arange(ind2*strides[1], ind2*(strides[1])+pool_size[1])]
+            tmp_patch = input_padded[np.ix_(indexes[0],indexes[1])]
+            updates = calculate_wt_max_unit(tmp_patch, wts[ind1,ind2,:], pool_size)
+            out_ds[np.ix_(indexes[0],indexes[1])]+=updates
+    out_ds = out_ds[paddings[0][0]:(paddings[0][0]+inp.shape[0]),
+                    paddings[1][0]:(paddings[1][0]+inp.shape[1]),:]
+    return out_ds
+def calculate_wt_avg_unit(patch, wts, pool_size):
+    p_ind = patch>0
+    p_ind = patch*p_ind
+    p_sum = np.einsum("ijk->k",p_ind)
+    n_ind = patch<0
+    n_ind = patch*n_ind
+    n_sum = np.einsum("ijk->k",n_ind)*-1.0
+    t_sum = p_sum+n_sum
+    wt_mat = np.zeros_like(patch)
+    p_saturate = p_sum>0
+    n_saturate = n_sum>0
+    t_sum[t_sum==0] = 1.0
+    p_agg_wt = (1.0/(t_sum))*wts*p_saturate
+    n_agg_wt = (1.0/(t_sum))*wts*n_saturate
+    wt_mat = wt_mat+(p_ind*p_agg_wt)
+    wt_mat = wt_mat+(n_ind*n_agg_wt*-1.0)
+    return wt_mat
-def calculate_wt_avgpool(wts, inp, pool_size):
+def calculate_wt_avgpool(wts, inp, pool_size, padding, strides):
     wts=wts.T
     inp=inp.T
     pad1 = pool_size[0]
     pad2 = pool_size[1]
-    test_samp_pad = np.pad(inp, ((0, pad1), (0, pad2), (0, 0)), "constant")
-    dim1, dim2, _ = wts.shape
-    test_wt = np.zeros_like(test_samp_pad)
-    for k in range(inp.shape[2]):
-        wt_mat = wts[:, :, k]
-        for ind1 in range(dim1):
-            for ind2 in range(dim2):
-                temp_inp = test_samp_pad[
-                    ind1 * pool_size[0] : (ind1 + 1) * pool_size[0],
-                    ind2 * pool_size[1] : (ind2 + 1) * pool_size[1],
-                    k,
-                ]
-                wt_ind1 = test_wt[
-                    ind1 * pool_size[0] : (ind1 + 1) * pool_size[0],
-                    ind2 * pool_size[1] : (ind2 + 1) * pool_size[1],
-                    k,
-                ]
-                wt = wt_mat[ind1, ind2]
-                p_ind = temp_inp > 0
-                n_ind = temp_inp < 0
-                p_sum = np.sum(temp_inp[p_ind])
-                n_sum = np.sum(temp_inp[n_ind]) * -1
-                if p_sum > 0:
-                    p_agg_wt = p_sum / (p_sum + n_sum)
-                else:
-                    p_agg_wt = 0
-                if n_sum > 0:
-                    n_agg_wt = n_sum / (p_sum + n_sum)
-                else:
-                    n_agg_wt = 0
-                if p_sum == 0:
-                    p_sum = 1
-                if n_sum == 0:
-                    n_sum = 1
-                wt_ind1[p_ind] += (temp_inp[p_ind] / p_sum) * wt * p_agg_wt
-                wt_ind1[n_ind] += (temp_inp[n_ind] / n_sum) * wt * n_agg_wt * -1.0
-    test_wt = test_wt[0 : inp.shape[0], 0 : inp.shape[1], :]
-    return test_wt
+    strides = (strides,strides)
+    padding = (padding,padding)
+    input_padded, paddings = calculate_padding(pool_size, inp, padding, strides, -np.inf)
+    out_ds = np.zeros_like(input_padded)
+    for ind1 in range(wts.shape[0]):
+        for ind2 in range(wts.shape[1]):
+            indexes = [np.arange(ind1*strides[0], ind1*(strides[0])+pool_size[0]),
+                       np.arange(ind2*strides[1], ind2*(strides[1])+pool_size[1])]
+            # Take slice
+            tmp_patch = input_padded[np.ix_(indexes[0],indexes[1])]
+            updates = calculate_wt_avg_unit(tmp_patch, wts[ind1,ind2,:], pool_size)
+            # Build tensor with "filtered" gradient
+            out_ds[np.ix_(indexes[0],indexes[1])]+=updates
+    out_ds = out_ds[paddings[0][0]:(paddings[0][0]+inp.shape[0]),
+                    paddings[1][0]:(paddings[1][0]+inp.shape[1]),:]
+    return out_ds
 def calculate_wt_gavgpool(wts, inp):
     wts=wts.T
     inp=inp.T
@@ -745,6 +743,438 @@ def calculate_wt_gavgpool(wts, inp):
         wt_mat[..., c] = temp_wt
     return wt_mat
+def calculate_wt_gmaxpool_2d(wts, inp):
+    channels = wts.shape[0]
+    wt_mat = np.zeros_like(inp)
+    for c in range(channels):
+        wt = wts[c]
+        x = inp[..., c]
+        max_val = np.max(x)
+        max_indexes = (x == max_val).astype(np.float32)
+        max_indexes_norm = 1.0 / np.sum(max_indexes)
+        max_indexes = max_indexes * max_indexes_norm
+        wt_mat[..., c] = max_indexes * wt
+    return wt_mat
+def calculate_padding_1d(kernel_size, inp, padding, strides, const_val=0.0):
+    if padding == 'valid':
+        return inp, [[0, 0],[0,0]]
+    elif padding == 0:
+        return inp,  [[0, 0],[0,0]]
+    elif isinstance(padding, int):
+        inp_pad = np.pad(inp, ((padding, padding), (0,0)), 'constant', constant_values=const_val)
+        return inp_pad, [[padding, padding],[0,0]]
+    else:
+        remainder = inp.shape[0] % strides
+        if remainder == 0:
+            pad_total = max(0, kernel_size - strides)
+        else:
+            pad_total = max(0, kernel_size - remainder)
+        pad_left = int(np.floor(pad_total / 2.0))
+        pad_right = int(np.ceil(pad_total / 2.0))
+        inp_pad = np.pad(inp, ((pad_left, pad_right),(0,0)), 'constant', constant_values=const_val)
+        return inp_pad, [[pad_left, pad_right],[0,0]]
+def calculate_wt_conv_unit_1d(patch, wts, w, b, act):
+    k = w.numpy()
+    bias = b.numpy()
+    b_ind = bias > 0
+    bias_pos = bias * b_ind
+    b_ind = bias < 0
+    bias_neg = bias * b_ind * -1.0
+    conv_out = np.einsum("ijk,ij->ijk", k, patch)
+    p_ind = conv_out > 0
+    p_ind = conv_out * p_ind
+    p_sum = np.einsum("ijk->k",p_ind)
+    n_ind = conv_out < 0
+    n_ind = conv_out * n_ind
+    n_sum = np.einsum("ijk->k",n_ind) * -1.0
+    t_sum = p_sum + n_sum
+    wt_mat = np.zeros_like(k)
+    p_saturate = p_sum > 0
+    n_saturate = n_sum > 0
+    if act["type"] == 'mono':
+        if act["range"]["l"]:
+            temp_ind = t_sum > act["range"]["l"]
+            p_saturate = temp_ind
+        if act["range"]["u"]:
+            temp_ind = t_sum < act["range"]["u"]
+            n_saturate = temp_ind
+    elif act["type"] == 'non_mono':
+        t_act = act["func"](t_sum)
+        p_act = act["func"](p_sum + bias_pos)
+        n_act = act["func"](-1 * (n_sum + bias_neg))
+        if act["range"]["l"]:
+            temp_ind = t_sum > act["range"]["l"]
+            p_saturate = p_saturate * temp_ind
+        if act["range"]["u"]:
+            temp_ind = t_sum < act["range"]["u"]
+            n_saturate = n_saturate * temp_ind
+        temp_ind = np.abs(t_act - p_act) > 1e-5
+        n_saturate = n_saturate * temp_ind
+        temp_ind = np.abs(t_act - n_act) > 1e-5
+        p_saturate = p_saturate * temp_ind
+    p_agg_wt = (1.0 / (p_sum + n_sum + bias_pos + bias_neg)) * wts * p_saturate
+    n_agg_wt = (1.0 / (p_sum + n_sum + bias_pos + bias_neg)) * wts * n_saturate
+    wt_mat = wt_mat + (p_ind * p_agg_wt)
+    wt_mat = wt_mat + (n_ind * n_agg_wt * -1.0)
+    wt_mat = np.sum(wt_mat, axis=-1)
+    return wt_mat
+def calculate_wt_conv_1d(wts, inp, w, b, padding, stride, act):
+    wts = wts.T
+    inp = inp.T
+    w = w.T
+    stride=stride
+    input_padded, paddings = calculate_padding_1d(w.shape[0], inp, padding, stride)
+    out_ds = np.zeros_like(input_padded)
+    for ind in range(wts.shape[0]):
+        indexes = np.arange(ind * stride, ind * stride + w.shape[0])
+        tmp_patch = input_padded[indexes]
+        updates = calculate_wt_conv_unit_1d(tmp_patch, wts[ind, :], w, b, act)
+        out_ds[indexes] += updates
+    out_ds = out_ds[paddings[0][0]:(paddings[0][0] + inp.shape[0])]
+    return out_ds
+def calculate_wt_max_unit_1d(patch, wts):
+    pmax = np.max(patch, axis=0)
+    indexes = (patch - pmax) == 0
+    indexes = indexes.astype(np.float32)
+    indexes_norm = 1.0 / np.sum(indexes, axis=0)
+    indexes = np.einsum("ij,j->ij", indexes, indexes_norm)
+    out = np.einsum("ij,j->ij", indexes, wts)
+    return out
+def calculate_wt_maxpool_1d(wts, inp, pool_size, padding, stride):
+    inp = inp.T
+    wts = wts.T
+    input_padded, paddings = calculate_padding_1d(pool_size, inp, padding, stride, -np.inf)
+    out_ds = np.zeros_like(input_padded)
+    stride=stride
+    pool_size=pool_size
+    for ind in range(wts.shape[0]):
+        indexes = np.arange(ind * stride, ind * stride + pool_size)
+        tmp_patch = input_padded[indexes]
+        updates = calculate_wt_max_unit_1d(tmp_patch, wts[ind, :])
+        out_ds[indexes] += updates
+    out_ds = out_ds[paddings[0][0]:(paddings[0][0] + inp.shape[0])]
+    return out_ds
+def calculate_wt_avg_unit_1d(patch, wts):
+    p_ind = patch > 0
+    p_ind = patch * p_ind
+    p_sum = np.sum(p_ind, axis=0)
+    n_ind = patch < 0
+    n_ind = patch * n_ind
+    n_sum = np.sum(n_ind, axis=0) * -1.0
+    t_sum = p_sum + n_sum
+    wt_mat = np.zeros_like(patch)
+    p_saturate = p_sum > 0
+    n_saturate = n_sum > 0
+    t_sum[t_sum == 0] = 1.0
+    p_agg_wt = (1.0 / t_sum) * wts * p_saturate
+    n_agg_wt = (1.0 / t_sum) * wts * n_saturate
+    wt_mat = wt_mat + (p_ind * p_agg_wt)
+    wt_mat = wt_mat + (n_ind * n_agg_wt * -1.0)
+    return wt_mat
+def calculate_wt_avgpool_1d(wts, inp, pool_size, padding, stride):
+    wts = wts.T
+    inp = inp.T
+    stride=stride
+    pool_size=pool_size
+    input_padded, paddings = calculate_padding_1d(pool_size, inp, padding[0], stride[0], 0)
+    out_ds = np.zeros_like(input_padded)
+    for ind in range(wts.shape[0]):
+        indexes = np.arange(ind * stride[0], ind * stride[0] + pool_size[0])
+        tmp_patch = input_padded[indexes]
+        updates = calculate_wt_avg_unit_1d(tmp_patch, wts[ind, :])
+        out_ds[indexes] += updates
+    out_ds = out_ds[paddings[0][0]:(paddings[0][0] + inp.shape[0])]
+    return out_ds
+def calculate_wt_gavgpool_1d(wts, inp):
+    channels = wts.shape[0]
+    wt_mat = np.zeros_like(inp)
+    for c in range(channels):
+        wt = wts[c]
+        temp_wt = wt_mat[:, c]
+        x = inp[:, c]
+        p_mat = np.copy(x)
+        n_mat = np.copy(x)
+        p_mat[p_mat < 0] = 0
+        n_mat[n_mat > 0] = 0
+        p_sum = np.sum(p_mat)
+        n_sum = np.sum(n_mat) * -1
+        p_agg_wt = 0.0
+        n_agg_wt = 0.0
+        if p_sum + n_sum > 0.0:
+            p_agg_wt = p_sum / (p_sum + n_sum)
+            n_agg_wt = n_sum / (p_sum + n_sum)
+        if p_sum == 0.0:
+            p_sum = 1.0
+        if n_sum == 0.0:
+            n_sum = 1.0
+        temp_wt = temp_wt + ((p_mat / p_sum) * wt * p_agg_wt)
+        temp_wt = temp_wt + ((n_mat / n_sum) * wt * n_agg_wt * -1.0)
+        wt_mat[:, c] = temp_wt
+    return wt_mat
+def calculate_wt_gmaxpool_1d(wts, inp):
+    wts = wts.T
+    inp = inp.T
+    channels = wts.shape[0]
+    wt_mat = np.zeros_like(inp)
+    for c in range(channels):
+        wt = wts[c]
+        x = inp[:, c]
+        max_val = np.max(x)
+        max_indexes = (x == max_val).astype(np.float32)
+        max_indexes_norm = 1.0 / np.sum(max_indexes)
+        max_indexes = max_indexes * max_indexes_norm
+        wt_mat[:, c] = max_indexes * wt
+    return wt_mat
+def calculate_output_padding_conv2d_transpose(input_shape, kernel_size, padding, strides):
+    if padding == 'valid':
+        out_shape = [(input_shape[0] - 1) * strides[0] + kernel_size[0],
+                     (input_shape[1] - 1) * strides[1] + kernel_size[1]]
+        paddings = [[0, 0], [0, 0], [0, 0]]
+    elif padding == (0,0):
+        out_shape = [(input_shape[0] - 1) * strides[0] + kernel_size[0],
+                     (input_shape[1] - 1) * strides[1] + kernel_size[1]]
+        paddings = [[0, 0], [0, 0], [0, 0]]
+    elif isinstance(padding, tuple) and padding != (None, None):
+        out_shape = [input_shape[0] * strides[0], input_shape[1] * strides[1]]
+        pad_h = padding[0]
+        pad_v = padding[1]
+        paddings = [[pad_h, pad_h], [pad_v, pad_v], [0, 0]]
+    else:  # 'same' padding
+        out_shape = [input_shape[0] * strides[0], input_shape[1] * strides[1]]
+        pad_h = max(0, (input_shape[0] - 1) * strides[0] + kernel_size[0] - out_shape[0])
+        pad_v = max(0, (input_shape[1] - 1) * strides[1] + kernel_size[1] - out_shape[1])
+        paddings = [[pad_h // 2, pad_h - pad_h // 2],
+                    [pad_v // 2, pad_v - pad_v // 2],
+                    [0, 0]]
+    return out_shape, paddings
+def calculate_wt_conv2d_transpose_unit(patch, wts, w, b, act):
+    if patch.ndim == 1:
+        patch = patch.reshape(1, 1, -1)
+    elif patch.ndim == 2:
+        patch = patch.reshape(1, *patch.shape)
+    elif patch.ndim != 3:
+        raise ValueError(f"Unexpected patch shape: {patch.shape}")
+    k = w.permute(0, 1, 3, 2).numpy()
+    bias = b.numpy()
+    b_ind = bias > 0
+    bias_pos = bias * b_ind
+    b_ind = bias < 0
+    bias_neg = bias * b_ind * -1.0
+    conv_out = np.einsum('ijkl,mnk->ijkl', k, patch)
+    p_ind = conv_out > 0
+    p_ind = conv_out * p_ind
+    n_ind = conv_out < 0
+    n_ind = conv_out * n_ind
+    p_sum = np.einsum("ijkl->l", p_ind)
+    n_sum = np.einsum("ijkl->l", n_ind) * -1.0
+    t_sum = p_sum + n_sum
+    wt_mat = np.zeros_like(k)
+    p_saturate = p_sum > 0
+    n_saturate = n_sum > 0
+    if act["type"] == 'mono':
+        if act["range"]["l"]:
+            p_saturate = t_sum > act["range"]["l"]
+        if act["range"]["u"]:
+            n_saturate = t_sum < act["range"]["u"]
+    elif act["type"] == 'non_mono':
+        t_act = act["func"](t_sum)
+        p_act = act["func"](p_sum + bias_pos)
+        n_act = act["func"](-1 * (n_sum + bias_neg))
+        if act["range"]["l"]:
+            temp_ind = t_sum > act["range"]["l"]
+            p_saturate = p_saturate * temp_ind
+        if act["range"]["u"]:
+            temp_ind = t_sum < act["range"]["u"]
+            n_saturate = n_saturate * temp_ind
+        temp_ind = np.abs(t_act - p_act) > 1e-5
+        n_saturate = n_saturate * temp_ind
+        temp_ind = np.abs(t_act - n_act) > 1e-5
+        p_saturate = p_saturate * temp_ind
+    p_agg_wt = (1.0 / (p_sum + n_sum + bias_pos + bias_neg)) * wts * p_saturate
+    n_agg_wt = (1.0 / (p_sum + n_sum + bias_pos + bias_neg)) * wts * n_saturate
+    wt_mat = wt_mat + (p_ind * p_agg_wt)
+    wt_mat = wt_mat + (n_ind * n_agg_wt * -1.0)
+    wt_mat = np.sum(wt_mat, axis=-1)
+    return wt_mat
+def calculate_wt_conv2d_transpose(wts, inp, w, b, padding, strides, act):
+    wts = wts.T
+    inp = inp.T
+    w = w.T
+    out_shape, paddings = calculate_output_padding_conv2d_transpose(inp.shape, w.shape, padding, strides)
+    out_ds = np.zeros(out_shape + [w.shape[3]])
+    for ind1 in range(inp.shape[0]):
+        for ind2 in range(inp.shape[1]):
+            out_ind1 = ind1 * strides[0]
+            out_ind2 = ind2 * strides[1]
+            tmp_patch = inp[ind1, ind2, :]
+            updates = calculate_wt_conv2d_transpose_unit(tmp_patch, wts[ind1, ind2, :], w, b, act)
+            end_ind1 = min(out_ind1 + w.shape[0], out_shape[0])
+            end_ind2 = min(out_ind2 + w.shape[1], out_shape[1])
+            valid_updates = updates[:end_ind1 - out_ind1, :end_ind2 - out_ind2, :]
+            out_ds[out_ind1:end_ind1, out_ind2:end_ind2, :] += valid_updates
+    if padding == 'same':
+        adjusted_out_ds = np.zeros(inp.shape)
+        for i in range(inp.shape[0]):
+            for j in range(inp.shape[1]):
+                start_i = max(0, i * strides[0])
+                start_j = max(0, j * strides[1])
+                end_i = min(out_ds.shape[0], (i+1) * strides[0])
+                end_j = min(out_ds.shape[1], (j+1) * strides[1])
+                relevant_area = out_ds[start_i:end_i, start_j:end_j, :]
+                adjusted_out_ds[i, j, :] = np.sum(relevant_area, axis=(0, 1))
+        out_ds = adjusted_out_ds
+    elif isinstance(padding, tuple) and padding != (None, None):
+        adjusted_out_ds = np.zeros(inp.shape)
+        for i in range(inp.shape[0]):
+            for j in range(inp.shape[1]):
+                start_i = max(0, i * strides[0])
+                start_j = max(0, j * strides[1])
+                end_i = min(out_ds.shape[0], (i+1) * strides[0])
+                end_j = min(out_ds.shape[1], (j+1) * strides[1])
+                relevant_area = out_ds[start_i:end_i, start_j:end_j, :]
+                adjusted_out_ds[i, j, :] = np.sum(relevant_area, axis=(0, 1))
+        out_ds = adjusted_out_ds
+    else:
+        out_ds = out_ds[paddings[0][0]:(paddings[0][0] + inp.shape[0]),
+                        paddings[1][0]:(paddings[1][0] + inp.shape[1]), :]
+    return out_ds
+def calculate_output_padding_conv1d_transpose(input_shape, kernel_size, padding, strides,dilation):
+    if padding == 'valid':
+        out_shape = [(input_shape[0] - 1) * strides + kernel_size[0]]
+        paddings = [[0, 0], [0, 0]]
+    elif padding == 0:
+        out_shape = [(input_shape[0] - 1) * strides + kernel_size[0]]
+        paddings = [[0, 0], [0, 0]]
+    elif isinstance(padding, int):
+        out_shape = [input_shape[0] * strides]
+        pad_v = (dilation * (kernel_size[0] - 1)) - padding
+        out_shape = [input_shape[0] * strides + pad_v]
+        paddings = [[pad_v, pad_v],
+                    [0, 0]]
+    else:  # 'same' padding
+        out_shape = [input_shape[0] * strides]
+        pad_h = max(0, (input_shape[0] - 1) * strides + kernel_size[0] - out_shape[0])
+        paddings = [[pad_h // 2, pad_h // 2],
+                    [0, 0]]
+    return out_shape, paddings
+def calculate_wt_conv1d_transpose_unit(patch, wts, w, b, act):
+    if patch.ndim == 1:
+        patch = patch.reshape(1, -1)
+    elif patch.ndim != 2:
+        raise ValueError(f"Unexpected patch shape: {patch.shape}")
+    k = w.permute(0, 2, 1).numpy()
+    bias = b.numpy()
+    b_ind = bias > 0
+    bias_pos = bias * b_ind
+    b_ind = bias < 0
+    bias_neg = bias * b_ind * -1.0
+    conv_out = np.einsum('ijk,mj->ijk', k, patch)
+    p_ind = conv_out > 0
+    p_ind = conv_out * p_ind
+    n_ind = conv_out < 0
+    n_ind = conv_out * n_ind
+    p_sum = np.einsum("ijl->l", p_ind)
+    n_sum = np.einsum("ijl->l", n_ind) * -1.0
+    t_sum = p_sum + n_sum
+    wt_mat = np.zeros_like(k)
+    p_saturate = p_sum > 0
+    n_saturate = n_sum > 0
+    if act["type"] == 'mono':
+        if act["range"]["l"]:
+            p_saturate = t_sum > act["range"]["l"]
+        if act["range"]["u"]:
+            n_saturate = t_sum < act["range"]["u"]
+    elif act["type"] == 'non_mono':
+        t_act = act["func"](t_sum)
+        p_act = act["func"](p_sum + bias_pos)
+        n_act = act["func"](-1 * (n_sum + bias_neg))
+        if act["range"]["l"]:
+            temp_ind = t_sum > act["range"]["l"]
+            p_saturate = p_saturate * temp_ind
+        if act["range"]["u"]:
+            temp_ind = t_sum < act["range"]["u"]
+            n_saturate = n_saturate * temp_ind
+        temp_ind = np.abs(t_act - p_act) > 1e-5
+        n_saturate = n_saturate * temp_ind
+        temp_ind = np.abs(t_act - n_act) > 1e-5
+        p_saturate = p_saturate * temp_ind
+    p_agg_wt = (1.0 / (p_sum + n_sum + bias_pos + bias_neg)) * wts * p_saturate
+    n_agg_wt = (1.0 / (p_sum + n_sum + bias_pos + bias_neg)) * wts * n_saturate
+    wt_mat = wt_mat + (p_ind * p_agg_wt)
+    wt_mat = wt_mat + (n_ind * n_agg_wt * -1.0)
+    wt_mat = np.sum(wt_mat, axis=-1)
+    return wt_mat
+def calculate_wt_conv1d_transpose(wts, inp, w, b, padding, strides, dilation, act):
+    wts = wts.T
+    inp = inp.T
+    w = w.T
+    out_shape, paddings = calculate_output_padding_conv1d_transpose(inp.shape, w.shape, padding, strides, dilation)
+    out_ds = np.zeros(out_shape + [w.shape[2]])
+    for ind in range(inp.shape[0]):
+        out_ind = ind * strides
+        tmp_patch = inp[ind, :]
+        updates = calculate_wt_conv1d_transpose_unit(tmp_patch, wts[ind, :], w, b, act)
+        end_ind = min(out_ind + w.shape[0], out_shape[0])
+        valid_updates = updates[:end_ind - out_ind, :]
+        out_ds[out_ind:end_ind, :] += valid_updates
+    if padding == 'same':
+        adjusted_out_ds = np.zeros(inp.shape)
+        for i in range(inp.shape[0]):
+            start_i = max(0, i * strides)
+            end_i = min(out_ds.shape[0], (i + 1) * strides)
+            relevant_area = out_ds[start_i:end_i, :]
+            adjusted_out_ds[i, :] = np.sum(relevant_area, axis=0)
+        out_ds = adjusted_out_ds
+    elif padding == 0:
+        adjusted_out_ds = np.zeros(inp.shape)
+        for i in range(inp.shape[0]):
+            start_i = max(0, i * strides)
+            end_i = min(out_ds.shape[0], (i + 1) * strides)
+            relevant_area = out_ds[start_i:end_i, :]
+            adjusted_out_ds[i, :] = np.sum(relevant_area, axis=0)
+        out_ds = adjusted_out_ds
+    else:
+        out_ds = out_ds[paddings[0][0]:(paddings[0][0] + inp.shape[0]), :]
+    return out_ds
 ####################################################################
 ###################    Encoder Model    ####################
@@ -753,27 +1183,85 @@ def stabilize(matrix, epsilon=1e-6):
     return matrix + epsilon * np.sign(matrix)
-def calculate_relevance_V(wts, value_output):
-    # Initialize wt_mat with zeros
-    wt_mat_V = np.zeros((wts.shape[0], wts.shape[1], *value_output.shape))
+def calculate_wt_residual(wts, inp=None):
+    wt_mat = []
+    inp_list = []
+    expanded_wts = as_strided(
+        wts,
+        shape=(np.prod(wts.shape),),
+        strides=(wts.strides[-1],),
+        writeable=False,  # totally use this to avoid writing to memory in weird places
+    )
+    for x in inp:
+        expanded_input = as_strided(
+            x,
+            shape=(np.prod(x.shape),),
+            strides=(x.strides[-1],),
+            writeable=False,  # totally use this to avoid writing to memory in weird places
+        )
+        inp_list.append(expanded_input)
+        wt_mat.append(np.zeros_like(expanded_input))
+    wt_mat = np.array(wt_mat)
+    inp_list = np.array(inp_list)
+    for i in range(wt_mat.shape[1]):
+        wt_ind1 = wt_mat[:, i]
+        wt = expanded_wts[i]
+        l1_ind1 = inp_list[:, i]
+        p_ind = l1_ind1 > 0
+        n_ind = l1_ind1 < 0
+        p_sum = np.sum(l1_ind1[p_ind])
+        n_sum = np.sum(l1_ind1[n_ind]) * -1
+        t_sum = p_sum - n_sum
+        p_agg_wt = 0
+        n_agg_wt = 0
+        if p_sum + n_sum > 0:
+            p_agg_wt = p_sum / (p_sum + n_sum)
+            n_agg_wt = n_sum / (p_sum + n_sum)
+        if p_sum == 0:
+            p_sum = 1
+        if n_sum == 0:
+            n_sum = 1
+        wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
+        wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
+        wt_mat[:, i] = wt_ind1
+    wt_mat = [i.reshape(wts.shape) for i in list(wt_mat)]
+    return wt_mat
+def calculate_relevance_V(wts, value_output, w):
+    wt_mat_V = np.zeros(value_output.shape)
+    if 'b_v' in w:
+        bias_v = w['b_v']
+    else:
+        bias_v = 0
     for i in range(wts.shape[0]):
         for j in range(wts.shape[1]):
             l1_ind1 = value_output
-            wt_ind1 = wt_mat_V[i, j]
             wt = wts[i, j]
             p_ind = l1_ind1 > 0
             n_ind = l1_ind1 < 0
             p_sum = np.sum(l1_ind1[p_ind])
             n_sum = np.sum(l1_ind1[n_ind]) * -1
+            if bias_v[i] > 0:
+                pbias = bias_v[i]
+                nbias = 0
+            else:
+                pbias = 0
+                nbias = bias_v[i] * -1
             if p_sum > 0:
-                p_agg_wt = p_sum / (p_sum + n_sum)
+                p_agg_wt = (p_sum + pbias) / (p_sum + n_sum + pbias + nbias)
+                p_agg_wt = p_agg_wt * (p_sum / (p_sum + pbias))
             else:
                 p_agg_wt = 0
             if n_sum > 0:
-                n_agg_wt = n_sum / (p_sum + n_sum)
+                n_agg_wt = (n_sum + nbias) / (p_sum + n_sum + pbias + nbias)
+                n_agg_wt = n_agg_wt * (n_sum / (n_sum + nbias))
             else:
                 n_agg_wt = 0
@@ -782,21 +1270,22 @@ def calculate_relevance_V(wts, value_output):
             if n_sum == 0:
                 n_sum = 1
-            wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
-            wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
+            wt_mat_V[p_ind] += (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
+            wt_mat_V[n_ind] += (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
-    wt_mat_V = np.sum(wt_mat_V, axis=(0,1))
     return wt_mat_V
-def calculate_relevance_QK(wts, QK_output):
-    # Initialize wt_mat with zeros
-    wt_mat_QK = np.zeros((wts.shape[0], wts.shape[1], *QK_output.shape))
+def calculate_relevance_QK(wts, QK_output, w):
+    wt_mat_QK = np.zeros(QK_output.shape)
+    # Check if 'b_q' and 'b_k' exist in the weights, default to 0 if not
+    b_q = w['b_q'] if 'b_q' in w else 0
+    b_k = w['b_k'] if 'b_k' in w else 0
     for i in range(wts.shape[0]):
         for j in range(wts.shape[1]):
             l1_ind1 = QK_output
-            wt_ind1 = wt_mat_QK[i, j]
             wt = wts[i, j]
             p_ind = l1_ind1 > 0
@@ -804,7 +1293,21 @@ def calculate_relevance_QK(wts, QK_output):
             p_sum = np.sum(l1_ind1[p_ind])
             n_sum = np.sum(l1_ind1[n_ind]) * -1
-            t_sum = p_sum - n_sum
+            if b_q[i] > 0 and b_k[i] > 0:
+                pbias = b_q[i] + b_k[i]
+                nbias = 0
+            elif b_q[i] > 0 and b_k[i] < 0:
+                pbias = b_q[i]
+                nbias = b_k[i] * -1
+            elif b_q[i] < 0 and b_k[i] > 0:
+                pbias = b_k[i]
+                nbias = b_q[i] * -1
+            else:
+                pbias = 0
+                nbias = b_q[i] + b_k[i]
+                nbias *= -1
+            t_sum = p_sum + pbias - n_sum - nbias
             # This layer has a softmax activation function
             act = {
@@ -823,12 +1326,13 @@ def calculate_relevance_QK(wts, QK_output):
                         n_sum = 0
             if p_sum > 0:
-                p_agg_wt = p_sum / (p_sum + n_sum)
+                p_agg_wt = (p_sum + pbias) / (p_sum + n_sum + pbias + nbias)
+                p_agg_wt = p_agg_wt * (p_sum / (p_sum + pbias))
             else:
                 p_agg_wt = 0
             if n_sum > 0:
-                n_agg_wt = n_sum / (p_sum + n_sum)
+                n_agg_wt = (n_sum + nbias) / (p_sum + n_sum + pbias + nbias)
+                n_agg_wt = n_agg_wt * (n_sum / (n_sum + nbias))
             else:
                 n_agg_wt = 0
@@ -837,14 +1341,60 @@ def calculate_relevance_QK(wts, QK_output):
             if n_sum == 0:
                 n_sum = 1
-            wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
-            wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
+            wt_mat_QK[p_ind] += (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
+            wt_mat_QK[n_ind] += (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
-    wt_mat_QK = np.sum(wt_mat_QK, axis=(0, 1))
     return  wt_mat_QK
-def calculate_wt_self_attention(wts, inp, w):
+def calculate_wt_attention_output_projection(wts, proj_output, w):
+    wt_mat_proj_output = np.zeros(proj_output.shape)
+    if 'b_d' in w:
+        bias_d = w['b_d']
+    else:
+        bias_d = 0
+    for i in range(wts.shape[0]):
+        for j in range(wts.shape[1]):
+            l1_ind1 = proj_output
+            wt = wts[i, j]
+            p_ind = l1_ind1 > 0
+            n_ind = l1_ind1 < 0
+            p_sum = np.sum(l1_ind1[p_ind])
+            n_sum = np.sum(l1_ind1[n_ind]) * -1
+            if bias_d[i] > 0:
+                pbias = bias_d[i]
+                nbias = 0
+            else:
+                pbias = 0
+                nbias = bias_d[i] * -1
+            if p_sum > 0:
+                p_agg_wt = (p_sum + pbias) / (p_sum + n_sum + pbias + nbias)
+                p_agg_wt = p_agg_wt * (p_sum / (p_sum + pbias))
+            else:
+                p_agg_wt = 0
+            if n_sum > 0:
+                n_agg_wt = (n_sum + nbias) / (p_sum + n_sum + pbias + nbias)
+                n_agg_wt = n_agg_wt * (n_sum / (n_sum + nbias))
+            else:
+                n_agg_wt = 0
+            if p_sum == 0:
+                p_sum = 1
+            if n_sum == 0:
+                n_sum = 1
+            wt_mat_proj_output[p_ind] += (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
+            wt_mat_proj_output[n_ind] += (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
+    return wt_mat_proj_output
+def calculate_wt_self_attention(wts, inp, w, config):
     '''
     Input:
         wts:  relevance score of the layer
@@ -856,28 +1406,82 @@ def calculate_wt_self_attention(wts, inp, w):
         Step-2: outputs = F.softmax(inputs, dim=dim, dtype=dtype)
         Step-3: outputs = input_a * input_b
     '''
+    # print(f"inp: {inp.shape}, wts: {wts.shape}")   # (1, 512)
+    # print(f"w['W_q']: {w['W_q'].shape}, w['W_k']: {w['W_k'].shape}, w['W_v']: {w['W_v'].shape}")
     query_output = np.einsum('ij,kj->ik', inp, w['W_q'])
     key_output = np.einsum('ij,kj->ik', inp, w['W_k'])
     value_output = np.einsum('ij,kj->ik', inp, w['W_v'])
+    # --------------- Reshape for Multi-Head Attention ----------------------
+    num_heads = getattr(config, 'num_attention_heads', getattr(config, 'num_heads', None))     # will work for BERT as well as T5/ Llama
+    hidden_size = getattr(config, 'hidden_size', getattr(config, 'd_model', None))             # will work for BERT as well as T5/Llama
+    if hasattr(config, 'num_key_value_heads'):
+        num_key_value_heads = config.num_key_value_heads
+    else:
+        num_key_value_heads = num_heads
+    head_dim = hidden_size // num_heads  # dimension of each attention head
+    query_states = np.einsum('thd->htd', query_output.reshape(query_output.shape[0], num_heads, head_dim))  # (num_heads, num_tokens, head_dim)
+    key_states = np.einsum('thd->htd', key_output.reshape(key_output.shape[0], num_key_value_heads, head_dim))  # (num_key_value_heads, num_tokens, head_dim)
+    value_states = np.einsum('thd->htd', value_output.reshape(value_output.shape[0], num_key_value_heads, head_dim))  # (num_key_value_heads, num_tokens, head_dim)
+    # calculate how many times we need to repeat the key/value heads
+    n_rep = num_heads // num_key_value_heads
+    key_states = np.repeat(key_states, n_rep, axis=0)
+    value_states = np.repeat(value_states, n_rep, axis=0)
+    QK_output = np.einsum('hqd,hkd->hqk', query_states, key_states)    # (num_heads, num_tokens, num_tokens)
+    attn_weights = QK_output / np.sqrt(head_dim)
+    # Apply softmax along the last dimension (softmax over key dimension)
+    attn_weights = np.exp(attn_weights - np.max(attn_weights, axis=-1, keepdims=True))  # Numerically stable softmax
+    attn_weights = attn_weights / np.sum(attn_weights, axis=-1, keepdims=True)
+    # Weighted sum of values (num_heads, num_tokens, head_dim)
+    attn_output = np.einsum('hqk,hkl->hql', attn_weights, value_states)
+    transposed_attn_output = np.einsum('hqd->qhd', attn_output)
+    reshaped_attn_output = transposed_attn_output.reshape(transposed_attn_output.shape[0], num_heads * head_dim)
+    # Perform final linear projection (num_tokens, hidden_size)
+    final_output = np.einsum('qd,dh->qh', reshaped_attn_output, w['W_d'])
+    # ------------- Relevance calculation for Final Linear Projection -------------
+    wt_mat_attn_proj = calculate_wt_attention_output_projection(wts, final_output, w)
     # --------------- Relevance Calculation for Step-3 -----------------------
-    relevance_V = wts / 2
-    relevance_QK = wts / 2
+    # divide the relevance among `attn_weights` and `value_states`
+    wt_mat_attn_proj = wt_mat_attn_proj.reshape(-1, num_heads, head_dim)
+    wt_mat_attn_proj = np.einsum('qhd->hqd', wt_mat_attn_proj)
+    stabilized_attn_output = stabilize(attn_output * 2)
+    norm_wt_mat_attn_proj = wt_mat_attn_proj / stabilized_attn_output
+    relevance_QK = np.einsum('htd,hbd->htb', norm_wt_mat_attn_proj, value_states) * attn_weights
+    relevance_V = np.einsum('hdt,hdb->htb', attn_weights, norm_wt_mat_attn_proj)  * value_states
     # --------------- Relevance Calculation for V --------------------------------
-    wt_mat_V = calculate_relevance_V(relevance_V, value_output)
+    relevance_V = np.einsum('hqd->qhd', relevance_V)
+    relevance_V = relevance_V.reshape(-1, num_heads * head_dim)
+    wt_mat_V = calculate_relevance_V(relevance_V, value_states, w)
     # --------------- Transformed Relevance QK ----------------------------------
-    QK_output = np.einsum('ij,kj->ik', query_output, key_output)
-    wt_mat_QK = calculate_relevance_QK(relevance_QK, QK_output)
+    relevance_QK = np.einsum('hqd->qhd', relevance_QK)
+    relevance_QK = relevance_QK.reshape(-1, relevance_QK.shape[1] * relevance_QK.shape[2])
+    wt_mat_QK = calculate_relevance_QK(relevance_QK, QK_output, w)
     # --------------- Relevance Calculation for K and Q --------------------------------
     stabilized_QK_output = stabilize(QK_output * 2)
     norm_wt_mat_QK = wt_mat_QK / stabilized_QK_output
-    wt_mat_Q = np.einsum('ij,jk->ik', norm_wt_mat_QK, key_output) * query_output
-    wt_mat_K = np.einsum('ij,ik->kj', query_output, norm_wt_mat_QK) * key_output
+    wt_mat_Q = np.einsum('htd,hdb->htb', norm_wt_mat_QK, key_states) * query_states
+    wt_mat_K = np.einsum('htd,htb->hbd', query_states, norm_wt_mat_QK) * key_states
     wt_mat = wt_mat_V + wt_mat_K + wt_mat_Q
+    # Reshape wt_mat
+    wt_mat = np.einsum('htd->thd', wt_mat)
+    wt_mat = wt_mat.reshape(wt_mat.shape[0], wt_mat.shape[1] * wt_mat.shape[2])  # reshaped_array = array.reshape(8, 32 * 128)
     return wt_mat
@@ -893,7 +1497,9 @@ def calculate_wt_feed_forward(wts, inp, w):
         R2 = wts[i]
         contribution_matrix2 = np.einsum('ij,j->ij', w['W_out'], intermediate_output[i])
         wt_mat2 = np.zeros(contribution_matrix2.shape)
+        bias_out = w['b_out'] if 'b_out' in w else 0
         for j in range(contribution_matrix2.shape[0]):
             l1_ind1 = contribution_matrix2[j]
             wt_ind1 = wt_mat2[j]
@@ -903,14 +1509,23 @@ def calculate_wt_feed_forward(wts, inp, w):
             n_ind = l1_ind1 < 0
             p_sum = np.sum(l1_ind1[p_ind])
             n_sum = np.sum(l1_ind1[n_ind]) * -1
+            # Handle positive and negative bias contributions
+            if bias_out[i] > 0:
+                pbias = bias_out[i]
+                nbias = 0
+            else:
+                pbias = 0
+                nbias = -bias_out[i]
             if p_sum > 0:
-                p_agg_wt = p_sum / (p_sum + n_sum)
+                p_agg_wt = (p_sum + pbias) / (p_sum + n_sum + pbias + nbias)
+                p_agg_wt = p_agg_wt * (p_sum / (p_sum + pbias))
             else:
                 p_agg_wt = 0
             if n_sum > 0:
-                n_agg_wt = n_sum / (p_sum + n_sum)
+                n_agg_wt = (n_sum + nbias) / (p_sum + n_sum + pbias + nbias)
+                n_agg_wt = n_agg_wt * (n_sum / (n_sum + nbias))
             else:
                 n_agg_wt = 0
@@ -929,6 +1544,9 @@ def calculate_wt_feed_forward(wts, inp, w):
         R1 = relevance_out[i]
         contribution_matrix1 = np.einsum('ij,j->ij', w['W_int'], inp[i])
         wt_mat1 = np.zeros(contribution_matrix1.shape)
+        # Check if bias 'b_int' exists, default to 0 if not
+        bias_int = w['b_int'] if 'b_int' in w else 0
         for j in range(contribution_matrix1.shape[0]):
             l1_ind1 = contribution_matrix1[j]
@@ -940,7 +1558,15 @@ def calculate_wt_feed_forward(wts, inp, w):
             p_sum = np.sum(l1_ind1[p_ind])
             n_sum = np.sum(l1_ind1[n_ind]) * -1
-            t_sum = p_sum - n_sum
+            # Handle positive and negative bias
+            if bias_int[i] > 0:
+                pbias = bias_int[i]
+                nbias = 0
+            else:
+                pbias = 0
+                nbias = -bias_int[i]
+            t_sum = p_sum + pbias - n_sum - nbias
             # This layer has a ReLU activation function
             act = {
@@ -959,12 +1585,13 @@ def calculate_wt_feed_forward(wts, inp, w):
                         n_sum = 0
             if p_sum > 0:
-                p_agg_wt = p_sum / (p_sum + n_sum)
+                p_agg_wt = (p_sum + pbias) / (p_sum + n_sum + pbias + nbias)
+                p_agg_wt = p_agg_wt * (p_sum / (p_sum + pbias))
             else:
                 p_agg_wt = 0
             if n_sum > 0:
-                n_agg_wt = n_sum / (p_sum + n_sum)
+                n_agg_wt = (n_sum + nbias) / (p_sum + n_sum + pbias + nbias)
+                n_agg_wt = n_agg_wt * (n_sum / (n_sum + nbias))
             else:
                 n_agg_wt = 0
@@ -1121,7 +1748,7 @@ def calculate_wt_pooler(wts, inp, w):
         # Calculate relevance for each token
         relevance_inp[i] = wt_mat.sum(axis=0)
-    relevance_inp *= (100 / np.sum(relevance_inp))
+    relevance_inp *= (np.sum(wts) / np.sum(relevance_inp))
     return relevance_inp