dl-backtrace 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dl-backtrace might be problematic. Click here for more details.
- dl_backtrace/pytorch_backtrace/backtrace/backtrace.py +173 -44
- dl_backtrace/pytorch_backtrace/backtrace/utils/__init__.py +3 -0
- dl_backtrace/pytorch_backtrace/backtrace/utils/encoder.py +183 -0
- dl_backtrace/pytorch_backtrace/backtrace/utils/encoder_decoder.py +489 -0
- dl_backtrace/pytorch_backtrace/backtrace/utils/helper.py +95 -0
- dl_backtrace/pytorch_backtrace/backtrace/utils/prop.py +481 -0
- dl_backtrace/tf_backtrace/backtrace/__init__.py +1 -2
- dl_backtrace/tf_backtrace/backtrace/activation_info.py +33 -0
- dl_backtrace/tf_backtrace/backtrace/backtrace.py +506 -279
- dl_backtrace/tf_backtrace/backtrace/models.py +25 -0
- dl_backtrace/tf_backtrace/backtrace/server.py +27 -0
- dl_backtrace/tf_backtrace/backtrace/utils/__init__.py +5 -2
- dl_backtrace/tf_backtrace/backtrace/utils/encoder.py +206 -0
- dl_backtrace/tf_backtrace/backtrace/utils/encoder_decoder.py +501 -0
- dl_backtrace/tf_backtrace/backtrace/utils/helper.py +99 -0
- dl_backtrace/tf_backtrace/backtrace/utils/utils_contrast.py +1132 -0
- dl_backtrace/tf_backtrace/backtrace/utils/utils_prop.py +1582 -0
- dl_backtrace/version.py +2 -2
- {dl_backtrace-0.0.14.dist-info → dl_backtrace-0.0.16.dist-info}/METADATA +2 -2
- dl_backtrace-0.0.16.dist-info/RECORD +29 -0
- {dl_backtrace-0.0.14.dist-info → dl_backtrace-0.0.16.dist-info}/WHEEL +1 -1
- dl_backtrace/tf_backtrace/backtrace/config.py +0 -41
- dl_backtrace/tf_backtrace/backtrace/utils/contrast.py +0 -834
- dl_backtrace/tf_backtrace/backtrace/utils/prop.py +0 -725
- dl_backtrace-0.0.14.dist-info/RECORD +0 -21
- {dl_backtrace-0.0.14.dist-info → dl_backtrace-0.0.16.dist-info}/LICENSE +0 -0
- {dl_backtrace-0.0.14.dist-info → dl_backtrace-0.0.16.dist-info}/top_level.txt +0 -0
|
@@ -744,3 +744,484 @@ def calculate_wt_gavgpool(wts, inp):
|
|
|
744
744
|
temp_wt = temp_wt + ((n_mat / n_sum) * wt * n_agg_wt * -1.0)
|
|
745
745
|
wt_mat[..., c] = temp_wt
|
|
746
746
|
return wt_mat
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
####################################################################
|
|
750
|
+
################### Encoder Model ####################
|
|
751
|
+
####################################################################
|
|
752
|
+
def stabilize(matrix, epsilon=1e-6):
|
|
753
|
+
return matrix + epsilon * np.sign(matrix)
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def calculate_relevance_V(wts, value_output):
|
|
757
|
+
# Initialize wt_mat with zeros
|
|
758
|
+
wt_mat_V = np.zeros((wts.shape[0], wts.shape[1], *value_output.shape))
|
|
759
|
+
|
|
760
|
+
for i in range(wts.shape[0]):
|
|
761
|
+
for j in range(wts.shape[1]):
|
|
762
|
+
l1_ind1 = value_output
|
|
763
|
+
wt_ind1 = wt_mat_V[i, j]
|
|
764
|
+
wt = wts[i, j]
|
|
765
|
+
|
|
766
|
+
p_ind = l1_ind1 > 0
|
|
767
|
+
n_ind = l1_ind1 < 0
|
|
768
|
+
p_sum = np.sum(l1_ind1[p_ind])
|
|
769
|
+
n_sum = np.sum(l1_ind1[n_ind]) * -1
|
|
770
|
+
|
|
771
|
+
if p_sum > 0:
|
|
772
|
+
p_agg_wt = p_sum / (p_sum + n_sum)
|
|
773
|
+
else:
|
|
774
|
+
p_agg_wt = 0
|
|
775
|
+
if n_sum > 0:
|
|
776
|
+
n_agg_wt = n_sum / (p_sum + n_sum)
|
|
777
|
+
else:
|
|
778
|
+
n_agg_wt = 0
|
|
779
|
+
|
|
780
|
+
if p_sum == 0:
|
|
781
|
+
p_sum = 1
|
|
782
|
+
if n_sum == 0:
|
|
783
|
+
n_sum = 1
|
|
784
|
+
|
|
785
|
+
wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
|
|
786
|
+
wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
|
|
787
|
+
|
|
788
|
+
wt_mat_V = np.sum(wt_mat_V, axis=(0,1))
|
|
789
|
+
return wt_mat_V
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def calculate_relevance_QK(wts, QK_output):
|
|
793
|
+
# Initialize wt_mat with zeros
|
|
794
|
+
wt_mat_QK = np.zeros((wts.shape[0], wts.shape[1], *QK_output.shape))
|
|
795
|
+
|
|
796
|
+
for i in range(wts.shape[0]):
|
|
797
|
+
for j in range(wts.shape[1]):
|
|
798
|
+
l1_ind1 = QK_output
|
|
799
|
+
wt_ind1 = wt_mat_QK[i, j]
|
|
800
|
+
wt = wts[i, j]
|
|
801
|
+
|
|
802
|
+
p_ind = l1_ind1 > 0
|
|
803
|
+
n_ind = l1_ind1 < 0
|
|
804
|
+
p_sum = np.sum(l1_ind1[p_ind])
|
|
805
|
+
n_sum = np.sum(l1_ind1[n_ind]) * -1
|
|
806
|
+
|
|
807
|
+
t_sum = p_sum - n_sum
|
|
808
|
+
|
|
809
|
+
# This layer has a softmax activation function
|
|
810
|
+
act = {
|
|
811
|
+
"name": "softmax",
|
|
812
|
+
"range": {"l": -1, "u": 2},
|
|
813
|
+
"type": "mono",
|
|
814
|
+
"func": None,
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
if act["type"] == "mono":
|
|
818
|
+
if act["range"]["l"]:
|
|
819
|
+
if t_sum < act["range"]["l"]:
|
|
820
|
+
p_sum = 0
|
|
821
|
+
if act["range"]["u"]:
|
|
822
|
+
if t_sum > act["range"]["u"]:
|
|
823
|
+
n_sum = 0
|
|
824
|
+
|
|
825
|
+
if p_sum > 0:
|
|
826
|
+
p_agg_wt = p_sum / (p_sum + n_sum)
|
|
827
|
+
else:
|
|
828
|
+
p_agg_wt = 0
|
|
829
|
+
|
|
830
|
+
if n_sum > 0:
|
|
831
|
+
n_agg_wt = n_sum / (p_sum + n_sum)
|
|
832
|
+
else:
|
|
833
|
+
n_agg_wt = 0
|
|
834
|
+
|
|
835
|
+
if p_sum == 0:
|
|
836
|
+
p_sum = 1
|
|
837
|
+
if n_sum == 0:
|
|
838
|
+
n_sum = 1
|
|
839
|
+
|
|
840
|
+
wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
|
|
841
|
+
wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
|
|
842
|
+
|
|
843
|
+
wt_mat_QK = np.sum(wt_mat_QK, axis=(0, 1))
|
|
844
|
+
return wt_mat_QK
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
def calculate_wt_self_attention(wts, inp, w):
|
|
848
|
+
'''
|
|
849
|
+
Input:
|
|
850
|
+
wts: relevance score of the layer
|
|
851
|
+
inp: input to the layer
|
|
852
|
+
w: weights of the layer- ['W_q', 'W_k', 'W_v', 'W_o']
|
|
853
|
+
|
|
854
|
+
Outputs:
|
|
855
|
+
Step-1: outputs = torch.matmul(input_a, input_b)
|
|
856
|
+
Step-2: outputs = F.softmax(inputs, dim=dim, dtype=dtype)
|
|
857
|
+
Step-3: outputs = input_a * input_b
|
|
858
|
+
'''
|
|
859
|
+
query_output = np.einsum('ij,kj->ik', inp, w['W_q'])
|
|
860
|
+
key_output = np.einsum('ij,kj->ik', inp, w['W_k'])
|
|
861
|
+
value_output = np.einsum('ij,kj->ik', inp, w['W_v'])
|
|
862
|
+
|
|
863
|
+
# --------------- Relevance Calculation for Step-3 -----------------------
|
|
864
|
+
relevance_V = wts / 2
|
|
865
|
+
relevance_QK = wts / 2
|
|
866
|
+
|
|
867
|
+
# --------------- Relevance Calculation for V --------------------------------
|
|
868
|
+
wt_mat_V = calculate_relevance_V(relevance_V, value_output)
|
|
869
|
+
|
|
870
|
+
# --------------- Transformed Relevance QK ----------------------------------
|
|
871
|
+
QK_output = np.einsum('ij,kj->ik', query_output, key_output)
|
|
872
|
+
wt_mat_QK = calculate_relevance_QK(relevance_QK, QK_output)
|
|
873
|
+
|
|
874
|
+
# --------------- Relevance Calculation for K and Q --------------------------------
|
|
875
|
+
stabilized_QK_output = stabilize(QK_output * 2)
|
|
876
|
+
norm_wt_mat_QK = wt_mat_QK / stabilized_QK_output
|
|
877
|
+
wt_mat_Q = np.einsum('ij,jk->ik', norm_wt_mat_QK, key_output) * query_output
|
|
878
|
+
wt_mat_K = np.einsum('ij,ik->kj', query_output, norm_wt_mat_QK) * key_output
|
|
879
|
+
|
|
880
|
+
wt_mat = wt_mat_V + wt_mat_K + wt_mat_Q
|
|
881
|
+
return wt_mat
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
def calculate_wt_feed_forward(wts, inp, w):
|
|
885
|
+
intermediate_output = np.einsum('ij,jk->ik', inp, w['W_int'].T)
|
|
886
|
+
feed_forward_output = np.einsum('ij,jk->ik', intermediate_output, w['W_out'].T)
|
|
887
|
+
|
|
888
|
+
relevance_input = np.zeros(inp.shape)
|
|
889
|
+
relevance_out = np.zeros(intermediate_output.shape)
|
|
890
|
+
|
|
891
|
+
# Relevance propagation for 2nd layer
|
|
892
|
+
for i in range(wts.shape[0]):
|
|
893
|
+
R2 = wts[i]
|
|
894
|
+
contribution_matrix2 = np.einsum('ij,j->ij', w['W_out'], intermediate_output[i])
|
|
895
|
+
wt_mat2 = np.zeros(contribution_matrix2.shape)
|
|
896
|
+
|
|
897
|
+
for j in range(contribution_matrix2.shape[0]):
|
|
898
|
+
l1_ind1 = contribution_matrix2[j]
|
|
899
|
+
wt_ind1 = wt_mat2[j]
|
|
900
|
+
wt = R2[j]
|
|
901
|
+
|
|
902
|
+
p_ind = l1_ind1 > 0
|
|
903
|
+
n_ind = l1_ind1 < 0
|
|
904
|
+
p_sum = np.sum(l1_ind1[p_ind])
|
|
905
|
+
n_sum = np.sum(l1_ind1[n_ind]) * -1
|
|
906
|
+
|
|
907
|
+
if p_sum > 0:
|
|
908
|
+
p_agg_wt = p_sum / (p_sum + n_sum)
|
|
909
|
+
else:
|
|
910
|
+
p_agg_wt = 0
|
|
911
|
+
|
|
912
|
+
if n_sum > 0:
|
|
913
|
+
n_agg_wt = n_sum / (p_sum + n_sum)
|
|
914
|
+
else:
|
|
915
|
+
n_agg_wt = 0
|
|
916
|
+
|
|
917
|
+
if p_sum == 0:
|
|
918
|
+
p_sum = 1
|
|
919
|
+
if n_sum == 0:
|
|
920
|
+
n_sum = 1
|
|
921
|
+
|
|
922
|
+
wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
|
|
923
|
+
wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
|
|
924
|
+
|
|
925
|
+
relevance_out[i] = wt_mat2.sum(axis=0)
|
|
926
|
+
|
|
927
|
+
# Relevance propagation for 1st layer
|
|
928
|
+
for i in range(relevance_out.shape[0]):
|
|
929
|
+
R1 = relevance_out[i]
|
|
930
|
+
contribution_matrix1 = np.einsum('ij,j->ij', w['W_int'], inp[i])
|
|
931
|
+
wt_mat1 = np.zeros(contribution_matrix1.shape)
|
|
932
|
+
|
|
933
|
+
for j in range(contribution_matrix1.shape[0]):
|
|
934
|
+
l1_ind1 = contribution_matrix1[j]
|
|
935
|
+
wt_ind1 = wt_mat1[j]
|
|
936
|
+
wt = R1[j]
|
|
937
|
+
|
|
938
|
+
p_ind = l1_ind1 > 0
|
|
939
|
+
n_ind = l1_ind1 < 0
|
|
940
|
+
p_sum = np.sum(l1_ind1[p_ind])
|
|
941
|
+
n_sum = np.sum(l1_ind1[n_ind]) * -1
|
|
942
|
+
|
|
943
|
+
t_sum = p_sum - n_sum
|
|
944
|
+
|
|
945
|
+
# This layer has a ReLU activation function
|
|
946
|
+
act = {
|
|
947
|
+
"name": "relu",
|
|
948
|
+
"range": {"l": 0, "u": None},
|
|
949
|
+
"type": "mono",
|
|
950
|
+
"func": None,
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
if act["type"] == "mono":
|
|
954
|
+
if act["range"]["l"]:
|
|
955
|
+
if t_sum < act["range"]["l"]:
|
|
956
|
+
p_sum = 0
|
|
957
|
+
if act["range"]["u"]:
|
|
958
|
+
if t_sum > act["range"]["u"]:
|
|
959
|
+
n_sum = 0
|
|
960
|
+
|
|
961
|
+
if p_sum > 0:
|
|
962
|
+
p_agg_wt = p_sum / (p_sum + n_sum)
|
|
963
|
+
else:
|
|
964
|
+
p_agg_wt = 0
|
|
965
|
+
|
|
966
|
+
if n_sum > 0:
|
|
967
|
+
n_agg_wt = n_sum / (p_sum + n_sum)
|
|
968
|
+
else:
|
|
969
|
+
n_agg_wt = 0
|
|
970
|
+
|
|
971
|
+
if p_sum == 0:
|
|
972
|
+
p_sum = 1
|
|
973
|
+
if n_sum == 0:
|
|
974
|
+
n_sum = 1
|
|
975
|
+
|
|
976
|
+
wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
|
|
977
|
+
wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
|
|
978
|
+
|
|
979
|
+
relevance_input[i] = wt_mat1.sum(axis=0)
|
|
980
|
+
|
|
981
|
+
return relevance_input
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
def calculate_wt_classifier(wts, inp, w):
|
|
985
|
+
'''
|
|
986
|
+
Input:
|
|
987
|
+
wts: relevance score of the layer
|
|
988
|
+
inp: input to the layer
|
|
989
|
+
w: weights of the layer- ['W_cls', 'b_cls']
|
|
990
|
+
'''
|
|
991
|
+
mul_mat = np.einsum("ij, i->ij", w['W_cls'].T, inp).T
|
|
992
|
+
wt_mat = np.zeros(mul_mat.shape)
|
|
993
|
+
|
|
994
|
+
for i in range(mul_mat.shape[0]):
|
|
995
|
+
l1_ind1 = mul_mat[i]
|
|
996
|
+
wt_ind1 = wt_mat[i]
|
|
997
|
+
wt = wts[i]
|
|
998
|
+
|
|
999
|
+
p_ind = l1_ind1 > 0
|
|
1000
|
+
n_ind = l1_ind1 < 0
|
|
1001
|
+
p_sum = np.sum(l1_ind1[p_ind])
|
|
1002
|
+
n_sum = np.sum(l1_ind1[n_ind]) * -1
|
|
1003
|
+
|
|
1004
|
+
if w['b_cls'][i] > 0:
|
|
1005
|
+
pbias = w['b_cls'][i]
|
|
1006
|
+
nbias = 0
|
|
1007
|
+
else:
|
|
1008
|
+
pbias = 0
|
|
1009
|
+
nbias = w['b_cls'][i]
|
|
1010
|
+
|
|
1011
|
+
t_sum = p_sum + pbias - n_sum - nbias
|
|
1012
|
+
|
|
1013
|
+
# This layer has a softmax activation function
|
|
1014
|
+
act = {
|
|
1015
|
+
"name": "softmax",
|
|
1016
|
+
"range": {"l": -1, "u": 2},
|
|
1017
|
+
"type": "mono",
|
|
1018
|
+
"func": None,
|
|
1019
|
+
}
|
|
1020
|
+
|
|
1021
|
+
if act["type"] == "mono":
|
|
1022
|
+
if act["range"]["l"]:
|
|
1023
|
+
if t_sum < act["range"]["l"]:
|
|
1024
|
+
p_sum = 0
|
|
1025
|
+
if act["range"]["u"]:
|
|
1026
|
+
if t_sum > act["range"]["u"]:
|
|
1027
|
+
n_sum = 0
|
|
1028
|
+
|
|
1029
|
+
if p_sum > 0:
|
|
1030
|
+
p_agg_wt = (p_sum + pbias) / (p_sum + n_sum + pbias + nbias)
|
|
1031
|
+
p_agg_wt = p_agg_wt * (p_sum / (p_sum + pbias))
|
|
1032
|
+
else:
|
|
1033
|
+
p_agg_wt = 0
|
|
1034
|
+
if n_sum > 0:
|
|
1035
|
+
n_agg_wt = (n_sum + nbias) / (p_sum + n_sum + pbias + nbias)
|
|
1036
|
+
n_agg_wt = n_agg_wt * (n_sum / (n_sum + nbias))
|
|
1037
|
+
else:
|
|
1038
|
+
n_agg_wt = 0
|
|
1039
|
+
|
|
1040
|
+
if p_sum == 0:
|
|
1041
|
+
p_sum = 1
|
|
1042
|
+
if n_sum == 0:
|
|
1043
|
+
n_sum = 1
|
|
1044
|
+
|
|
1045
|
+
wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
|
|
1046
|
+
wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
|
|
1047
|
+
|
|
1048
|
+
wt_mat = wt_mat.sum(axis=0)
|
|
1049
|
+
return wt_mat
|
|
1050
|
+
|
|
1051
|
+
|
|
1052
|
+
def calculate_wt_pooler(wts, inp, w):
|
|
1053
|
+
'''
|
|
1054
|
+
Input:
|
|
1055
|
+
wts: relevance score of the layer
|
|
1056
|
+
inp: input to the layer
|
|
1057
|
+
w: weights of the layer- ['W_p', 'b_p']
|
|
1058
|
+
'''
|
|
1059
|
+
relevance_inp = np.zeros(inp.shape)
|
|
1060
|
+
|
|
1061
|
+
for i in range(inp.shape[0]):
|
|
1062
|
+
# Compute contribution matrix
|
|
1063
|
+
contribution_matrix = np.einsum('ij,j->ij', w['W_p'], inp[i])
|
|
1064
|
+
wt_mat = np.zeros(contribution_matrix.shape)
|
|
1065
|
+
|
|
1066
|
+
# Iterate over each unit
|
|
1067
|
+
for j in range(contribution_matrix.shape[0]):
|
|
1068
|
+
l1_ind1 = contribution_matrix[j]
|
|
1069
|
+
wt_ind1 = wt_mat[j]
|
|
1070
|
+
wt = wts[j]
|
|
1071
|
+
|
|
1072
|
+
p_ind = l1_ind1 > 0
|
|
1073
|
+
n_ind = l1_ind1 < 0
|
|
1074
|
+
p_sum = np.sum(l1_ind1[p_ind])
|
|
1075
|
+
n_sum = np.sum(l1_ind1[n_ind]) * -1
|
|
1076
|
+
|
|
1077
|
+
# Calculate biases
|
|
1078
|
+
pbias = max(w['b_p'][j], 0)
|
|
1079
|
+
nbias = min(w['b_p'][j], 0) * -1
|
|
1080
|
+
|
|
1081
|
+
t_sum = p_sum + pbias - n_sum - nbias
|
|
1082
|
+
|
|
1083
|
+
# This layer has a tanh activation function
|
|
1084
|
+
act = {
|
|
1085
|
+
"name": "tanh",
|
|
1086
|
+
"range": {"l": -2, "u": 2},
|
|
1087
|
+
"type": "mono",
|
|
1088
|
+
"func": None
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
# Apply activation function constraints
|
|
1092
|
+
if act["type"] == "mono":
|
|
1093
|
+
if act["range"]["l"]:
|
|
1094
|
+
if t_sum < act["range"]["l"]:
|
|
1095
|
+
p_sum = 0
|
|
1096
|
+
if act["range"]["u"]:
|
|
1097
|
+
if t_sum > act["range"]["u"]:
|
|
1098
|
+
n_sum = 0
|
|
1099
|
+
|
|
1100
|
+
# Aggregate weights based on positive and negative contributions
|
|
1101
|
+
p_agg_wt = 0
|
|
1102
|
+
n_agg_wt = 0
|
|
1103
|
+
if p_sum > 0:
|
|
1104
|
+
p_agg_wt = (p_sum + pbias) / (p_sum + n_sum + pbias + nbias)
|
|
1105
|
+
p_agg_wt *= (p_sum / (p_sum + pbias))
|
|
1106
|
+
|
|
1107
|
+
if n_sum > 0:
|
|
1108
|
+
n_agg_wt = (n_sum + nbias) / (p_sum + n_sum + pbias + nbias)
|
|
1109
|
+
n_agg_wt *= (n_sum / (n_sum + nbias))
|
|
1110
|
+
|
|
1111
|
+
# Prevent division by zero
|
|
1112
|
+
if p_sum == 0:
|
|
1113
|
+
p_sum = 1
|
|
1114
|
+
if n_sum == 0:
|
|
1115
|
+
n_sum = 1
|
|
1116
|
+
|
|
1117
|
+
# Update weight matrix
|
|
1118
|
+
wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
|
|
1119
|
+
wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
|
|
1120
|
+
|
|
1121
|
+
# Calculate relevance for each token
|
|
1122
|
+
relevance_inp[i] = wt_mat.sum(axis=0)
|
|
1123
|
+
|
|
1124
|
+
relevance_inp *= (100 / np.sum(relevance_inp))
|
|
1125
|
+
return relevance_inp
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
####################################################################
|
|
1129
|
+
################### Encoder-Decoder Model ####################
|
|
1130
|
+
####################################################################
|
|
1131
|
+
|
|
1132
|
+
def calculate_enc_dec_start_wt(arg, indices):
|
|
1133
|
+
y = np.zeros(arg.shape, dtype=np.float64)
|
|
1134
|
+
value = 1 / arg.shape[0]
|
|
1135
|
+
|
|
1136
|
+
for i in range(arg.shape[0]):
|
|
1137
|
+
y[i][indices[i]] = value
|
|
1138
|
+
|
|
1139
|
+
return y
|
|
1140
|
+
|
|
1141
|
+
|
|
1142
|
+
def calculate_wt_lm_head(wts, inp, w):
|
|
1143
|
+
'''
|
|
1144
|
+
Input:
|
|
1145
|
+
wts: relevance score of the layer
|
|
1146
|
+
inp: input to the layer
|
|
1147
|
+
w: weights of the layer- ['W_lm_head']
|
|
1148
|
+
'''
|
|
1149
|
+
relevance_input = np.zeros(inp.shape)
|
|
1150
|
+
|
|
1151
|
+
for i in range(wts.shape[0]):
|
|
1152
|
+
R = wts[i]
|
|
1153
|
+
contribution_matrix = np.einsum('ij,j->ij', w['W_lm_head'], inp[i])
|
|
1154
|
+
wt_mat = np.zeros(contribution_matrix.shape)
|
|
1155
|
+
|
|
1156
|
+
for j in range(contribution_matrix.shape[0]):
|
|
1157
|
+
l1_ind1 = contribution_matrix[j]
|
|
1158
|
+
wt_ind1 = wt_mat[j]
|
|
1159
|
+
wt = R[j]
|
|
1160
|
+
|
|
1161
|
+
p_ind = l1_ind1 > 0
|
|
1162
|
+
n_ind = l1_ind1 < 0
|
|
1163
|
+
|
|
1164
|
+
p_sum = np.sum(l1_ind1[p_ind])
|
|
1165
|
+
n_sum = np.sum(l1_ind1[n_ind]) * -1
|
|
1166
|
+
|
|
1167
|
+
if p_sum > 0:
|
|
1168
|
+
p_agg_wt = p_sum / (p_sum + n_sum)
|
|
1169
|
+
else:
|
|
1170
|
+
p_agg_wt = 0
|
|
1171
|
+
|
|
1172
|
+
if n_sum > 0:
|
|
1173
|
+
n_agg_wt = n_sum / (p_sum + n_sum)
|
|
1174
|
+
else:
|
|
1175
|
+
n_agg_wt = 0
|
|
1176
|
+
|
|
1177
|
+
if p_sum == 0:
|
|
1178
|
+
p_sum = 1
|
|
1179
|
+
if n_sum == 0:
|
|
1180
|
+
n_sum = 1
|
|
1181
|
+
|
|
1182
|
+
wt_ind1[p_ind] = (l1_ind1[p_ind] / p_sum) * wt * p_agg_wt
|
|
1183
|
+
wt_ind1[n_ind] = (l1_ind1[n_ind] / n_sum) * wt * n_agg_wt * -1.0
|
|
1184
|
+
|
|
1185
|
+
relevance_input[i] = wt_mat.sum(axis=0)
|
|
1186
|
+
|
|
1187
|
+
return relevance_input
|
|
1188
|
+
|
|
1189
|
+
|
|
1190
|
+
def calculate_wt_cross_attention(wts, inp, w):
|
|
1191
|
+
'''
|
|
1192
|
+
Input:
|
|
1193
|
+
wts: relevance score of the layer
|
|
1194
|
+
inp: input to the layer
|
|
1195
|
+
w: weights of the layer- ['W_q', 'W_k', 'W_v', 'W_o']
|
|
1196
|
+
inputs: dict_keys(['query', 'key', 'value'])
|
|
1197
|
+
|
|
1198
|
+
Outputs:
|
|
1199
|
+
Step-1: outputs = torch.matmul(input_a, input_b)
|
|
1200
|
+
Step-2: outputs = F.softmax(inputs, dim=dim, dtype=dtype)
|
|
1201
|
+
Step-3: outputs = input_a * input_b
|
|
1202
|
+
'''
|
|
1203
|
+
k_v_inp, q_inp = inp
|
|
1204
|
+
query_output = np.einsum('ij,kj->ik', q_inp, w['W_q'])
|
|
1205
|
+
key_output = np.einsum('ij,kj->ik', k_v_inp, w['W_k'])
|
|
1206
|
+
value_output = np.einsum('ij,kj->ik', k_v_inp, w['W_v'])
|
|
1207
|
+
|
|
1208
|
+
# --------------- Relevance Calculation for Step-3 -----------------------
|
|
1209
|
+
relevance_V = wts / 2
|
|
1210
|
+
relevance_QK = wts / 2
|
|
1211
|
+
|
|
1212
|
+
# --------------- Relevance Calculation for V --------------------------------
|
|
1213
|
+
wt_mat_V = calculate_relevance_V(relevance_V, value_output)
|
|
1214
|
+
|
|
1215
|
+
# --------------- Transformed Relevance QK ----------------------------------
|
|
1216
|
+
QK_output = np.einsum('ij,kj->ik', query_output, key_output)
|
|
1217
|
+
wt_mat_QK = calculate_relevance_QK(relevance_QK, QK_output)
|
|
1218
|
+
|
|
1219
|
+
# --------------- Relevance Calculation for K and Q --------------------------------
|
|
1220
|
+
stabilized_QK_output = stabilize(QK_output * 2)
|
|
1221
|
+
norm_wt_mat_QK = wt_mat_QK / stabilized_QK_output
|
|
1222
|
+
wt_mat_Q = np.einsum('ij,jk->ik', norm_wt_mat_QK, key_output) * query_output
|
|
1223
|
+
wt_mat_K = np.einsum('ij,ik->kj', query_output, norm_wt_mat_QK) * key_output
|
|
1224
|
+
|
|
1225
|
+
wt_mat_KV = wt_mat_V + wt_mat_K
|
|
1226
|
+
wt_mat = [wt_mat_KV, wt_mat_Q]
|
|
1227
|
+
return wt_mat
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
def np_swish(x, beta=0.75):
|
|
3
|
+
z = 1 / (1 + np.exp(-(beta * x)))
|
|
4
|
+
return x * z
|
|
5
|
+
|
|
6
|
+
activation_master = {'None': {'name': None,
|
|
7
|
+
'range': {'l': None, 'u': None},
|
|
8
|
+
'type': 'null',
|
|
9
|
+
'func': None},
|
|
10
|
+
'linear': {'name': None,
|
|
11
|
+
'range': {'l': None, 'u': None},
|
|
12
|
+
'type': 'mono',
|
|
13
|
+
'func': None},
|
|
14
|
+
'tanh': {'name': 'tanh',
|
|
15
|
+
'range': {'l': -2, 'u': 2},
|
|
16
|
+
'type': 'mono',
|
|
17
|
+
'func': None},
|
|
18
|
+
'sigmoid': {'name': 'sigmoid',
|
|
19
|
+
'range': {'l': -4, 'u': 4},
|
|
20
|
+
'type': 'mono',
|
|
21
|
+
'func': None},
|
|
22
|
+
'relu': {'name': 'relu',
|
|
23
|
+
'range': {'l': 0, 'u': None},
|
|
24
|
+
'type': 'mono',
|
|
25
|
+
'func': None},
|
|
26
|
+
'swish': {'name': 'swish',
|
|
27
|
+
'range': {'l': -6, 'u': None},
|
|
28
|
+
'type': 'non_mono',
|
|
29
|
+
'func': np_swish},
|
|
30
|
+
'softmax': {'name': 'softmax',
|
|
31
|
+
'range': {'l': -1, 'u': 2},
|
|
32
|
+
'type': 'mono',
|
|
33
|
+
'func': None}}
|