dl-backtrace 0.0.14__py3-none-any.whl → 0.0.16.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dl-backtrace might be problematic. Click here for more details.
- dl_backtrace/pytorch_backtrace/backtrace/backtrace.py +173 -44
- dl_backtrace/pytorch_backtrace/backtrace/utils/__init__.py +3 -0
- dl_backtrace/pytorch_backtrace/backtrace/utils/encoder.py +183 -0
- dl_backtrace/pytorch_backtrace/backtrace/utils/encoder_decoder.py +489 -0
- dl_backtrace/pytorch_backtrace/backtrace/utils/helper.py +95 -0
- dl_backtrace/pytorch_backtrace/backtrace/utils/prop.py +481 -0
- dl_backtrace/tf_backtrace/backtrace/__init__.py +1 -2
- dl_backtrace/tf_backtrace/backtrace/activation_info.py +33 -0
- dl_backtrace/tf_backtrace/backtrace/backtrace.py +506 -279
- dl_backtrace/tf_backtrace/backtrace/models.py +25 -0
- dl_backtrace/tf_backtrace/backtrace/server.py +27 -0
- dl_backtrace/tf_backtrace/backtrace/utils/__init__.py +5 -2
- dl_backtrace/tf_backtrace/backtrace/utils/encoder.py +206 -0
- dl_backtrace/tf_backtrace/backtrace/utils/encoder_decoder.py +501 -0
- dl_backtrace/tf_backtrace/backtrace/utils/helper.py +99 -0
- dl_backtrace/tf_backtrace/backtrace/utils/utils_contrast.py +1132 -0
- dl_backtrace/tf_backtrace/backtrace/utils/utils_prop.py +1582 -0
- dl_backtrace/version.py +2 -2
- {dl_backtrace-0.0.14.dist-info → dl_backtrace-0.0.16.dev4.dist-info}/METADATA +2 -2
- dl_backtrace-0.0.16.dev4.dist-info/RECORD +29 -0
- {dl_backtrace-0.0.14.dist-info → dl_backtrace-0.0.16.dev4.dist-info}/WHEEL +1 -1
- dl_backtrace/tf_backtrace/backtrace/config.py +0 -41
- dl_backtrace/tf_backtrace/backtrace/utils/contrast.py +0 -834
- dl_backtrace/tf_backtrace/backtrace/utils/prop.py +0 -725
- dl_backtrace-0.0.14.dist-info/RECORD +0 -21
- {dl_backtrace-0.0.14.dist-info → dl_backtrace-0.0.16.dev4.dist-info}/LICENSE +0 -0
- {dl_backtrace-0.0.14.dist-info → dl_backtrace-0.0.16.dev4.dist-info}/top_level.txt +0 -0
|
@@ -4,6 +4,9 @@ import torch.nn as nn
|
|
|
4
4
|
from dl_backtrace.pytorch_backtrace.backtrace.utils import contrast as UC
|
|
5
5
|
from dl_backtrace.pytorch_backtrace.backtrace.utils import prop as UP
|
|
6
6
|
from dl_backtrace.pytorch_backtrace.backtrace.config import activation_master
|
|
7
|
+
from dl_backtrace.pytorch_backtrace.backtrace.utils import helper as HP
|
|
8
|
+
from dl_backtrace.pytorch_backtrace.backtrace.utils import encoder as EN
|
|
9
|
+
from dl_backtrace.pytorch_backtrace.backtrace.utils import encoder_decoder as ED
|
|
7
10
|
|
|
8
11
|
class Backtrace(object):
|
|
9
12
|
"""
|
|
@@ -11,47 +14,78 @@ class Backtrace(object):
|
|
|
11
14
|
It takes two optional parameters: model (a neural network model) and activation_dict (a dictionary that maps layer names to activation functions).
|
|
12
15
|
"""
|
|
13
16
|
|
|
14
|
-
def __init__(self, model=None, activation_dict={}):
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
17
|
+
def __init__(self, model=None, activation_dict={}, model_type=None):
|
|
18
|
+
if model_type == 'encoder':
|
|
19
|
+
self.model = model
|
|
20
|
+
self.model_type = model_type
|
|
21
|
+
|
|
22
|
+
# create a tree-like structure for encoder model
|
|
23
|
+
self.model_resource = EN.build_encoder_tree(model)
|
|
24
|
+
|
|
25
|
+
# create a layer stack for encoder model
|
|
26
|
+
self.create_layer_stack()
|
|
27
|
+
|
|
28
|
+
# extract the encoder model weights
|
|
29
|
+
self.model_weights = EN.extract_encoder_weights(model)
|
|
30
|
+
|
|
31
|
+
# # calculate the output of each submodule of the encoder model
|
|
32
|
+
# self.all_out_model = EN.create_encoder_output(model)
|
|
33
|
+
|
|
34
|
+
elif model_type == 'encoder_decoder':
|
|
35
|
+
self.model = model
|
|
36
|
+
self.model_type = model_type
|
|
37
|
+
|
|
38
|
+
# create a tree-like structure and layer_stack for encoder-decoder model
|
|
39
|
+
self.model_resource, self.layer_stack = ED.build_enc_dec_tree(model)
|
|
40
|
+
|
|
41
|
+
# extract the encoder-decoder model weights
|
|
42
|
+
self.model_weights = ED.extract_encoder_decoder_weights(model)
|
|
43
|
+
|
|
44
|
+
# # calculate the output of each submodule of the encoder-decoder model
|
|
45
|
+
# self.all_out_model = ED.calculate_encoder_decoder_output(model)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
else:
|
|
49
|
+
self.model_type = model_type
|
|
50
|
+
# create a tree-like structure that represents the layers of the neural network model
|
|
51
|
+
self.create_tree(model)
|
|
52
|
+
|
|
53
|
+
# create a new model (an instance of tf.keras.Model) that produces the output of each layer in the neural network.
|
|
54
|
+
self.create_model_output(model)
|
|
55
|
+
|
|
56
|
+
# create a new model (an instance of tf.keras.Model) that produces the output of each layer in the neural network.
|
|
57
|
+
self.create_every_model_output(model)
|
|
58
|
+
|
|
59
|
+
# create a layer stack that defines the order in which layers should be processed during backpropagation.
|
|
60
|
+
self.create_layer_stack()
|
|
61
|
+
|
|
62
|
+
# checks if the model is sequential or not. If it's sequential, it adds the input layer to the layer stack.
|
|
63
|
+
# identity
|
|
64
|
+
|
|
65
|
+
inp_name = 'identity'
|
|
66
|
+
self.layer_stack.append(inp_name)
|
|
67
|
+
self.model_resource[1][inp_name] = {}
|
|
68
|
+
self.model_resource[1][inp_name]["name"] = inp_name
|
|
69
|
+
self.model_resource[1][inp_name]["type"] = "input"
|
|
70
|
+
self.model_resource[1][inp_name]["parent"] = []
|
|
71
|
+
self.model_resource[1][inp_name]["child"] = None
|
|
72
|
+
self.model_resource[3].append(inp_name)
|
|
73
|
+
self.sequential = True
|
|
74
|
+
try:
|
|
41
75
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
76
|
+
# calls the build_activation_dict method to build a dictionary that maps layer names to activation functions.
|
|
77
|
+
# If that fails, it creates a temporary dictionary with default activation functions.
|
|
78
|
+
if len(activation_dict) == 0:
|
|
79
|
+
self.build_activation_dict(model)
|
|
80
|
+
else:
|
|
81
|
+
self.activation_dict = activation_dict
|
|
48
82
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
83
|
+
except Exception as e:
|
|
84
|
+
print(e)
|
|
85
|
+
temp_dict = {}
|
|
86
|
+
for l in model.layers:
|
|
87
|
+
temp_dict[l.name] = activation_master["None"]
|
|
88
|
+
self.activation_dict = temp_dict
|
|
55
89
|
|
|
56
90
|
def build_activation_dict(self, model):
|
|
57
91
|
model_resource = self.model_resource
|
|
@@ -321,6 +355,7 @@ class Backtrace(object):
|
|
|
321
355
|
multiplier=100.0,
|
|
322
356
|
scaler=0,
|
|
323
357
|
max_unit=0,
|
|
358
|
+
predicted_token=None,
|
|
324
359
|
):
|
|
325
360
|
# This method is used for evaluating layer-wise relevance based on different modes.
|
|
326
361
|
if mode == "default":
|
|
@@ -330,6 +365,7 @@ class Backtrace(object):
|
|
|
330
365
|
multiplier=multiplier,
|
|
331
366
|
scaler=0,
|
|
332
367
|
max_unit=0,
|
|
368
|
+
predicted_token=predicted_token,
|
|
333
369
|
)
|
|
334
370
|
return output
|
|
335
371
|
elif mode == "contrast":
|
|
@@ -342,7 +378,7 @@ class Backtrace(object):
|
|
|
342
378
|
return output
|
|
343
379
|
|
|
344
380
|
def proportional_eval(
|
|
345
|
-
self, all_out, start_wt=[], multiplier=100.0, scaler=0, max_unit=0
|
|
381
|
+
self, all_out, start_wt=[], multiplier=100.0, scaler=0, max_unit=0, predicted_token=None
|
|
346
382
|
):
|
|
347
383
|
model_resource = self.model_resource
|
|
348
384
|
activation_dict = self.activation_dict
|
|
@@ -350,10 +386,21 @@ class Backtrace(object):
|
|
|
350
386
|
out_layer = model_resource[2][0]
|
|
351
387
|
all_wt = {}
|
|
352
388
|
if len(start_wt) == 0:
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
389
|
+
if self.model_type == 'encoder':
|
|
390
|
+
start_wt = UP.calculate_start_wt(all_out[out_layer].detach().numpy())
|
|
391
|
+
all_wt[out_layer] = start_wt * multiplier
|
|
392
|
+
layer_stack = self.layer_stack
|
|
393
|
+
all_wts = self.model_weights
|
|
394
|
+
if self.model_type == 'encoder_decoder':
|
|
395
|
+
start_wt = UP.calculate_enc_dec_start_wt(all_out[out_layer][0].detach().numpy(), predicted_token)
|
|
396
|
+
all_wt[out_layer] = start_wt * multiplier
|
|
397
|
+
layer_stack = self.layer_stack
|
|
398
|
+
all_wts = self.model_weights
|
|
399
|
+
else:
|
|
400
|
+
start_wt = UP.calculate_start_wt(all_out[out_layer])
|
|
401
|
+
all_wt[out_layer] = start_wt * multiplier
|
|
402
|
+
layer_stack = self.layer_stack
|
|
403
|
+
|
|
357
404
|
for start_layer in layer_stack:
|
|
358
405
|
if model_resource[1][start_layer]["child"]:
|
|
359
406
|
child_nodes = model_resource[1][start_layer]["child"]
|
|
@@ -453,6 +500,88 @@ class Backtrace(object):
|
|
|
453
500
|
all_wt[start_layer], lstm_obj_f.compute_log
|
|
454
501
|
)
|
|
455
502
|
all_wt[child_nodes[0]] += temp_wt
|
|
503
|
+
|
|
504
|
+
elif model_resource[1][start_layer]["class"] == "Self_Attention":
|
|
505
|
+
weights = all_wts[start_layer]
|
|
506
|
+
self_attention_weights = HP.rename_self_attention_keys(weights)
|
|
507
|
+
|
|
508
|
+
temp_wt = UP.calculate_wt_self_attention(
|
|
509
|
+
all_wt[start_layer],
|
|
510
|
+
all_out[child_nodes[0]][0].detach().numpy(),
|
|
511
|
+
self_attention_weights,
|
|
512
|
+
)
|
|
513
|
+
all_wt[child_nodes[0]] += temp_wt
|
|
514
|
+
|
|
515
|
+
elif model_resource[1][start_layer]["class"] == 'Residual':
|
|
516
|
+
temp_wt = UP.calculate_wt_add(
|
|
517
|
+
all_wt[start_layer],
|
|
518
|
+
[all_out[ch].detach().numpy() for ch in child_nodes],
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
for ind, ch in enumerate(child_nodes):
|
|
522
|
+
all_wt[ch] += temp_wt[ind]
|
|
523
|
+
|
|
524
|
+
elif model_resource[1][start_layer]["class"] == 'Feed_Forward':
|
|
525
|
+
weights = all_wts[start_layer]
|
|
526
|
+
feed_forward_weights = HP.rename_feed_forward_keys(weights)
|
|
527
|
+
|
|
528
|
+
temp_wt = UP.calculate_wt_feed_forward(
|
|
529
|
+
all_wt[start_layer],
|
|
530
|
+
all_out[child_nodes[0]][0].detach().numpy(),
|
|
531
|
+
feed_forward_weights
|
|
532
|
+
)
|
|
533
|
+
all_wt[child_nodes[0]] += temp_wt
|
|
534
|
+
|
|
535
|
+
elif model_resource[1][start_layer]["class"] == "Pooler":
|
|
536
|
+
weights = all_wts[start_layer]
|
|
537
|
+
pooler_weights = HP.rename_pooler_keys(weights)
|
|
538
|
+
|
|
539
|
+
temp_wt = UP.calculate_wt_pooler(
|
|
540
|
+
all_wt[start_layer],
|
|
541
|
+
all_out[child_nodes[0]][0].detach().numpy(),
|
|
542
|
+
pooler_weights
|
|
543
|
+
)
|
|
544
|
+
all_wt[child_nodes[0]] += temp_wt
|
|
545
|
+
|
|
546
|
+
elif model_resource[1][start_layer]["class"] == "Classifier":
|
|
547
|
+
weights = all_wts[start_layer]
|
|
548
|
+
classifier_weights = HP.rename_classifier_keys(weights)
|
|
549
|
+
|
|
550
|
+
temp_wt = UP.calculate_wt_classifier(
|
|
551
|
+
all_wt[start_layer],
|
|
552
|
+
all_out[child_nodes[0]][0].detach().numpy(),
|
|
553
|
+
classifier_weights
|
|
554
|
+
)
|
|
555
|
+
all_wt[child_nodes[0]] += temp_wt
|
|
556
|
+
|
|
557
|
+
elif model_resource[1][start_layer]["class"] == "LM_Head":
|
|
558
|
+
weights = all_wts[start_layer]
|
|
559
|
+
lm_head_weights = HP.rename_decoder_lm_head(weights)
|
|
560
|
+
|
|
561
|
+
temp_wt = UP.calculate_wt_lm_head(
|
|
562
|
+
all_wt[start_layer],
|
|
563
|
+
all_out[child_nodes[0]][0].detach().numpy(),
|
|
564
|
+
lm_head_weights
|
|
565
|
+
)
|
|
566
|
+
all_wt[child_nodes[0]] += temp_wt
|
|
567
|
+
|
|
568
|
+
elif model_resource[1][start_layer]["class"] == 'Layer_Norm':
|
|
569
|
+
temp_wt = all_wt[start_layer]
|
|
570
|
+
all_wt[child_nodes[0]] += temp_wt
|
|
571
|
+
|
|
572
|
+
elif model_resource[1][start_layer]["class"] == 'Cross_Attention':
|
|
573
|
+
weights = all_wts[start_layer]
|
|
574
|
+
cross_attention_weights = HP.rename_cross_attention_keys(weights)
|
|
575
|
+
|
|
576
|
+
temp_wt = UP.calculate_wt_cross_attention(
|
|
577
|
+
all_wt[start_layer],
|
|
578
|
+
[all_out[ch][0].detach().numpy() for ch in child_nodes],
|
|
579
|
+
cross_attention_weights,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
for ind, ch in enumerate(child_nodes):
|
|
583
|
+
all_wt[ch] += temp_wt[ind]
|
|
584
|
+
|
|
456
585
|
else:
|
|
457
586
|
temp_wt = all_wt[start_layer]
|
|
458
587
|
all_wt[child_nodes[0]] += temp_wt
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def build_encoder_tree(model, root='bert'):
|
|
5
|
+
# Initialize the tree structure
|
|
6
|
+
ltree = {}
|
|
7
|
+
layer_tree = {}
|
|
8
|
+
inputs = []
|
|
9
|
+
outputs = []
|
|
10
|
+
intermediates = []
|
|
11
|
+
|
|
12
|
+
# Base component setup
|
|
13
|
+
def add_component(tree, name, component, child=None):
|
|
14
|
+
tree[name] = {
|
|
15
|
+
'name': name,
|
|
16
|
+
'class': component if type(component).__name__ == 'str' else type(component).__name__,
|
|
17
|
+
'type': str(type(component)),
|
|
18
|
+
'parent': None,
|
|
19
|
+
'child': None
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if isinstance(child, list):
|
|
23
|
+
tree[name]['child'] = child
|
|
24
|
+
elif isinstance(child, str):
|
|
25
|
+
tree[name]['child'] = [child]
|
|
26
|
+
|
|
27
|
+
if tree[name]['class'] == 'list':
|
|
28
|
+
tree[name]['class'] = [type(item).__name__ for item in component]
|
|
29
|
+
tree[name]['type'] = [str(type(item)) for item in component]
|
|
30
|
+
|
|
31
|
+
# Keep track of component type in a separate dictionary
|
|
32
|
+
layer_tree[name] = component if type(component).__name__ == 'str' else tree[name]['type']
|
|
33
|
+
|
|
34
|
+
# Link the parent to its children
|
|
35
|
+
if isinstance(child, list):
|
|
36
|
+
for ch in child:
|
|
37
|
+
if ch in tree:
|
|
38
|
+
tree[ch]['parent'] = [name]
|
|
39
|
+
|
|
40
|
+
elif isinstance(child, str):
|
|
41
|
+
if child in tree:
|
|
42
|
+
tree[child]['parent'] = [name]
|
|
43
|
+
|
|
44
|
+
return tree[name]
|
|
45
|
+
|
|
46
|
+
# Add root and embeddings component
|
|
47
|
+
embeddings = add_component(ltree, 'embeddings', 'Embeddings', child=None)
|
|
48
|
+
|
|
49
|
+
# Add encoder layers dynamically
|
|
50
|
+
current_child = 'embeddings'
|
|
51
|
+
for i, layer in enumerate(model.bert.encoder.layer):
|
|
52
|
+
attention = add_component(ltree, f'attention_{i}', 'Self_Attention', child=current_child)
|
|
53
|
+
add_and_layer_norm_0 = add_component(ltree, f'add_and_layer_norm_{i}_0', 'Residual', child=[f'attention_{i}', current_child])
|
|
54
|
+
feed_forward = add_component(ltree, f'feed_forward_{i}', 'Feed_Forward', child=f'add_and_layer_norm_{i}_0')
|
|
55
|
+
add_and_layer_norm_1 = add_component(ltree, f'add_and_layer_norm_{i}_1', 'Residual', child=[f'feed_forward_{i}', f'add_and_layer_norm_{i}_0'])
|
|
56
|
+
current_child = f'add_and_layer_norm_{i}_1' # Update current_child to link this layer's output to the next layer's input
|
|
57
|
+
|
|
58
|
+
# Optionally add pooler layer if present
|
|
59
|
+
if hasattr(model.bert, 'pooler'):
|
|
60
|
+
pooler = add_component(ltree, 'pooler', 'Pooler', child=current_child)
|
|
61
|
+
current_child = 'pooler'
|
|
62
|
+
|
|
63
|
+
if hasattr(model, 'classifier'):
|
|
64
|
+
classifier = add_component(ltree, 'classifier', 'Classifier', child=current_child)
|
|
65
|
+
current_child = 'classifier'
|
|
66
|
+
|
|
67
|
+
# Classify components
|
|
68
|
+
for name, component in ltree.items():
|
|
69
|
+
if component['parent'] is None:
|
|
70
|
+
outputs.append(component['name'])
|
|
71
|
+
elif component['child'] is None:
|
|
72
|
+
inputs.append(component['name'])
|
|
73
|
+
else:
|
|
74
|
+
intermediates.append(component['name'])
|
|
75
|
+
|
|
76
|
+
model_resource = (layer_tree, ltree, outputs, inputs)
|
|
77
|
+
return model_resource
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def extract_encoder_weights(model):
|
|
81
|
+
# Initialize a dictionary to hold the weights
|
|
82
|
+
weights_dict = {
|
|
83
|
+
'embeddings': {},
|
|
84
|
+
'pooler': {},
|
|
85
|
+
'dropout': {},
|
|
86
|
+
'classifier': {}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
for i in range(model.config.num_hidden_layers):
|
|
90
|
+
weights_dict[f'attention_{i}'] = {}
|
|
91
|
+
weights_dict[f'add_and_layer_norm_{i}_0'] = {}
|
|
92
|
+
weights_dict[f'feed_forward_{i}'] = {}
|
|
93
|
+
weights_dict[f'add_and_layer_norm_{i}_1'] = {}
|
|
94
|
+
|
|
95
|
+
# Extract the model's parameters and organize them into the dictionary
|
|
96
|
+
for name, param in model.bert.named_parameters():
|
|
97
|
+
if 'embeddings' in name:
|
|
98
|
+
weights_dict['embeddings'][name] = param.data.numpy()
|
|
99
|
+
elif 'encoder.layer' in name:
|
|
100
|
+
layer = name.split('.')[2]
|
|
101
|
+
submodule = name.split('.')[3]
|
|
102
|
+
if 'attention' in submodule and 'LayerNorm' not in name:
|
|
103
|
+
weights_dict[f'attention_{layer}'][name] = param.data.numpy()
|
|
104
|
+
elif 'attention.output.LayerNorm' in name:
|
|
105
|
+
weights_dict[f'add_and_layer_norm_{layer}_0'][name] = param.data.numpy()
|
|
106
|
+
elif 'intermediate' in submodule:
|
|
107
|
+
weights_dict[f'feed_forward_{layer}'][name] = param.data.numpy()
|
|
108
|
+
elif 'output' in submodule and 'LayerNorm' not in name:
|
|
109
|
+
weights_dict[f'feed_forward_{layer}'][name] = param.data.numpy()
|
|
110
|
+
elif 'output.LayerNorm' in name:
|
|
111
|
+
weights_dict[f'add_and_layer_norm_{layer}_1'][name] = param.data.numpy()
|
|
112
|
+
elif 'pooler' in name:
|
|
113
|
+
weights_dict['pooler'][name] = param.data.numpy()
|
|
114
|
+
|
|
115
|
+
for name, param in model.named_parameters():
|
|
116
|
+
if 'dropout' in name:
|
|
117
|
+
weights_dict['dropout'][name] = param.data.numpy()
|
|
118
|
+
elif 'classifier' in name:
|
|
119
|
+
weights_dict['classifier'][name] = param.data.numpy()
|
|
120
|
+
|
|
121
|
+
return weights_dict
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def create_encoder_output(model, input_ids=None, attention_mask=None, token_type_ids=None):
|
|
125
|
+
all_layer_outputs = {}
|
|
126
|
+
|
|
127
|
+
# Embeddings
|
|
128
|
+
embedding_output = model.bert.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
|
|
129
|
+
all_layer_outputs['embeddings'] = embedding_output
|
|
130
|
+
|
|
131
|
+
# iterate over each layer
|
|
132
|
+
hidden_states = embedding_output
|
|
133
|
+
|
|
134
|
+
for i, layer_module in enumerate(model.bert.encoder.layer):
|
|
135
|
+
# code here
|
|
136
|
+
with torch.no_grad():
|
|
137
|
+
# Self-Attention and attention output
|
|
138
|
+
attention_output = layer_module.attention.self(
|
|
139
|
+
hidden_states,
|
|
140
|
+
attention_mask=attention_mask,
|
|
141
|
+
)[0]
|
|
142
|
+
|
|
143
|
+
# Add + Layer Norm after attention
|
|
144
|
+
attention_output = layer_module.attention.output.dense(attention_output)
|
|
145
|
+
attention_output = layer_module.attention.output.dropout(attention_output)
|
|
146
|
+
residual_attention_output = attention_output + hidden_states
|
|
147
|
+
attention_output_norm = layer_module.attention.output.LayerNorm(residual_attention_output)
|
|
148
|
+
|
|
149
|
+
# Feed Forward (Intermediate)
|
|
150
|
+
intermediate_output = layer_module.intermediate(attention_output_norm)
|
|
151
|
+
|
|
152
|
+
# Feed Forward Output
|
|
153
|
+
feed_forward_output = layer_module.output.dense(intermediate_output)
|
|
154
|
+
feed_forward_output = layer_module.output.dropout(feed_forward_output)
|
|
155
|
+
residual_feed_forward_output = feed_forward_output + attention_output_norm
|
|
156
|
+
feed_forward_output_norm = layer_module.output.LayerNorm(residual_feed_forward_output)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# Save outputs add_and_layer_norm_0_0
|
|
160
|
+
all_layer_outputs[f'attention_{i}'] = attention_output
|
|
161
|
+
all_layer_outputs[f'add_and_layer_norm_{i}_0'] = attention_output_norm
|
|
162
|
+
all_layer_outputs[f'feed_forward_{i}'] = feed_forward_output
|
|
163
|
+
all_layer_outputs[f'add_and_layer_norm_{i}_1'] = feed_forward_output_norm
|
|
164
|
+
|
|
165
|
+
# Update hidden states for the next layer
|
|
166
|
+
hidden_states = feed_forward_output_norm
|
|
167
|
+
|
|
168
|
+
# Pooler
|
|
169
|
+
if hasattr(model.bert, 'pooler'):
|
|
170
|
+
pooled_output = model.bert.pooler(hidden_states)
|
|
171
|
+
all_layer_outputs['pooler'] = pooled_output
|
|
172
|
+
|
|
173
|
+
if hasattr(model, 'dropout'):
|
|
174
|
+
dropout_output = model.dropout(pooled_output)
|
|
175
|
+
all_layer_outputs['dropout'] = dropout_output
|
|
176
|
+
|
|
177
|
+
# Classifier
|
|
178
|
+
if hasattr(model, 'classifier'):
|
|
179
|
+
classifier = model.classifier(dropout_output)
|
|
180
|
+
softmax_output = torch.nn.functional.softmax(classifier)
|
|
181
|
+
all_layer_outputs['classifier'] = softmax_output
|
|
182
|
+
|
|
183
|
+
return all_layer_outputs
|