tpcav 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tpcav-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 seqcode
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
tpcav-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,89 @@
1
+ Metadata-Version: 2.4
2
+ Name: tpcav
3
+ Version: 0.1.0
4
+ Summary: Testing with PCA projected Concept Activation Vectors
5
+ Author-email: Jianyu Yang <yztxwd@gmail.com>
6
+ License-Expression: MIT AND (Apache-2.0 OR BSD-2-Clause)
7
+ Project-URL: Homepage, https://github.com/seqcode/TPCAV
8
+ Keywords: interpretation,attribution,concept,genomics,deep learning
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: torch
13
+ Requires-Dist: pandas
14
+ Requires-Dist: numpy
15
+ Requires-Dist: seqchromloader
16
+ Requires-Dist: deeplift
17
+ Requires-Dist: pyfaidx
18
+ Requires-Dist: pybedtools
19
+ Requires-Dist: captum
20
+ Requires-Dist: scikit-learn
21
+ Requires-Dist: biopython
22
+ Requires-Dist: seaborn
23
+ Requires-Dist: matplotlib
24
+ Dynamic: license-file
25
+
26
+ # TPCAV (Testing with PCA projected Concept Activation Vectors)
27
+
28
+ Analysis pipeline for TPCAV
29
+
30
+ ## Dependencies
31
+
32
+ You can use your own environment for the model, in addition, you need to install the following packages:
33
+
34
+ - captum 0.7
35
+ - seqchromloader 0.8.5
36
+ - scikit-learn 1.5.2
37
+
38
+ ## Workflow
39
+
40
+ 1. Since not every saved pytorch model stores the computation graph, you need to manually add functions to let the script know how to get the activations of the intermediate layer and how to proceed from there.
41
+
42
+ There are 3 places you need to insert your own code.
43
+
44
+ - Model class definition in models.py
45
+ - Please first copy your class definition into `Model_Class` in the script, it already has several pre-defined class functions, you need to fill in the following two functions:
46
+ - `forward_until_select_layer`: this is the function that takes your model input and forward until the layer you want to compute TPCAV score on
47
+ - `resume_forward_from_select_layer`: this is the function that starts from the activations of your select layer and forward all the way until the end
48
+ - There are also functions necessary for TPCAV computation, don't change them:
49
+ - `forward_from_start`: this function calls `forward_until_select_layer` and `resume_forward_from_select_layer` to do a full forward pass
50
+ - `forward_from_projected_and_residual`: this function takes the PCA projected activations and unexplained residual to do the forward pass
51
+ - `project_avs_to_pca`: this function takes care of the PCA projection
52
+
53
+ > NOTE: you can modify your final output tensor to specifically explain certain transformation of your output, for example, you can take weighted sum of base pair resolution signal prediction to emphasize high signal region.
54
+
55
+ - Function `load_model` in utils.py
56
+ - Take care of the model initialization and load saved parameters in `load_model`, return the model instance.
57
+ > NOTE: you need to use your own model class definition in models.py, as we need the functions defined in step 1.
58
+
59
+ - Function `seq_transform_fn` in utils.py
60
+ - By default the dataloader provides one hot coded DNA array of shape (batch_size, 4, len), coded in the order [A, C, G, T], if your model takes a different kind of input, modify `seq_transform_fn` to transform the input
61
+
62
+ - Function `chrom_transform_fn` in utils.py
63
+ - By default the dataloader provides signal array from bigwig files of shape (batch_size, # bigwigs, len), if your model takes a different kind of chromatin input, modify `chrom_transform_fn` to transform the input, if your model is sequence only, leave it to return None.
64
+
65
+
66
+ 2. Compute CAVs on your model, example command:
67
+
68
+ ```bash
69
+ srun -n1 -c8 --gres=gpu:1 --mem=128G python scripts/run_tcav_sgd_pca.py \
70
+ cavs_test 1024 data/hg19.fa data/hg19.fa.fai \
71
+ --meme-motifs data/motif-clustering-v2.1beta_consensus_pwms.test.meme \
72
+ --bed-chrom-concepts data/ENCODE_DNase_peaks.bed
73
+ ```
74
+
75
+ 3. Then compute the layer attributions, example command:
76
+
77
+ ```bash
78
+ srun -n1 -c8 --gres=gpu:1 --mem=128G \
79
+ python scripts/compute_layer_attrs_only.py cavs_test/tpcav_model.pt \
80
+ data/ChIPseq.H1-hESC.MAX.conservative.all.shuf1k.narrowPeak \
81
+ 1024 data/hg19.fa data/hg19.fa.fai cavs_test/test
82
+ ```
83
+
84
+ 4. run the jupyer notebook to generate summary of your results
85
+
86
+ ```bash
87
+ papermill -f scripts/compute_tcav_v2_pwm.example.yaml scripts/compute_tcav_v2_pwm.py.ipynb cavs_test/tcav_report.py.ipynb
88
+ ```
89
+
tpcav-0.1.0/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # TPCAV (Testing with PCA projected Concept Activation Vectors)
2
+
3
+ Analysis pipeline for TPCAV
4
+
5
+ ## Dependencies
6
+
7
+ You can use your own environment for the model, in addition, you need to install the following packages:
8
+
9
+ - captum 0.7
10
+ - seqchromloader 0.8.5
11
+ - scikit-learn 1.5.2
12
+
13
+ ## Workflow
14
+
15
+ 1. Since not every saved pytorch model stores the computation graph, you need to manually add functions to let the script know how to get the activations of the intermediate layer and how to proceed from there.
16
+
17
+ There are 3 places you need to insert your own code.
18
+
19
+ - Model class definition in models.py
20
+ - Please first copy your class definition into `Model_Class` in the script, it already has several pre-defined class functions, you need to fill in the following two functions:
21
+ - `forward_until_select_layer`: this is the function that takes your model input and forward until the layer you want to compute TPCAV score on
22
+ - `resume_forward_from_select_layer`: this is the function that starts from the activations of your select layer and forward all the way until the end
23
+ - There are also functions necessary for TPCAV computation, don't change them:
24
+ - `forward_from_start`: this function calls `forward_until_select_layer` and `resume_forward_from_select_layer` to do a full forward pass
25
+ - `forward_from_projected_and_residual`: this function takes the PCA projected activations and unexplained residual to do the forward pass
26
+ - `project_avs_to_pca`: this function takes care of the PCA projection
27
+
28
+ > NOTE: you can modify your final output tensor to specifically explain certain transformation of your output, for example, you can take weighted sum of base pair resolution signal prediction to emphasize high signal region.
29
+
30
+ - Function `load_model` in utils.py
31
+ - Take care of the model initialization and load saved parameters in `load_model`, return the model instance.
32
+ > NOTE: you need to use your own model class definition in models.py, as we need the functions defined in step 1.
33
+
34
+ - Function `seq_transform_fn` in utils.py
35
+ - By default the dataloader provides one hot coded DNA array of shape (batch_size, 4, len), coded in the order [A, C, G, T], if your model takes a different kind of input, modify `seq_transform_fn` to transform the input
36
+
37
+ - Function `chrom_transform_fn` in utils.py
38
+ - By default the dataloader provides signal array from bigwig files of shape (batch_size, # bigwigs, len), if your model takes a different kind of chromatin input, modify `chrom_transform_fn` to transform the input, if your model is sequence only, leave it to return None.
39
+
40
+
41
+ 2. Compute CAVs on your model, example command:
42
+
43
+ ```bash
44
+ srun -n1 -c8 --gres=gpu:1 --mem=128G python scripts/run_tcav_sgd_pca.py \
45
+ cavs_test 1024 data/hg19.fa data/hg19.fa.fai \
46
+ --meme-motifs data/motif-clustering-v2.1beta_consensus_pwms.test.meme \
47
+ --bed-chrom-concepts data/ENCODE_DNase_peaks.bed
48
+ ```
49
+
50
+ 3. Then compute the layer attributions, example command:
51
+
52
+ ```bash
53
+ srun -n1 -c8 --gres=gpu:1 --mem=128G \
54
+ python scripts/compute_layer_attrs_only.py cavs_test/tpcav_model.pt \
55
+ data/ChIPseq.H1-hESC.MAX.conservative.all.shuf1k.narrowPeak \
56
+ 1024 data/hg19.fa data/hg19.fa.fai cavs_test/test
57
+ ```
58
+
59
+ 4. run the jupyer notebook to generate summary of your results
60
+
61
+ ```bash
62
+ papermill -f scripts/compute_tcav_v2_pwm.example.yaml scripts/compute_tcav_v2_pwm.py.ipynb cavs_test/tcav_report.py.ipynb
63
+ ```
64
+
@@ -0,0 +1,33 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tpcav"
7
+ version = "0.1.0"
8
+ description = "Testing with PCA projected Concept Activation Vectors"
9
+ authors = [{name = "Jianyu Yang", email = "yztxwd@gmail.com"},]
10
+ readme = "README.md"
11
+ requires-python = ">=3.8"
12
+ dependencies = [
13
+ "torch",
14
+ "pandas",
15
+ "numpy",
16
+ "seqchromloader",
17
+ "deeplift",
18
+ "pyfaidx",
19
+ "pybedtools",
20
+ "captum",
21
+ "scikit-learn",
22
+ "biopython",
23
+ "seaborn",
24
+ "matplotlib",
25
+ ]
26
+ license = "MIT AND (Apache-2.0 OR BSD-2-Clause)"
27
+ keywords = ["interpretation", "attribution", "concept", "genomics", "deep learning"]
28
+
29
+ [project.urls]
30
+ Homepage = "https://github.com/seqcode/TPCAV"
31
+
32
+ [tool.setuptools.packages.find]
33
+ exclude = ["data", "model", "node_modules", "test", "scripts"]
tpcav-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,229 @@
1
+ import unittest
2
+ from functools import partial
3
+ from pathlib import Path
4
+
5
+ import torch
6
+ from Bio import motifs as Bio_motifs
7
+ from captum.attr import DeepLift
8
+
9
+ from tpcav import helper
10
+ from tpcav.cavs import CavTrainer
11
+ from tpcav.concepts import ConceptBuilder
12
+ from tpcav.tpcav_model import TPCAV, _abs_attribution_func
13
+
14
+
15
+ class DummyModelSeq(torch.nn.Module):
16
+ def __init__(self):
17
+ super().__init__()
18
+ self.layer1 = torch.nn.Linear(1024, 1)
19
+ self.layer2 = torch.nn.Linear(4, 1)
20
+
21
+ def forward(self, seq):
22
+ y_hat = self.layer1(seq)
23
+ y_hat = y_hat.squeeze(-1)
24
+ y_hat = self.layer2(y_hat)
25
+ return y_hat
26
+
27
+ def foward_from_layer1(self, y_hat):
28
+ y_hat = y_hat.squeeze(-1)
29
+ y_hat = self.layer2(y_hat)
30
+ return y_hat
31
+
32
+
33
+ class DummyModelSeqChrom(torch.nn.Module):
34
+ def __init__(self):
35
+ super().__init__()
36
+ self.layer1 = torch.nn.Linear(1024, 1)
37
+ self.layer2 = torch.nn.Linear(4, 1)
38
+
39
+ def forward(self, seq, chrom):
40
+ y_hat = self.layer1(seq)
41
+ y_hat = y_hat.squeeze(-1)
42
+ y_hat = self.layer2(y_hat)
43
+ return y_hat
44
+
45
+
46
+ def transform_fasta_to_one_hot_seq(seq, chrom):
47
+ return (helper.fasta_to_one_hot_sequences(seq),)
48
+
49
+
50
+ class CavTrainerIntegrationTest(unittest.TestCase):
51
+
52
+ def test_motif_concepts(self):
53
+ motif_path = Path("data") / "motif-clustering-v2.1beta_consensus_pwms.test.meme"
54
+ self.assertTrue(motif_path.exists(), "Motif file is missing")
55
+
56
+ builder = ConceptBuilder(
57
+ genome_fasta="data/hg38.analysisSet.fa",
58
+ genome_size_file="data/hg38.analysisSet.fa.fai",
59
+ input_window_length=1024,
60
+ bws=None,
61
+ num_motifs=16,
62
+ include_reverse_complement=True,
63
+ min_samples=1000,
64
+ batch_size=8,
65
+ )
66
+
67
+ builder.build_control()
68
+
69
+ builder.add_meme_motif_concepts(str(motif_path))
70
+
71
+ # load motifs
72
+ motifs = Bio_motifs.parse(open(motif_path), fmt="minimal")
73
+
74
+ for motif in motifs:
75
+ motif_name = motif.name.replace("/", "-")
76
+
77
+ concept = None
78
+ for c in builder.concepts:
79
+ if c.name == motif_name:
80
+ concept = c
81
+ break
82
+
83
+ self.assertIsNotNone(concept)
84
+
85
+ seq, chrom = next(iter(concept.data_iter))
86
+
87
+ matches = list(motif.pssm.search(seq[0], threshold=2.0))
88
+
89
+ self.assertGreaterEqual(
90
+ len(matches),
91
+ 16,
92
+ f"Motif concept {motif_name} has insufficient matches {matches}",
93
+ )
94
+
95
+ control_seq, _ = next(iter(builder.control_concepts[0].data_iter))
96
+
97
+ control_matches = list(motif.pssm.search(control_seq[0], threshold=2.0))
98
+
99
+ self.assertGreater(
100
+ len(matches),
101
+ len(control_matches),
102
+ f"Control concept has more motif matches than Motif concept, motif concept: {len(matches)}, control concept: {len(control_matches)}",
103
+ )
104
+
105
+ def test_all(self):
106
+
107
+ motif_path = Path("data") / "motif-clustering-v2.1beta_consensus_pwms.test.meme"
108
+ self.assertTrue(motif_path.exists(), "Motif file is missing")
109
+
110
+ builder = ConceptBuilder(
111
+ genome_fasta="data/hg38.analysisSet.fa",
112
+ genome_size_file="data/hg38.analysisSet.fa.fai",
113
+ input_window_length=1024,
114
+ bws=None,
115
+ num_motifs=12,
116
+ include_reverse_complement=True,
117
+ min_samples=1000,
118
+ batch_size=8,
119
+ )
120
+
121
+ builder.build_control()
122
+
123
+ builder.add_meme_motif_concepts(str(motif_path))
124
+
125
+ builder.apply_transform(transform_fasta_to_one_hot_seq)
126
+
127
+ batch = next(iter(builder.all_concepts()[0].data_iter))
128
+
129
+ self.assertTupleEqual(batch[0].shape, (builder.batch_size, 4, 1024))
130
+
131
+ tpcav_model = TPCAV(DummyModelSeq(), layer_name="layer1")
132
+ tpcav_model.fit_pca(
133
+ concepts=builder.all_concepts(),
134
+ num_samples_per_concept=10,
135
+ num_pc="full",
136
+ )
137
+ torch.save(tpcav_model, "data/tmp_tpcav_model.pt")
138
+
139
+ cav_trainer = CavTrainer(tpcav_model, penalty="l2")
140
+ cav_trainer.set_control(builder.control_concepts[0], num_samples=100)
141
+
142
+ cav_trainer.train_concepts(
143
+ builder.concepts, 100, output_dir="data/cavs/", num_processes=2
144
+ )
145
+ torch.save(cav_trainer, "data/tmp_cav_trainer.pt")
146
+
147
+ random_regions_1 = helper.random_regions_dataframe(
148
+ "data/hg38.analysisSet.fa.fai", 1024, 100, seed=1
149
+ )
150
+ random_regions_2 = helper.random_regions_dataframe(
151
+ "data/hg38.analysisSet.fa.fai", 1024, 100, seed=2
152
+ )
153
+
154
+ def pack_data_iters(df):
155
+ seq_fasta_iter = helper.dataframe_to_fasta_iter(
156
+ df, "data/hg38.analysisSet.fa", batch_size=8
157
+ )
158
+ seq_one_hot_iter = (
159
+ helper.fasta_to_one_hot_sequences(seq_fasta)
160
+ for seq_fasta in seq_fasta_iter
161
+ )
162
+ chrom_iter = helper.dataframe_to_chrom_tracks_iter(df, None, batch_size=8)
163
+ return zip(
164
+ seq_one_hot_iter,
165
+ )
166
+
167
+ attributions = tpcav_model.layer_attributions(
168
+ pack_data_iters(random_regions_1), pack_data_iters(random_regions_2)
169
+ )["attributions"]
170
+
171
+ cav_trainer.tpcav_score("AC0001:GATA-PROP:GATA", attributions)
172
+
173
+ cav_trainer.plot_cavs_similaritiy_heatmap(attributions)
174
+
175
+ input_attrs = tpcav_model.input_attributions(
176
+ pack_data_iters(random_regions_1),
177
+ pack_data_iters(random_regions_2),
178
+ multiply_by_inputs=True,
179
+ cavs_list=[
180
+ cav_trainer.cav_weights["AC0001:GATA-PROP:GATA"],
181
+ ],
182
+ )
183
+
184
+ # compute layer attributions using the old way
185
+ random1_avs = []
186
+ random2_avs = []
187
+ for inputs in pack_data_iters(random_regions_1):
188
+ av = tpcav_model._layer_output(*[i.to(tpcav_model.device) for i in inputs])
189
+ random1_avs.append(av.detach().cpu())
190
+ for inputs in pack_data_iters(random_regions_2):
191
+ av = tpcav_model._layer_output(*[i.to(tpcav_model.device) for i in inputs])
192
+ random2_avs.append(av.detach().cpu())
193
+ random1_avs = torch.cat(random1_avs, dim=0)
194
+ random2_avs = torch.cat(random2_avs, dim=0)
195
+
196
+ random1_avs_residual, random1_avs_projected = tpcav_model.project_activations(
197
+ random1_avs
198
+ )
199
+ random2_avs_residual, random2_avs_projected = tpcav_model.project_activations(
200
+ random2_avs
201
+ )
202
+
203
+ def forward_from_layer_1_embeddings(tm, avs_residual, avs_projected):
204
+ y_hat = tm.embedding_to_layer_activation(avs_residual, avs_projected)
205
+ y_hat = tm.model.foward_from_layer1(y_hat)
206
+ return y_hat
207
+
208
+ tpcav_model.forward = partial(forward_from_layer_1_embeddings, tpcav_model)
209
+
210
+ dl = DeepLift(tpcav_model)
211
+ attributions_old = dl.attribute(
212
+ (
213
+ random1_avs_residual.to(tpcav_model.device),
214
+ random1_avs_projected.to(tpcav_model.device),
215
+ ),
216
+ baselines=(
217
+ random2_avs_residual.to(tpcav_model.device),
218
+ random2_avs_projected.to(tpcav_model.device),
219
+ ),
220
+ custom_attribution_func=_abs_attribution_func,
221
+ )
222
+ attr_residual, attr_projected = attributions_old
223
+ attributions_old = torch.cat((attr_projected, attr_residual), dim=1)
224
+
225
+ self.assertTrue(torch.allclose(attributions.cpu(), attributions_old.cpu()))
226
+
227
+
228
+ if __name__ == "__main__":
229
+ unittest.main()
@@ -0,0 +1,39 @@
1
+ """
2
+ Lightweight, reusable TCAV utilities built from the repository scripts.
3
+
4
+ The package keeps existing scripts untouched while offering programmatic
5
+ access to concept construction and PCA/attribution workflows.
6
+ """
7
+
8
+ import logging
9
+
10
+ # Set the logging level to INFO
11
+ logging.basicConfig(level=logging.INFO)
12
+
13
+ from .cavs import CavTrainer
14
+ from .concepts import ConceptBuilder
15
+ from .helper import (
16
+ bed_to_chrom_tracks_iter,
17
+ bed_to_fasta_iter,
18
+ dataframe_to_chrom_tracks_iter,
19
+ dataframe_to_fasta_iter,
20
+ dinuc_shuffle_sequences,
21
+ fasta_to_one_hot_sequences,
22
+ random_regions_dataframe,
23
+ )
24
+ from .logging_utils import set_verbose
25
+ from .tpcav_model import TPCAV
26
+
27
+ __all__ = [
28
+ "ConceptBuilder",
29
+ "CavTrainer",
30
+ "TPCAV",
31
+ "bed_to_fasta_iter",
32
+ "dataframe_to_fasta_iter",
33
+ "bed_to_chrom_tracks_iter",
34
+ "dataframe_to_chrom_tracks_iter",
35
+ "fasta_to_one_hot_sequences",
36
+ "random_regions_dataframe",
37
+ "dinuc_shuffle_sequences",
38
+ "set_verbose",
39
+ ]