PyPI - tpcav - Versions diffs - 0.1.0__tar.gz - Mend

tpcav 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

tpcav-0.1.0/LICENSE +21 -0
tpcav-0.1.0/PKG-INFO +89 -0
tpcav-0.1.0/README.md +64 -0
tpcav-0.1.0/pyproject.toml +33 -0
tpcav-0.1.0/setup.cfg +4 -0
tpcav-0.1.0/test/test_cav_trainer.py +229 -0
tpcav-0.1.0/tpcav/__init__.py +39 -0
tpcav-0.1.0/tpcav/cavs.py +334 -0
tpcav-0.1.0/tpcav/concepts.py +309 -0
tpcav-0.1.0/tpcav/helper.py +165 -0
tpcav-0.1.0/tpcav/logging_utils.py +10 -0
tpcav-0.1.0/tpcav/tpcav_model.py +427 -0
tpcav-0.1.0/tpcav/utils.py +601 -0
tpcav-0.1.0/tpcav.egg-info/PKG-INFO +89 -0
tpcav-0.1.0/tpcav.egg-info/SOURCES.txt +16 -0
tpcav-0.1.0/tpcav.egg-info/dependency_links.txt +1 -0
tpcav-0.1.0/tpcav.egg-info/requires.txt +12 -0
tpcav-0.1.0/tpcav.egg-info/top_level.txt +4 -0

tpcav-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 seqcode
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tpcav-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,89 @@
+Metadata-Version: 2.4
+Name: tpcav
+Version: 0.1.0
+Summary: Testing with PCA projected Concept Activation Vectors
+Author-email: Jianyu Yang <yztxwd@gmail.com>
+License-Expression: MIT AND (Apache-2.0 OR BSD-2-Clause)
+Project-URL: Homepage, https://github.com/seqcode/TPCAV
+Keywords: interpretation,attribution,concept,genomics,deep learning
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch
+Requires-Dist: pandas
+Requires-Dist: numpy
+Requires-Dist: seqchromloader
+Requires-Dist: deeplift
+Requires-Dist: pyfaidx
+Requires-Dist: pybedtools
+Requires-Dist: captum
+Requires-Dist: scikit-learn
+Requires-Dist: biopython
+Requires-Dist: seaborn
+Requires-Dist: matplotlib
+Dynamic: license-file
+# TPCAV (Testing with PCA projected Concept Activation Vectors)
+Analysis pipeline for TPCAV
+## Dependencies
+You can use your own environment for the model, in addition, you need to install the following packages:
+- captum 0.7
+- seqchromloader 0.8.5
+- scikit-learn 1.5.2
+## Workflow
+1. Since not every saved pytorch model stores the computation graph, you need to manually add functions to let the script know how to get the activations of the intermediate layer and how to proceed from there.
+    There are 3 places you need to insert your own code.
+    - Model class definition in models.py
+        - Please first copy your class definition into `Model_Class` in the script, it already has several pre-defined class functions, you need to fill in the following two functions:
+            - `forward_until_select_layer`: this is the function that takes your model input and forward until the layer you want to compute TPCAV score on
+            - `resume_forward_from_select_layer`: this is the function that starts from the activations of your select layer and forward all the way until the end
+        -  There are also functions necessary for TPCAV computation, don't change them:
+            - `forward_from_start`: this function calls `forward_until_select_layer` and `resume_forward_from_select_layer` to do a full forward pass
+            - `forward_from_projected_and_residual`: this function takes the PCA projected activations and unexplained residual to do the forward pass
+            - `project_avs_to_pca`: this function takes care of the PCA projection
+        > NOTE: you can modify your final output tensor to specifically explain certain transformation of your output, for example, you can take weighted sum of base pair resolution signal prediction to emphasize high signal region.
+    - Function `load_model` in utils.py
+        - Take care of the model initialization and load saved parameters in `load_model`, return the model instance.
+        > NOTE: you need to use your own model class definition in models.py, as we need the functions defined in step 1.
+    - Function `seq_transform_fn` in utils.py
+        - By default the dataloader provides one hot coded DNA array of shape (batch_size, 4, len), coded in the order [A, C, G, T], if your model takes a different kind of input, modify `seq_transform_fn` to transform the input
+    - Function `chrom_transform_fn` in utils.py
+        - By default the dataloader provides signal array from bigwig files of shape (batch_size, # bigwigs, len), if your model takes a different kind of chromatin input, modify `chrom_transform_fn` to transform the input, if your model is sequence only, leave it to return None.
+2. Compute CAVs on your model, example command:
+```bash
+srun -n1 -c8 --gres=gpu:1 --mem=128G python scripts/run_tcav_sgd_pca.py \
+  cavs_test 1024 data/hg19.fa data/hg19.fa.fai \
+  --meme-motifs data/motif-clustering-v2.1beta_consensus_pwms.test.meme \
+  --bed-chrom-concepts data/ENCODE_DNase_peaks.bed
+```
+3. Then compute the layer attributions, example command:
+```bash
+srun -n1 -c8 --gres=gpu:1 --mem=128G \
+  python scripts/compute_layer_attrs_only.py cavs_test/tpcav_model.pt \
+  data/ChIPseq.H1-hESC.MAX.conservative.all.shuf1k.narrowPeak \
+  1024 data/hg19.fa data/hg19.fa.fai cavs_test/test
+```
+4. run the jupyer notebook to generate summary of your results
+```bash
+papermill -f scripts/compute_tcav_v2_pwm.example.yaml scripts/compute_tcav_v2_pwm.py.ipynb cavs_test/tcav_report.py.ipynb
+```

tpcav-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,64 @@
+# TPCAV (Testing with PCA projected Concept Activation Vectors)
+Analysis pipeline for TPCAV
+## Dependencies
+You can use your own environment for the model, in addition, you need to install the following packages:
+- captum 0.7
+- seqchromloader 0.8.5
+- scikit-learn 1.5.2
+## Workflow
+1. Since not every saved pytorch model stores the computation graph, you need to manually add functions to let the script know how to get the activations of the intermediate layer and how to proceed from there.
+    There are 3 places you need to insert your own code.
+    - Model class definition in models.py
+        - Please first copy your class definition into `Model_Class` in the script, it already has several pre-defined class functions, you need to fill in the following two functions:
+            - `forward_until_select_layer`: this is the function that takes your model input and forward until the layer you want to compute TPCAV score on
+            - `resume_forward_from_select_layer`: this is the function that starts from the activations of your select layer and forward all the way until the end
+        -  There are also functions necessary for TPCAV computation, don't change them:
+            - `forward_from_start`: this function calls `forward_until_select_layer` and `resume_forward_from_select_layer` to do a full forward pass
+            - `forward_from_projected_and_residual`: this function takes the PCA projected activations and unexplained residual to do the forward pass
+            - `project_avs_to_pca`: this function takes care of the PCA projection
+        > NOTE: you can modify your final output tensor to specifically explain certain transformation of your output, for example, you can take weighted sum of base pair resolution signal prediction to emphasize high signal region.
+    - Function `load_model` in utils.py
+        - Take care of the model initialization and load saved parameters in `load_model`, return the model instance.
+        > NOTE: you need to use your own model class definition in models.py, as we need the functions defined in step 1.
+    - Function `seq_transform_fn` in utils.py
+        - By default the dataloader provides one hot coded DNA array of shape (batch_size, 4, len), coded in the order [A, C, G, T], if your model takes a different kind of input, modify `seq_transform_fn` to transform the input
+    - Function `chrom_transform_fn` in utils.py
+        - By default the dataloader provides signal array from bigwig files of shape (batch_size, # bigwigs, len), if your model takes a different kind of chromatin input, modify `chrom_transform_fn` to transform the input, if your model is sequence only, leave it to return None.
+2. Compute CAVs on your model, example command:
+```bash
+srun -n1 -c8 --gres=gpu:1 --mem=128G python scripts/run_tcav_sgd_pca.py \
+  cavs_test 1024 data/hg19.fa data/hg19.fa.fai \
+  --meme-motifs data/motif-clustering-v2.1beta_consensus_pwms.test.meme \
+  --bed-chrom-concepts data/ENCODE_DNase_peaks.bed
+```
+3. Then compute the layer attributions, example command:
+```bash
+srun -n1 -c8 --gres=gpu:1 --mem=128G \
+  python scripts/compute_layer_attrs_only.py cavs_test/tpcav_model.pt \
+  data/ChIPseq.H1-hESC.MAX.conservative.all.shuf1k.narrowPeak \
+  1024 data/hg19.fa data/hg19.fa.fai cavs_test/test
+```
+4. run the jupyer notebook to generate summary of your results
+```bash
+papermill -f scripts/compute_tcav_v2_pwm.example.yaml scripts/compute_tcav_v2_pwm.py.ipynb cavs_test/tcav_report.py.ipynb
+```

tpcav-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,33 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "tpcav"
+version = "0.1.0"
+description = "Testing with PCA projected Concept Activation Vectors"
+authors = [{name = "Jianyu Yang", email = "yztxwd@gmail.com"},]
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = [
+    "torch",
+    "pandas",
+    "numpy",
+    "seqchromloader",
+    "deeplift",
+    "pyfaidx",
+    "pybedtools",
+    "captum",
+    "scikit-learn",
+    "biopython",
+    "seaborn",
+    "matplotlib",
+]
+license = "MIT AND (Apache-2.0 OR BSD-2-Clause)"
+keywords = ["interpretation",  "attribution", "concept", "genomics", "deep learning"]
+[project.urls]
+Homepage = "https://github.com/seqcode/TPCAV"
+[tool.setuptools.packages.find]
+exclude = ["data", "model", "node_modules", "test", "scripts"]

tpcav-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

tpcav-0.1.0/test/test_cav_trainer.py ADDED Viewed

@@ -0,0 +1,229 @@
+import unittest
+from functools import partial
+from pathlib import Path
+import torch
+from Bio import motifs as Bio_motifs
+from captum.attr import DeepLift
+from tpcav import helper
+from tpcav.cavs import CavTrainer
+from tpcav.concepts import ConceptBuilder
+from tpcav.tpcav_model import TPCAV, _abs_attribution_func
+class DummyModelSeq(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer1 = torch.nn.Linear(1024, 1)
+        self.layer2 = torch.nn.Linear(4, 1)
+    def forward(self, seq):
+        y_hat = self.layer1(seq)
+        y_hat = y_hat.squeeze(-1)
+        y_hat = self.layer2(y_hat)
+        return y_hat
+    def foward_from_layer1(self, y_hat):
+        y_hat = y_hat.squeeze(-1)
+        y_hat = self.layer2(y_hat)
+        return y_hat
+class DummyModelSeqChrom(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer1 = torch.nn.Linear(1024, 1)
+        self.layer2 = torch.nn.Linear(4, 1)
+    def forward(self, seq, chrom):
+        y_hat = self.layer1(seq)
+        y_hat = y_hat.squeeze(-1)
+        y_hat = self.layer2(y_hat)
+        return y_hat
+def transform_fasta_to_one_hot_seq(seq, chrom):
+    return (helper.fasta_to_one_hot_sequences(seq),)
+class CavTrainerIntegrationTest(unittest.TestCase):
+    def test_motif_concepts(self):
+        motif_path = Path("data") / "motif-clustering-v2.1beta_consensus_pwms.test.meme"
+        self.assertTrue(motif_path.exists(), "Motif file is missing")
+        builder = ConceptBuilder(
+            genome_fasta="data/hg38.analysisSet.fa",
+            genome_size_file="data/hg38.analysisSet.fa.fai",
+            input_window_length=1024,
+            bws=None,
+            num_motifs=16,
+            include_reverse_complement=True,
+            min_samples=1000,
+            batch_size=8,
+        )
+        builder.build_control()
+        builder.add_meme_motif_concepts(str(motif_path))
+        # load motifs
+        motifs = Bio_motifs.parse(open(motif_path), fmt="minimal")
+        for motif in motifs:
+            motif_name = motif.name.replace("/", "-")
+            concept = None
+            for c in builder.concepts:
+                if c.name == motif_name:
+                    concept = c
+                    break
+            self.assertIsNotNone(concept)
+            seq, chrom = next(iter(concept.data_iter))
+            matches = list(motif.pssm.search(seq[0], threshold=2.0))
+            self.assertGreaterEqual(
+                len(matches),
+                16,
+                f"Motif concept {motif_name} has insufficient matches {matches}",
+            )
+            control_seq, _ = next(iter(builder.control_concepts[0].data_iter))
+            control_matches = list(motif.pssm.search(control_seq[0], threshold=2.0))
+            self.assertGreater(
+                len(matches),
+                len(control_matches),
+                f"Control concept has more motif matches than Motif concept, motif concept: {len(matches)}, control concept: {len(control_matches)}",
+            )
+    def test_all(self):
+        motif_path = Path("data") / "motif-clustering-v2.1beta_consensus_pwms.test.meme"
+        self.assertTrue(motif_path.exists(), "Motif file is missing")
+        builder = ConceptBuilder(
+            genome_fasta="data/hg38.analysisSet.fa",
+            genome_size_file="data/hg38.analysisSet.fa.fai",
+            input_window_length=1024,
+            bws=None,
+            num_motifs=12,
+            include_reverse_complement=True,
+            min_samples=1000,
+            batch_size=8,
+        )
+        builder.build_control()
+        builder.add_meme_motif_concepts(str(motif_path))
+        builder.apply_transform(transform_fasta_to_one_hot_seq)
+        batch = next(iter(builder.all_concepts()[0].data_iter))
+        self.assertTupleEqual(batch[0].shape, (builder.batch_size, 4, 1024))
+        tpcav_model = TPCAV(DummyModelSeq(), layer_name="layer1")
+        tpcav_model.fit_pca(
+            concepts=builder.all_concepts(),
+            num_samples_per_concept=10,
+            num_pc="full",
+        )
+        torch.save(tpcav_model, "data/tmp_tpcav_model.pt")
+        cav_trainer = CavTrainer(tpcav_model, penalty="l2")
+        cav_trainer.set_control(builder.control_concepts[0], num_samples=100)
+        cav_trainer.train_concepts(
+            builder.concepts, 100, output_dir="data/cavs/", num_processes=2
+        )
+        torch.save(cav_trainer, "data/tmp_cav_trainer.pt")
+        random_regions_1 = helper.random_regions_dataframe(
+            "data/hg38.analysisSet.fa.fai", 1024, 100, seed=1
+        )
+        random_regions_2 = helper.random_regions_dataframe(
+            "data/hg38.analysisSet.fa.fai", 1024, 100, seed=2
+        )
+        def pack_data_iters(df):
+            seq_fasta_iter = helper.dataframe_to_fasta_iter(
+                df, "data/hg38.analysisSet.fa", batch_size=8
+            )
+            seq_one_hot_iter = (
+                helper.fasta_to_one_hot_sequences(seq_fasta)
+                for seq_fasta in seq_fasta_iter
+            )
+            chrom_iter = helper.dataframe_to_chrom_tracks_iter(df, None, batch_size=8)
+            return zip(
+                seq_one_hot_iter,
+            )
+        attributions = tpcav_model.layer_attributions(
+            pack_data_iters(random_regions_1), pack_data_iters(random_regions_2)
+        )["attributions"]
+        cav_trainer.tpcav_score("AC0001:GATA-PROP:GATA", attributions)
+        cav_trainer.plot_cavs_similaritiy_heatmap(attributions)
+        input_attrs = tpcav_model.input_attributions(
+            pack_data_iters(random_regions_1),
+            pack_data_iters(random_regions_2),
+            multiply_by_inputs=True,
+            cavs_list=[
+                cav_trainer.cav_weights["AC0001:GATA-PROP:GATA"],
+            ],
+        )
+        # compute layer attributions using the old way
+        random1_avs = []
+        random2_avs = []
+        for inputs in pack_data_iters(random_regions_1):
+            av = tpcav_model._layer_output(*[i.to(tpcav_model.device) for i in inputs])
+            random1_avs.append(av.detach().cpu())
+        for inputs in pack_data_iters(random_regions_2):
+            av = tpcav_model._layer_output(*[i.to(tpcav_model.device) for i in inputs])
+            random2_avs.append(av.detach().cpu())
+        random1_avs = torch.cat(random1_avs, dim=0)
+        random2_avs = torch.cat(random2_avs, dim=0)
+        random1_avs_residual, random1_avs_projected = tpcav_model.project_activations(
+            random1_avs
+        )
+        random2_avs_residual, random2_avs_projected = tpcav_model.project_activations(
+            random2_avs
+        )
+        def forward_from_layer_1_embeddings(tm, avs_residual, avs_projected):
+            y_hat = tm.embedding_to_layer_activation(avs_residual, avs_projected)
+            y_hat = tm.model.foward_from_layer1(y_hat)
+            return y_hat
+        tpcav_model.forward = partial(forward_from_layer_1_embeddings, tpcav_model)
+        dl = DeepLift(tpcav_model)
+        attributions_old = dl.attribute(
+            (
+                random1_avs_residual.to(tpcav_model.device),
+                random1_avs_projected.to(tpcav_model.device),
+            ),
+            baselines=(
+                random2_avs_residual.to(tpcav_model.device),
+                random2_avs_projected.to(tpcav_model.device),
+            ),
+            custom_attribution_func=_abs_attribution_func,
+        )
+        attr_residual, attr_projected = attributions_old
+        attributions_old = torch.cat((attr_projected, attr_residual), dim=1)
+        self.assertTrue(torch.allclose(attributions.cpu(), attributions_old.cpu()))
+if __name__ == "__main__":
+    unittest.main()

tpcav-0.1.0/tpcav/__init__.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""
+Lightweight, reusable TCAV utilities built from the repository scripts.
+The package keeps existing scripts untouched while offering programmatic
+access to concept construction and PCA/attribution workflows.
+"""
+import logging
+# Set the logging level to INFO
+logging.basicConfig(level=logging.INFO)
+from .cavs import CavTrainer
+from .concepts import ConceptBuilder
+from .helper import (
+    bed_to_chrom_tracks_iter,
+    bed_to_fasta_iter,
+    dataframe_to_chrom_tracks_iter,
+    dataframe_to_fasta_iter,
+    dinuc_shuffle_sequences,
+    fasta_to_one_hot_sequences,
+    random_regions_dataframe,
+)
+from .logging_utils import set_verbose
+from .tpcav_model import TPCAV
+__all__ = [
+    "ConceptBuilder",
+    "CavTrainer",
+    "TPCAV",
+    "bed_to_fasta_iter",
+    "dataframe_to_fasta_iter",
+    "bed_to_chrom_tracks_iter",
+    "dataframe_to_chrom_tracks_iter",
+    "fasta_to_one_hot_sequences",
+    "random_regions_dataframe",
+    "dinuc_shuffle_sequences",
+    "set_verbose",
+]