PyPI - tpcav - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

tpcav 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

tpcav/__init__.py +1 -1
tpcav/cavs.py +190 -3
tpcav/concepts.py +129 -116
tpcav/helper.py +142 -5
tpcav/tpcav_model.py +26 -21
tpcav/utils.py +93 -0
tpcav-0.2.1.dist-info/METADATA +91 -0
tpcav-0.2.1.dist-info/RECORD +12 -0
{tpcav-0.1.0.dist-info → tpcav-0.2.1.dist-info}/WHEEL +1 -1
tpcav-0.1.0.dist-info/METADATA +0 -89
tpcav-0.1.0.dist-info/RECORD +0 -12
{tpcav-0.1.0.dist-info → tpcav-0.2.1.dist-info}/licenses/LICENSE +0 -0
{tpcav-0.1.0.dist-info → tpcav-0.2.1.dist-info}/top_level.txt +0 -0

tpcav/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 # Set the logging level to INFO
 logging.basicConfig(level=logging.INFO)
-from .cavs import CavTrainer
+from .cavs import CavTrainer, run_tpcav
 from .concepts import ConceptBuilder
 from .helper import (
     bed_to_chrom_tracks_iter,

tpcav/cavs.py CHANGED Viewed

@@ -5,11 +5,15 @@ CAV training and attribution utilities built on TPCAV.
 import logging
 import multiprocessing
+from collections import defaultdict
+import os
 from pathlib import Path
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple, Dict
+from Bio import motifs
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 import seaborn as sns
 import torch
 from sklearn.linear_model import SGDClassifier
@@ -17,8 +21,11 @@ from sklearn.metrics import precision_recall_fscore_support
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.model_selection import GridSearchCV
 from torch.utils.data import DataLoader, TensorDataset, random_split
+from sklearn.linear_model import LinearRegression
-from tpcav.tpcav_model import TPCAV
+from . import helper, utils
+from .concepts import ConceptBuilder
+from .tpcav_model import TPCAV
 logger = logging.getLogger(__name__)
@@ -246,6 +253,16 @@ class CavTrainer:
         return scores
+    def tpcav_score_all_concepts(self, attributions: torch.Tensor) -> dict:
+        """
+        Compute TCAV scores for all trained concepts.
+        """
+        scores_dict = {}
+        for concept_name in self.cav_weights.keys():
+            scores = self.tpcav_score(concept_name, attributions)
+            scores_dict[concept_name] = scores
+        return scores_dict
     def tpcav_score_binary_log_ratio(
         self, concept_name: str, attributions: torch.Tensor, pseudocount: float = 1.0
     ) -> float:
@@ -259,6 +276,20 @@ class CavTrainer:
         return np.log((pos_count + pseudocount) / (neg_count + pseudocount))
+    def tpcav_score_all_concepts_log_ratio(
+        self, attributions: torch.Tensor, pseudocount: float = 1.0
+    ) -> dict:
+        """
+        Compute TCAV log ratio scores for all trained concepts.
+        """
+        log_ratio_dict = {}
+        for concept_name in self.cav_weights.keys():
+            log_ratio = self.tpcav_score_binary_log_ratio(
+                concept_name, attributions, pseudocount
+            )
+            log_ratio_dict[concept_name] = log_ratio
+        return log_ratio_dict
     def plot_cavs_similaritiy_heatmap(
         self,
         attributions: torch.Tensor,
@@ -274,7 +305,7 @@ class CavTrainer:
         cavs_names_pass = []
         for cname in cavs_names:
             if self.cavs_fscores[cname] >= fscore_thresh:
-                cavs_pass.append(self.cav_weights[cname])
+                cavs_pass.append(self.cav_weights[cname].cpu().numpy())
                 cavs_names_pass.append(cname)
             else:
                 logger.info(
@@ -332,3 +363,159 @@ class CavTrainer:
         ax_log.set_title("TCAV log ratio")
         plt.savefig(output_path, dpi=300, bbox_inches="tight")
+def load_motifs_from_meme(motif_meme_file):
+    return {utils.clean_motif_name(m.name): m for m in motifs.parse(open(motif_meme_file), fmt="MINIMAL")}
+def compute_motif_auc_fscore(num_motif_insertions: List[int], cav_trainers: List[CavTrainer], meme_motif_file: str | None = None):
+    cavs_fscores_df = pd.DataFrame({nm: cav_trainer.cavs_fscores for nm, cav_trainer in zip(num_motif_insertions, cav_trainers)})
+    cavs_fscores_df['concept'] = list(cav_trainers[0].cavs_fscores.keys())
+    def compute_auc_fscore(row):
+        y = [row[nm] for nm in num_motif_insertions]
+        return np.trapz(y, num_motif_insertions) / (
+            num_motif_insertions[-1] - num_motif_insertions[0]
+        )
+    cavs_fscores_df["AUC_fscores"] = cavs_fscores_df.apply(compute_auc_fscore, axis=1)
+    # if motif instances are provided, fit linear regression curve to remove the dependency of f-scores on information content and motif lengthj
+    if meme_motif_file is not None:
+        motifs_dict = load_motifs_from_meme(meme_motif_file)
+        cavs_fscores_df['information_content'] = cavs_fscores_df.apply(lambda x: motifs_dict[x['concept']].relative_entropy.sum(), axis=1)
+        cavs_fscores_df['motif_len'] = cavs_fscores_df.apply(lambda x: len(motifs_dict[x['concept']].consensus), axis=1)
+        model = LinearRegression()
+        model.fit(cavs_fscores_df[['information_content', 'motif_len']].to_numpy(), cavs_fscores_df['AUC_fscores'].to_numpy()[:, np.newaxis])
+        y_pred = model.predict(cavs_fscores_df[['information_content', 'motif_len']].to_numpy())
+        residuals = cavs_fscores_df['AUC_fscores'].to_numpy() - y_pred.flatten()
+        cavs_fscores_df['AUC_fscores_residual'] = residuals
+        cavs_fscores_df.sort_values("AUC_fscores_residual", ascending=False, inplace=True)
+    else:
+        cavs_fscores_df.sort_values("AUC_fscores", ascending=False, inplace=True)
+    return cavs_fscores_df
+def run_tpcav(
+    model,
+    layer_name: str,
+    meme_motif_file: str,
+    genome_fasta: str,
+    num_motif_insertions: List[int] = [4, 8, 16],
+    bed_seq_file: Optional[str] = None,
+    bed_chrom_file: Optional[str] = None,
+    output_dir: str = "tpcav/",
+    num_samples_for_pca=10,
+    num_samples_for_cav=1000,
+    input_window_length=1024,
+    batch_size=8,
+    bws=None,
+    input_transform_func=helper.fasta_chrom_to_one_hot_seq,
+    p=4
+):
+    """
+    One-stop function to compute CAVs on motif concepts and bed concepts, compute AUC of motif concept f-scores after correction
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    output_path = Path(output_dir)
+    # create concept builder to generate concepts
+    ## motif concepts
+    motif_concepts_pairs = {}
+    motif_concept_builders = []
+    num_motif_insertions.sort()
+    for nm in num_motif_insertions:
+        builder = ConceptBuilder(
+            genome_fasta=genome_fasta,
+            input_window_length=input_window_length,
+            bws=bws,
+            num_motifs=nm,
+            include_reverse_complement=True,
+            min_samples=num_samples_for_cav,
+            batch_size=batch_size,
+        )
+        # use random regions as control
+        builder.build_control()
+        # use meme motif PWMs to build motif concepts, one concept per motif
+        concepts_pairs = builder.add_meme_motif_concepts(str(meme_motif_file))
+        # apply transform to convert fasta sequences to one-hot encoded sequences
+        builder.apply_transform(input_transform_func)
+        motif_concepts_pairs[nm] = concepts_pairs
+        motif_concept_builders.append(builder)
+    ## bed concepts (optional)
+    if bed_seq_file is not None or bed_chrom_file is not None:
+        bed_builder = ConceptBuilder(
+            genome_fasta=genome_fasta,
+            input_window_length=input_window_length,
+            bws=bws,
+            num_motifs=0,
+            include_reverse_complement=True,
+            min_samples=num_samples_for_cav,
+            batch_size=batch_size,
+        )
+        # use random regions as control
+        bed_builder.build_control()
+        if bed_seq_file is not None:
+            # build concepts from fasta sequences in bed file
+            bed_builder.add_bed_sequence_concepts(bed_seq_file)
+        if bed_chrom_file is not None:
+            # build concepts from chromatin tracks in bed file
+            bed_builder.add_bed_chrom_concepts(bed_chrom_file)
+        # apply transform to convert fasta sequences to one-hot encoded sequences
+        bed_builder.apply_transform(input_transform_func)
+    else:
+        bed_builder = None
+    # create TPCAV model on top of the given model
+    tpcav_model = TPCAV(model, layer_name=layer_name)
+    # fit PCA on sampled all concept activations of the last builder (should have the most motifs)
+    tpcav_model.fit_pca(
+        concepts=motif_concept_builders[-1].all_concepts() + bed_builder.concepts if  bed_builder is not None else motif_concept_builders[-1].all_concepts(),
+        num_samples_per_concept=num_samples_for_pca,
+        num_pc="full",
+    )
+    #torch.save(tpcav_model, output_path / "tpcav_model.pt")
+    # create trainer for computing CAVs
+    motif_cav_trainers = {}
+    for nm in num_motif_insertions:
+        cav_trainer = CavTrainer(tpcav_model, penalty="l2")
+        for motif_concept, permuted_concept in motif_concepts_pairs[nm]:
+            # set control concept for CAV training
+            cav_trainer.set_control(
+                permuted_concept, num_samples=num_samples_for_cav
+            )
+            # train CAVs for all concepts
+            cav_trainer.train_concepts(
+                [motif_concept,],
+                num_samples_for_cav,
+                output_dir=str(output_path / f"cavs_{nm}_motifs/"),
+                num_processes=p,
+            )
+        motif_cav_trainers[nm] = cav_trainer
+    if bed_builder is not None:
+        bed_cav_trainer = CavTrainer(tpcav_model, penalty="l2")
+        bed_cav_trainer.set_control(
+            bed_builder.control_concepts[0], num_samples=num_samples_for_cav
+        )
+        bed_cav_trainer.train_concepts(
+            bed_builder.concepts,
+            num_samples_for_cav,
+            output_dir=str(output_path / f"cavs_bed_concepts/"),
+            num_processes=p,
+        )
+    else:
+        bed_cav_trainer = None
+    if len(num_motif_insertions) > 1:
+        cavs_fscores_df = compute_motif_auc_fscore(num_motif_insertions, list(motif_cav_trainers.values()), meme_motif_file=meme_motif_file)
+    else:
+        cavs_fscores_df = None
+    return cavs_fscores_df, motif_cav_trainers, bed_cav_trainer

tpcav/concepts.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import logging
-from copy import deepcopy
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple
 import numpy as np
 import pandas as pd
+import pyfaidx
 import seqchromloader as scl
 import webdataset as wds
 from Bio import motifs as Bio_motifs
@@ -21,15 +21,15 @@ class _PairedLoader:
     def __init__(self, seq_dl: Iterable, chrom_dl: Iterable) -> None:
         self.seq_dl = seq_dl
         self.chrom_dl = chrom_dl
-        self.apply_func = None
+        self.apply_func_list = []
     def apply(self, apply_func):
-        self.apply_func = apply_func
+        self.apply_func_list.append(apply_func)
     def __iter__(self):
         for inputs in zip(self.seq_dl, self.chrom_dl):
-            if self.apply_func:
-                inputs = self.apply_func(*inputs)
+            for apply_func in self.apply_func_list:
+                inputs = apply_func(*inputs)
             yield inputs
@@ -74,7 +74,6 @@ class ConceptBuilder:
     def __init__(
         self,
         genome_fasta: str,
-        genome_size_file: str,
         input_window_length: int = 1024,
         bws: Optional[List[str]] = None,
         batch_size: int = 8,
@@ -83,9 +82,13 @@ class ConceptBuilder:
         include_reverse_complement: bool = False,
         min_samples: int = 5000,
         rng_seed: int = 1001,
+        concept_name_suffix: str = "",
     ) -> None:
         self.genome_fasta = genome_fasta
-        self.genome_size_file = genome_size_file
+        pyfaidx.Fasta(
+            genome_fasta, build_index=True
+        )  # validate genome fasta file and build index if needed
+        self.genome_size_file = self.genome_fasta + ".fai"
         self.input_window_length = input_window_length
         self.bws = bws or []
         self.batch_size = batch_size
@@ -94,6 +97,7 @@ class ConceptBuilder:
         self.include_reverse_complement = include_reverse_complement
         self.min_samples = min_samples
         self.rng_seed = rng_seed
+        self.concept_name_suffix = concept_name_suffix
         self.control_regions: pd.DataFrame | None = None
         self.control_concepts: List[Concept] = []
@@ -116,7 +120,7 @@ class ConceptBuilder:
         concept = Concept(
             id=self._reserve_id(is_control=True),
-            name=name,
+            name=name + self.concept_name_suffix,
             data_iter=_PairedLoader(self._control_seq_dl(), self._control_chrom_dl()),
         )
         self.control_concepts = [concept]
@@ -142,144 +146,153 @@ class ConceptBuilder:
         return chrom_iter
     def add_custom_motif_concepts(
-        self, motif_table: str, control_regions: Optional[pd.DataFrame] = None
-    ) -> List[Concept]:
+        self, motif_table: str, control_regions: Optional[pd.DataFrame] = None, build_permute_control=True
+    ) -> List[Concept] | List[Tuple[Concept]]:
         """Add concepts from a tab-delimited motif table: motif_name<TAB>consensus."""
-        if control_regions is None:
-            if not self.control_concepts:
-                raise ValueError("Call build_control or pass control_regions first.")
-            control_regions = self.metadata.get("control_regions")
-        assert control_regions is not None
         df = pd.read_table(motif_table, names=["motif_name", "consensus_seq"])
-        added: List[Concept] = []
+        added = []
         for motif_name in np.unique(df.motif_name):
+            motif_name = utils.clean_motif_name(motif_name)
             consensus = df.loc[df.motif_name == motif_name, "consensus_seq"].tolist()
             motifs = []
             for idx, cons in enumerate(consensus):
                 motif = utils.CustomMotif(f"{motif_name}_{idx}", cons)
                 motifs.append(motif)
-                if self.include_reverse_complement:
-                    motifs.append(motif.reverse_complement())
-            seq_dl = _construct_motif_concept_dataloader_from_control(
-                control_regions,
-                self.genome_fasta,
-                motifs=motifs,
-                num_motifs=self.num_motifs,
-                motif_mode="consensus",
-                batch_size=self.batch_size,
-                num_workers=self.num_workers,
-            )
-            concept = Concept(
-                id=self._reserve_id(),
-                name=motif_name,
-                data_iter=_PairedLoader(seq_dl, self._control_chrom_dl()),
-            )
+            concept = self.build_motif_concept(motifs, motif_name, control_regions=control_regions, motif_mode="consensus")
             self.concepts.append(concept)
-            added.append(concept)
+            # build permute control if specified
+            if build_permute_control:
+                motifs_permuted = [m.permute() for m in motifs]
+                concept_permuted = self.build_motif_concept(motifs_permuted, motif_name + "_perm", control_regions=control_regions, motif_mode="consensus")
+                self.control_concepts.append(concept_permuted)
+                added.append((concept, concept_permuted))
+            else:
+                added.append(concept)
         return added
     def add_meme_motif_concepts(
-        self, meme_file: str, control_regions: Optional[pd.DataFrame] = None
-    ) -> List[Concept]:
+        self, meme_file: str, control_regions: Optional[pd.DataFrame] = None, build_permute_control=True) -> List[Concept] | List[Tuple[Concept]]:
         """Add concepts from a MEME minimal-format motif file."""
+        added = []
+        with open(meme_file) as handle:
+            for motif in Bio_motifs.parse(handle, fmt="MINIMAL"):
+                motif_name = utils.clean_motif_name(motif.name)
+                concept = self.build_motif_concept([motif,], motif_name, control_regions=control_regions, motif_mode="pwm")
+                self.concepts.append(concept)
+                # build permute control if specified
+                if build_permute_control:
+                    motif_permuted = utils.PermutedPWMMotif(motif)
+                    concept_permuted = self.build_motif_concept([motif_permuted,], motif_name + "_perm", control_regions=control_regions, motif_mode="pwm")
+                    self.control_concepts.append(concept_permuted)
+                    added.append((concept, concept_permuted))
+                else:
+                    added.append(concept)
+        return added
+    def build_motif_concept(self, motifs, concept_name, control_regions=None, motif_mode="pwm"):
         if control_regions is None:
             if not self.control_concepts:
                 raise ValueError("Call build_control or pass control_regions first.")
             control_regions = self.metadata.get("control_regions")
         assert control_regions is not None
+        if self.include_reverse_complement:
+            motifs.extend([m.reverse_complement() for m in motifs])
+        seq_dl = _construct_motif_concept_dataloader_from_control(
+            control_regions,
+            self.genome_fasta,
+            motifs=motifs,
+            num_motifs=self.num_motifs,
+            motif_mode=motif_mode,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+        )
+        concept = Concept(
+            id=self._reserve_id(),
+            name=concept_name + self.concept_name_suffix,
+            data_iter=_PairedLoader(seq_dl, self._control_chrom_dl()),
+        )
+        return concept
+    def add_bed_sequence_concepts(self, bed_path: str) -> List[Concept]:
+        """Add concepts backed by BED sequences with concept_name in column 5."""
         added: List[Concept] = []
-        with open(meme_file) as handle:
-            for motif in Bio_motifs.parse(handle, fmt="MINIMAL"):
-                motifs = [motif]
-                if self.include_reverse_complement:
-                    motifs.append(motif.reverse_complement())
-                motif_name = motif.name.replace("/", "-")
-                seq_dl = _construct_motif_concept_dataloader_from_control(
-                    control_regions,
-                    self.genome_fasta,
-                    motifs=motifs,
-                    num_motifs=self.num_motifs,
-                    motif_mode="pwm",
-                    batch_size=self.batch_size,
-                    num_workers=self.num_workers,
-                )
-                concept = Concept(
-                    id=self._reserve_id(),
-                    name=motif_name,
-                    data_iter=_PairedLoader(seq_dl, self._control_chrom_dl()),
-                )
-                self.concepts.append(concept)
-                added.append(concept)
+        bed_df = pd.read_table(
+            bed_path,
+            header=None,
+            usecols=[0, 1, 2, 3, 4],
+            names=["chrom", "start", "end", "strand", "concept_name"],
+        )
+        added.extend(self.add_dataframe_sequence_concepts(bed_df))
         return added
-    def add_bed_sequence_concepts(self, bed_paths: Iterable[str]) -> List[Concept]:
+    def add_dataframe_sequence_concepts(self, dataframe: pd.DataFrame) -> List[Concept]:
         """Add concepts backed by BED sequences with concept_name in column 5."""
+        dataframe = helper.center_dataframe_regions(dataframe, self.input_window_length)
         added: List[Concept] = []
-        for bed in bed_paths:
-            bed_df = pd.read_table(
-                bed,
-                header=None,
-                usecols=[0, 1, 2, 3, 4],
-                names=["chrom", "start", "end", "strand", "concept_name"],
-            )
-            for concept_name in bed_df.concept_name.unique():
-                concept_df = bed_df.loc[bed_df.concept_name == concept_name]
-                if len(concept_df) < self.min_samples:
-                    logger.warning(
-                        "Concept %s has %s samples, fewer than min_samples=%s; skipping",
-                        concept_name,
-                        len(concept_df),
-                        self.min_samples,
-                    )
-                    continue
-                seq_fasta_iter = helper.dataframe_to_fasta_iter(
-                    concept_df.sample(n=self.min_samples, random_state=self.rng_seed),
-                    self.genome_fasta,
-                    batch_size=self.batch_size,
+        for concept_name in dataframe.concept_name.unique():
+            concept_df = dataframe.loc[dataframe.concept_name == concept_name]
+            if len(concept_df) < self.min_samples:
+                logger.warning(
+                    "Concept %s has %s samples, fewer than min_samples=%s; skipping",
+                    concept_name,
+                    len(concept_df),
+                    self.min_samples,
                 )
-                concept = Concept(
-                    id=self._reserve_id(),
-                    name=concept_name,
-                    data_iter=_PairedLoader(seq_fasta_iter, self._control_chrom_dl()),
-                )
-                self.concepts.append(concept)
-                added.append(concept)
+                continue
+            seq_fasta_iter = helper.dataframe_to_fasta_iter(
+                concept_df.sample(n=self.min_samples, random_state=self.rng_seed),
+                self.genome_fasta,
+                batch_size=self.batch_size,
+            )
+            concept = Concept(
+                id=self._reserve_id(),
+                name=concept_name + self.concept_name_suffix,
+                data_iter=_PairedLoader(seq_fasta_iter, self._control_chrom_dl()),
+            )
+            self.concepts.append(concept)
+            added.append(concept)
         return added
-    def add_bed_chrom_concepts(self, bed_paths: Iterable[str]) -> List[Concept]:
+    def add_bed_chrom_concepts(self, bed_path: str) -> List[Concept]:
         """Add concepts backed by chromatin signal bigwigs and BED coordinates."""
         added: List[Concept] = []
-        for bed in bed_paths:
-            bed_df = pd.read_table(
-                bed,
-                header=None,
-                usecols=[0, 1, 2, 3, 4],
-                names=["chrom", "start", "end", "strand", "concept_name"],
-            )
-            for concept_name in bed_df.concept_name.unique():
-                concept_df = bed_df.loc[bed_df.concept_name == concept_name]
-                if len(concept_df) < self.min_samples:
-                    logger.warning(
-                        "Concept %s has %s samples, fewer than min_samples=%s; skipping",
-                        concept_name,
-                        len(concept_df),
-                        self.min_samples,
-                    )
-                    continue
-                chrom_dl = helper.dataframe_to_chrom_tracks_iter(
-                    concept_df.sample(n=self.min_samples, random_state=self.rng_seed),
-                    self.genome_fasta,
-                    self.bws,
-                    batch_size=self.batch_size,
-                )
-                concept = Concept(
-                    id=self._reserve_id(),
-                    name=concept_name,
-                    data_iter=_PairedLoader(self._control_seq_dl(), chrom_dl),
+        bed_df = pd.read_table(
+            bed_path,
+            header=None,
+            usecols=[0, 1, 2, 3, 4],
+            names=["chrom", "start", "end", "strand", "concept_name"],
+        )
+        added.extend(self.add_dataframe_chrom_concepts(bed_df))
+        return added
+    def add_dataframe_chrom_concepts(self, dataframe) -> List[Concept]:
+        """Add concepts backed by chromatin signal bigwigs and BED coordinates."""
+        dataframe = helper.center_dataframe_regions(dataframe, self.input_window_length)
+        added: List[Concept] = []
+        for concept_name in dataframe.concept_name.unique():
+            concept_df = dataframe.loc[dataframe.concept_name == concept_name]
+            if len(concept_df) < self.min_samples:
+                logger.warning(
+                    "Concept %s has %s samples, fewer than min_samples=%s; skipping",
+                    concept_name,
+                    len(concept_df),
+                    self.min_samples,
                 )
-                self.concepts.append(concept)
-                added.append(concept)
+                continue
+            chrom_dl = helper.dataframe_to_chrom_tracks_iter(
+                concept_df.sample(n=self.min_samples, random_state=self.rng_seed),
+                self.bws,
+                batch_size=self.batch_size,
+            )
+            concept = Concept(
+                id=self._reserve_id(),
+                name=concept_name + self.concept_name_suffix,
+                data_iter=_PairedLoader(self._control_seq_dl(), chrom_dl),
+            )
+            self.concepts.append(concept)
+            added.append(concept)
         return added
     def all_concepts(self) -> List[Concept]:

tpcav/helper.py CHANGED Viewed

@@ -5,6 +5,11 @@ Lightweight data loading helpers for sequences and chromatin tracks.
 from typing import Iterable, List, Optional
+import itertools
+import logging
+import pyBigWig
+import re
+import sys
 import numpy as np
 import pandas as pd
 import seqchromloader as scl
@@ -13,17 +18,31 @@ from deeplift.dinuc_shuffle import dinuc_shuffle
 from pyfaidx import Fasta
 from seqchromloader.utils import dna2OneHot, extract_bw
+logger = logging.getLogger(__name__)
 def load_bed_and_center(bed_file: str, window: int) -> pd.DataFrame:
     """
     Load a BED file and center the regions to a fixed window size.
     """
     bed_df = pd.read_table(bed_file, usecols=[0, 1, 2], names=["chrom", "start", "end"])
-    bed_df["center"] = ((bed_df["start"] + bed_df["end"]) // 2).astype(int)
-    bed_df["start"] = bed_df["center"] - (window // 2)
-    bed_df["end"] = bed_df["start"] + window
-    bed_df = bed_df[["chrom", "start", "end"]]
-    return bed_df
+    return center_dataframe_regions(bed_df, window)
+def center_dataframe_regions(df: pd.DataFrame, window: int) -> pd.DataFrame:
+    """
+    Center the regions in a DataFrame to a fixed window size, keep other columns. Put chrom, start, end as the first 3 columns.
+    """
+    df = df.copy()
+    df["center"] = ((df["start"] + df["end"]) // 2).astype(int)
+    df["start"] = df["center"] - (window // 2)
+    df["end"] = df["start"] + window
+    df = df.drop(columns=["center"])
+    cols = ["chrom", "start", "end"] + [
+        col for col in df.columns if col not in ["chrom", "start", "end"]
+    ]
+    df = df[cols]
+    return df
 def bed_to_fasta_iter(
@@ -46,6 +65,10 @@ def dataframe_to_fasta_iter(
     fasta_seqs = []
     for row in df.itertuples(index=False):
         seq = str(genome[row.chrom][row.start : row.end]).upper()
+        if len(seq) != (row.end - row.start):
+            raise ValueError(
+                f"Extract Fasta sequence length mismatch with region coordinate length {row.chrom}:{row.start}-{row.end}"
+            )
         fasta_seqs.append(seq)
         if len(fasta_seqs) == batch_size:
             yield fasta_seqs
@@ -163,3 +186,117 @@ def dinuc_shuffle_sequences(
         )
         results.append(shuffles)
     return results
+def fasta_chrom_to_one_hot_seq(seq, chrom):
+    return (fasta_to_one_hot_sequences(seq),)
+def write_attrs_to_bw(arrs, regions, genome_info, bigwig_fn, smooth=False):
+    """
+    write the attributions into bigwig files
+    shape of arrs should be (# samples, length)
+    Note: If regions overlap with each other, only base pairs not covered by previous regions would be assigned current region's attribution score
+    """
+    # write header into bigwig
+    bw = pyBigWig.open(bigwig_fn, "w")
+    heads = []
+    with open(genome_info, "r") as f:
+        for line in f:
+            chrom, length = line.strip().split("\t")[:2]
+            heads.append((chrom, int(length)))
+    heads = sorted(heads, key=lambda x: x[0])
+    bw.addHeader(heads)
+    # sort regions and arrs
+    assert len(regions) == len(arrs)
+    def get_key(x):
+        chrom, start, end = re.split("[:-]", regions[x])
+        start = int(start)
+        return chrom, start
+    idx_sort = sorted(range(len(regions)), key=get_key)
+    regions = [regions[i] for i in idx_sort]
+    arrs = arrs[idx_sort]
+    # construct iterables
+    it = zip(arrs, regions)
+    it = itertools.chain(
+        it, zip([np.array([-1000])], ["chrNone:10-100"])
+    )  # add pseudo region to make sure the last entry will be added to bw file
+    arr, lastRegion = next(it)
+    lastChrom, start, end = re.split(r"[:-]", lastRegion)
+    start = int(start)
+    end = int(end)
+    # extend coordinates if attribution arr is larger than interval length
+    if end - start < len(arr):
+        logger.warning(
+            "Interval length is smaller than attribution array length, expand it!"
+        )
+        diff = len(arr) - (end - start)
+        if diff % 2 != 0:
+            raise Exception(
+                "The difference between attribution array length and interval length is not even! Can't do symmetric extension in this case, exiting..."
+            )
+        start -= int(diff / 2)
+        end += int(diff / 2)
+    elif end - start == len(arr):
+        diff = 0
+    else:
+        raise Exception(
+            "Interval length is larger than attribution array length, this is not expected situation, exiting..."
+        )
+    arr_store_tmp = arr
+    for arr, region in it:
+        rchrom, rstart, rend = re.split(r"[:-]", region)
+        rstart = int(rstart)
+        rend = int(rend)
+        # extend coordinates if attribution arr is larger than interval length
+        rstart -= int(diff / 2)
+        rend += int(diff / 2)
+        if rstart < 0:
+            break
+        if end <= rstart or rchrom != lastChrom:
+            arr_store_tmp = (
+                np.convolve(arr_store_tmp, np.ones(10) / 10, mode="same")
+                if smooth
+                else arr_store_tmp
+            )
+            try:
+                bw.addEntries(
+                    lastChrom,
+                    np.arange(start, end, dtype=np.int64),
+                    values=arr_store_tmp.astype(np.float64),
+                    span=1,
+                )
+            except:
+                print(lastChrom)
+                print(start)
+                print(end)
+                print(arr_store_tmp.shape, arr_store_tmp.dtype)
+                print(rchrom)
+                print(rstart)
+                print(rend)
+                raise Exception(
+                    "Runtime error when adding entries to bigwig file, see above messages for region info"
+                )
+            lastChrom = rchrom
+            start = rstart
+            end = rend
+            arr_store_tmp = arr
+        # get uncovered interval (defined by start coordinate `start` and relative start coordinate `start_idx`)
+        else:
+            assert (
+                end > rstart and rchrom == lastChrom
+            )  # just double check make sure two intervals are overlapped
+            start_idx = end - rstart
+            end = rend
+            try:
+                arr_store_tmp = np.concatenate([arr_store_tmp, arr[start_idx:]])
+            except TypeError:
+                print(start_idx)
+                print(rstart, rend, rchrom, start, end, lastChrom)
+                print(arr_store_tmp.shape, print(arr.shape))
+                sys.exit(1)
+    bw.close()

tpcav/tpcav_model.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 from functools import partial
 from typing import Dict, Iterable, List, Optional, Tuple
+import numpy as np
 import torch
 from captum.attr import DeepLift
 from scipy.linalg import svd
@@ -11,10 +12,6 @@ logger = logging.getLogger(__name__)
 def _abs_attribution_func(multipliers, inputs, baselines):
     "Multiplier x abs(inputs - baselines) to avoid double-sign effects."
-    # print(f"inputs: {inputs[1][:5]}")
-    # print(f"baselines: {baselines[1][:5]}")
-    # print(f"multipliers: {multipliers[0][:5]}")
-    # print(f"multipliers: {multipliers[1][:5]}")
     return tuple(
         (input_ - baseline).abs() * multiplier
         for input_, baseline, multiplier in zip(inputs, baselines, multipliers)
@@ -52,7 +49,7 @@ class TPCAV(torch.nn.Module):
             "layer_name": self.layer_name,
             "zscore_mean": getattr(self, "zscore_mean", None),
             "zscore_std": getattr(self, "zscore_std", None),
-            "pca_inv": getattr(self, "pca_inv", None),
+            "Vh": getattr(self, "Vh", None),
             "orig_shape": getattr(self, "orig_shape", None),
         }
@@ -61,7 +58,7 @@ class TPCAV(torch.nn.Module):
         self.layer_name = tpcav_state_dict["layer_name"]
         self._set_buffer("zscore_mean", tpcav_state_dict["zscore_mean"])
         self._set_buffer("zscore_std", tpcav_state_dict["zscore_std"])
-        self._set_buffer("pca_inv", tpcav_state_dict["pca_inv"])
+        self._set_buffer("Vh", tpcav_state_dict["Vh"])
         self._set_buffer("orig_shape", tpcav_state_dict["orig_shape"])
         self.fitted = True
         logger.warning(
@@ -91,20 +88,22 @@ class TPCAV(torch.nn.Module):
         std[std == 0] = -1
         standardized = (flat - mean) / std
-        v_inverse = None
         if num_pc is None or num_pc == "full":
-            _, _, v = svd(standardized, lapack_driver="gesvd", full_matrices=False)
-            v_inverse = torch.tensor(v)
+            _, S, Vh = svd(standardized, lapack_driver="gesvd", full_matrices=False)
+            Vh = torch.tensor(Vh)
         elif int(num_pc) == 0:
-            v_inverse = None
+            S = None
+            Vh = None
         else:
-            _, _, v = svd(standardized, lapack_driver="gesvd", full_matrices=False)
-            v_inverse = torch.tensor(v[: int(num_pc)])
+            _, S, Vh = svd(standardized, lapack_driver="gesvd", full_matrices=False)
+            Vh = torch.tensor(Vh[: int(num_pc)])
+        self.eigen_values = np.square(S) if S is not None else None
         self._set_buffer("zscore_mean", mean.to(self.device))
         self._set_buffer("zscore_std", std.to(self.device))
         self._set_buffer(
-            "pca_inv", v_inverse.to(self.device) if v_inverse is not None else None
+            "Vh", Vh.to(self.device) if Vh is not None else None
         )
         self._set_buffer("orig_shape", torch.tensor(orig_shape).to(self.device))
         self.fitted = True
@@ -112,7 +111,7 @@ class TPCAV(torch.nn.Module):
         return {
             "zscore_mean": mean,
             "zscore_std": std,
-            "pca_inv": v_inverse,
+            "Vh": Vh,
             "orig_shape": torch.tensor(orig_shape),
         }
@@ -124,13 +123,13 @@ class TPCAV(torch.nn.Module):
             raise RuntimeError("Call fit_pca before projecting activations.")
         y = activations.flatten(start_dim=1).to(self.device)
-        if self.pca_inv is not None:
-            V = self.pca_inv.T
+        if self.Vh is not None:
+            V = self.Vh.T
             zscore_mean = getattr(self, "zscore_mean", 0.0)
             zscore_std = getattr(self, "zscore_std", 1.0)
             y_standardized = (y - zscore_mean) / zscore_std
             y_projected = torch.matmul(y_standardized, V)
-            y_residual = y_standardized - torch.matmul(y_projected, self.pca_inv)
+            y_residual = y_standardized - torch.matmul(y_projected, self.Vh)
             return y_residual, y_projected
         else:
             return y, None
@@ -174,8 +173,12 @@ class TPCAV(torch.nn.Module):
         target_batches: Iterable,
         baseline_batches: Iterable,
         multiply_by_inputs: bool = True,
+        abs_inputs_diff: bool = True,
     ) -> Dict[str, torch.Tensor]:
-        """Compute DeepLift attributions on PCA embedding space.
+        """
+        Compute DeepLift attributions on PCA embedding space.
+        By default, it computes (input - baseline).abs() * multiplier to avoid double-sign effects (abs_inputs_diff=True).
         target_batches and baseline_batches should yield (seq, chrom) pairs of matching length.
         """
@@ -184,6 +187,8 @@ class TPCAV(torch.nn.Module):
         self.forward = self.forward_from_embeddings_at_layer
         deeplift = DeepLift(self, multiply_by_inputs=multiply_by_inputs)
+        custom_attr_func = _abs_attribution_func if abs_inputs_diff else None
         attributions = []
         for inputs, binputs in zip(target_batches, baseline_batches):
             avs = self._layer_output(*[i.to(self.device) for i in inputs])
@@ -205,7 +210,7 @@ class TPCAV(torch.nn.Module):
                     ),
                     additional_forward_args=(inputs,),
                     custom_attribution_func=(
-                        None if not multiply_by_inputs else _abs_attribution_func
+                        None if not multiply_by_inputs else custom_attr_func
                     ),
                 )
                 attr_residual, attr_projected = attribution
@@ -219,7 +224,7 @@ class TPCAV(torch.nn.Module):
                         inputs,
                     ),
                     custom_attribution_func=(
-                        None if not multiply_by_inputs else _abs_attribution_func
+                        None if not multiply_by_inputs else custom_attr_func
                     ),
                 )[0]
@@ -324,7 +329,7 @@ class TPCAV(torch.nn.Module):
         Combine projected/residual embeddings into the layer activation space,
         mirroring scripts/models.py merge logic.
         """
-        y_hat = torch.matmul(avs_projected, self.pca_inv) + avs_residual
+        y_hat = torch.matmul(avs_projected, self.Vh) + avs_residual
         y_hat = y_hat * self.zscore_std + self.zscore_mean
         return y_hat.reshape((-1, *self.orig_shape[1:]))

tpcav/utils.py CHANGED Viewed

@@ -18,6 +18,8 @@ from torch.utils.data import default_collate, get_worker_info
 logger = logging.getLogger(__name__)
+def clean_motif_name(motif_name):
+    return motif_name.replace("/", "-")
 def center_windows(df, window_len=1024):
     "Get center window_len bp region of the given coordinate dataframe."
@@ -568,11 +570,102 @@ class CustomMotif:
     def __len__(self):
         return len(self.consensus)
+    def permute(self, seed=None, min_shift=0.3, name_suffix="_perm", max_attempts=100):
+        """
+        Permute the consensus sequence, return the object
+        """
+        permuted = deepcopy(self)
+        rng = np.random.default_rng(seed)
+        L = len(self.consensus)
+        count = 0
+        while True:
+            perm = rng.permutation(L)
+            frac_moved = np.mean(perm != np.arange(L))
+            if frac_moved >= min_shift:
+                break
+            else:
+                count += 1
+            if count > max_attempts:
+                raise ValueError(
+                    f"Could not generate a permutation with min_shift={min_shift} for motif {self.name}"
+                )
+        permuted_consensus = ''.join([self.consensus[i] for i in perm])
+        permuted.consensus = permuted_consensus
+        permuted.name = self.name + name_suffix
+        permuted.matrix_id = self.name + name_suffix
+        return permuted
     def reverse_complement(self):
         self.consensus = Bio.Seq.reverse_complement(self.consensus)
         self.name = self.name + "_rc"
         return self
+class PermutedPWMMotif:
+    BASES = ("A", "C", "G", "T")
+    RC_MAP = {"A": "T", "C": "G", "G": "C", "T": "A"}
+    def __init__(self, motif, seed=None, min_shift=0.3, name_suffix="_perm"):
+        """
+        motif: Bio.motifs.Motif
+        seed: RNG seed
+        min_shift: fraction of positions that must move
+        """
+        self.original_motif = motif
+        self.name = motif.name + name_suffix if motif.name else "permuted_motif"
+        self.length = motif.length
+        self.alphabet = motif.alphabet
+        # extract PWM as dict of lists
+        pwm = {b: list(motif.pwm[b]) for b in self.BASES}
+        self.pwm, self.permutation = self._permute_pwm_positions(
+            pwm, seed=seed, min_shift=min_shift
+        )
+    def __len__(self):
+        return self.length
+    def _permute_pwm_positions(self, pwm, seed=None, min_shift=0.3):
+        rng = np.random.default_rng(seed)
+        L = len(pwm["A"])
+        count = 0
+        while True:
+            perm = rng.permutation(L)
+            frac_moved = np.mean(perm != np.arange(L))
+            if frac_moved >= min_shift:
+                break
+            else:
+                count += 1
+            if count > 100:
+                raise ValueError(
+                    f"Could not generate a permutation with min_shift={min_shift} for motif {self.original_motif.name}"
+                )
+        permuted_pwm = {b: [pwm[b][i] for i in perm] for b in self.BASES}
+        return permuted_pwm, perm
+    def reverse_complement(self):
+        """
+        Return a NEW PermutedMotif with reverse-complemented PWM
+        """
+        rc_pwm = {b: [] for b in self.BASES}
+        L = len(self.pwm["A"])
+        for i in reversed(range(L)):
+            for b in self.BASES:
+                rc_base = self.RC_MAP[b]
+                rc_pwm[rc_base].append(self.pwm[b][i])
+        rc = deepcopy(self)
+        rc.pwm = rc_pwm
+        rc.name = self.name + "_rc"
+        return rc
 class PairedMotif:
     def __init__(self, motif1, motif2, spacing=0):

tpcav-0.2.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,91 @@
+Metadata-Version: 2.4
+Name: tpcav
+Version: 0.2.1
+Summary: Testing with PCA projected Concept Activation Vectors
+Author-email: Jianyu Yang <yztxwd@gmail.com>
+License-Expression: MIT AND (Apache-2.0 OR BSD-2-Clause)
+Project-URL: Homepage, https://github.com/seqcode/TPCAV
+Keywords: interpretation,attribution,concept,genomics,deep learning
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: torch
+Requires-Dist: pandas
+Requires-Dist: numpy
+Requires-Dist: seqchromloader
+Requires-Dist: deeplift
+Requires-Dist: pyfaidx
+Requires-Dist: pybedtools
+Requires-Dist: captum
+Requires-Dist: scikit-learn
+Requires-Dist: biopython
+Requires-Dist: seaborn
+Requires-Dist: matplotlib
+Dynamic: license-file
+# TPCAV (Testing with PCA projected Concept Activation Vectors)
+This repository contains code to compute TPCAV (Testing with PCA projected Concept Activation Vectors) on deep learning models. TPCAV is an extension of the original TCAV method, which uses PCA to reduce the dimensionality of the activations at a selected intermediate layer before computing Concept Activation Vectors (CAVs) to improve the consistency of the results.
+## Installation
+`pip install tpcav`
+## Quick start
+> `tpcav` only works with Pytorch model, if your model is built using other libraries, you should port the model into Pytorch first. For Tensorflow models, you can use [tf2onnx](https://github.com/onnx/tensorflow-onnx) and [onnx2pytorch](https://github.com/Talmaj/onnx2pytorch) for the conversion.
+```python
+import torch
+from tpcav import run_tpcav
+class DummyModelSeq(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layer1 = torch.nn.Linear(1024, 1)
+        self.layer2 = torch.nn.Linear(4, 1)
+    def forward(self, seq):
+        y_hat = self.layer1(seq)
+        y_hat = y_hat.squeeze(-1)
+        y_hat = self.layer2(y_hat)
+        return y_hat
+# transformation function to obtain one-hot encoded sequences
+def transform_fasta_to_one_hot_seq(seq, chrom):
+    # `seq` is a list of fasta sequences
+    # `chrom` is a numpy array of bigwig signals of shape [-1, # bigwigs, len]
+    return (helper.fasta_to_one_hot_sequences(seq),) # it has to return a tuple of inputs, even if there is only one input
+motif_path = "data/motif-clustering-v2.1beta_consensus_pwms.test.meme"
+bed_seq_concept = "data/hg38_rmsk.head500k.bed"
+genome_fasta = "data/hg38.analysisSet.fa"
+model = DummyModelSeq() # load the model
+layer_name = "layer1"   # name of the layer to be interpreted
+# concept_fscores_dataframe: fscores of each concept
+# motif_cav_trainers: each trainer contains the cav weights of motifs inserted different number of times
+# bed_cav_trainer: trainer contains the cav weights of the sequence concepts provided in bed file
+concept_fscores_dataframe, motif_cav_trainers, bed_cav_trainer = run_tpcav(
+    model=model,
+    layer_name=layer_name,
+    meme_motif_file=motif_path,
+    genome_fasta=genome_fasta,
+    num_motif_insertions=[4, 8],
+    bed_seq_file=bed_seq_concept,
+    output_dir="test_run_tpcav_output/",
+    input_transform_func=transform_fasta_to_one_hot_seq
+)
+# check each trainer for detailed weights
+print(bed_cav_trainer.cav_weights)
+```
+## Detailed Usage
+For detailed usage, please refer to this [jupyter notebook](https://github.com/seqcode/TPCAV/tree/main/examples/tpcav_detailed_usage.ipynb)
+If you find any issue, feel free to open an issue (strongly suggested) or contact [Jianyu Yang](mailto:jmy5455@psu.edu).

tpcav-0.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+tpcav/__init__.py,sha256=CpHijSyE1HMy8dlvdSaYrwN9gYMGDEJGDdsneNWnqdA,996
+tpcav/cavs.py,sha256=qXeNiTqlrCPb824ivVvZNrhHSZ6YRx2xmjdZ9JTlAgM,19299
+tpcav/concepts.py,sha256=_ht4UTu2EVJh52JGnKT3PEgDHk4Q-JCpNuHfFOVmzCw,12884
+tpcav/helper.py,sha256=CcNFJEFG00pujUrthBoMInpIBz1mWIG3y5fztaiHO-c,9917
+tpcav/logging_utils.py,sha256=wug7O_5IjxjhOpQr-aq90qKMEUp1EgcPkrv26d8li6Q,281
+tpcav/tpcav_model.py,sha256=XgNLPXr6_B-Dyb7RdgsUsFnrSK6oNjqqFPOjpz1wXmM,16564
+tpcav/utils.py,sha256=s2TfC-YoH_xa73WuMqvtpuqzx6g3ne12hE90Yg9hToY,21502
+tpcav-0.2.1.dist-info/licenses/LICENSE,sha256=uC-2s0ObLnQzWFKH5aokHXo6CzxlJgeI0P3bIUCZgfU,1064
+tpcav-0.2.1.dist-info/METADATA,sha256=XaYcUWr6humOfiUhwgKrccufSqDl_XiAutlO_wCf4lo,3502
+tpcav-0.2.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+tpcav-0.2.1.dist-info/top_level.txt,sha256=I9veSE_WsuFYrXlcfRevqtatDyWWZNsWA3dV0CeBXVg,6
+tpcav-0.2.1.dist-info/RECORD,,

{tpcav-0.1.0.dist-info → tpcav-0.2.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

tpcav-0.1.0.dist-info/METADATA DELETED Viewed

@@ -1,89 +0,0 @@
-Metadata-Version: 2.4
-Name: tpcav
-Version: 0.1.0
-Summary: Testing with PCA projected Concept Activation Vectors
-Author-email: Jianyu Yang <yztxwd@gmail.com>
-License-Expression: MIT AND (Apache-2.0 OR BSD-2-Clause)
-Project-URL: Homepage, https://github.com/seqcode/TPCAV
-Keywords: interpretation,attribution,concept,genomics,deep learning
-Requires-Python: >=3.8
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: torch
-Requires-Dist: pandas
-Requires-Dist: numpy
-Requires-Dist: seqchromloader
-Requires-Dist: deeplift
-Requires-Dist: pyfaidx
-Requires-Dist: pybedtools
-Requires-Dist: captum
-Requires-Dist: scikit-learn
-Requires-Dist: biopython
-Requires-Dist: seaborn
-Requires-Dist: matplotlib
-Dynamic: license-file
-# TPCAV (Testing with PCA projected Concept Activation Vectors)
-Analysis pipeline for TPCAV
-## Dependencies
-You can use your own environment for the model, in addition, you need to install the following packages:
-- captum 0.7
-- seqchromloader 0.8.5
-- scikit-learn 1.5.2
-## Workflow
-1. Since not every saved pytorch model stores the computation graph, you need to manually add functions to let the script know how to get the activations of the intermediate layer and how to proceed from there.
-    There are 3 places you need to insert your own code.
-    - Model class definition in models.py
-        - Please first copy your class definition into `Model_Class` in the script, it already has several pre-defined class functions, you need to fill in the following two functions:
-            - `forward_until_select_layer`: this is the function that takes your model input and forward until the layer you want to compute TPCAV score on
-            - `resume_forward_from_select_layer`: this is the function that starts from the activations of your select layer and forward all the way until the end
-        -  There are also functions necessary for TPCAV computation, don't change them:
-            - `forward_from_start`: this function calls `forward_until_select_layer` and `resume_forward_from_select_layer` to do a full forward pass
-            - `forward_from_projected_and_residual`: this function takes the PCA projected activations and unexplained residual to do the forward pass
-            - `project_avs_to_pca`: this function takes care of the PCA projection
-        > NOTE: you can modify your final output tensor to specifically explain certain transformation of your output, for example, you can take weighted sum of base pair resolution signal prediction to emphasize high signal region.
-    - Function `load_model` in utils.py
-        - Take care of the model initialization and load saved parameters in `load_model`, return the model instance.
-        > NOTE: you need to use your own model class definition in models.py, as we need the functions defined in step 1.
-    - Function `seq_transform_fn` in utils.py
-        - By default the dataloader provides one hot coded DNA array of shape (batch_size, 4, len), coded in the order [A, C, G, T], if your model takes a different kind of input, modify `seq_transform_fn` to transform the input
-    - Function `chrom_transform_fn` in utils.py
-        - By default the dataloader provides signal array from bigwig files of shape (batch_size, # bigwigs, len), if your model takes a different kind of chromatin input, modify `chrom_transform_fn` to transform the input, if your model is sequence only, leave it to return None.
-2. Compute CAVs on your model, example command:
-```bash
-srun -n1 -c8 --gres=gpu:1 --mem=128G python scripts/run_tcav_sgd_pca.py \
-  cavs_test 1024 data/hg19.fa data/hg19.fa.fai \
-  --meme-motifs data/motif-clustering-v2.1beta_consensus_pwms.test.meme \
-  --bed-chrom-concepts data/ENCODE_DNase_peaks.bed
-```
-3. Then compute the layer attributions, example command:
-```bash
-srun -n1 -c8 --gres=gpu:1 --mem=128G \
-  python scripts/compute_layer_attrs_only.py cavs_test/tpcav_model.pt \
-  data/ChIPseq.H1-hESC.MAX.conservative.all.shuf1k.narrowPeak \
-  1024 data/hg19.fa data/hg19.fa.fai cavs_test/test
-```
-4. run the jupyer notebook to generate summary of your results
-```bash
-papermill -f scripts/compute_tcav_v2_pwm.example.yaml scripts/compute_tcav_v2_pwm.py.ipynb cavs_test/tcav_report.py.ipynb
-```

tpcav-0.1.0.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-tpcav/__init__.py,sha256=GbO0qDy-VJjnBMZAl5TXh27znwnwEHLIadsPoWH-gY8,985
-tpcav/cavs.py,sha256=DDe7vAUdewosU6wur5qDUp2OsR0Bg-k_8R4VXFjcheI,11587
-tpcav/concepts.py,sha256=3HIybk5xrAru7OiOb3tBPKyWtfcfnA8DGa3DDCJXBxc,11775
-tpcav/helper.py,sha256=qvEmvIwm-qMKa8_8z_uhWdlYotwzMFx-8EPUPSKoveg,5014
-tpcav/logging_utils.py,sha256=wug7O_5IjxjhOpQr-aq90qKMEUp1EgcPkrv26d8li6Q,281
-tpcav/tpcav_model.py,sha256=gnM2RkBsv6mSFS2SYonziVBjHqdXoRX4cuYFmi9ITr0,16514
-tpcav/utils.py,sha256=sftnhLqeY5ExZIvXnICY0pP27jjowSRCqtPyDi0t5Yg,18509
-tpcav-0.1.0.dist-info/licenses/LICENSE,sha256=uC-2s0ObLnQzWFKH5aokHXo6CzxlJgeI0P3bIUCZgfU,1064
-tpcav-0.1.0.dist-info/METADATA,sha256=EW5LGdtqL6x6jge-oob_qgYDBuhDznxyKMkq-_YrMVA,4260
-tpcav-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-tpcav-0.1.0.dist-info/top_level.txt,sha256=I9veSE_WsuFYrXlcfRevqtatDyWWZNsWA3dV0CeBXVg,6
-tpcav-0.1.0.dist-info/RECORD,,

{tpcav-0.1.0.dist-info → tpcav-0.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{tpcav-0.1.0.dist-info → tpcav-0.2.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

tpcav 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

tpcav 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl