PyPI - topiary - Versions diffs - 4.0.0__py3-none-any.whl - Mend

topiary 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

tests/__init__.py +0 -0
tests/common.py +13 -0
tests/data.py +56 -0
tests/test_args_outputs.py +38 -0
tests/test_cli_protein_changes.py +64 -0
tests/test_contains_mutant_residues.py +95 -0
tests/test_dataframe.py +52 -0
tests/test_effect_expression_filters.py +100 -0
tests/test_epitopes_from_commandline_args.py +35 -0
tests/test_load_cufflinks_fpkm.py +107 -0
tests/test_load_stringtie_gtf_fpkm.py +20 -0
tests/test_mutant_epitope_predictions_class1.py +73 -0
tests/test_mutant_epitope_predictions_class2.py +60 -0
tests/test_padding.py +14 -0
tests/test_peptide_mutation_interval.py +57 -0
tests/test_rna_helpers.py +29 -0
tests/test_variant_expression_filters.py +62 -0
topiary/__init__.py +17 -0
topiary/cli/__init__.py +11 -0
topiary/cli/args.py +96 -0
topiary/cli/errors.py +30 -0
topiary/cli/filtering.py +64 -0
topiary/cli/outputs.py +101 -0
topiary/cli/protein_changes.py +132 -0
topiary/cli/rna.py +94 -0
topiary/cli/script.py +52 -0
topiary/cli/sequence.py +38 -0
topiary/filters.py +160 -0
topiary/predictor.py +414 -0
topiary/rna/__init__.py +13 -0
topiary/rna/common.py +57 -0
topiary/rna/cufflinks.py +244 -0
topiary/rna/gtf.py +63 -0
topiary/sequence_helpers.py +120 -0
topiary-4.0.0.dist-info/METADATA +162 -0
topiary-4.0.0.dist-info/RECORD +40 -0
topiary-4.0.0.dist-info/WHEEL +5 -0
topiary-4.0.0.dist-info/entry_points.txt +2 -0
topiary-4.0.0.dist-info/licenses/LICENSE +201 -0
topiary-4.0.0.dist-info/top_level.txt +2 -0

tests/__init__.py ADDED Viewed

File without changes

tests/common.py ADDED Viewed

@@ -0,0 +1,13 @@
+from contextlib import contextmanager
+import pytest
+def eq_(x, y):
+    assert x == y, "Expected %s == %s" % (x, y)
+@contextmanager
+def assert_raises(e_expected):
+    with pytest.raises(e_expected):
+        yield

tests/data.py ADDED Viewed

@@ -0,0 +1,56 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Helper functions and shared datasets for tests
+"""
+from __future__ import print_function, division, absolute_import
+import os
+from varcode import Variant, VariantCollection
+from pyensembl import ensembl_grch38
+def data_path(name):
+    """
+    Return the absolute path to a file in the varcode/test/data directory.
+    The name specified should be relative to varcode/test/data.
+    """
+    return os.path.join(os.path.dirname(__file__), "data", name)
+# BRAF variant coordinates from COSMIC entry:
+# http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=476
+braf_V600E_variant = Variant(7, 140753336, "A", "T", ensembl_grch38)
+# TP53 variant coordinates from COSMIC entry:
+# http://cancer.sanger.ac.uk/cosmic/mutation/overview?id=10656
+tp53_R248W_variant = Variant(17, 7674221, "G", "A", ensembl_grch38)
+cancer_test_variants = VariantCollection([
+    braf_V600E_variant,
+    tp53_R248W_variant
+])
+cancer_test_variant_gene_ids = {
+    gene_id
+    for v in cancer_test_variants
+    for gene_id in v.gene_ids
+}
+cancer_test_variant_transcript_ids = {
+    transcript_id
+    for v in cancer_test_variants
+    for transcript_id in v.transcript_ids
+}

tests/test_args_outputs.py ADDED Viewed

@@ -0,0 +1,38 @@
+from topiary.cli.args import arg_parser
+from topiary.cli.outputs import write_outputs
+import tempfile
+import pandas as pd
+from .common import eq_
+def test_write_outputs():
+    with tempfile.NamedTemporaryFile(mode="r+", delete=False) as f:
+        df = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
+        args = arg_parser.parse_args(
+            [
+                "--output-csv",
+                f.name,
+                "--subset-output-columns",
+                "x",
+                "--rename-output-column",
+                "x",
+                "X",
+                "--mhc-predictor",
+                "random",
+                "--mhc-alleles",
+                "A0201",
+            ]
+        )
+        write_outputs(
+            df, args, print_df_before_filtering=True, print_df_after_filtering=True
+        )
+        print("File: %s" % f.name)
+        df_from_file = pd.read_csv(f.name, index_col="#")
+        df_expected = pd.DataFrame({"X": [1, 2, 3]})
+        print(df_from_file)
+        eq_(len(df_expected), len(df_from_file))
+        assert (df_expected == df_from_file).all().all()

tests/test_cli_protein_changes.py ADDED Viewed

@@ -0,0 +1,64 @@
+from topiary.cli.protein_changes import protein_change_effects_from_args
+from topiary.cli.args import create_arg_parser
+from .common import eq_
+arg_parser = create_arg_parser(mhc=False, rna=False, output=False)
+def test_protein_change_effects_from_args_substitutions():
+    args = arg_parser.parse_args(
+        [
+            "--protein-change",
+            "EGFR",
+            "T790M",
+            "--genome",
+            "grch37",
+        ]
+    )
+    effects = protein_change_effects_from_args(args)
+    eq_(len(effects), 1)
+    effect = effects[0]
+    eq_(effect.aa_ref, "T")
+    eq_(effect.aa_mutation_start_offset, 789)
+    eq_(effect.aa_alt, "M")
+    transcript = effect.transcript
+    eq_(transcript.name, "EGFR-001")
+def test_protein_change_effects_from_args_malformed_missing_ref():
+    args = arg_parser.parse_args(
+        ["--protein-change", "EGFR", "790M", "--genome", "grch37"]
+    )
+    effects = protein_change_effects_from_args(args)
+    eq_(len(effects), 0)
+def test_protein_change_effects_from_args_malformed_missing_alt():
+    args = arg_parser.parse_args(
+        ["--protein-change", "EGFR", "T790", "--genome", "grch37"]
+    )
+    effects = protein_change_effects_from_args(args)
+    eq_(len(effects), 0)
+def test_protein_change_effects_from_args_multiple_effects():
+    args = arg_parser.parse_args(
+        [
+            "--protein-change",
+            "EGFR",
+            "T790M",
+            "--protein-change",
+            "KRAS",
+            "G10D",
+            "--genome",
+            "grch37",
+        ]
+    )
+    effects = protein_change_effects_from_args(args)
+    print(effects)
+    eq_(len(effects), 2)

tests/test_contains_mutant_residues.py ADDED Viewed

@@ -0,0 +1,95 @@
+from topiary import contains_mutant_residues
+from .common import eq_
+def test_contains_mutant_residues_before():
+    eq_(
+        contains_mutant_residues(
+            peptide_start_in_protein=10,
+            peptide_length=9,
+            mutation_start_in_protein=5,
+            mutation_end_in_protein=6,
+        ),
+        False,
+    )
+def test_contains_mutant_residues_after():
+    eq_(
+        contains_mutant_residues(
+            peptide_start_in_protein=10,
+            peptide_length=9,
+            mutation_start_in_protein=25,
+            mutation_end_in_protein=26,
+        ),
+        False,
+    )
+def test_contains_mutant_residues_inside():
+    eq_(
+        contains_mutant_residues(
+            peptide_start_in_protein=10,
+            peptide_length=9,
+            mutation_start_in_protein=12,
+            mutation_end_in_protein=13,
+        ),
+        True,
+    )
+def test_contains_mutant_residues_deletion_before_beginning():
+    # peptide only contains the residue *after* the mutation
+    # so it still looks like it's wildtype
+    eq_(
+        contains_mutant_residues(
+            peptide_start_in_protein=10,
+            peptide_length=9,
+            mutation_start_in_protein=10,
+            mutation_end_in_protein=10,
+        ),
+        False,
+    )
+def test_contains_mutant_residues_deletion_at_beginning():
+    # peptide contains mutation before *and* after mutation so
+    # it should count as having a mutant juxtaposition of residues
+    eq_(
+        contains_mutant_residues(
+            peptide_start_in_protein=10,
+            peptide_length=9,
+            mutation_start_in_protein=11,
+            mutation_end_in_protein=11,
+        ),
+        True,
+    )
+def test_contains_mutant_residues_deletion_after_end():
+    # peptide only contains the residue *before* the mutation
+    # so it still looks like it's wildtype
+    eq_(
+        contains_mutant_residues(
+            peptide_start_in_protein=10,
+            peptide_length=9,
+            mutation_start_in_protein=19,
+            mutation_end_in_protein=19,
+        ),
+        False,
+    )
+def test_contains_mutant_residues_deletion_at_end():
+    # peptide contains mutation before *and* after mutation so
+    # it should count as having a mutant juxtaposition of residues
+    eq_(
+        contains_mutant_residues(
+            peptide_start_in_protein=10,
+            peptide_length=9,
+            mutation_start_in_protein=18,
+            mutation_end_in_protein=18,
+        ),
+        True,
+    )

tests/test_dataframe.py ADDED Viewed

@@ -0,0 +1,52 @@
+from mhctools import NetMHC
+from topiary import TopiaryPredictor
+from .data import cancer_test_variants
+alleles = [
+    "A02:01",
+    "B*07:02",
+    "HLA-C*07:02",
+]
+mhc_model = NetMHC(alleles=alleles, default_peptide_lengths=[8, 9, 10])
+DEFAULT_FPKM = 1.0
+def test_epitopes_to_dataframe_transcript_expression():
+    predictor = TopiaryPredictor(mhc_model=mhc_model, only_novel_epitopes=False)
+    df = predictor.predict_from_variants(
+        variants=cancer_test_variants,
+        transcript_expression_dict={
+            transcript_id: DEFAULT_FPKM
+            for variant in cancer_test_variants
+            for transcript_id in variant.transcript_ids
+        },
+    )
+    assert (
+        "transcript_expression" in df.columns
+    ), "transcript_expression missing from %s" % (df.columns,)
+    assert (
+        df["transcript_expression"] == DEFAULT_FPKM
+    ).all(), "Invalid FPKM values in DataFrame transcript_expression column"
+def test_epitopes_to_dataframe_gene_expression():
+    predictor = TopiaryPredictor(mhc_model=mhc_model, only_novel_epitopes=False)
+    df = predictor.predict_from_variants(
+        variants=cancer_test_variants,
+        gene_expression_dict={
+            gene_id: DEFAULT_FPKM
+            for variant in cancer_test_variants
+            for gene_id in variant.gene_ids
+        },
+    )
+    assert "gene_expression" in df.columns, "gene_expression missing from %s" % (
+        df.columns,
+    )
+    assert (
+        df["gene_expression"] == DEFAULT_FPKM
+    ).all(), "Invalid FPKM values in DataFrame gene_expression column"

tests/test_effect_expression_filters.py ADDED Viewed

@@ -0,0 +1,100 @@
+from .data import (
+    cancer_test_variants,
+    cancer_test_variant_gene_ids,
+    cancer_test_variant_transcript_ids,
+)
+from topiary.filters import apply_effect_expression_filters
+cancer_test_effects = cancer_test_variants.effects()
+DEFAULT_FPKM = 1.0
+# associate every gene ID with 1.0 FPKM
+gene_expression_dict = {
+    gene_id: DEFAULT_FPKM for gene_id in cancer_test_variant_gene_ids
+}
+# associate every transcript with 1.0 FPKM
+transcript_expression_dict = {
+    transcript_id: DEFAULT_FPKM for transcript_id in cancer_test_variant_transcript_ids
+}
+def test_apply_effect_gene_expression_below_threshold():
+    filtered = apply_effect_expression_filters(
+        cancer_test_effects,
+        gene_expression_dict=gene_expression_dict,
+        gene_expression_threshold=2 * DEFAULT_FPKM,
+        transcript_expression_dict=None,
+        transcript_expression_threshold=None,
+    )
+    assert (
+        len(filtered) == 0
+    ), "All variants should have been filtered out but got: %s" % (filtered,)
+def test_apply_effect_gene_expression_above_threshold():
+    filtered = apply_effect_expression_filters(
+        cancer_test_effects,
+        gene_expression_dict=gene_expression_dict,
+        gene_expression_threshold=0.5 * DEFAULT_FPKM,
+        transcript_expression_dict=None,
+        transcript_expression_threshold=None,
+    )
+    assert len(filtered) == len(
+        cancer_test_effects
+    ), "Expected %s effects but got %s" % (len(cancer_test_effects), len(filtered))
+def test_apply_effect_gene_expression_equal_threshold():
+    # expect genes with expression at threshold to NOT get filtered
+    filtered = apply_effect_expression_filters(
+        cancer_test_effects,
+        gene_expression_dict=gene_expression_dict,
+        gene_expression_threshold=DEFAULT_FPKM,
+        transcript_expression_dict=None,
+        transcript_expression_threshold=None,
+    )
+    assert len(filtered) == len(
+        cancer_test_effects
+    ), "Expected %s effects but got %s" % (len(cancer_test_effects), len(filtered))
+def test_apply_effect_transcript_expression_below_threshold():
+    filtered = apply_effect_expression_filters(
+        cancer_test_effects,
+        gene_expression_dict=None,
+        gene_expression_threshold=None,
+        transcript_expression_dict=transcript_expression_dict,
+        transcript_expression_threshold=2 * DEFAULT_FPKM,
+    )
+    assert (
+        len(filtered) == 0
+    ), "All effects should have been filtered out but got: %s" % (filtered,)
+def test_apply_effect_transcript_expression_above_threshold():
+    filtered = apply_effect_expression_filters(
+        cancer_test_effects,
+        gene_expression_dict=None,
+        gene_expression_threshold=None,
+        transcript_expression_dict=transcript_expression_dict,
+        transcript_expression_threshold=0.5 * DEFAULT_FPKM,
+    )
+    assert len(filtered) == len(
+        cancer_test_effects
+    ), "Expected %s effects but got %s" % (len(cancer_test_effects), len(filtered))
+def test_apply_effect_transcript_expression_equal_threshold():
+    # expect transcripts with expression at threshold to NOT be filtered
+    filtered = apply_effect_expression_filters(
+        cancer_test_effects,
+        gene_expression_dict=None,
+        gene_expression_threshold=None,
+        transcript_expression_dict=transcript_expression_dict,
+        transcript_expression_threshold=DEFAULT_FPKM,
+    )
+    assert len(filtered) == len(
+        cancer_test_effects
+    ), "Expected %s effects but got %s" % (len(cancer_test_effects), len(filtered))

tests/test_epitopes_from_commandline_args.py ADDED Viewed

@@ -0,0 +1,35 @@
+from topiary.cli.args import arg_parser, predict_epitopes_from_args
+from .common import eq_
+from .data import cancer_test_variants
+def test_cancer_epitopes_from_args():
+    epitope_lengths = [9, 10]
+    alleles = ["HLA-A*02:01", "C0701"]
+    args_list = [
+        "--mhc-predictor",
+        "netmhc",
+        "--mhc-epitope-lengths",
+        ",".join(str(x) for x in epitope_lengths),
+        "--mhc-alleles",
+        ",".join(alleles),
+        "--genome",
+        "GRCh38",
+        "--only-novel-epitopes",
+    ]
+    for variant in cancer_test_variants:
+        args_list.append("--variant")
+        args_list.append(str(variant.contig))
+        args_list.append(str(variant.start))
+        args_list.append(variant.ref)
+        args_list.append(variant.alt)
+    parsed_args = arg_parser.parse_args(args_list)
+    epitope_predictions = predict_epitopes_from_args(parsed_args)
+    expected_number_of_epitopes = 0
+    for epitope_length in epitope_lengths:
+        expected_number_of_epitopes += (
+            epitope_length * len(cancer_test_variants) * len(alleles)
+        )
+    eq_(len(epitope_predictions), expected_number_of_epitopes)

tests/test_load_cufflinks_fpkm.py ADDED Viewed

@@ -0,0 +1,107 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+test_cufflinks : Test that we can correctly load Cufflinks tracking files which
+contain the estimated expression levels of genes and isoforms (computed from
+RNA-Seq reads).
+"""
+from __future__ import print_function, division, absolute_import
+from topiary.rna import load_cufflinks_dataframe
+from .common import eq_
+from .data import data_path
+def test_load_cufflinks_genes():
+    genes_df = load_cufflinks_dataframe(
+        data_path("genes.fpkm_tracking"),
+        drop_lowdata=True,
+        drop_hidata=True,
+        drop_failed=True,
+        drop_novel=False,
+    )
+    gene_ids = set(genes_df.id)
+    expected_gene_ids = {
+        "ENSG00000240361",
+        "ENSG00000268020",
+        "ENSG00000186092",
+        "ENSG00000269308",
+        "CUFF.1",
+        "CUFF.2",
+        "CUFF.3",
+        "CUFF.4",
+        "CUFF.5",
+    }
+    eq_(gene_ids, expected_gene_ids)
+def test_load_cufflinks_genes_drop_novel():
+    genes_df = load_cufflinks_dataframe(
+        data_path("genes.fpkm_tracking"),
+        drop_lowdata=True,
+        drop_hidata=True,
+        drop_failed=True,
+        drop_novel=True,
+    )
+    gene_ids = set(genes_df.id)
+    expected_gene_ids = {
+        "ENSG00000240361",
+        "ENSG00000268020",
+        "ENSG00000186092",
+        "ENSG00000269308",
+    }
+    eq_(gene_ids, expected_gene_ids)
+def test_load_cufflinks_isoforms():
+    transcripts_df = load_cufflinks_dataframe(
+        data_path("isoforms.fpkm_tracking"),
+        drop_lowdata=True,
+        drop_hidata=True,
+        drop_failed=True,
+        drop_novel=False,
+    )
+    transcript_ids = set(transcripts_df.id)
+    expected_transcript_ids = {
+        "ENST00000492842",
+        "ENST00000594647",
+        "ENST00000335137",
+        "ENST00000417324",
+        "ENST00000461467",
+        "ENST00000518655",
+        "CUFF.7604.1",
+    }
+    eq_(transcript_ids, expected_transcript_ids)
+def test_load_cufflinks_isoforms_drop_novel():
+    transcripts_df = load_cufflinks_dataframe(
+        data_path("isoforms.fpkm_tracking"),
+        drop_lowdata=True,
+        drop_hidata=True,
+        drop_failed=True,
+        drop_novel=True,
+    )
+    transcript_ids = set(transcripts_df.id)
+    expected_transcript_ids = {
+        "ENST00000492842",
+        "ENST00000594647",
+        "ENST00000335137",
+        "ENST00000417324",
+        "ENST00000461467",
+        "ENST00000518655",
+    }
+    eq_(transcript_ids, expected_transcript_ids)

tests/test_load_stringtie_gtf_fpkm.py ADDED Viewed

@@ -0,0 +1,20 @@
+from topiary.rna import load_transcript_fpkm_dict_from_gtf
+from .common import eq_
+from .data import data_path
+def test_load_stringtie_gtf_transcripts():
+    transcript_fpkms = load_transcript_fpkm_dict_from_gtf(
+        data_path("B16-StringTie-chr1-subset.gtf")
+    )
+    transcript_ids = set(transcript_fpkms.keys())
+    expected_fpkms_dict = {
+        "ENSMUST00000192505": 0.125126,
+        "ENSMUST00000191939": 0.680062,
+        "ENSMUST00000182774": 0.054028,
+    }
+    expected_transcript_ids = set(expected_fpkms_dict.keys())
+    eq_(expected_transcript_ids, transcript_ids)
+    for transcript_id, fpkm in expected_fpkms_dict.items():
+        eq_(fpkm, transcript_fpkms[transcript_id])

tests/test_mutant_epitope_predictions_class1.py ADDED Viewed

@@ -0,0 +1,73 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+from pyensembl import ensembl_grch37
+from topiary import TopiaryPredictor
+from varcode import Variant, VariantCollection
+from .common import eq_
+try:
+    from mhctools import NetMHCpan
+    mhc_model = NetMHCpan(
+        alleles=["A02:01", "a0204", "B*07:02", "HLA-B14:02", "HLA-C*07:02", "hla-c07:01"],
+        default_peptide_lengths=[9],
+    )
+    HAS_NETMHC = True
+except Exception:
+    mhc_model = None
+    HAS_NETMHC = False
+pytestmark = pytest.mark.skipif(not HAS_NETMHC, reason="NetMHCpan not installed")
+# TODO: find out about these variants,
+# what do we expect from them?
+variants = VariantCollection(
+    [
+        Variant(contig=10, start=100018900, ref="C", alt="T", ensembl=ensembl_grch37),
+        Variant(contig=11, start=32861682, ref="G", alt="A", ensembl=ensembl_grch37),
+    ]
+)
+def test_epitope_prediction_without_padding():
+    output_without_padding = TopiaryPredictor(
+        mhc_model=mhc_model, only_novel_epitopes=True
+    ).predict_from_variants(variants=variants)
+    # one prediction for each variant * number of alleles
+    strong_binders = output_without_padding[output_without_padding.affinity <= 500]
+    eq_(len(strong_binders), 5)
+def test_epitope_prediction_with_invalid_padding():
+    with pytest.raises(ValueError):
+        TopiaryPredictor(
+            mhc_model=mhc_model, padding_around_mutation=7
+        ).predict_from_variants(variants=variants)
+def test_epitope_prediction_with_invalid_zero_padding():
+    with pytest.raises(ValueError):
+        TopiaryPredictor(
+            mhc_model=mhc_model, padding_around_mutation=7
+        ).predict_from_variants(variants=variants)
+def test_epitope_prediction_with_valid_padding():
+    predictor = TopiaryPredictor(
+        mhc_model=mhc_model, padding_around_mutation=8, only_novel_epitopes=True
+    )
+    output_with_padding = predictor.predict_from_variants(variants=variants)
+    # 6 alleles * 2 mutations * 9 distinct windows = 108
+    eq_(len(output_with_padding), 108)