PyPI - pheval - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

pheval 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (15) hide show

pheval/analyse/binary_classification_stats.py +29 -1
pheval/analyse/disease_prioritisation_analysis.py +6 -2
pheval/analyse/gene_prioritisation_analysis.py +6 -2
pheval/analyse/generate_plots.py +83 -1
pheval/analyse/generate_summary_outputs.py +14 -1
pheval/analyse/variant_prioritisation_analysis.py +6 -2
pheval/cli_pheval_utils.py +20 -14
pheval/prepare/create_spiked_vcf.py +132 -95
pheval/utils/file_utils.py +21 -12
pheval/utils/phenopacket_utils.py +3 -1
{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/METADATA +1 -1
{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/RECORD +15 -15
{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/WHEEL +1 -1
{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/LICENSE +0 -0
{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/entry_points.txt +0 -0

pheval/analyse/binary_classification_stats.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from math import sqrt
 from typing import List, Union
@@ -29,6 +29,8 @@ class BinaryClassificationStats:
     true_negatives: int = 0
     false_positives: int = 0
     false_negatives: int = 0
+    labels: List = field(default_factory=list)
+    scores: List = field(default_factory=list)
     @staticmethod
     def remove_relevant_ranks(
@@ -84,6 +86,31 @@ class BinaryClassificationStats:
             elif rank != 1:
                 self.true_negatives += 1
+    def add_labels_and_scores(
+        self,
+        pheval_results: Union[
+            List[RankedPhEvalGeneResult],
+            List[RankedPhEvalVariantResult],
+            List[RankedPhEvalDiseaseResult],
+        ],
+        relevant_ranks: List[int],
+    ):
+        """
+        Adds scores and labels from the PhEval results.
+        Args:
+            pheval_results (Union[List[RankedPhEvalGeneResult], List[RankedPhEvalVariantResult],
+                                  List[RankedPhEvalDiseaseResult]]):
+                List of all PhEval results
+            relevant_ranks (List[int]): A list of the ranks associated with the known entities.
+        """
+        relevant_ranks_copy = relevant_ranks.copy()
+        for result in pheval_results:
+            self.scores.append(result.score)
+            label = 1 if result.rank in relevant_ranks_copy else 0
+            self.labels.append(label)
+            relevant_ranks_copy.remove(result.rank) if label == 1 else None
     def add_classification(
         self,
         pheval_results: Union[
@@ -105,6 +132,7 @@ class BinaryClassificationStats:
         self.add_classification_for_other_entities(
             self.remove_relevant_ranks(pheval_results, relevant_ranks)
         )
+        self.add_labels_and_scores(pheval_results, relevant_ranks)
     def sensitivity(self) -> float:
         """

pheval/analyse/disease_prioritisation_analysis.py CHANGED Viewed

@@ -10,7 +10,11 @@ from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResu
 from pheval.analyse.rank_stats import RankStats
 from pheval.analyse.run_data_parser import TrackInputOutputDirectories
 from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
-from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name
+from pheval.utils.file_utils import (
+    all_files,
+    files_with_suffix,
+    obtain_phenopacket_path_from_pheval_result,
+)
 from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader
@@ -234,7 +238,7 @@ def assess_phenopacket_disease_prioritisation(
         disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
         disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
     """
-    phenopacket_path = obtain_closest_file_name(
+    phenopacket_path = obtain_phenopacket_path_from_pheval_result(
         standardised_disease_result, all_files(results_dir_and_input.phenopacket_dir)
     )
     pheval_disease_result = read_standardised_result(standardised_disease_result)

pheval/analyse/gene_prioritisation_analysis.py CHANGED Viewed

@@ -10,7 +10,11 @@ from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
 from pheval.analyse.rank_stats import RankStats
 from pheval.analyse.run_data_parser import TrackInputOutputDirectories
 from pheval.post_processing.post_processing import RankedPhEvalGeneResult
-from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name
+from pheval.utils.file_utils import (
+    all_files,
+    files_with_suffix,
+    obtain_phenopacket_path_from_pheval_result,
+)
 from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
@@ -226,7 +230,7 @@ def assess_phenopacket_gene_prioritisation(
         gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
         gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
     """
-    phenopacket_path = obtain_closest_file_name(
+    phenopacket_path = obtain_phenopacket_path_from_pheval_result(
         standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir)
     )
     pheval_gene_result = read_standardised_result(standardised_gene_result)

pheval/analyse/generate_plots.py CHANGED Viewed

@@ -5,6 +5,7 @@ import matplotlib
 import pandas as pd
 import seaborn as sns
 from matplotlib import pyplot as plt
+from sklearn.metrics import auc, precision_recall_curve, roc_curve
 from pheval.analyse.benchmark_generator import (
     BenchmarkRunOutputGenerator,
@@ -357,6 +358,82 @@ class PlotGenerator:
             ]
         )
+    def generate_roc_curve(
+        self,
+        benchmarking_results: List[BenchmarkRunResults],
+        benchmark_generator: BenchmarkRunOutputGenerator,
+    ):
+        """
+        Generate and plot Receiver Operating Characteristic (ROC) curves for binary classification benchmark results.
+        Args:
+            benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs.
+            benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details.
+        """
+        for i, benchmark_result in enumerate(benchmarking_results):
+            fpr, tpr, thresh = roc_curve(
+                benchmark_result.binary_classification_stats.labels,
+                benchmark_result.binary_classification_stats.scores,
+                pos_label=1,
+            )
+            roc_auc = auc(fpr, tpr)
+            plt.plot(
+                fpr,
+                tpr,
+                label=f"{self.return_benchmark_name(benchmark_result)} ROC Curve (AUC = {roc_auc:.2f})",
+                color=self.palette_hex_codes[i],
+            )
+        plt.plot(linestyle="--", color="gray")
+        plt.xlabel("False Positive Rate")
+        plt.ylabel("True Positive Rate")
+        plt.title("Receiver Operating Characteristic (ROC) Curve")
+        plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.15))
+        plt.savefig(
+            f"{benchmark_generator.prioritisation_type_file_prefix}_roc_curve.svg",
+            format="svg",
+            bbox_inches="tight",
+        )
+    def generate_precision_recall(
+        self,
+        benchmarking_results: List[BenchmarkRunResults],
+        benchmark_generator: BenchmarkRunOutputGenerator,
+    ):
+        """
+        Generate and plot Precision-Recall curves for binary classification benchmark results.
+        Args:
+            benchmarking_results (List[BenchmarkRunResults]): List of benchmarking results for multiple runs.
+            benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details.
+        """
+        plt.figure()
+        for i, benchmark_result in enumerate(benchmarking_results):
+            precision, recall, thresh = precision_recall_curve(
+                benchmark_result.binary_classification_stats.labels,
+                benchmark_result.binary_classification_stats.scores,
+            )
+            precision_recall_auc = auc(recall, precision)
+            plt.plot(
+                recall,
+                precision,
+                label=f"{self.return_benchmark_name(benchmark_result)} Precision-Recall Curve "
+                f"(AUC = {precision_recall_auc:.2f})",
+                color=self.palette_hex_codes[i],
+            )
+        plt.plot(linestyle="--", color="gray")
+        plt.xlabel("Recall")
+        plt.ylabel("Precision")
+        plt.title("Precision-Recall Curve")
+        plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.15))
+        plt.savefig(
+            f"{benchmark_generator.prioritisation_type_file_prefix}_precision_recall_curve.svg",
+            format="svg",
+            bbox_inches="tight",
+        )
     def generate_non_cumulative_bar(
         self,
         benchmarking_results: List[BenchmarkRunResults],
@@ -405,6 +482,7 @@ def generate_plots(
     benchmark_generator: BenchmarkRunOutputGenerator,
     plot_type: str,
     title: str = None,
+    generate_from_tsv: bool = False,
 ) -> None:
     """
     Generate summary statistics bar plots for prioritisation.
@@ -416,8 +494,12 @@ def generate_plots(
         benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details.
         plot_type (str): Type of plot to be generated ("bar_stacked", "bar_cumulative", "bar_non_cumulative").
         title (str, optional): Title for the generated plot. Defaults to None.
+        generate_from_tsv (bool): Specify whether to generate plots from the TSV file. Defaults to False.
     """
     plot_generator = PlotGenerator()
+    if not generate_from_tsv:
+        plot_generator.generate_roc_curve(benchmarking_results, benchmark_generator)
+        plot_generator.generate_precision_recall(benchmarking_results, benchmark_generator)
     if plot_type == "bar_stacked":
         plot_generator.generate_stacked_bar_plot(benchmarking_results, benchmark_generator, title)
     elif plot_type == "bar_cumulative":
@@ -462,4 +544,4 @@ def generate_plots_from_benchmark_summary_tsv(
         raise ValueError(
             "Specify one analysis type (gene_analysis, variant_analysis, or disease_analysis)"
         )
-    generate_plots(benchmarking_results, benchmark_generator, plot_type, title)
+    generate_plots(benchmarking_results, benchmark_generator, plot_type, title, True)

pheval/analyse/generate_summary_outputs.py CHANGED Viewed

@@ -3,6 +3,7 @@ from collections import defaultdict
 from copy import deepcopy
 from typing import List
+import numpy as np
 import pandas as pd
 from pheval.analyse.benchmark_generator import BenchmarkRunOutputGenerator
@@ -40,7 +41,19 @@ class RankComparisonGenerator:
             pd.DataFrame: DataFrame containing the calculated rank differences.
         """
         comparison_df = self._generate_dataframe()
-        comparison_df["rank_decrease"] = comparison_df.iloc[:, 3] - comparison_df.iloc[:, 2]
+        comparison_df["rank_change"] = comparison_df.iloc[:, 2] - comparison_df.iloc[:, 3]
+        comparison_df["rank_change"] = np.where(
+            (comparison_df.iloc[:, 2] == 0) & (comparison_df.iloc[:, 3] != 0),
+            "GAINED",
+            np.where(
+                (comparison_df.iloc[:, 3] == 0) & (comparison_df.iloc[:, 2] != 0),
+                "LOST",
+                comparison_df["rank_change"],
+            ),
+        )
+        comparison_df["rank_change"] = comparison_df["rank_change"].apply(
+            lambda x: int(x) if str(x).lstrip("-").isdigit() else x
+        )
         return comparison_df
     def generate_output(self, prefix: str, suffix: str) -> None:

pheval/analyse/variant_prioritisation_analysis.py CHANGED Viewed

@@ -10,7 +10,11 @@ from pheval.analyse.prioritisation_result_types import VariantPrioritisationResu
 from pheval.analyse.rank_stats import RankStats
 from pheval.analyse.run_data_parser import TrackInputOutputDirectories
 from pheval.post_processing.post_processing import RankedPhEvalVariantResult
-from pheval.utils.file_utils import all_files, files_with_suffix, obtain_closest_file_name
+from pheval.utils.file_utils import (
+    all_files,
+    files_with_suffix,
+    obtain_phenopacket_path_from_pheval_result,
+)
 from pheval.utils.phenopacket_utils import GenomicVariant, PhenopacketUtil, phenopacket_reader
@@ -228,7 +232,7 @@ def assess_phenopacket_variant_prioritisation(
         variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
         variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
     """
-    phenopacket_path = obtain_closest_file_name(
+    phenopacket_path = obtain_phenopacket_path_from_pheval_result(
         standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir)
     )
     proband_causative_variants = _obtain_causative_variants(phenopacket_path)

pheval/cli_pheval_utils.py CHANGED Viewed

@@ -253,22 +253,19 @@ def update_phenopackets_command(
     mutually_exclusive=["phenopacket_path"],
 )
 @click.option(
-    "--template-vcf-path",
-    "-t",
-    cls=MutuallyExclusiveOptionError,
+    "--hg19-template-vcf",
+    "-hg19",
     metavar="PATH",
     required=False,
-    help="Template VCF file",
-    mutually_exclusive=["vcf_dir"],
+    help="Template hg19 VCF file",
     type=Path,
 )
 @click.option(
-    "--vcf-dir",
-    "-v",
-    cls=MutuallyExclusiveOptionError,
+    "--hg38-template-vcf",
+    "-hg38",
     metavar="PATH",
-    help="Directory containing template VCF files",
-    mutually_exclusive=["template_vcf"],
+    required=False,
+    help="Template hg38 VCF file",
     type=Path,
 )
 @click.option(
@@ -284,13 +281,22 @@ def create_spiked_vcfs_command(
     phenopacket_path: Path,
     phenopacket_dir: Path,
     output_dir: Path,
-    template_vcf_path: Path = None,
-    vcf_dir: Path = None,
+    hg19_template_vcf: Path = None,
+    hg38_template_vcf: Path = None,
 ):
-    """Spikes variants into a template VCF file for a directory of phenopackets."""
+    """
+    Create spiked VCF from either a Phenopacket or a Phenopacket directory.
+    Args:
+        phenopacket_path (Path): Path to a single Phenopacket file (optional).
+        phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
+        output_dir (Path): The directory to store the generated spiked VCF file(s).
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
+    """
     if phenopacket_path is None and phenopacket_dir is None:
         raise InputError("Either a phenopacket or phenopacket directory must be specified")
-    spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, template_vcf_path, vcf_dir)
+    spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)
 @click.command()

pheval/prepare/create_spiked_vcf.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gzip
 import logging
 import re
-import secrets
 import urllib.parse
 from copy import copy
 from dataclasses import dataclass
@@ -10,6 +9,8 @@ from typing import List, Union
 from phenopackets import Family, File, Phenopacket
+from pheval.prepare.custom_exceptions import InputError
+from pheval.utils.file_utils import files_with_suffix, is_gzipped
 from pheval.utils.phenopacket_utils import (
     IncompatibleGenomeAssemblyError,
     PhenopacketRebuilder,
@@ -19,9 +20,6 @@ from pheval.utils.phenopacket_utils import (
     write_phenopacket,
 )
-from .custom_exceptions import InputError
-from ..utils.file_utils import all_files, files_with_suffix, is_gzipped
 info_log = logging.getLogger("info")
 genome_assemblies = {
@@ -91,39 +89,6 @@ class VcfHeader:
     chr_status: bool
-class VcfPicker:
-    """Choose a VCF file randomly from a directory if provided, otherwise selects the single template."""
-    def __init__(self, template_vcf: Path or None, vcf_dir: Path or None):
-        """
-        Initialise the VcfPicker.
-        Args:
-            template_vcf (Path or None): The path to a template VCF file, or None if not provided.
-            vcf_dir (Path or None): The directory containing VCF files, or None if not provided.
-        """
-        self.template_vcf = template_vcf
-        self.vcf_dir = vcf_dir
-    def pick_file_from_dir(self) -> Path:
-        """
-        Selects a VCF file from a directory at random.
-        Returns:
-            Path: The randomly selected VCF file path from the directory.
-        """
-        return secrets.choice(all_files(self.vcf_dir))
-    def pick_file(self) -> Path:
-        """
-        Select a VCF file randomly when given a directory; if not, the template VCF is assigned.
-        Returns:
-            Path: The selected VCF file path.
-        """
-        return self.pick_file_from_dir() if self.vcf_dir is not None else self.template_vcf
 def read_vcf(vcf_file: Path) -> List[str]:
     """
     Read the contents of a VCF file into memory, handling both uncompressed and gzipped files.
@@ -206,6 +171,72 @@ class VcfHeaderParser:
         return VcfHeader(sample_id, assembly, chr_status)
+@dataclass
+class VcfFile:
+    """
+    Represents a VCF file with its name, contents, and header information.
+    Attributes:
+        vcf_file_name (str): The name of the VCF file.
+        vcf_contents (List[str]): The contents of the VCF file.
+        vcf_header (VcfHeader): The parsed header information of the VCF file.
+    """
+    vcf_file_name: str = None
+    vcf_contents: List[str] = None
+    vcf_header: VcfHeader = None
+    @staticmethod
+    def populate_fields(template_vcf: Path):
+        """
+        Populate the fields of the VcfFile instance using the contents of a template VCF file.
+        Args:
+            template_vcf (Path): The path to the template VCF file.
+        Returns:
+            VcfFile: An instance of VcfFile with populated fields.
+        """
+        contents = read_vcf(template_vcf)
+        return VcfFile(template_vcf.name, contents, VcfHeaderParser(contents).parse_vcf_header())
+def select_vcf_template(
+    phenopacket_path: Path,
+    proband_causative_variants: List[ProbandCausativeVariant],
+    hg19_vcf_info: VcfFile,
+    hg38_vcf_info: VcfFile,
+) -> VcfFile:
+    """
+    Select the appropriate VCF template based on the assembly information of the proband causative variants.
+    Args:
+        phenopacket_path (Path): The path to the Phenopacket file.
+        proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
+        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
+        hg38_vcf_info (VcfFile): CF file info for hg38 template vcf.
+    Returns:
+        VcfFile: The selected VCF template file based on the assembly information of the proband causative variants.
+    """
+    if proband_causative_variants[0].assembly in ["hg19", "GRCh37"]:
+        if hg19_vcf_info:
+            return hg19_vcf_info
+        else:
+            raise InputError("Must specify hg19 template VCF!")
+    elif proband_causative_variants[0].assembly in ["hg38", "GRCh38"]:
+        if hg38_vcf_info:
+            return hg38_vcf_info
+        else:
+            raise InputError("Must specify hg38 template VCF!")
+    else:
+        raise IncompatibleGenomeAssemblyError(
+            proband_causative_variants[0].assembly, phenopacket_path
+        )
 def check_variant_assembly(
     proband_causative_variants: list[ProbandCausativeVariant],
     vcf_header: VcfHeader,
@@ -229,7 +260,13 @@ def check_variant_assembly(
         raise ValueError("Too many genome assemblies!")
     if phenopacket_assembly[0] not in compatible_genome_assembly:
         raise IncompatibleGenomeAssemblyError(phenopacket_assembly, phenopacket_path)
-    if phenopacket_assembly[0] != vcf_header.assembly:
+    if (
+        phenopacket_assembly[0] in {"hg19", "GRCh37"}
+        and vcf_header.assembly not in {"hg19", "GRCh37"}
+    ) or (
+        phenopacket_assembly[0] in {"hg38", "GRCh38"}
+        and vcf_header.assembly not in {"hg38", "GRCh38"}
+    ):
         raise IncompatibleGenomeAssemblyError(
             assembly=phenopacket_assembly, phenopacket=phenopacket_path
         )
@@ -387,7 +424,8 @@ class VcfWriter:
 def spike_vcf_contents(
     phenopacket: Union[Phenopacket, Family],
     phenopacket_path: Path,
-    chosen_template_vcf: Path,
+    hg19_vcf_info: VcfFile,
+    hg38_vcf_info: VcfFile,
 ) -> tuple[str, List[str]]:
     """
     Spike VCF records with variants obtained from a Phenopacket or Family.
@@ -395,22 +433,28 @@ def spike_vcf_contents(
     Args:
         phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants.
         phenopacket_path (Path): Path to the Phenopacket file.
-        chosen_template_vcf (Path): Path to the chosen template VCF file.
+        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
+        hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
     Returns:
         A tuple containing:
             assembly (str): The genome assembly information extracted from VCF header.
             modified_vcf_contents (List[str]): Modified VCF records with spiked variants.
     """
-    # this is a separate function to a click command as it will fail if annotated with click annotations
-    # and referenced from another click command
     phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
-    vcf_contents = read_vcf(chosen_template_vcf)
-    vcf_header = VcfHeaderParser(vcf_contents).parse_vcf_header()
-    check_variant_assembly(phenopacket_causative_variants, vcf_header, phenopacket_path)
+    chosen_template_vcf = select_vcf_template(
+        phenopacket_path, phenopacket_causative_variants, hg19_vcf_info, hg38_vcf_info
+    )
+    check_variant_assembly(
+        phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path
+    )
     return (
-        vcf_header.assembly,
-        VcfSpiker(vcf_contents, phenopacket_causative_variants, vcf_header).construct_vcf(),
+        chosen_template_vcf.vcf_header.assembly,
+        VcfSpiker(
+            chosen_template_vcf.vcf_contents,
+            phenopacket_causative_variants,
+            chosen_template_vcf.vcf_header,
+        ).construct_vcf(),
     )
@@ -418,7 +462,8 @@ def generate_spiked_vcf_file(
     output_dir: Path,
     phenopacket: Union[Phenopacket, Family],
     phenopacket_path: Path,
-    chosen_template_vcf: Path,
+    hg19_vcf_info: VcfFile,
+    hg38_vcf_info: VcfFile,
 ) -> File:
     """
     Write spiked VCF contents to a new file.
@@ -427,21 +472,17 @@ def generate_spiked_vcf_file(
         output_dir (Path): Path to the directory to store the generated file.
         phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants.
         phenopacket_path (Path): Path to the Phenopacket file.
-        chosen_template_vcf (Path): Path to the chosen template VCF file.
+        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
+        hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
     Returns:
         File: The generated File object representing the newly created spiked VCF file.
     """
     output_dir.mkdir(exist_ok=True)
     info_log.info(f" Created a directory {output_dir}")
     vcf_assembly, spiked_vcf = spike_vcf_contents(
-        phenopacket, phenopacket_path, chosen_template_vcf
-    )
-    spiked_vcf_path = (
-        output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
-        if is_gzipped(chosen_template_vcf)
-        else output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf"))
+        phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
     )
+    spiked_vcf_path = output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
     VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
     return File(
         uri=urllib.parse.unquote(spiked_vcf_path.as_uri()),
@@ -449,8 +490,19 @@ def generate_spiked_vcf_file(
     )
+def spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path):
+    phenopacket = phenopacket_reader(phenopacket_path)
+    spiked_vcf_file_message = generate_spiked_vcf_file(
+        output_dir, phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
+    )
+    updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
+        spiked_vcf_file_message
+    )
+    write_phenopacket(updated_phenopacket, phenopacket_path)
 def create_spiked_vcf(
-    output_dir: Path, phenopacket_path: Path, template_vcf_path: Path, vcf_dir: Path
+    output_dir: Path, phenopacket_path: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
 ) -> None:
     """
     Create a spiked VCF for a Phenopacket.
@@ -458,27 +510,21 @@ def create_spiked_vcf(
     Args:
         output_dir (Path): The directory to store the generated spiked VCF file.
         phenopacket_path (Path): Path to the Phenopacket file.
-        template_vcf_path (Path): Path to the template VCF file (optional).
-        vcf_dir (Path): Path to the directory containing VCF files (optional).
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
     Raises:
-        InputError: If both template_vcf_path and vcf_dir are None.
+        InputError: If both hg19_template_vcf and hg38_template_vcf are None.
     """
-    if template_vcf_path is None and vcf_dir is None:
-        raise InputError("Either a template_vcf or vcf_dir must be specified")
-    vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
-    phenopacket = phenopacket_reader(phenopacket_path)
-    spiked_vcf_file_message = generate_spiked_vcf_file(
-        output_dir, phenopacket, phenopacket_path, vcf_file_path
-    )
-    updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
-        spiked_vcf_file_message
-    )
-    write_phenopacket(updated_phenopacket, phenopacket_path)
+    if hg19_template_vcf is None and hg38_template_vcf is None:
+        raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
+    hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
+    hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
+    spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
 def create_spiked_vcfs(
-    output_dir: Path, phenopacket_dir: Path, template_vcf_path: Path, vcf_dir: Path
+    output_dir: Path, phenopacket_dir: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
 ) -> None:
     """
     Create a spiked VCF for a directory of Phenopackets.
@@ -486,35 +532,26 @@ def create_spiked_vcfs(
     Args:
         output_dir (Path): The directory to store the generated spiked VCF file.
         phenopacket_dir (Path): Path to the Phenopacket directory.
-        template_vcf_path (Path): Path to the template VCF file (optional).
-        vcf_dir (Path): Path to the directory containing VCF files (optional).
+        hg19_template_vcf (Path): Path to the template hg19 VCF file (optional).
+        hg38_template_vcf (Path): Path to the template hg19 VCF file (optional).
     Raises:
-        InputError: If both template_vcf_path and vcf_dir are None.
+        InputError: If both hg19_template_vcf and hg38_template_vcf are None.
     """
-    if template_vcf_path is None and vcf_dir is None:
-        raise InputError("Either a template_vcf or vcf_dir must be specified")
+    if hg19_template_vcf is None and hg38_template_vcf is None:
+        raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
+    hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
+    hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
     for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
-        vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
-        phenopacket = phenopacket_reader(phenopacket_path)
-        spiked_vcf_file_message = generate_spiked_vcf_file(
-            output_dir, phenopacket, phenopacket_path, vcf_file_path
-        )
-        updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
-            spiked_vcf_file_message
-        )
-        write_phenopacket(updated_phenopacket, phenopacket_path)
-    # or made a lambda one-liner for maximum wtf...
-    # [spike_vcf(path, output_dir, template_vcf, vcf_dir) for path in phenopacket_dir.iterdir() if path.suffix ==
-    # ".json"]
+        spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
 def spike_vcfs(
     output_dir: Path,
     phenopacket_path: Path,
     phenopacket_dir: Path,
-    template_vcf_path: Path,
-    vcf_dir: Path,
+    hg19_template_vcf: Path,
+    hg38_template_vcf: Path,
 ) -> None:
     """
     Create spiked VCF from either a Phenopacket or a Phenopacket directory.
@@ -523,10 +560,10 @@ def spike_vcfs(
         output_dir (Path): The directory to store the generated spiked VCF file(s).
         phenopacket_path (Path): Path to a single Phenopacket file (optional).
         phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
-        template_vcf_path (Path): Path to the template VCF file (optional).
-        vcf_dir (Path): Path to the directory containing VCF files (optional).
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
     """
     if phenopacket_path is not None:
-        create_spiked_vcf(output_dir, phenopacket_path, template_vcf_path, vcf_dir)
+        create_spiked_vcf(output_dir, phenopacket_path, hg19_template_vcf, hg38_template_vcf)
     elif phenopacket_dir is not None:
-        create_spiked_vcfs(output_dir, phenopacket_dir, template_vcf_path, vcf_dir)
+        create_spiked_vcfs(output_dir, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)

pheval/utils/file_utils.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import difflib
 import itertools
 import re
 import unicodedata
@@ -71,23 +70,33 @@ def normalise_file_name(file_path: Path) -> str:
     return re.sub("[\u0300-\u036f]", "", normalised_file_name)
-def obtain_closest_file_name(file_to_be_queried: Path, file_paths: list[Path]) -> Path:
+def obtain_phenopacket_path_from_pheval_result(
+    pheval_result_path: Path, phenopacket_paths: list[Path]
+) -> Path:
     """
-    Obtains the closest file name when given a template file name
-    and a list of full paths of files to be queried.
+    Obtains the phenopacket file name when given a pheval result file name
+    and a list of full paths of phenopackets to be queried.
     Args:
-        file_to_be_queried (Path): The template file name to find the closest match.
-        file_paths (list[Path]): List of full paths of files to be queried.
+        pheval_result_path (Path): The PhEval result.
+        phenopacket_paths (list[Path]): List of full paths of phenopackets to be queried.
     Returns:
-        Path: The closest matching file path from the provided list.
+        Path: The matching phenopacket file path from the provided list.
     """
-    stems = [Path(file_path).stem for file_path in file_paths]
-    closest_file_match = difflib.get_close_matches(
-        str(Path(file_to_be_queried).stem), stems, cutoff=0.1, n=1
-    )[0]
-    return [file_path for file_path in file_paths if closest_file_match == str(file_path.stem)][0]
+    pheval_result_path_stem_stripped = pheval_result_path.stem.split("-pheval_")[0]
+    matching_phenopacket_paths = [
+        phenopacket_path
+        for phenopacket_path in phenopacket_paths
+        if phenopacket_path.stem == pheval_result_path_stem_stripped
+    ]
+    if matching_phenopacket_paths:
+        return matching_phenopacket_paths[0]
+    else:
+        raise FileNotFoundError(
+            f"Unable to find matching phenopacket file named "
+            f"{pheval_result_path_stem_stripped}.json for {pheval_result_path.name}"
+        )
 def ensure_file_exists(*files: str):

pheval/utils/phenopacket_utils.py CHANGED Viewed

@@ -467,7 +467,9 @@ class PhenopacketUtil:
         for i in pheno_interpretation:
             for g in i.diagnosis.genomic_interpretations:
                 variant = GenomicVariant(
-                    chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom,
+                    chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
+                        "chr", ""
+                    ),
                     pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
                     ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
                     alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,

{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pheval
-Version: 0.3.1
+Version: 0.3.3
 Summary:
 Author: Yasemin Bridges
 Author-email: y.bridges@qmul.ac.uk

{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/RECORD RENAMED Viewed

@@ -3,21 +3,21 @@ pheval/analyse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pheval/analyse/analysis.py,sha256=ponm3P8nvzJNmcrNZ2_KudEhWSaWshd_Gd30D-aau8s,7743
 pheval/analyse/benchmark_generator.py,sha256=AeuwbaPb4j_dyBGPRgEBxQk2NahDb5u4xHyFiqp5Fes,5943
 pheval/analyse/benchmarking_data.py,sha256=aNZkWdmWemlnC1Tg35MtR60S9YC71QWS2rMuzkUc3w0,768
-pheval/analyse/binary_classification_stats.py,sha256=ZBAvhMVPYSFg3asONUG1w24JhYTjG03RG_C9uohQntI,11373
-pheval/analyse/disease_prioritisation_analysis.py,sha256=ttdgUX5ZKT74gKgsRrnyH8zKFxhcJxVOtZTsAdheGxU,12596
-pheval/analyse/gene_prioritisation_analysis.py,sha256=raEjzJFvAvS3wE0yrYcSIQzBe6s_lOgJMqe_p_AFgZY,12320
-pheval/analyse/generate_plots.py,sha256=gU7NYr1zgnXEXAZR-nHLql3farQEaUN5gkgu2ywTJho,17779
-pheval/analyse/generate_summary_outputs.py,sha256=tpHjbyme3FlkflGcTIgQ4H4xyN6FZ5Jmm-ImjAbSpYU,6071
+pheval/analyse/binary_classification_stats.py,sha256=E35YjvGM-zFnuEt8M3pgN03vBab4MH6ih726QKvuogg,12519
+pheval/analyse/disease_prioritisation_analysis.py,sha256=qadEVhBMtBgtjGCJLhNQA510F8Pd0Ll4NAQXoT23BYs,12649
+pheval/analyse/gene_prioritisation_analysis.py,sha256=lAN171xfXqweK8ie6191s_6WPPGjZKJXL1Z0dIqp54k,12373
+pheval/analyse/generate_plots.py,sha256=MFORnFTgoelYAahFlu3Dc3Rul4cwCg8Bloxe62vONSc,21350
+pheval/analyse/generate_summary_outputs.py,sha256=s9pXMSW6xm4ZBe1aCd0UJSaFiKBvpUfPwJ2BI4qfTas,6591
 pheval/analyse/parse_benchmark_summary.py,sha256=Y8uPTlHTEiaeVBOqxMcdOqjY3ZBtOS3DoRycL78Dzxg,2384
 pheval/analyse/parse_pheval_result.py,sha256=j8YFVA0YXfySOkm8gMwrfIuV45DI9AX3ETn7h-r8ayE,1211
 pheval/analyse/prioritisation_rank_recorder.py,sha256=EVe8DoEvvp0_WMAcjfVxmDGGRFPEELi7hEVjH3sIpLY,3223
 pheval/analyse/prioritisation_result_types.py,sha256=qJoB6O-lFYmzAMcTQeDJZQNLJ6hleoKDYATTkhvFF98,1228
 pheval/analyse/rank_stats.py,sha256=knj1tsKrly17QgtOUVpqA14UjbO99N3ydkWN4xU6c2k,15785
 pheval/analyse/run_data_parser.py,sha256=HzBKsJL2skjmrRZdrF3VYzswtKNgbX6U5qhY_kqq9mA,1552
-pheval/analyse/variant_prioritisation_analysis.py,sha256=_yYgknFHqL0_nlpBeQdo9D1Jnd99BcUkA733uxTPpcg,12331
+pheval/analyse/variant_prioritisation_analysis.py,sha256=ApmUeTW0cl_BPh7LusbApxtgjEXEkhuNFyh0DxKKpgU,12384
 pheval/cli.py,sha256=4l9xZfxBfLCcm7PDdhMWgTvTKbQt5sJ2bYHf7kU1dO4,1493
 pheval/cli_pheval.py,sha256=fWbKUcPTZZSa1EJEtH_lNn1XE6qRApRHihqUZS5owrA,2424
-pheval/cli_pheval_utils.py,sha256=wVLH0Bk2WrvTBkH-G5wC3Xgo6KftX9zSwonC2DVBpP8,16929
+pheval/cli_pheval_utils.py,sha256=i5rSLR_moti1VM0of4uOLdX7wbwUcM7spLO9zw6bHrc,17257
 pheval/config_parser.py,sha256=lh-Dy_FflXJUnRC3HYaEdSvPAsNZWQZlEr1hHQigrTM,1227
 pheval/constants.py,sha256=TWBgWOc05FGXFu63fs-hEHS2IJkLLAPHtMppiWBfBOg,349
 pheval/implementations/__init__.py,sha256=BMUTotjTdgy5j5xubWCIQgRXrSQ1ZIcjooer7r299Zo,1228
@@ -27,7 +27,7 @@ pheval/post_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
 pheval/post_processing/post_processing.py,sha256=2srdlw2D3qMh2B3PUSDvA6COYlbXINC08Wt4eccMZp8,16030
 pheval/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pheval/prepare/create_noisy_phenopackets.py,sha256=UbBRWDD95BFHPv03VYx04v35AGwJ9ynLltYKqQJHbZ0,11236
-pheval/prepare/create_spiked_vcf.py,sha256=jd_OxcSVh25qBcrDNkD1d586jNuNyOjjxArwgCSCKN0,18755
+pheval/prepare/create_spiked_vcf.py,sha256=KZIyjtDDTqJj3hxL3u4YP6P0toA4RN1oPeDrzLMB2z4,20235
 pheval/prepare/custom_exceptions.py,sha256=_G3_95dPtHIs1SviYBV1j7cYc-hxlhuw8hhnYdzByYY,1719
 pheval/prepare/update_phenopacket.py,sha256=36dLIUSO_4EakGkjVwlecu-he-lOPXMhoWoOkeRYMV4,4753
 pheval/resources/alternate_ouputs/CADA_results.txt,sha256=Rinn2TtfwFNsx0aEWegKJOkjKnBm-Mf54gdaT3bWP0k,547
@@ -45,12 +45,12 @@ pheval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pheval/utils/docs_gen.py,sha256=6FGtHicBC0rZKi0tdL3Epsg8d4osE44I9f1Ga0j4JLA,3193
 pheval/utils/docs_gen.sh,sha256=LyKLKjaZuf4UJ962CWfM-XqkxtvM8O2N9wHZS5mcb9A,477
 pheval/utils/exomiser.py,sha256=m2u0PH2z9lFPaB3LVkZCmPmH5e55q1NoTzNl46zRRP8,683
-pheval/utils/file_utils.py,sha256=ESAXWtfpCAZX6T6nU6vb1x0of5S-eYhu639geJBu1es,4361
-pheval/utils/phenopacket_utils.py,sha256=hBEWl9mOP9D7odSaL6lIY__dbXn7Sc3TZX0Si-nPYaE,24379
+pheval/utils/file_utils.py,sha256=9HoCmtF73D3wY6bBhFLefMBI5uhvCe_meZeHXQzF_ts,4640
+pheval/utils/phenopacket_utils.py,sha256=iWYWfneaI47lx5w56-ILwvyLRaxHVoHnJ3EbVrja8-k,24444
 pheval/utils/semsim_utils.py,sha256=s7ZCR2VfPYnOh7ApX6rv66eGoVSm9QJaVYOWBEhlXpo,6151
 pheval/utils/utils.py,sha256=9V6vCT8l1g4O2-ZATYqsVyd7AYZdWGd-Ksy7_oIC3eE,2343
-pheval-0.3.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-pheval-0.3.1.dist-info/METADATA,sha256=wVyoDa-Xs4ztciDaO56ogC3rjhukYhCe3HFqmqEtClA,1810
-pheval-0.3.1.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-pheval-0.3.1.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
-pheval-0.3.1.dist-info/RECORD,,
+pheval-0.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+pheval-0.3.3.dist-info/METADATA,sha256=I0Njs6iBWs0Ag1bf81pNaRKYPcw8dkRKaG454m-6bJA,1810
+pheval-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
+pheval-0.3.3.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
+pheval-0.3.3.dist-info/RECORD,,

{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry-core 1.8.1
+Generator: poetry-core 1.9.0
 Root-Is-Purelib: true
 Tag: py3-none-any

{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{pheval-0.3.1.dist-info → pheval-0.3.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pheval 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

Potentially problematic release.

pheval 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl