PyPI - pheval - Versions diffs - 0.3.2__tar.gz → 0.3.4__tar.gz - Mend

pheval 0.3.2tar.gz → 0.3.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (56) hide show

{pheval-0.3.2 → pheval-0.3.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pheval
-Version: 0.3.2
+Version: 0.3.4
 Summary:
 Author: Yasemin Bridges
 Author-email: y.bridges@qmul.ac.uk

{pheval-0.3.2 → pheval-0.3.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pheval"
-version = "0.3.2"
+version = "0.3.4"
 description = ""
 authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
   "Julius Jacobsen <j.jacobsen@qmul.ac.uk>",

{pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/generate_plots.py RENAMED Viewed

@@ -482,6 +482,7 @@ def generate_plots(
     benchmark_generator: BenchmarkRunOutputGenerator,
     plot_type: str,
     title: str = None,
+    generate_from_tsv: bool = False,
 ) -> None:
     """
     Generate summary statistics bar plots for prioritisation.
@@ -493,10 +494,12 @@ def generate_plots(
         benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details.
         plot_type (str): Type of plot to be generated ("bar_stacked", "bar_cumulative", "bar_non_cumulative").
         title (str, optional): Title for the generated plot. Defaults to None.
+        generate_from_tsv (bool): Specify whether to generate plots from the TSV file. Defaults to False.
     """
     plot_generator = PlotGenerator()
-    plot_generator.generate_roc_curve(benchmarking_results, benchmark_generator)
-    plot_generator.generate_precision_recall(benchmarking_results, benchmark_generator)
+    if not generate_from_tsv:
+        plot_generator.generate_roc_curve(benchmarking_results, benchmark_generator)
+        plot_generator.generate_precision_recall(benchmarking_results, benchmark_generator)
     if plot_type == "bar_stacked":
         plot_generator.generate_stacked_bar_plot(benchmarking_results, benchmark_generator, title)
     elif plot_type == "bar_cumulative":
@@ -541,4 +544,4 @@ def generate_plots_from_benchmark_summary_tsv(
         raise ValueError(
             "Specify one analysis type (gene_analysis, variant_analysis, or disease_analysis)"
         )
-    generate_plots(benchmarking_results, benchmark_generator, plot_type, title)
+    generate_plots(benchmarking_results, benchmark_generator, plot_type, title, True)

{pheval-0.3.2 → pheval-0.3.4}/src/pheval/cli.py RENAMED Viewed

@@ -10,6 +10,7 @@ from .cli_pheval_utils import (
     benchmark_comparison,
     create_spiked_vcfs_command,
     generate_stats_plot,
+    prepare_corpus_command,
     scramble_phenopackets_command,
     semsim_scramble_command,
     semsim_to_exomiserdb_command,
@@ -60,6 +61,7 @@ pheval_utils.add_command(benchmark)
 pheval_utils.add_command(benchmark_comparison)
 pheval_utils.add_command(semsim_to_exomiserdb_command)
 pheval_utils.add_command(generate_stats_plot)
+pheval_utils.add_command(prepare_corpus_command)
 if __name__ == "__main__":
     main()

{pheval-0.3.2 → pheval-0.3.4}/src/pheval/cli_pheval_utils.py RENAMED Viewed

@@ -15,6 +15,7 @@ from pheval.analyse.run_data_parser import parse_run_data_text_file
 from pheval.prepare.create_noisy_phenopackets import scramble_phenopackets
 from pheval.prepare.create_spiked_vcf import spike_vcfs
 from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
+from pheval.prepare.prepare_corpus import prepare_corpus
 from pheval.prepare.update_phenopacket import update_phenopackets
 from pheval.utils.exomiser import semsim_to_exomiserdb
 from pheval.utils.semsim_utils import percentage_diff, semsim_heatmap_plot
@@ -253,22 +254,19 @@ def update_phenopackets_command(
     mutually_exclusive=["phenopacket_path"],
 )
 @click.option(
-    "--template-vcf-path",
-    "-t",
-    cls=MutuallyExclusiveOptionError,
+    "--hg19-template-vcf",
+    "-hg19",
     metavar="PATH",
     required=False,
-    help="Template VCF file",
-    mutually_exclusive=["vcf_dir"],
+    help="Template hg19 VCF file",
     type=Path,
 )
 @click.option(
-    "--vcf-dir",
-    "-v",
-    cls=MutuallyExclusiveOptionError,
+    "--hg38-template-vcf",
+    "-hg38",
     metavar="PATH",
-    help="Directory containing template VCF files",
-    mutually_exclusive=["template_vcf"],
+    required=False,
+    help="Template hg38 VCF file",
     type=Path,
 )
 @click.option(
@@ -284,13 +282,22 @@ def create_spiked_vcfs_command(
     phenopacket_path: Path,
     phenopacket_dir: Path,
     output_dir: Path,
-    template_vcf_path: Path = None,
-    vcf_dir: Path = None,
+    hg19_template_vcf: Path = None,
+    hg38_template_vcf: Path = None,
 ):
-    """Spikes variants into a template VCF file for a directory of phenopackets."""
+    """
+    Create spiked VCF from either a Phenopacket or a Phenopacket directory.
+    Args:
+        phenopacket_path (Path): Path to a single Phenopacket file (optional).
+        phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
+        output_dir (Path): The directory to store the generated spiked VCF file(s).
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
+    """
     if phenopacket_path is None and phenopacket_dir is None:
         raise InputError("Either a phenopacket or phenopacket directory must be specified")
-    spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, template_vcf_path, vcf_dir)
+    spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)
 @click.command()
@@ -600,3 +607,106 @@ def generate_stats_plot(
     generate_plots_from_benchmark_summary_tsv(
         benchmarking_tsv, gene_analysis, variant_analysis, disease_analysis, plot_type, title
     )
+@click.command("prepare-corpus")
+@click.option(
+    "--phenopacket-dir",
+    "-p",
+    required=True,
+    metavar="PATH",
+    help="Path to phenopacket corpus directory..",
+    type=Path,
+)
+@click.option(
+    "--variant-analysis/--no-variant-analysis",
+    default=False,
+    required=False,
+    type=bool,
+    show_default=True,
+    help="Specify whether to check for complete variant records in the phenopackets.",
+)
+@click.option(
+    "--gene-analysis/--no-gene-analysis",
+    default=False,
+    required=False,
+    type=bool,
+    show_default=True,
+    help="Specify whether to check for complete gene records in the phenopackets.",
+)
+@click.option(
+    "--disease-analysis/--no-disease-analysis",
+    default=False,
+    required=False,
+    type=bool,
+    show_default=True,
+    help="Specify whether to check for complete disease records in the phenopackets.",
+)
+@click.option(
+    "--gene-identifier",
+    "-g",
+    required=False,
+    help="Gene identifier to update in phenopacket",
+    type=click.Choice(["ensembl_id", "entrez_id", "hgnc_id"]),
+)
+@click.option(
+    "--hg19-template-vcf",
+    "-hg19",
+    metavar="PATH",
+    required=False,
+    help="Template hg19 VCF file",
+    type=Path,
+)
+@click.option(
+    "--hg38-template-vcf",
+    "-hg38",
+    metavar="PATH",
+    required=False,
+    help="Template hg38 VCF file",
+    type=Path,
+)
+@click.option(
+    "--output-dir",
+    "-o",
+    metavar="PATH",
+    required=True,
+    help="Path to output prepared corpus.",
+    default="prepared_corpus",
+    type=Path,
+)
+def prepare_corpus_command(
+    phenopacket_dir: Path,
+    variant_analysis: bool,
+    gene_analysis: bool,
+    disease_analysis: bool,
+    gene_identifier: str,
+    hg19_template_vcf: Path,
+    hg38_template_vcf: Path,
+    output_dir: Path,
+):
+    """
+    Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
+    gene identifiers.
+        Args:
+            phenopacket_dir (Path): The path to the directory containing Phenopackets.
+            variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
+            gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
+            disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
+            gene_identifier (str): Identifier for updating gene identifiers, if applicable.
+            hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
+            VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+            hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
+            VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+            output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
+    """
+    prepare_corpus(
+        phenopacket_dir,
+        variant_analysis,
+        gene_analysis,
+        disease_analysis,
+        gene_identifier,
+        hg19_template_vcf,
+        hg38_template_vcf,
+        output_dir,
+    )

{pheval-0.3.2 → pheval-0.3.4}/src/pheval/prepare/create_spiked_vcf.py RENAMED Viewed

@@ -1,7 +1,6 @@
 import gzip
 import logging
 import re
-import secrets
 import urllib.parse
 from copy import copy
 from dataclasses import dataclass
@@ -10,6 +9,8 @@ from typing import List, Union
 from phenopackets import Family, File, Phenopacket
+from pheval.prepare.custom_exceptions import InputError
+from pheval.utils.file_utils import files_with_suffix, is_gzipped
 from pheval.utils.phenopacket_utils import (
     IncompatibleGenomeAssemblyError,
     PhenopacketRebuilder,
@@ -19,9 +20,6 @@ from pheval.utils.phenopacket_utils import (
     write_phenopacket,
 )
-from .custom_exceptions import InputError
-from ..utils.file_utils import all_files, files_with_suffix, is_gzipped
 info_log = logging.getLogger("info")
 genome_assemblies = {
@@ -91,39 +89,6 @@ class VcfHeader:
     chr_status: bool
-class VcfPicker:
-    """Choose a VCF file randomly from a directory if provided, otherwise selects the single template."""
-    def __init__(self, template_vcf: Path or None, vcf_dir: Path or None):
-        """
-        Initialise the VcfPicker.
-        Args:
-            template_vcf (Path or None): The path to a template VCF file, or None if not provided.
-            vcf_dir (Path or None): The directory containing VCF files, or None if not provided.
-        """
-        self.template_vcf = template_vcf
-        self.vcf_dir = vcf_dir
-    def pick_file_from_dir(self) -> Path:
-        """
-        Selects a VCF file from a directory at random.
-        Returns:
-            Path: The randomly selected VCF file path from the directory.
-        """
-        return secrets.choice(all_files(self.vcf_dir))
-    def pick_file(self) -> Path:
-        """
-        Select a VCF file randomly when given a directory; if not, the template VCF is assigned.
-        Returns:
-            Path: The selected VCF file path.
-        """
-        return self.pick_file_from_dir() if self.vcf_dir is not None else self.template_vcf
 def read_vcf(vcf_file: Path) -> List[str]:
     """
     Read the contents of a VCF file into memory, handling both uncompressed and gzipped files.
@@ -206,6 +171,72 @@ class VcfHeaderParser:
         return VcfHeader(sample_id, assembly, chr_status)
+@dataclass
+class VcfFile:
+    """
+    Represents a VCF file with its name, contents, and header information.
+    Attributes:
+        vcf_file_name (str): The name of the VCF file.
+        vcf_contents (List[str]): The contents of the VCF file.
+        vcf_header (VcfHeader): The parsed header information of the VCF file.
+    """
+    vcf_file_name: str = None
+    vcf_contents: List[str] = None
+    vcf_header: VcfHeader = None
+    @staticmethod
+    def populate_fields(template_vcf: Path):
+        """
+        Populate the fields of the VcfFile instance using the contents of a template VCF file.
+        Args:
+            template_vcf (Path): The path to the template VCF file.
+        Returns:
+            VcfFile: An instance of VcfFile with populated fields.
+        """
+        contents = read_vcf(template_vcf)
+        return VcfFile(template_vcf.name, contents, VcfHeaderParser(contents).parse_vcf_header())
+def select_vcf_template(
+    phenopacket_path: Path,
+    proband_causative_variants: List[ProbandCausativeVariant],
+    hg19_vcf_info: VcfFile,
+    hg38_vcf_info: VcfFile,
+) -> VcfFile:
+    """
+    Select the appropriate VCF template based on the assembly information of the proband causative variants.
+    Args:
+        phenopacket_path (Path): The path to the Phenopacket file.
+        proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
+        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
+        hg38_vcf_info (VcfFile): CF file info for hg38 template vcf.
+    Returns:
+        VcfFile: The selected VCF template file based on the assembly information of the proband causative variants.
+    """
+    if proband_causative_variants[0].assembly in ["hg19", "GRCh37"]:
+        if hg19_vcf_info:
+            return hg19_vcf_info
+        else:
+            raise InputError("Must specify hg19 template VCF!")
+    elif proband_causative_variants[0].assembly in ["hg38", "GRCh38"]:
+        if hg38_vcf_info:
+            return hg38_vcf_info
+        else:
+            raise InputError("Must specify hg38 template VCF!")
+    else:
+        raise IncompatibleGenomeAssemblyError(
+            proband_causative_variants[0].assembly, phenopacket_path
+        )
 def check_variant_assembly(
     proband_causative_variants: list[ProbandCausativeVariant],
     vcf_header: VcfHeader,
@@ -229,7 +260,13 @@ def check_variant_assembly(
         raise ValueError("Too many genome assemblies!")
     if phenopacket_assembly[0] not in compatible_genome_assembly:
         raise IncompatibleGenomeAssemblyError(phenopacket_assembly, phenopacket_path)
-    if phenopacket_assembly[0] != vcf_header.assembly:
+    if (
+        phenopacket_assembly[0] in {"hg19", "GRCh37"}
+        and vcf_header.assembly not in {"hg19", "GRCh37"}
+    ) or (
+        phenopacket_assembly[0] in {"hg38", "GRCh38"}
+        and vcf_header.assembly not in {"hg38", "GRCh38"}
+    ):
         raise IncompatibleGenomeAssemblyError(
             assembly=phenopacket_assembly, phenopacket=phenopacket_path
         )
@@ -387,7 +424,8 @@ class VcfWriter:
 def spike_vcf_contents(
     phenopacket: Union[Phenopacket, Family],
     phenopacket_path: Path,
-    chosen_template_vcf: Path,
+    hg19_vcf_info: VcfFile,
+    hg38_vcf_info: VcfFile,
 ) -> tuple[str, List[str]]:
     """
     Spike VCF records with variants obtained from a Phenopacket or Family.
@@ -395,22 +433,28 @@ def spike_vcf_contents(
     Args:
         phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants.
         phenopacket_path (Path): Path to the Phenopacket file.
-        chosen_template_vcf (Path): Path to the chosen template VCF file.
+        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
+        hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
     Returns:
         A tuple containing:
             assembly (str): The genome assembly information extracted from VCF header.
             modified_vcf_contents (List[str]): Modified VCF records with spiked variants.
     """
-    # this is a separate function to a click command as it will fail if annotated with click annotations
-    # and referenced from another click command
     phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
-    vcf_contents = read_vcf(chosen_template_vcf)
-    vcf_header = VcfHeaderParser(vcf_contents).parse_vcf_header()
-    check_variant_assembly(phenopacket_causative_variants, vcf_header, phenopacket_path)
+    chosen_template_vcf = select_vcf_template(
+        phenopacket_path, phenopacket_causative_variants, hg19_vcf_info, hg38_vcf_info
+    )
+    check_variant_assembly(
+        phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path
+    )
     return (
-        vcf_header.assembly,
-        VcfSpiker(vcf_contents, phenopacket_causative_variants, vcf_header).construct_vcf(),
+        chosen_template_vcf.vcf_header.assembly,
+        VcfSpiker(
+            chosen_template_vcf.vcf_contents,
+            phenopacket_causative_variants,
+            chosen_template_vcf.vcf_header,
+        ).construct_vcf(),
     )
@@ -418,7 +462,8 @@ def generate_spiked_vcf_file(
     output_dir: Path,
     phenopacket: Union[Phenopacket, Family],
     phenopacket_path: Path,
-    chosen_template_vcf: Path,
+    hg19_vcf_info: VcfFile,
+    hg38_vcf_info: VcfFile,
 ) -> File:
     """
     Write spiked VCF contents to a new file.
@@ -427,21 +472,17 @@ def generate_spiked_vcf_file(
         output_dir (Path): Path to the directory to store the generated file.
         phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants.
         phenopacket_path (Path): Path to the Phenopacket file.
-        chosen_template_vcf (Path): Path to the chosen template VCF file.
+        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
+        hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
     Returns:
         File: The generated File object representing the newly created spiked VCF file.
     """
     output_dir.mkdir(exist_ok=True)
     info_log.info(f" Created a directory {output_dir}")
     vcf_assembly, spiked_vcf = spike_vcf_contents(
-        phenopacket, phenopacket_path, chosen_template_vcf
-    )
-    spiked_vcf_path = (
-        output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
-        if is_gzipped(chosen_template_vcf)
-        else output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf"))
+        phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
     )
+    spiked_vcf_path = output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
     VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
     return File(
         uri=urllib.parse.unquote(spiked_vcf_path.as_uri()),
@@ -449,8 +490,19 @@ def generate_spiked_vcf_file(
     )
+def spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path):
+    phenopacket = phenopacket_reader(phenopacket_path)
+    spiked_vcf_file_message = generate_spiked_vcf_file(
+        output_dir, phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
+    )
+    updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
+        spiked_vcf_file_message
+    )
+    write_phenopacket(updated_phenopacket, phenopacket_path)
 def create_spiked_vcf(
-    output_dir: Path, phenopacket_path: Path, template_vcf_path: Path, vcf_dir: Path
+    output_dir: Path, phenopacket_path: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
 ) -> None:
     """
     Create a spiked VCF for a Phenopacket.
@@ -458,27 +510,21 @@ def create_spiked_vcf(
     Args:
         output_dir (Path): The directory to store the generated spiked VCF file.
         phenopacket_path (Path): Path to the Phenopacket file.
-        template_vcf_path (Path): Path to the template VCF file (optional).
-        vcf_dir (Path): Path to the directory containing VCF files (optional).
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
     Raises:
-        InputError: If both template_vcf_path and vcf_dir are None.
+        InputError: If both hg19_template_vcf and hg38_template_vcf are None.
     """
-    if template_vcf_path is None and vcf_dir is None:
-        raise InputError("Either a template_vcf or vcf_dir must be specified")
-    vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
-    phenopacket = phenopacket_reader(phenopacket_path)
-    spiked_vcf_file_message = generate_spiked_vcf_file(
-        output_dir, phenopacket, phenopacket_path, vcf_file_path
-    )
-    updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
-        spiked_vcf_file_message
-    )
-    write_phenopacket(updated_phenopacket, phenopacket_path)
+    if hg19_template_vcf is None and hg38_template_vcf is None:
+        raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
+    hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
+    hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
+    spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
 def create_spiked_vcfs(
-    output_dir: Path, phenopacket_dir: Path, template_vcf_path: Path, vcf_dir: Path
+    output_dir: Path, phenopacket_dir: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
 ) -> None:
     """
     Create a spiked VCF for a directory of Phenopackets.
@@ -486,35 +532,26 @@ def create_spiked_vcfs(
     Args:
         output_dir (Path): The directory to store the generated spiked VCF file.
         phenopacket_dir (Path): Path to the Phenopacket directory.
-        template_vcf_path (Path): Path to the template VCF file (optional).
-        vcf_dir (Path): Path to the directory containing VCF files (optional).
+        hg19_template_vcf (Path): Path to the template hg19 VCF file (optional).
+        hg38_template_vcf (Path): Path to the template hg19 VCF file (optional).
     Raises:
-        InputError: If both template_vcf_path and vcf_dir are None.
+        InputError: If both hg19_template_vcf and hg38_template_vcf are None.
     """
-    if template_vcf_path is None and vcf_dir is None:
-        raise InputError("Either a template_vcf or vcf_dir must be specified")
+    if hg19_template_vcf is None and hg38_template_vcf is None:
+        raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
+    hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
+    hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
     for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
-        vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
-        phenopacket = phenopacket_reader(phenopacket_path)
-        spiked_vcf_file_message = generate_spiked_vcf_file(
-            output_dir, phenopacket, phenopacket_path, vcf_file_path
-        )
-        updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
-            spiked_vcf_file_message
-        )
-        write_phenopacket(updated_phenopacket, phenopacket_path)
-    # or made a lambda one-liner for maximum wtf...
-    # [spike_vcf(path, output_dir, template_vcf, vcf_dir) for path in phenopacket_dir.iterdir() if path.suffix ==
-    # ".json"]
+        spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
 def spike_vcfs(
     output_dir: Path,
     phenopacket_path: Path,
     phenopacket_dir: Path,
-    template_vcf_path: Path,
-    vcf_dir: Path,
+    hg19_template_vcf: Path,
+    hg38_template_vcf: Path,
 ) -> None:
     """
     Create spiked VCF from either a Phenopacket or a Phenopacket directory.
@@ -523,10 +560,10 @@ def spike_vcfs(
         output_dir (Path): The directory to store the generated spiked VCF file(s).
         phenopacket_path (Path): Path to a single Phenopacket file (optional).
         phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
-        template_vcf_path (Path): Path to the template VCF file (optional).
-        vcf_dir (Path): Path to the directory containing VCF files (optional).
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
     """
     if phenopacket_path is not None:
-        create_spiked_vcf(output_dir, phenopacket_path, template_vcf_path, vcf_dir)
+        create_spiked_vcf(output_dir, phenopacket_path, hg19_template_vcf, hg38_template_vcf)
     elif phenopacket_dir is not None:
-        create_spiked_vcfs(output_dir, phenopacket_dir, template_vcf_path, vcf_dir)
+        create_spiked_vcfs(output_dir, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)

pheval-0.3.4/src/pheval/prepare/prepare_corpus.py ADDED Viewed

@@ -0,0 +1,67 @@
+import logging
+from pathlib import Path
+from pheval.prepare.create_spiked_vcf import create_spiked_vcf
+from pheval.prepare.update_phenopacket import create_updated_phenopacket
+from pheval.utils.file_utils import all_files
+from pheval.utils.phenopacket_utils import PhenopacketUtil, phenopacket_reader
+info_log = logging.getLogger("info")
+def prepare_corpus(
+    phenopacket_dir: Path,
+    variant_analysis: bool,
+    gene_analysis: bool,
+    disease_analysis: bool,
+    gene_identifier: str,
+    hg19_template_vcf: Path,
+    hg38_template_vcf: Path,
+    output_dir: Path,
+) -> None:
+    """
+    Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
+    gene identifiers.
+    Args:
+        phenopacket_dir (Path): The path to the directory containing Phenopackets.
+        variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
+        gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
+        disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
+        gene_identifier (str): Identifier for updating gene identifiers, if applicable.
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
+        VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
+        VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+        output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
+    """
+    output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
+    for phenopacket_path in all_files(phenopacket_dir):
+        phenopacket_util = PhenopacketUtil(phenopacket_reader(phenopacket_path))
+        if variant_analysis:
+            if phenopacket_util.check_incomplete_variant_record():
+                info_log.warning(
+                    f"Removed {phenopacket_path.name} from the corpus due to missing variant fields."
+                )
+                continue
+        if gene_analysis:
+            if phenopacket_util.check_incomplete_gene_record():
+                info_log.warning(
+                    f"Removed {phenopacket_path.name} from the corpus due to missing gene fields."
+                )
+                continue
+        if disease_analysis:
+            if phenopacket_util.check_incomplete_disease_record():
+                info_log.warning(
+                    f"Removed {phenopacket_path.name} from the corpus due to missing disease fields."
+                )
+                continue
+        if gene_identifier:
+            create_updated_phenopacket(
+                gene_identifier, phenopacket_path, output_dir.joinpath("phenopackets")
+            )
+        if hg19_template_vcf or hg38_template_vcf:
+            output_dir.joinpath("vcf").mkdir(exist_ok=True)
+            create_spiked_vcf(
+                output_dir.joinpath("vcf"), phenopacket_path, hg19_template_vcf, hg38_template_vcf
+            )

{pheval-0.3.2 → pheval-0.3.4}/src/pheval/prepare/update_phenopacket.py RENAMED Viewed

@@ -38,8 +38,7 @@ def update_outdated_gene_context(
     interpretations = PhenopacketUtil(phenopacket).interpretations()
     updated_interpretations = GeneIdentifierUpdater(
         hgnc_data=hgnc_data, gene_identifier=gene_identifier
-    ).update_genomic_interpretations_gene_identifier(interpretations)
+    ).update_genomic_interpretations_gene_identifier(interpretations, phenopacket_path)
     return PhenopacketRebuilder(phenopacket).update_interpretations(updated_interpretations)

{pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/phenopacket_utils.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import json
-# import logging
+import logging
 import os
 from collections import defaultdict
 from copy import copy
@@ -22,6 +21,8 @@ from phenopackets import (
 from pheval.prepare.custom_exceptions import IncorrectFileFormatError
+info_log = logging.getLogger("info")
 class IncompatibleGenomeAssemblyError(Exception):
     """Exception raised for incompatible genome assembly."""
@@ -467,7 +468,9 @@ class PhenopacketUtil:
         for i in pheno_interpretation:
             for g in i.diagnosis.genomic_interpretations:
                 variant = GenomicVariant(
-                    chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom,
+                    chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
+                        "chr", ""
+                    ),
                     pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
                     ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
                     alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
@@ -475,6 +478,59 @@ class PhenopacketUtil:
                 variants.append(variant)
         return variants
+    def check_incomplete_variant_record(self) -> bool:
+        """
+        Check if any variant record in the phenopacket has incomplete information.
+        This method iterates through the diagnosed variant records and checks if any of them
+        have missing or incomplete information such as empty chromosome, position, reference,
+        or alternate allele.
+        Returns:
+            bool: True if any variant record is incomplete, False otherwise.
+        """
+        variants = self.diagnosed_variants()
+        for variant in variants:
+            if (
+                variant.chrom == ""
+                or variant.pos == 0
+                or variant.pos == ""
+                or variant.ref == ""
+                or variant.alt == ""
+            ):
+                return True
+        return False
+    def check_incomplete_gene_record(self) -> bool:
+        """
+        Check if any gene record in the phenopacket has incomplete information.
+        This method iterates through the diagnosed gene records and checks if any of them
+        have missing or incomplete information such as gene name, or gene identifier.
+        Returns:
+            bool: True if any gene record is incomplete, False otherwise.
+        """
+        genes = self.diagnosed_genes()
+        for gene in genes:
+            if gene.gene_symbol == "" or gene.gene_identifier == "":
+                return True
+        return False
+    def check_incomplete_disease_record(self) -> bool:
+        """
+        Check if any disease record in the phenopacket has incomplete information.
+        This method iterates through the diagnosed disease records and checks if any of them
+        have missing or incomplete information such as empty disease name, or disease identifier.
+        Returns:
+            bool: True if any disease record is incomplete, False otherwise.
+        """
+        if len(self.diagnoses()) == 0:
+            return True
+        return False
 class PhenopacketRebuilder:
     """Class for rebuilding a Phenopacket"""
@@ -653,7 +709,7 @@ class GeneIdentifierUpdater:
                         ]
     def update_genomic_interpretations_gene_identifier(
-        self, interpretations: List[Interpretation]
+        self, interpretations: List[Interpretation], phenopacket_path: Path
     ) -> List[Interpretation]:
         """
         Update the genomic interpretations of a Phenopacket.
@@ -667,10 +723,16 @@ class GeneIdentifierUpdater:
         updated_interpretations = copy(list(interpretations))
         for updated_interpretation in updated_interpretations:
             for g in updated_interpretation.diagnosis.genomic_interpretations:
+                updated_gene_identifier = self.find_identifier(
+                    g.variant_interpretation.variation_descriptor.gene_context.symbol
+                )
+                info_log.info(
+                    f"Updating gene identifier in {phenopacket_path} from "
+                    f"{g.variant_interpretation.variation_descriptor.gene_context.value_id}"
+                    f"to {updated_gene_identifier}"
+                )
                 g.variant_interpretation.variation_descriptor.gene_context.value_id = (
-                    self.find_identifier(
-                        g.variant_interpretation.variation_descriptor.gene_context.symbol
-                    )
+                    updated_gene_identifier
                 )
                 del g.variant_interpretation.variation_descriptor.gene_context.alternate_ids[:]
                 g.variant_interpretation.variation_descriptor.gene_context.alternate_ids.extend(