PyPI - pheval - Versions diffs - 0.3.6__tar.gz → 0.3.8__tar.gz - Mend

pheval 0.3.6tar.gz → 0.3.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (56) hide show

{pheval-0.3.6 → pheval-0.3.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pheval
-Version: 0.3.6
+Version: 0.3.8
 Summary:
 Author: Yasemin Bridges
 Author-email: y.bridges@qmul.ac.uk

{pheval-0.3.6 → pheval-0.3.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pheval"
-version = "0.3.6"
+version = "0.3.8"
 description = ""
 authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
   "Julius Jacobsen <j.jacobsen@qmul.ac.uk>",

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/disease_prioritisation_analysis.py RENAMED Viewed

@@ -10,11 +10,7 @@ from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResu
 from pheval.analyse.rank_stats import RankStats
 from pheval.analyse.run_data_parser import TrackInputOutputDirectories
 from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
-from pheval.utils.file_utils import (
-    all_files,
-    files_with_suffix,
-    obtain_phenopacket_path_from_pheval_result,
-)
+from pheval.utils.file_utils import all_files
 from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader
@@ -217,7 +213,7 @@ def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
 def assess_phenopacket_disease_prioritisation(
-    standardised_disease_result: Path,
+    phenopacket_path: Path,
     score_order: str,
     results_dir_and_input: TrackInputOutputDirectories,
     threshold: float,
@@ -230,7 +226,7 @@ def assess_phenopacket_disease_prioritisation(
     against the recorded causative diseases for a proband in the Phenopacket.
     Args:
-        standardised_disease_result (Path): Path to the PhEval standardised disease result file.
+        phenopacket_path (Path): Path to the Phenopacket.
         score_order (str): The order in which scores are arranged, either ascending or descending.
         results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
         threshold (float): Threshold for assessment.
@@ -238,8 +234,8 @@ def assess_phenopacket_disease_prioritisation(
         disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
         disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
     """
-    phenopacket_path = obtain_phenopacket_path_from_pheval_result(
-        standardised_disease_result, all_files(results_dir_and_input.phenopacket_dir)
+    standardised_disease_result = results_dir_and_input.results_dir.joinpath(
+        f"pheval_disease_results/{phenopacket_path.stem}-pheval_disease_result.tsv"
     )
     pheval_disease_result = read_standardised_result(standardised_disease_result)
     proband_diseases = _obtain_causative_diseases(phenopacket_path)
@@ -276,12 +272,9 @@ def benchmark_disease_prioritisation(
     """
     disease_rank_stats = RankStats()
     disease_binary_classification_stats = BinaryClassificationStats()
-    for standardised_result in files_with_suffix(
-        results_directory_and_input.results_dir.joinpath("pheval_disease_results/"),
-        ".tsv",
-    ):
+    for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
         assess_phenopacket_disease_prioritisation(
-            standardised_result,
+            phenopacket_path,
             score_order,
             results_directory_and_input,
             threshold,

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/gene_prioritisation_analysis.py RENAMED Viewed

@@ -1,6 +1,8 @@
+import ast
+import re
 from collections import defaultdict
 from pathlib import Path
-from typing import List
+from typing import List, Union
 from pheval.analyse.benchmarking_data import BenchmarkRunResults
 from pheval.analyse.binary_classification_stats import BinaryClassificationStats
@@ -10,11 +12,7 @@ from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
 from pheval.analyse.rank_stats import RankStats
 from pheval.analyse.run_data_parser import TrackInputOutputDirectories
 from pheval.post_processing.post_processing import RankedPhEvalGeneResult
-from pheval.utils.file_utils import (
-    all_files,
-    files_with_suffix,
-    obtain_phenopacket_path_from_pheval_result,
-)
+from pheval.utils.file_utils import all_files
 from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
@@ -144,6 +142,24 @@ class AssessGenePrioritisation:
                 )
             )
+    @staticmethod
+    def _check_string_representation(entity: str) -> Union[List[str], str]:
+        """
+        Check if the input string is a representation of a list and returns the list if true, otherwise the string.
+        Args:
+            entity (str): The input entity to check.
+        Returns:
+            Union[List[str], str]: A list if the input string is a list representation, otherwise
+            the original string.
+        """
+        list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*\]$")
+        if list_pattern.match(entity):
+            return ast.literal_eval(entity)
+        else:
+            return entity
     def assess_gene_prioritisation(
         self,
         rank_stats: RankStats,
@@ -165,9 +181,21 @@ class AssessGenePrioritisation:
             rank_stats.total += 1
             gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
             for standardised_gene_result in self.standardised_gene_results:
+                gene_identifier = self._check_string_representation(
+                    standardised_gene_result.gene_identifier
+                )
+                gene_symbol = self._check_string_representation(
+                    standardised_gene_result.gene_symbol
+                )
                 if (
-                    gene.gene_identifier == standardised_gene_result.gene_identifier
-                    or gene.gene_symbol == standardised_gene_result.gene_symbol
+                    isinstance(gene_identifier, list)
+                    and gene.gene_identifier in gene_identifier
+                    or isinstance(gene_identifier, str)
+                    and gene.gene_identifier == str
+                    or isinstance(gene_symbol, list)
+                    and gene.gene_symbol in gene_symbol
+                    or isinstance(gene_symbol, str)
+                    and gene.gene_symbol == gene_symbol
                 ):
                     gene_match = self._record_matched_gene(
                         gene, rank_stats, standardised_gene_result
@@ -209,7 +237,7 @@ def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene
 def assess_phenopacket_gene_prioritisation(
-    standardised_gene_result: Path,
+    phenopacket_path: Path,
     score_order: str,
     results_dir_and_input: TrackInputOutputDirectories,
     threshold: float,
@@ -222,7 +250,7 @@ def assess_phenopacket_gene_prioritisation(
     against the recorded causative genes for a proband in the Phenopacket.
     Args:
-        standardised_gene_result (Path): Path to the PhEval standardised gene result file.
+        phenopacket_path (Path): Path to the Phenopacket.
         score_order (str): The order in which scores are arranged, either ascending or descending.
         results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
         threshold (float): Threshold for assessment.
@@ -230,8 +258,8 @@ def assess_phenopacket_gene_prioritisation(
         gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
         gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
     """
-    phenopacket_path = obtain_phenopacket_path_from_pheval_result(
-        standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir)
+    standardised_gene_result = results_dir_and_input.results_dir.joinpath(
+        f"pheval_gene_results/{phenopacket_path.stem}-pheval_gene_result.tsv"
     )
     pheval_gene_result = read_standardised_result(standardised_gene_result)
     proband_causative_genes = _obtain_causative_genes(phenopacket_path)
@@ -266,11 +294,9 @@ def benchmark_gene_prioritisation(
     """
     gene_rank_stats = RankStats()
     gene_binary_classification_stats = BinaryClassificationStats()
-    for standardised_result in files_with_suffix(
-        results_directory_and_input.results_dir.joinpath("pheval_gene_results/"), ".tsv"
-    ):
+    for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
         assess_phenopacket_gene_prioritisation(
-            standardised_result,
+            phenopacket_path,
             score_order,
             results_directory_and_input,
             threshold,

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/parse_pheval_result.py RENAMED Viewed

@@ -1,3 +1,4 @@
+import logging
 from pathlib import Path
 from typing import List
@@ -5,6 +6,8 @@ import pandas as pd
 from pheval.post_processing.post_processing import PhEvalResult
+info_log = logging.getLogger("info")
 def read_standardised_result(standardised_result_path: Path) -> List[dict]:
     """
@@ -16,7 +19,11 @@ def read_standardised_result(standardised_result_path: Path) -> List[dict]:
     Returns:
         List[dict]: A list of dictionaries representing the content of the standardised result file.
     """
-    return pd.read_csv(standardised_result_path, delimiter="\t").to_dict("records")
+    if standardised_result_path.is_file():
+        return pd.read_csv(standardised_result_path, delimiter="\t").to_dict("records")
+    else:
+        info_log.info(f"Could not find {standardised_result_path}")
+        return pd.DataFrame().to_dict("records")
 def parse_pheval_result(

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/analyse/variant_prioritisation_analysis.py RENAMED Viewed

@@ -10,11 +10,7 @@ from pheval.analyse.prioritisation_result_types import VariantPrioritisationResu
 from pheval.analyse.rank_stats import RankStats
 from pheval.analyse.run_data_parser import TrackInputOutputDirectories
 from pheval.post_processing.post_processing import RankedPhEvalVariantResult
-from pheval.utils.file_utils import (
-    all_files,
-    files_with_suffix,
-    obtain_phenopacket_path_from_pheval_result,
-)
+from pheval.utils.file_utils import all_files
 from pheval.utils.phenopacket_utils import GenomicVariant, PhenopacketUtil, phenopacket_reader
@@ -166,8 +162,8 @@ class AssessVariantPrioritisation:
             variant_match = VariantPrioritisationResult(self.phenopacket_path, variant)
             for result in self.standardised_variant_results:
                 result_variant = GenomicVariant(
-                    chrom=result.chromosome,
-                    pos=result.start,
+                    chrom=str(result.chromosome),
+                    pos=int(result.start),
                     ref=result.ref,
                     alt=result.alt,
                 )
@@ -211,7 +207,7 @@ def _obtain_causative_variants(phenopacket_path: Path) -> List[GenomicVariant]:
 def assess_phenopacket_variant_prioritisation(
-    standardised_variant_result: Path,
+    phenopacket_path: Path,
     score_order: str,
     results_dir_and_input: TrackInputOutputDirectories,
     threshold: float,
@@ -224,7 +220,7 @@ def assess_phenopacket_variant_prioritisation(
     against the recorded causative variants for a proband in the Phenopacket.
     Args:
-        standardised_variant_result (Path): Path to the PhEval standardised variant result file.
+        phenopacket_path (Path): Path to the Phenopacket.
         score_order (str): The order in which scores are arranged, either ascending or descending.
         results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
         threshold (float): Threshold for assessment.
@@ -232,10 +228,10 @@ def assess_phenopacket_variant_prioritisation(
         variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
         variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
     """
-    phenopacket_path = obtain_phenopacket_path_from_pheval_result(
-        standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir)
-    )
     proband_causative_variants = _obtain_causative_variants(phenopacket_path)
+    standardised_variant_result = results_dir_and_input.results_dir.joinpath(
+        f"pheval_variant_results/{phenopacket_path.stem}-pheval_variant_result.tsv"
+    )
     pheval_variant_result = read_standardised_result(standardised_variant_result)
     AssessVariantPrioritisation(
         phenopacket_path,
@@ -270,12 +266,9 @@ def benchmark_variant_prioritisation(
     """
     variant_rank_stats = RankStats()
     variant_binary_classification_stats = BinaryClassificationStats()
-    for standardised_result in files_with_suffix(
-        results_directory_and_input.results_dir.joinpath("pheval_variant_results/"),
-        ".tsv",
-    ):
+    for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
         assess_phenopacket_variant_prioritisation(
-            standardised_result,
+            phenopacket_path,
             score_order,
             results_directory_and_input,
             threshold,

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/cli_pheval_utils.py RENAMED Viewed

@@ -260,6 +260,8 @@ def update_phenopackets_command(
     required=False,
     help="Template hg19 VCF file",
     type=Path,
+    cls=MutuallyExclusiveOptionError,
+    mutually_exclusive=["hg19_vcf_dir"],
 )
 @click.option(
     "--hg38-template-vcf",
@@ -268,6 +270,28 @@ def update_phenopackets_command(
     required=False,
     help="Template hg38 VCF file",
     type=Path,
+    cls=MutuallyExclusiveOptionError,
+    mutually_exclusive=["hg38_vcf_dir"],
+)
+@click.option(
+    "--hg19-vcf-dir",
+    "-hg19-dir",
+    metavar="PATH",
+    required=False,
+    help="Path to directory containing hg19 VCF templates.",
+    type=Path,
+    cls=MutuallyExclusiveOptionError,
+    mutually_exclusive=["hg19_template_vcf"],
+)
+@click.option(
+    "--hg38-vcf-dir",
+    "-hg38-dir",
+    metavar="PATH",
+    required=False,
+    help="Path to directory containing hg38 VCF templates.",
+    type=Path,
+    cls=MutuallyExclusiveOptionError,
+    mutually_exclusive=["hg38_template_vcf"],
 )
 @click.option(
     "--output-dir",
@@ -284,6 +308,8 @@ def create_spiked_vcfs_command(
     output_dir: Path,
     hg19_template_vcf: Path = None,
     hg38_template_vcf: Path = None,
+    hg19_vcf_dir: Path = None,
+    hg38_vcf_dir: Path = None,
 ):
     """
     Create spiked VCF from either a Phenopacket or a Phenopacket directory.
@@ -294,10 +320,20 @@ def create_spiked_vcfs_command(
         output_dir (Path): The directory to store the generated spiked VCF file(s).
         hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
         hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
+        hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional).
+        hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional).
     """
     if phenopacket_path is None and phenopacket_dir is None:
         raise InputError("Either a phenopacket or phenopacket directory must be specified")
-    spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)
+    spike_vcfs(
+        output_dir,
+        phenopacket_path,
+        phenopacket_dir,
+        hg19_template_vcf,
+        hg38_template_vcf,
+        hg19_vcf_dir,
+        hg38_vcf_dir,
+    )
 @click.command()
@@ -656,6 +692,8 @@ def generate_stats_plot(
     required=False,
     help="Template hg19 VCF file",
     type=Path,
+    cls=MutuallyExclusiveOptionError,
+    mutually_exclusive=["hg19_vcf_dir"],
 )
 @click.option(
     "--hg38-template-vcf",
@@ -664,6 +702,28 @@ def generate_stats_plot(
     required=False,
     help="Template hg38 VCF file",
     type=Path,
+    cls=MutuallyExclusiveOptionError,
+    mutually_exclusive=["hg38_vcf_dir"],
+)
+@click.option(
+    "--hg19-vcf-dir",
+    "-hg19-dir",
+    metavar="PATH",
+    required=False,
+    help="Path to directory containing hg19 VCF templates.",
+    type=Path,
+    cls=MutuallyExclusiveOptionError,
+    mutually_exclusive=["hg19_template_vcf"],
+)
+@click.option(
+    "--hg38-vcf-dir",
+    "-hg38-dir",
+    metavar="PATH",
+    required=False,
+    help="Path to directory containing hg38 VCF templates.",
+    type=Path,
+    cls=MutuallyExclusiveOptionError,
+    mutually_exclusive=["hg38_template_vcf"],
 )
 @click.option(
     "--output-dir",
@@ -682,23 +742,28 @@ def prepare_corpus_command(
     gene_identifier: str,
     hg19_template_vcf: Path,
     hg38_template_vcf: Path,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
     output_dir: Path,
 ):
     """
     Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
     gene identifiers.
-        Args:
-            phenopacket_dir (Path): The path to the directory containing Phenopackets.
-            variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
-            gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
-            disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
-            gene_identifier (str): Identifier for updating gene identifiers, if applicable.
-            hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
-            VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
-            hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
-            VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
-            output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
+    Args:
+        phenopacket_dir (Path): The path to the directory containing Phenopackets.
+        variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
+        gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
+        disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
+        gene_identifier (str): Identifier for updating gene identifiers, if applicable.
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
+        hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional).
+        hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional).
+        output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
+    Notes:
+        To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf,
+        hg19_vcf_dir or hg38_vcf_dir is required.
     """
     prepare_corpus(
         phenopacket_dir,
@@ -708,5 +773,7 @@ def prepare_corpus_command(
         gene_identifier,
         hg19_template_vcf,
         hg38_template_vcf,
+        hg19_vcf_dir,
+        hg38_vcf_dir,
         output_dir,
     )

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/post_processing/post_processing.py RENAMED Viewed

@@ -3,6 +3,7 @@ import operator
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
+from typing import List, Union
 import pandas as pd
@@ -30,8 +31,8 @@ class PhEvalResult:
 class PhEvalGeneResult(PhEvalResult):
     """Minimal data required from tool-specific output for gene prioritisation result
     Args:
-        gene_symbol (str): The gene symbol for the result entry
-        gene_identifier (str): The ENSEMBL gene identifier for the result entry
+        gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry
+        gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry
         score (float): The score for the gene result entry
     Notes:
         While we recommend providing the gene identifier in the ENSEMBL namespace,
@@ -39,8 +40,8 @@ class PhEvalGeneResult(PhEvalResult):
         in the analysis.
     """
-    gene_symbol: str
-    gene_identifier: str
+    gene_symbol: Union[List[str], str]
+    gene_identifier: Union[List[str], str]
     score: float
@@ -375,11 +376,11 @@ def generate_pheval_result(
         info_log.warning(f"No results found for {tool_result_path.name}")
         return
     ranked_pheval_result = _create_pheval_result(pheval_result, sort_order_str)
-    if all(isinstance(result, RankedPhEvalGeneResult) for result in ranked_pheval_result):
+    if all(isinstance(result, PhEvalGeneResult) for result in pheval_result):
         _write_pheval_gene_result(ranked_pheval_result, output_dir, tool_result_path)
-    elif all(isinstance(result, RankedPhEvalVariantResult) for result in ranked_pheval_result):
+    elif all(isinstance(result, PhEvalVariantResult) for result in pheval_result):
         _write_pheval_variant_result(ranked_pheval_result, output_dir, tool_result_path)
-    elif all(isinstance(result, RankedPhEvalDiseaseResult) for result in ranked_pheval_result):
+    elif all(isinstance(result, PhEvalDiseaseResult) for result in pheval_result):
         _write_pheval_disease_result(ranked_pheval_result, output_dir, tool_result_path)
     else:
         raise ValueError("Results are not all of the same type.")

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/prepare/create_spiked_vcf.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import gzip
 import logging
+import random
 import re
 import urllib.parse
 from copy import copy
@@ -10,7 +11,7 @@ from typing import List, Union
 from phenopackets import Family, File, Phenopacket
 from pheval.prepare.custom_exceptions import InputError
-from pheval.utils.file_utils import files_with_suffix, is_gzipped
+from pheval.utils.file_utils import all_files, files_with_suffix, is_gzipped
 from pheval.utils.phenopacket_utils import (
     IncompatibleGenomeAssemblyError,
     PhenopacketRebuilder,
@@ -207,6 +208,8 @@ def select_vcf_template(
     proband_causative_variants: List[ProbandCausativeVariant],
     hg19_vcf_info: VcfFile,
     hg38_vcf_info: VcfFile,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
 ) -> VcfFile:
     """
     Select the appropriate VCF template based on the assembly information of the proband causative variants.
@@ -216,6 +219,8 @@ def select_vcf_template(
         proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
         hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
         hg38_vcf_info (VcfFile): CF file info for hg38 template vcf.
+        hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
+        hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
     Returns:
         VcfFile: The selected VCF template file based on the assembly information of the proband causative variants.
@@ -224,11 +229,15 @@ def select_vcf_template(
     if proband_causative_variants[0].assembly in ["hg19", "GRCh37"]:
         if hg19_vcf_info:
             return hg19_vcf_info
+        elif hg19_vcf_dir:
+            return VcfFile.populate_fields(random.choice(all_files(hg19_vcf_dir)))
         else:
             raise InputError("Must specify hg19 template VCF!")
     elif proband_causative_variants[0].assembly in ["hg38", "GRCh38"]:
         if hg38_vcf_info:
             return hg38_vcf_info
+        elif hg38_vcf_dir:
+            return VcfFile.populate_fields(random.choice(all_files(hg38_vcf_dir)))
         else:
             raise InputError("Must specify hg38 template VCF!")
     else:
@@ -445,6 +454,8 @@ def spike_vcf_contents(
     phenopacket_path: Path,
     hg19_vcf_info: VcfFile,
     hg38_vcf_info: VcfFile,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
 ) -> tuple[str, List[str]]:
     """
     Spike VCF records with variants obtained from a Phenopacket or Family.
@@ -454,6 +465,8 @@ def spike_vcf_contents(
         phenopacket_path (Path): Path to the Phenopacket file.
         hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
         hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
+        hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
+        hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
     Returns:
         A tuple containing:
@@ -462,7 +475,12 @@ def spike_vcf_contents(
     """
     phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
     chosen_template_vcf = select_vcf_template(
-        phenopacket_path, phenopacket_causative_variants, hg19_vcf_info, hg38_vcf_info
+        phenopacket_path,
+        phenopacket_causative_variants,
+        hg19_vcf_info,
+        hg38_vcf_info,
+        hg19_vcf_dir,
+        hg38_vcf_dir,
     )
     check_variant_assembly(
         phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path
@@ -483,6 +501,8 @@ def generate_spiked_vcf_file(
     phenopacket_path: Path,
     hg19_vcf_info: VcfFile,
     hg38_vcf_info: VcfFile,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
 ) -> File:
     """
     Write spiked VCF contents to a new file.
@@ -493,13 +513,15 @@ def generate_spiked_vcf_file(
         phenopacket_path (Path): Path to the Phenopacket file.
         hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
         hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
+        hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
+        hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
     Returns:
         File: The generated File object representing the newly created spiked VCF file.
     """
     output_dir.mkdir(exist_ok=True)
     info_log.info(f" Created a directory {output_dir}")
     vcf_assembly, spiked_vcf = spike_vcf_contents(
-        phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
+        phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir
     )
     spiked_vcf_path = output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
     VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
@@ -509,10 +531,38 @@ def generate_spiked_vcf_file(
     )
-def spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path):
+def spike_and_update_phenopacket(
+    hg19_vcf_info: VcfFile,
+    hg38_vcf_info: VcfFile,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
+    output_dir: Path,
+    phenopacket_path: Path,
+) -> None:
+    """
+    Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket
+    accordingly, and write the updated Phenopacket to the specified output directory.
+    Args:
+        hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
+        hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
+        hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
+        hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
+        output_dir (Path): Directory where the updated Phenopacket will be saved.
+        phenopacket_path (Path): Path to the original Phenopacket file.
+    Returns:
+        None
+    """
     phenopacket = phenopacket_reader(phenopacket_path)
     spiked_vcf_file_message = generate_spiked_vcf_file(
-        output_dir, phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
+        output_dir,
+        phenopacket,
+        phenopacket_path,
+        hg19_vcf_info,
+        hg38_vcf_info,
+        hg19_vcf_dir,
+        hg38_vcf_dir,
     )
     updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
         spiked_vcf_file_message
@@ -521,7 +571,12 @@ def spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, pheno
 def create_spiked_vcf(
-    output_dir: Path, phenopacket_path: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
+    output_dir: Path,
+    phenopacket_path: Path,
+    hg19_template_vcf: Path,
+    hg38_template_vcf: Path,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
 ) -> None:
     """
     Create a spiked VCF for a Phenopacket.
@@ -531,6 +586,8 @@ def create_spiked_vcf(
         phenopacket_path (Path): Path to the Phenopacket file.
         hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
         hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
+        hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
+        hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
     Raises:
         InputError: If both hg19_template_vcf and hg38_template_vcf are None.
@@ -539,11 +596,18 @@ def create_spiked_vcf(
         raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
     hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
     hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
-    spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
+    spike_and_update_phenopacket(
+        hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
+    )
 def create_spiked_vcfs(
-    output_dir: Path, phenopacket_dir: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
+    output_dir: Path,
+    phenopacket_dir: Path,
+    hg19_template_vcf: Path,
+    hg38_template_vcf: Path,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
 ) -> None:
     """
     Create a spiked VCF for a directory of Phenopackets.
@@ -553,16 +617,25 @@ def create_spiked_vcfs(
         phenopacket_dir (Path): Path to the Phenopacket directory.
         hg19_template_vcf (Path): Path to the template hg19 VCF file (optional).
         hg38_template_vcf (Path): Path to the template hg19 VCF file (optional).
+        hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
+        hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
     Raises:
         InputError: If both hg19_template_vcf and hg38_template_vcf are None.
     """
-    if hg19_template_vcf is None and hg38_template_vcf is None:
-        raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
+    if (
+        hg19_template_vcf is None
+        and hg38_template_vcf is None
+        and hg19_vcf_dir is None
+        and hg38_vcf_dir is None
+    ):
+        raise InputError("Need to specify a VCF!")
     hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
     hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
     for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
-        spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
+        spike_and_update_phenopacket(
+            hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
+        )
 def spike_vcfs(
@@ -571,6 +644,8 @@ def spike_vcfs(
     phenopacket_dir: Path,
     hg19_template_vcf: Path,
     hg38_template_vcf: Path,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
 ) -> None:
     """
     Create spiked VCF from either a Phenopacket or a Phenopacket directory.
@@ -581,8 +656,24 @@ def spike_vcfs(
         phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
         hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
         hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
+        hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
+        hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
     """
     if phenopacket_path is not None:
-        create_spiked_vcf(output_dir, phenopacket_path, hg19_template_vcf, hg38_template_vcf)
+        create_spiked_vcf(
+            output_dir,
+            phenopacket_path,
+            hg19_template_vcf,
+            hg38_template_vcf,
+            hg19_vcf_dir,
+            hg38_vcf_dir,
+        )
     elif phenopacket_dir is not None:
-        create_spiked_vcfs(output_dir, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)
+        create_spiked_vcfs(
+            output_dir,
+            phenopacket_dir,
+            hg19_template_vcf,
+            hg38_template_vcf,
+            hg19_vcf_dir,
+            hg38_vcf_dir,
+        )

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/prepare/prepare_corpus.py RENAMED Viewed

@@ -18,6 +18,8 @@ def prepare_corpus(
     gene_identifier: str,
     hg19_template_vcf: Path,
     hg38_template_vcf: Path,
+    hg19_vcf_dir: Path,
+    hg38_vcf_dir: Path,
     output_dir: Path,
 ) -> None:
     """
@@ -34,7 +36,12 @@ def prepare_corpus(
         VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
         hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
         VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+        hg19_vcf_dir (Path): Path to the directory containing hg19 template VCF files (optional).
+        hg38_vcf_dir (Path): Path to the directory containing hg38 template VCF files (optional).
         output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
+    Notes:
+        To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf,
+        hg19_vcf_dir or hg38_vcf_dir is required.
     """
     output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
     for phenopacket_path in all_files(phenopacket_dir):
@@ -65,7 +72,12 @@ def prepare_corpus(
         if hg19_template_vcf or hg38_template_vcf:
             output_dir.joinpath("vcf").mkdir(exist_ok=True)
             create_spiked_vcf(
-                output_dir.joinpath("vcf"), phenopacket_path, hg19_template_vcf, hg38_template_vcf
+                output_dir.joinpath("vcf"),
+                phenopacket_path,
+                hg19_template_vcf,
+                hg38_template_vcf,
+                hg19_vcf_dir,
+                hg38_vcf_dir,
             )
         if gene_identifier:
             create_updated_phenopacket(

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/file_utils.py RENAMED Viewed

@@ -70,35 +70,6 @@ def normalise_file_name(file_path: Path) -> str:
     return re.sub("[\u0300-\u036f]", "", normalised_file_name)
-def obtain_phenopacket_path_from_pheval_result(
-    pheval_result_path: Path, phenopacket_paths: list[Path]
-) -> Path:
-    """
-    Obtains the phenopacket file name when given a pheval result file name
-    and a list of full paths of phenopackets to be queried.
-    Args:
-        pheval_result_path (Path): The PhEval result.
-        phenopacket_paths (list[Path]): List of full paths of phenopackets to be queried.
-    Returns:
-        Path: The matching phenopacket file path from the provided list.
-    """
-    pheval_result_path_stem_stripped = pheval_result_path.stem.split("-pheval_")[0]
-    matching_phenopacket_paths = [
-        phenopacket_path
-        for phenopacket_path in phenopacket_paths
-        if phenopacket_path.stem == pheval_result_path_stem_stripped
-    ]
-    if matching_phenopacket_paths:
-        return matching_phenopacket_paths[0]
-    else:
-        raise FileNotFoundError(
-            f"Unable to find matching phenopacket file named "
-            f"{pheval_result_path_stem_stripped}.json for {pheval_result_path.name}"
-        )
 def ensure_file_exists(*files: str):
     """Ensures the existence of files passed as parameter
     Raises:

{pheval-0.3.6 → pheval-0.3.8}/src/pheval/utils/phenopacket_utils.py RENAMED Viewed

@@ -468,10 +468,12 @@ class PhenopacketUtil:
         for i in pheno_interpretation:
             for g in i.diagnosis.genomic_interpretations:
                 variant = GenomicVariant(
-                    chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
-                        "chr", ""
+                    chrom=str(
+                        g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
+                            "chr", ""
+                        )
                     ),
-                    pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
+                    pos=int(g.variant_interpretation.variation_descriptor.vcf_record.pos),
                     ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
                     alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
                 )