PyPI - pheval - Versions diffs - 0.3.3__tar.gz → 0.3.5__tar.gz - Mend

pheval 0.3.3tar.gz → 0.3.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pheval might be problematic. Click here for more details.

Files changed (56) hide show

{pheval-0.3.3 → pheval-0.3.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pheval
-Version: 0.3.3
+Version: 0.3.5
 Summary:
 Author: Yasemin Bridges
 Author-email: y.bridges@qmul.ac.uk

{pheval-0.3.3 → pheval-0.3.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pheval"
-version = "0.3.3"
+version = "0.3.5"
 description = ""
 authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
   "Julius Jacobsen <j.jacobsen@qmul.ac.uk>",

{pheval-0.3.3 → pheval-0.3.5}/src/pheval/cli.py RENAMED Viewed

@@ -10,6 +10,7 @@ from .cli_pheval_utils import (
     benchmark_comparison,
     create_spiked_vcfs_command,
     generate_stats_plot,
+    prepare_corpus_command,
     scramble_phenopackets_command,
     semsim_scramble_command,
     semsim_to_exomiserdb_command,
@@ -60,6 +61,7 @@ pheval_utils.add_command(benchmark)
 pheval_utils.add_command(benchmark_comparison)
 pheval_utils.add_command(semsim_to_exomiserdb_command)
 pheval_utils.add_command(generate_stats_plot)
+pheval_utils.add_command(prepare_corpus_command)
 if __name__ == "__main__":
     main()

{pheval-0.3.3 → pheval-0.3.5}/src/pheval/cli_pheval_utils.py RENAMED Viewed

@@ -15,6 +15,7 @@ from pheval.analyse.run_data_parser import parse_run_data_text_file
 from pheval.prepare.create_noisy_phenopackets import scramble_phenopackets
 from pheval.prepare.create_spiked_vcf import spike_vcfs
 from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
+from pheval.prepare.prepare_corpus import prepare_corpus
 from pheval.prepare.update_phenopacket import update_phenopackets
 from pheval.utils.exomiser import semsim_to_exomiserdb
 from pheval.utils.semsim_utils import percentage_diff, semsim_heatmap_plot
@@ -606,3 +607,106 @@ def generate_stats_plot(
     generate_plots_from_benchmark_summary_tsv(
         benchmarking_tsv, gene_analysis, variant_analysis, disease_analysis, plot_type, title
     )
+@click.command("prepare-corpus")
+@click.option(
+    "--phenopacket-dir",
+    "-p",
+    required=True,
+    metavar="PATH",
+    help="Path to phenopacket corpus directory..",
+    type=Path,
+)
+@click.option(
+    "--variant-analysis/--no-variant-analysis",
+    default=False,
+    required=False,
+    type=bool,
+    show_default=True,
+    help="Specify whether to check for complete variant records in the phenopackets.",
+)
+@click.option(
+    "--gene-analysis/--no-gene-analysis",
+    default=False,
+    required=False,
+    type=bool,
+    show_default=True,
+    help="Specify whether to check for complete gene records in the phenopackets.",
+)
+@click.option(
+    "--disease-analysis/--no-disease-analysis",
+    default=False,
+    required=False,
+    type=bool,
+    show_default=True,
+    help="Specify whether to check for complete disease records in the phenopackets.",
+)
+@click.option(
+    "--gene-identifier",
+    "-g",
+    required=False,
+    help="Gene identifier to update in phenopacket",
+    type=click.Choice(["ensembl_id", "entrez_id", "hgnc_id"]),
+)
+@click.option(
+    "--hg19-template-vcf",
+    "-hg19",
+    metavar="PATH",
+    required=False,
+    help="Template hg19 VCF file",
+    type=Path,
+)
+@click.option(
+    "--hg38-template-vcf",
+    "-hg38",
+    metavar="PATH",
+    required=False,
+    help="Template hg38 VCF file",
+    type=Path,
+)
+@click.option(
+    "--output-dir",
+    "-o",
+    metavar="PATH",
+    required=True,
+    help="Path to output prepared corpus.",
+    default="prepared_corpus",
+    type=Path,
+)
+def prepare_corpus_command(
+    phenopacket_dir: Path,
+    variant_analysis: bool,
+    gene_analysis: bool,
+    disease_analysis: bool,
+    gene_identifier: str,
+    hg19_template_vcf: Path,
+    hg38_template_vcf: Path,
+    output_dir: Path,
+):
+    """
+    Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
+    gene identifiers.
+        Args:
+            phenopacket_dir (Path): The path to the directory containing Phenopackets.
+            variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
+            gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
+            disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
+            gene_identifier (str): Identifier for updating gene identifiers, if applicable.
+            hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
+            VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+            hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
+            VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+            output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
+    """
+    prepare_corpus(
+        phenopacket_dir,
+        variant_analysis,
+        gene_analysis,
+        disease_analysis,
+        gene_identifier,
+        hg19_template_vcf,
+        hg38_template_vcf,
+        output_dir,
+    )

pheval-0.3.5/src/pheval/prepare/prepare_corpus.py ADDED Viewed

@@ -0,0 +1,73 @@
+import logging
+import shutil
+from pathlib import Path
+from pheval.prepare.create_spiked_vcf import create_spiked_vcf
+from pheval.prepare.update_phenopacket import create_updated_phenopacket
+from pheval.utils.file_utils import all_files
+from pheval.utils.phenopacket_utils import PhenopacketUtil, phenopacket_reader
+info_log = logging.getLogger("info")
+def prepare_corpus(
+    phenopacket_dir: Path,
+    variant_analysis: bool,
+    gene_analysis: bool,
+    disease_analysis: bool,
+    gene_identifier: str,
+    hg19_template_vcf: Path,
+    hg38_template_vcf: Path,
+    output_dir: Path,
+) -> None:
+    """
+    Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
+    gene identifiers.
+    Args:
+        phenopacket_dir (Path): The path to the directory containing Phenopackets.
+        variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
+        gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
+        disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
+        gene_identifier (str): Identifier for updating gene identifiers, if applicable.
+        hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
+        VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+        hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
+        VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
+        output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
+    """
+    output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
+    for phenopacket_path in all_files(phenopacket_dir):
+        phenopacket_util = PhenopacketUtil(phenopacket_reader(phenopacket_path))
+        if variant_analysis:
+            if phenopacket_util.check_incomplete_variant_record():
+                info_log.warning(
+                    f"Removed {phenopacket_path.name} from the corpus due to missing variant fields."
+                )
+                continue
+        if gene_analysis:
+            if phenopacket_util.check_incomplete_gene_record():
+                info_log.warning(
+                    f"Removed {phenopacket_path.name} from the corpus due to missing gene fields."
+                )
+                continue
+        if disease_analysis:
+            if phenopacket_util.check_incomplete_disease_record():
+                info_log.warning(
+                    f"Removed {phenopacket_path.name} from the corpus due to missing disease fields."
+                )
+                continue
+        if hg19_template_vcf or hg38_template_vcf:
+            output_dir.joinpath("vcf").mkdir(exist_ok=True)
+            create_spiked_vcf(
+                output_dir.joinpath("vcf"), phenopacket_path, hg19_template_vcf, hg38_template_vcf
+            )
+        if gene_identifier:
+            create_updated_phenopacket(
+                gene_identifier, phenopacket_path, output_dir.joinpath("phenopackets")
+            )
+        else:
+            # if not updating phenopacket gene identifiers then copy phenopacket as is to output directory
+            shutil.copy(
+                phenopacket_path, output_dir.joinpath(f"phenopackets/{phenopacket_path.name}")
+            )

{pheval-0.3.3 → pheval-0.3.5}/src/pheval/prepare/update_phenopacket.py RENAMED Viewed

@@ -38,8 +38,7 @@ def update_outdated_gene_context(
     interpretations = PhenopacketUtil(phenopacket).interpretations()
     updated_interpretations = GeneIdentifierUpdater(
         hgnc_data=hgnc_data, gene_identifier=gene_identifier
-    ).update_genomic_interpretations_gene_identifier(interpretations)
+    ).update_genomic_interpretations_gene_identifier(interpretations, phenopacket_path)
     return PhenopacketRebuilder(phenopacket).update_interpretations(updated_interpretations)

{pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/phenopacket_utils.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import json
-# import logging
+import logging
 import os
 from collections import defaultdict
 from copy import copy
@@ -22,6 +21,8 @@ from phenopackets import (
 from pheval.prepare.custom_exceptions import IncorrectFileFormatError
+info_log = logging.getLogger("info")
 class IncompatibleGenomeAssemblyError(Exception):
     """Exception raised for incompatible genome assembly."""
@@ -477,6 +478,59 @@ class PhenopacketUtil:
                 variants.append(variant)
         return variants
+    def check_incomplete_variant_record(self) -> bool:
+        """
+        Check if any variant record in the phenopacket has incomplete information.
+        This method iterates through the diagnosed variant records and checks if any of them
+        have missing or incomplete information such as empty chromosome, position, reference,
+        or alternate allele.
+        Returns:
+            bool: True if any variant record is incomplete, False otherwise.
+        """
+        variants = self.diagnosed_variants()
+        for variant in variants:
+            if (
+                variant.chrom == ""
+                or variant.pos == 0
+                or variant.pos == ""
+                or variant.ref == ""
+                or variant.alt == ""
+            ):
+                return True
+        return False
+    def check_incomplete_gene_record(self) -> bool:
+        """
+        Check if any gene record in the phenopacket has incomplete information.
+        This method iterates through the diagnosed gene records and checks if any of them
+        have missing or incomplete information such as gene name, or gene identifier.
+        Returns:
+            bool: True if any gene record is incomplete, False otherwise.
+        """
+        genes = self.diagnosed_genes()
+        for gene in genes:
+            if gene.gene_symbol == "" or gene.gene_identifier == "":
+                return True
+        return False
+    def check_incomplete_disease_record(self) -> bool:
+        """
+        Check if any disease record in the phenopacket has incomplete information.
+        This method iterates through the diagnosed disease records and checks if any of them
+        have missing or incomplete information such as empty disease name, or disease identifier.
+        Returns:
+            bool: True if any disease record is incomplete, False otherwise.
+        """
+        if len(self.diagnoses()) == 0:
+            return True
+        return False
 class PhenopacketRebuilder:
     """Class for rebuilding a Phenopacket"""
@@ -655,7 +709,7 @@ class GeneIdentifierUpdater:
                         ]
     def update_genomic_interpretations_gene_identifier(
-        self, interpretations: List[Interpretation]
+        self, interpretations: List[Interpretation], phenopacket_path: Path
     ) -> List[Interpretation]:
         """
         Update the genomic interpretations of a Phenopacket.
@@ -669,10 +723,16 @@ class GeneIdentifierUpdater:
         updated_interpretations = copy(list(interpretations))
         for updated_interpretation in updated_interpretations:
             for g in updated_interpretation.diagnosis.genomic_interpretations:
+                updated_gene_identifier = self.find_identifier(
+                    g.variant_interpretation.variation_descriptor.gene_context.symbol
+                )
+                info_log.info(
+                    f"Updating gene identifier in {phenopacket_path} from "
+                    f"{g.variant_interpretation.variation_descriptor.gene_context.value_id}"
+                    f"to {updated_gene_identifier}"
+                )
                 g.variant_interpretation.variation_descriptor.gene_context.value_id = (
-                    self.find_identifier(
-                        g.variant_interpretation.variation_descriptor.gene_context.symbol
-                    )
+                    updated_gene_identifier
                 )
                 del g.variant_interpretation.variation_descriptor.gene_context.alternate_ids[:]
                 g.variant_interpretation.variation_descriptor.gene_context.alternate_ids.extend(