pheval 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (56) hide show
  1. {pheval-0.3.3 → pheval-0.3.5}/PKG-INFO +1 -1
  2. {pheval-0.3.3 → pheval-0.3.5}/pyproject.toml +1 -1
  3. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/cli.py +2 -0
  4. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/cli_pheval_utils.py +104 -0
  5. pheval-0.3.5/src/pheval/prepare/prepare_corpus.py +73 -0
  6. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/prepare/update_phenopacket.py +1 -2
  7. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/phenopacket_utils.py +66 -6
  8. {pheval-0.3.3 → pheval-0.3.5}/LICENSE +0 -0
  9. {pheval-0.3.3 → pheval-0.3.5}/README.md +0 -0
  10. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/__init__.py +0 -0
  11. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/__init__.py +0 -0
  12. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/analysis.py +0 -0
  13. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/benchmark_generator.py +0 -0
  14. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/benchmarking_data.py +0 -0
  15. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/binary_classification_stats.py +0 -0
  16. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/disease_prioritisation_analysis.py +0 -0
  17. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/gene_prioritisation_analysis.py +0 -0
  18. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/generate_plots.py +0 -0
  19. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/generate_summary_outputs.py +0 -0
  20. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/parse_benchmark_summary.py +0 -0
  21. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/parse_pheval_result.py +0 -0
  22. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/prioritisation_rank_recorder.py +0 -0
  23. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/prioritisation_result_types.py +0 -0
  24. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/rank_stats.py +0 -0
  25. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/run_data_parser.py +0 -0
  26. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/analyse/variant_prioritisation_analysis.py +0 -0
  27. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/cli_pheval.py +0 -0
  28. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/config_parser.py +0 -0
  29. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/constants.py +0 -0
  30. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/implementations/__init__.py +0 -0
  31. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/infra/__init__.py +0 -0
  32. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/infra/exomiserdb.py +0 -0
  33. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/post_processing/__init__.py +0 -0
  34. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/post_processing/post_processing.py +0 -0
  35. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/prepare/__init__.py +0 -0
  36. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/prepare/create_noisy_phenopackets.py +0 -0
  37. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/prepare/create_spiked_vcf.py +0 -0
  38. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/prepare/custom_exceptions.py +0 -0
  39. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/resources/alternate_ouputs/CADA_results.txt +0 -0
  40. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/resources/alternate_ouputs/DeepPVP_results.txt +0 -0
  41. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/resources/alternate_ouputs/OVA_results.txt +0 -0
  42. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/resources/alternate_ouputs/Phen2Gene_results.json +0 -0
  43. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/resources/alternate_ouputs/Phenolyzer_results.txt +0 -0
  44. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/resources/alternate_ouputs/lirical_results.tsv +0 -0
  45. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/resources/alternate_ouputs/svanna_results.tsv +0 -0
  46. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/resources/hgnc_complete_set.txt +0 -0
  47. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/run_metadata.py +0 -0
  48. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/runners/__init__.py +0 -0
  49. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/runners/runner.py +0 -0
  50. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/__init__.py +0 -0
  51. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/docs_gen.py +0 -0
  52. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/docs_gen.sh +0 -0
  53. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/exomiser.py +0 -0
  54. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/file_utils.py +0 -0
  55. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/semsim_utils.py +0 -0
  56. {pheval-0.3.3 → pheval-0.3.5}/src/pheval/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pheval
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary:
5
5
  Author: Yasemin Bridges
6
6
  Author-email: y.bridges@qmul.ac.uk
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pheval"
3
- version = "0.3.3"
3
+ version = "0.3.5"
4
4
  description = ""
5
5
  authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
6
6
  "Julius Jacobsen <j.jacobsen@qmul.ac.uk>",
@@ -10,6 +10,7 @@ from .cli_pheval_utils import (
10
10
  benchmark_comparison,
11
11
  create_spiked_vcfs_command,
12
12
  generate_stats_plot,
13
+ prepare_corpus_command,
13
14
  scramble_phenopackets_command,
14
15
  semsim_scramble_command,
15
16
  semsim_to_exomiserdb_command,
@@ -60,6 +61,7 @@ pheval_utils.add_command(benchmark)
60
61
  pheval_utils.add_command(benchmark_comparison)
61
62
  pheval_utils.add_command(semsim_to_exomiserdb_command)
62
63
  pheval_utils.add_command(generate_stats_plot)
64
+ pheval_utils.add_command(prepare_corpus_command)
63
65
 
64
66
  if __name__ == "__main__":
65
67
  main()
@@ -15,6 +15,7 @@ from pheval.analyse.run_data_parser import parse_run_data_text_file
15
15
  from pheval.prepare.create_noisy_phenopackets import scramble_phenopackets
16
16
  from pheval.prepare.create_spiked_vcf import spike_vcfs
17
17
  from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
18
+ from pheval.prepare.prepare_corpus import prepare_corpus
18
19
  from pheval.prepare.update_phenopacket import update_phenopackets
19
20
  from pheval.utils.exomiser import semsim_to_exomiserdb
20
21
  from pheval.utils.semsim_utils import percentage_diff, semsim_heatmap_plot
@@ -606,3 +607,106 @@ def generate_stats_plot(
606
607
  generate_plots_from_benchmark_summary_tsv(
607
608
  benchmarking_tsv, gene_analysis, variant_analysis, disease_analysis, plot_type, title
608
609
  )
610
+
611
+
612
+ @click.command("prepare-corpus")
613
+ @click.option(
614
+ "--phenopacket-dir",
615
+ "-p",
616
+ required=True,
617
+ metavar="PATH",
618
+ help="Path to phenopacket corpus directory..",
619
+ type=Path,
620
+ )
621
+ @click.option(
622
+ "--variant-analysis/--no-variant-analysis",
623
+ default=False,
624
+ required=False,
625
+ type=bool,
626
+ show_default=True,
627
+ help="Specify whether to check for complete variant records in the phenopackets.",
628
+ )
629
+ @click.option(
630
+ "--gene-analysis/--no-gene-analysis",
631
+ default=False,
632
+ required=False,
633
+ type=bool,
634
+ show_default=True,
635
+ help="Specify whether to check for complete gene records in the phenopackets.",
636
+ )
637
+ @click.option(
638
+ "--disease-analysis/--no-disease-analysis",
639
+ default=False,
640
+ required=False,
641
+ type=bool,
642
+ show_default=True,
643
+ help="Specify whether to check for complete disease records in the phenopackets.",
644
+ )
645
+ @click.option(
646
+ "--gene-identifier",
647
+ "-g",
648
+ required=False,
649
+ help="Gene identifier to update in phenopacket",
650
+ type=click.Choice(["ensembl_id", "entrez_id", "hgnc_id"]),
651
+ )
652
+ @click.option(
653
+ "--hg19-template-vcf",
654
+ "-hg19",
655
+ metavar="PATH",
656
+ required=False,
657
+ help="Template hg19 VCF file",
658
+ type=Path,
659
+ )
660
+ @click.option(
661
+ "--hg38-template-vcf",
662
+ "-hg38",
663
+ metavar="PATH",
664
+ required=False,
665
+ help="Template hg38 VCF file",
666
+ type=Path,
667
+ )
668
+ @click.option(
669
+ "--output-dir",
670
+ "-o",
671
+ metavar="PATH",
672
+ required=True,
673
+ help="Path to output prepared corpus.",
674
+ default="prepared_corpus",
675
+ type=Path,
676
+ )
677
+ def prepare_corpus_command(
678
+ phenopacket_dir: Path,
679
+ variant_analysis: bool,
680
+ gene_analysis: bool,
681
+ disease_analysis: bool,
682
+ gene_identifier: str,
683
+ hg19_template_vcf: Path,
684
+ hg38_template_vcf: Path,
685
+ output_dir: Path,
686
+ ):
687
+ """
688
+ Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
689
+ gene identifiers.
690
+
691
+ Args:
692
+ phenopacket_dir (Path): The path to the directory containing Phenopackets.
693
+ variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
694
+ gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
695
+ disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
696
+ gene_identifier (str): Identifier for updating gene identifiers, if applicable.
697
+ hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
698
+ VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
699
+ hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
700
+ VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
701
+ output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
702
+ """
703
+ prepare_corpus(
704
+ phenopacket_dir,
705
+ variant_analysis,
706
+ gene_analysis,
707
+ disease_analysis,
708
+ gene_identifier,
709
+ hg19_template_vcf,
710
+ hg38_template_vcf,
711
+ output_dir,
712
+ )
@@ -0,0 +1,73 @@
1
+ import logging
2
+ import shutil
3
+ from pathlib import Path
4
+
5
+ from pheval.prepare.create_spiked_vcf import create_spiked_vcf
6
+ from pheval.prepare.update_phenopacket import create_updated_phenopacket
7
+ from pheval.utils.file_utils import all_files
8
+ from pheval.utils.phenopacket_utils import PhenopacketUtil, phenopacket_reader
9
+
10
+ info_log = logging.getLogger("info")
11
+
12
+
13
+ def prepare_corpus(
14
+ phenopacket_dir: Path,
15
+ variant_analysis: bool,
16
+ gene_analysis: bool,
17
+ disease_analysis: bool,
18
+ gene_identifier: str,
19
+ hg19_template_vcf: Path,
20
+ hg38_template_vcf: Path,
21
+ output_dir: Path,
22
+ ) -> None:
23
+ """
24
+ Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
25
+ gene identifiers.
26
+
27
+ Args:
28
+ phenopacket_dir (Path): The path to the directory containing Phenopackets.
29
+ variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
30
+ gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
31
+ disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
32
+ gene_identifier (str): Identifier for updating gene identifiers, if applicable.
33
+ hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
34
+ VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
35
+ hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
36
+ VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
37
+ output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
38
+ """
39
+ output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
40
+ for phenopacket_path in all_files(phenopacket_dir):
41
+ phenopacket_util = PhenopacketUtil(phenopacket_reader(phenopacket_path))
42
+ if variant_analysis:
43
+ if phenopacket_util.check_incomplete_variant_record():
44
+ info_log.warning(
45
+ f"Removed {phenopacket_path.name} from the corpus due to missing variant fields."
46
+ )
47
+ continue
48
+ if gene_analysis:
49
+ if phenopacket_util.check_incomplete_gene_record():
50
+ info_log.warning(
51
+ f"Removed {phenopacket_path.name} from the corpus due to missing gene fields."
52
+ )
53
+ continue
54
+ if disease_analysis:
55
+ if phenopacket_util.check_incomplete_disease_record():
56
+ info_log.warning(
57
+ f"Removed {phenopacket_path.name} from the corpus due to missing disease fields."
58
+ )
59
+ continue
60
+ if hg19_template_vcf or hg38_template_vcf:
61
+ output_dir.joinpath("vcf").mkdir(exist_ok=True)
62
+ create_spiked_vcf(
63
+ output_dir.joinpath("vcf"), phenopacket_path, hg19_template_vcf, hg38_template_vcf
64
+ )
65
+ if gene_identifier:
66
+ create_updated_phenopacket(
67
+ gene_identifier, phenopacket_path, output_dir.joinpath("phenopackets")
68
+ )
69
+ else:
70
+ # if not updating phenopacket gene identifiers then copy phenopacket as is to output directory
71
+ shutil.copy(
72
+ phenopacket_path, output_dir.joinpath(f"phenopackets/{phenopacket_path.name}")
73
+ )
@@ -38,8 +38,7 @@ def update_outdated_gene_context(
38
38
  interpretations = PhenopacketUtil(phenopacket).interpretations()
39
39
  updated_interpretations = GeneIdentifierUpdater(
40
40
  hgnc_data=hgnc_data, gene_identifier=gene_identifier
41
- ).update_genomic_interpretations_gene_identifier(interpretations)
42
-
41
+ ).update_genomic_interpretations_gene_identifier(interpretations, phenopacket_path)
43
42
  return PhenopacketRebuilder(phenopacket).update_interpretations(updated_interpretations)
44
43
 
45
44
 
@@ -1,6 +1,5 @@
1
1
  import json
2
-
3
- # import logging
2
+ import logging
4
3
  import os
5
4
  from collections import defaultdict
6
5
  from copy import copy
@@ -22,6 +21,8 @@ from phenopackets import (
22
21
 
23
22
  from pheval.prepare.custom_exceptions import IncorrectFileFormatError
24
23
 
24
+ info_log = logging.getLogger("info")
25
+
25
26
 
26
27
  class IncompatibleGenomeAssemblyError(Exception):
27
28
  """Exception raised for incompatible genome assembly."""
@@ -477,6 +478,59 @@ class PhenopacketUtil:
477
478
  variants.append(variant)
478
479
  return variants
479
480
 
481
+ def check_incomplete_variant_record(self) -> bool:
482
+ """
483
+ Check if any variant record in the phenopacket has incomplete information.
484
+
485
+ This method iterates through the diagnosed variant records and checks if any of them
486
+ have missing or incomplete information such as empty chromosome, position, reference,
487
+ or alternate allele.
488
+
489
+ Returns:
490
+ bool: True if any variant record is incomplete, False otherwise.
491
+ """
492
+ variants = self.diagnosed_variants()
493
+ for variant in variants:
494
+ if (
495
+ variant.chrom == ""
496
+ or variant.pos == 0
497
+ or variant.pos == ""
498
+ or variant.ref == ""
499
+ or variant.alt == ""
500
+ ):
501
+ return True
502
+ return False
503
+
504
+ def check_incomplete_gene_record(self) -> bool:
505
+ """
506
+ Check if any gene record in the phenopacket has incomplete information.
507
+
508
+ This method iterates through the diagnosed gene records and checks if any of them
509
+ have missing or incomplete information such as gene name, or gene identifier.
510
+
511
+ Returns:
512
+ bool: True if any gene record is incomplete, False otherwise.
513
+ """
514
+ genes = self.diagnosed_genes()
515
+ for gene in genes:
516
+ if gene.gene_symbol == "" or gene.gene_identifier == "":
517
+ return True
518
+ return False
519
+
520
+ def check_incomplete_disease_record(self) -> bool:
521
+ """
522
+ Check if any disease record in the phenopacket has incomplete information.
523
+
524
+ This method iterates through the diagnosed disease records and checks if any of them
525
+ have missing or incomplete information such as empty disease name, or disease identifier.
526
+
527
+ Returns:
528
+ bool: True if any disease record is incomplete, False otherwise.
529
+ """
530
+ if len(self.diagnoses()) == 0:
531
+ return True
532
+ return False
533
+
480
534
 
481
535
  class PhenopacketRebuilder:
482
536
  """Class for rebuilding a Phenopacket"""
@@ -655,7 +709,7 @@ class GeneIdentifierUpdater:
655
709
  ]
656
710
 
657
711
  def update_genomic_interpretations_gene_identifier(
658
- self, interpretations: List[Interpretation]
712
+ self, interpretations: List[Interpretation], phenopacket_path: Path
659
713
  ) -> List[Interpretation]:
660
714
  """
661
715
  Update the genomic interpretations of a Phenopacket.
@@ -669,10 +723,16 @@ class GeneIdentifierUpdater:
669
723
  updated_interpretations = copy(list(interpretations))
670
724
  for updated_interpretation in updated_interpretations:
671
725
  for g in updated_interpretation.diagnosis.genomic_interpretations:
726
+ updated_gene_identifier = self.find_identifier(
727
+ g.variant_interpretation.variation_descriptor.gene_context.symbol
728
+ )
729
+ info_log.info(
730
+ f"Updating gene identifier in {phenopacket_path} from "
731
+ f"{g.variant_interpretation.variation_descriptor.gene_context.value_id}"
732
+ f"to {updated_gene_identifier}"
733
+ )
672
734
  g.variant_interpretation.variation_descriptor.gene_context.value_id = (
673
- self.find_identifier(
674
- g.variant_interpretation.variation_descriptor.gene_context.symbol
675
- )
735
+ updated_gene_identifier
676
736
  )
677
737
  del g.variant_interpretation.variation_descriptor.gene_context.alternate_ids[:]
678
738
  g.variant_interpretation.variation_descriptor.gene_context.alternate_ids.extend(
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes